mirror of
https://github.com/datascale-ai/opentalking.git
synced 2026-07-03 15:22:34 +08:00
feat: add local F5-TTS provider (#128)
This commit is contained in:
@@ -326,6 +326,7 @@ Join the QQ community to discuss real-time digital humans, FlashTalk, OmniRT, mo
|
||||
|
||||
OpenTalking references and benefits from excellent projects in the real-time digital-human ecosystem:
|
||||
|
||||
- Thanks to the [LINUX DO](https://linux.do/) community for their support and discussions.
|
||||
- [SoulX-FlashTalk](https://github.com/Soul-AILab/SoulX-FlashTalk) and [SoulX-FlashTalk-14B](https://huggingface.co/Soul-AILab/SoulX-FlashTalk-14B)
|
||||
- [LiveTalking](https://github.com/lipku/LiveTalking)
|
||||
- [OmniRT](https://github.com/datascale-ai/omnirt)
|
||||
|
||||
@@ -326,6 +326,7 @@ Join the QQ community to discuss real-time digital humans, FlashTalk, OmniRT, mo
|
||||
|
||||
OpenTalking references and benefits from excellent projects in the real-time digital-human ecosystem:
|
||||
|
||||
- Thanks to the [LINUX DO](https://linux.do/) community for their support and discussions.
|
||||
- [SoulX-FlashTalk](https://github.com/Soul-AILab/SoulX-FlashTalk) and [SoulX-FlashTalk-14B](https://huggingface.co/Soul-AILab/SoulX-FlashTalk-14B)
|
||||
- [LiveTalking](https://github.com/lipku/LiveTalking)
|
||||
- [OmniRT](https://github.com/datascale-ai/omnirt)
|
||||
|
||||
@@ -324,6 +324,7 @@ OpenTalking 的 LLM 走 OpenAI-compatible 接口,把 `OPENTALKING_LLM_BASE_URL
|
||||
|
||||
OpenTalking 参考并受益于实时数字人生态中的优秀项目:
|
||||
|
||||
- 感谢 [LINUX DO](https://linux.do/) 社区的支持与讨论。
|
||||
- [SoulX-FlashTalk](https://github.com/Soul-AILab/SoulX-FlashTalk) 和 [SoulX-FlashTalk-14B](https://huggingface.co/Soul-AILab/SoulX-FlashTalk-14B)
|
||||
- [LiveTalking](https://github.com/lipku/LiveTalking)
|
||||
- [OmniRT](https://github.com/datascale-ai/omnirt)
|
||||
|
||||
@@ -53,6 +53,7 @@ def _runtime_status_payload(request: Request) -> dict[str, Any]:
|
||||
"mock",
|
||||
"local_cosyvoice",
|
||||
"indextts",
|
||||
"local_f5_tts",
|
||||
"dashscope",
|
||||
"xiaomi_mimo",
|
||||
"openai_compatible",
|
||||
|
||||
@@ -24,6 +24,7 @@ from opentalking.providers.tts.voice_assets import (
|
||||
INDEXTTS_PROVIDER,
|
||||
INDEXTTS_PROVIDERS,
|
||||
LOCAL_COSYVOICE_PROVIDER,
|
||||
LOCAL_F5_TTS_PROVIDER,
|
||||
bundled_system_voice_root,
|
||||
iter_voice_assets,
|
||||
local_audio_model_root,
|
||||
@@ -117,6 +118,19 @@ def _write_local_cosyvoice_prompt(
|
||||
return voice_dir
|
||||
|
||||
|
||||
def _write_local_f5_tts_prompt(*, voice_id: str, wav: bytes, prompt_text: str, display_label: str, target_model: str, validation: dict[str, Any] | None = None) -> Path:
|
||||
if not re.fullmatch(r"[A-Za-z0-9_-]{3,80}", voice_id):
|
||||
raise ValueError("invalid local voice id")
|
||||
voice_dir = _local_audio_model_root() / "voices" / "clones" / voice_id
|
||||
voice_dir.mkdir(parents=True, exist_ok=True)
|
||||
clean_prompt_text = prompt_text.strip()
|
||||
(voice_dir / "prompt.wav").write_bytes(wav)
|
||||
if clean_prompt_text:
|
||||
(voice_dir / "prompt.txt").write_text(clean_prompt_text, encoding="utf-8")
|
||||
(voice_dir / "meta.json").write_text(json.dumps({"voice_id": voice_id, "display_label": display_label, "provider": LOCAL_F5_TTS_PROVIDER, "target_model": target_model, "prompt_audio": str(voice_dir / "prompt.wav"), "prompt_text": clean_prompt_text, "validation": validation or {}, "source": "clone"}, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
|
||||
return voice_dir
|
||||
|
||||
|
||||
def _write_local_indextts_prompt(
|
||||
*,
|
||||
provider: str,
|
||||
@@ -187,6 +201,15 @@ def _wav_audio_stats(wav: bytes) -> dict[str, float]:
|
||||
}
|
||||
|
||||
|
||||
def _validate_local_f5_tts_prompt(wav: bytes) -> dict[str, Any]:
|
||||
stats = _wav_audio_stats(wav)
|
||||
if stats["duration_sec"] < 1.0:
|
||||
raise HTTPException(status_code=400, detail="F5-TTS 参考音频过短,请录制 3-15 秒清晰人声。")
|
||||
if stats["active_sec"] < 0.5 or stats["rms_dbfs"] < LOCAL_COSYVOICE_MIN_RMS_DBFS:
|
||||
raise HTTPException(status_code=400, detail="F5-TTS 参考音频声音太小或静音太多,请靠近麦克风重录。")
|
||||
return stats
|
||||
|
||||
|
||||
def _validate_local_indextts_prompt(wav: bytes) -> dict[str, Any]:
|
||||
stats = _wav_audio_stats(wav)
|
||||
if stats["duration_sec"] < 1.0:
|
||||
@@ -307,6 +330,26 @@ def _local_cosyvoice_system_voice_items() -> list[VoiceItem]:
|
||||
return items
|
||||
|
||||
|
||||
def _local_f5_tts_voice_items(source: str) -> list[VoiceItem]:
|
||||
if source not in {"system", "clones"}:
|
||||
return []
|
||||
items: list[VoiceItem] = []
|
||||
for asset in iter_voice_assets(
|
||||
provider=LOCAL_F5_TTS_PROVIDER,
|
||||
sources=(source,),
|
||||
model_root=_local_audio_model_root(),
|
||||
require_prompt_text=source == "system",
|
||||
):
|
||||
voice_id = asset.voice_id
|
||||
if not re.fullmatch(r"[A-Za-z0-9_-]{3,80}", voice_id):
|
||||
continue
|
||||
meta = asset.meta
|
||||
label = _public_voice_label(str(meta.get("display_label") or meta.get("label") or voice_id), fallback=voice_id)
|
||||
tm = str(meta.get("target_model") or "").strip()
|
||||
items.append({"id": -len(items) - 1, "user_id": 1, "provider": LOCAL_F5_TTS_PROVIDER, "voice_id": voice_id, "display_label": label, "target_model": tm or None, "source": "clone" if source == "clones" else "system"})
|
||||
return items
|
||||
|
||||
|
||||
def _local_indextts_voice_items(source: str) -> list[VoiceItem]:
|
||||
if source not in {"system", "clones"}:
|
||||
return []
|
||||
@@ -399,6 +442,13 @@ async def get_voices(provider: str | None = None) -> JSONResponse:
|
||||
if key not in existing:
|
||||
items.append(item)
|
||||
existing.add(key)
|
||||
if public_p is None or public_p == LOCAL_F5_TTS_PROVIDER:
|
||||
for source in ("system", "clones"):
|
||||
for item in _local_f5_tts_voice_items(source):
|
||||
key = (item["provider"], item["voice_id"])
|
||||
if key not in existing:
|
||||
items.append(item)
|
||||
existing.add(key)
|
||||
if public_p is None or public_p == INDEXTTS_PROVIDER:
|
||||
for source in ("system", "clones"):
|
||||
for item in _local_indextts_voice_items(source):
|
||||
@@ -445,10 +495,10 @@ async def post_voice_clone(
|
||||
prov = provider.strip().lower()
|
||||
if prov in {"xiaomi", "mimo"}:
|
||||
prov = "xiaomi_mimo"
|
||||
if prov not in {"local_cosyvoice", "cosyvoice", "dashscope", "xiaomi_mimo", *INDEXTTS_PROVIDERS}:
|
||||
if prov not in {"local_cosyvoice", "local_f5_tts", "cosyvoice", "dashscope", "xiaomi_mimo", *INDEXTTS_PROVIDERS}:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="provider 须为 local_cosyvoice、indextts、cosyvoice、dashscope 或 xiaomi_mimo",
|
||||
detail="provider 须为 local_cosyvoice、local_f5_tts、indextts、cosyvoice、dashscope 或 xiaomi_mimo",
|
||||
)
|
||||
|
||||
raw = await audio.read()
|
||||
@@ -471,6 +521,14 @@ async def post_voice_clone(
|
||||
)
|
||||
|
||||
try:
|
||||
if prov == LOCAL_F5_TTS_PROVIDER:
|
||||
voice_id = _safe_local_voice_id(label)
|
||||
effective_model = tm or "SWivid/F5-TTS/F5TTS_v1_Base"
|
||||
validation = _validate_local_f5_tts_prompt(wav)
|
||||
_write_local_f5_tts_prompt(voice_id=voice_id, wav=wav, prompt_text=(prompt_text or "").strip(), display_label=label, target_model=effective_model, validation=validation)
|
||||
eid = insert_clone(provider=LOCAL_F5_TTS_PROVIDER, voice_id=voice_id, display_label=label, target_model=effective_model)
|
||||
return JSONResponse({"ok": True, "entry_id": eid, "voice_id": voice_id, "display_label": label, "provider": LOCAL_F5_TTS_PROVIDER, "target_model": effective_model, "message": "F5-TTS 复刻音色已保存,可用于 F5-TTS 合成。"})
|
||||
|
||||
if prov in INDEXTTS_PROVIDERS:
|
||||
voice_id = _safe_local_voice_id(label)
|
||||
effective_model = tm or "IndexTeam/IndexTTS-2"
|
||||
@@ -641,7 +699,7 @@ async def delete_voice_entry(entry_id: int) -> JSONResponse:
|
||||
if row.get("source") != "clone":
|
||||
raise HTTPException(status_code=400, detail="不能删除系统预设音色")
|
||||
if delete_entry(entry_id):
|
||||
if row.get("provider") in {"local_cosyvoice", *INDEXTTS_PROVIDERS}:
|
||||
if row.get("provider") in {"local_cosyvoice", "local_f5_tts", *INDEXTTS_PROVIDERS}:
|
||||
_remove_local_prompt(str(row.get("voice_id") or ""))
|
||||
return JSONResponse({"ok": True})
|
||||
raise HTTPException(status_code=404, detail="not found")
|
||||
|
||||
@@ -71,6 +71,7 @@ def test_local_cosyvoice_clone_stores_prompt_locally(tmp_path, monkeypatch):
|
||||
|
||||
monkeypatch.setenv("OPENTALKING_LOCAL_AUDIO_MODEL_ROOT", str(tmp_path / "models"))
|
||||
monkeypatch.setattr(voices_routes, "init_voice_store", lambda: None)
|
||||
monkeypatch.setattr(voices_routes, "list_voices", lambda provider=None: [])
|
||||
monkeypatch.setattr(
|
||||
voices_routes.bailian_clone,
|
||||
"convert_audio_to_wav_24k_mono",
|
||||
@@ -117,6 +118,106 @@ def test_local_cosyvoice_clone_stores_prompt_locally(tmp_path, monkeypatch):
|
||||
assert inserted["display_label"] == "本地客服女声"
|
||||
|
||||
|
||||
|
||||
def test_local_f5_tts_clone_stores_prompt_audio_and_text(tmp_path, monkeypatch):
|
||||
inserted: dict[str, object] = {}
|
||||
|
||||
monkeypatch.setenv("OPENTALKING_LOCAL_AUDIO_MODEL_ROOT", str(tmp_path / "models"))
|
||||
monkeypatch.setattr(voices_routes, "init_voice_store", lambda: None)
|
||||
monkeypatch.setattr(voices_routes, "list_voices", lambda provider=None: [])
|
||||
monkeypatch.setattr(
|
||||
voices_routes.bailian_clone,
|
||||
"convert_audio_to_wav_24k_mono",
|
||||
lambda raw, suffix: _wav_bytes(),
|
||||
)
|
||||
|
||||
def fake_insert_clone(**kwargs):
|
||||
inserted.update(kwargs)
|
||||
return 77
|
||||
|
||||
monkeypatch.setattr(voices_routes, "insert_clone", fake_insert_clone)
|
||||
|
||||
app = FastAPI()
|
||||
app.include_router(voices_routes.router)
|
||||
response = TestClient(app).post(
|
||||
"/voices/clone",
|
||||
data={
|
||||
"provider": "local_f5_tts",
|
||||
"target_model": "SWivid/F5-TTS/F5TTS_v1_Base",
|
||||
"display_label": "F5 客服女声",
|
||||
"prompt_text": "您好,欢迎使用实时数字人。",
|
||||
},
|
||||
files={"audio": ("sample.wav", _wav_bytes(), "audio/wav")},
|
||||
)
|
||||
|
||||
assert response.status_code == 200, response.text
|
||||
body = response.json()
|
||||
voice_id = body["voice_id"]
|
||||
voice_dir = tmp_path / "models" / "voices" / "clones" / voice_id
|
||||
assert body["provider"] == "local_f5_tts"
|
||||
assert body["entry_id"] == 77
|
||||
assert body["target_model"] == "SWivid/F5-TTS/F5TTS_v1_Base"
|
||||
assert (voice_dir / "prompt.wav").is_file()
|
||||
assert (voice_dir / "prompt.txt").read_text(encoding="utf-8") == "您好,欢迎使用实时数字人。"
|
||||
meta = (voice_dir / "meta.json").read_text(encoding="utf-8")
|
||||
assert '"provider": "local_f5_tts"' in meta
|
||||
assert inserted["provider"] == "local_f5_tts"
|
||||
assert inserted["voice_id"] == voice_id
|
||||
assert inserted["display_label"] == "F5 客服女声"
|
||||
|
||||
|
||||
def test_get_voices_includes_local_f5_tts_system_voice_dirs(tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("OPENTALKING_LOCAL_AUDIO_MODEL_ROOT", str(tmp_path / "models"))
|
||||
voice_dir = tmp_path / "models" / "voices" / "system" / "f5-clear-cn"
|
||||
voice_dir.mkdir(parents=True)
|
||||
(voice_dir / "prompt.wav").write_bytes(b"RIFFtest")
|
||||
(voice_dir / "prompt.txt").write_text("这是一段 F5 参考音色。", encoding="utf-8")
|
||||
(voice_dir / "meta.json").write_text(
|
||||
'{"provider":"local_f5_tts","display_label":"F5 清晰中文","target_model":"SWivid/F5-TTS/F5TTS_v1_Base"}',
|
||||
encoding="utf-8",
|
||||
)
|
||||
monkeypatch.setattr(voices_routes, "init_voice_store", lambda: None)
|
||||
monkeypatch.setattr(voices_routes, "list_voices", lambda provider=None: [])
|
||||
|
||||
app = FastAPI()
|
||||
app.include_router(voices_routes.router)
|
||||
response = TestClient(app).get("/voices?provider=local_f5_tts")
|
||||
|
||||
assert response.status_code == 200
|
||||
assert {
|
||||
"id": -1,
|
||||
"user_id": 1,
|
||||
"provider": "local_f5_tts",
|
||||
"voice_id": "f5-clear-cn",
|
||||
"display_label": "F5 清晰中文",
|
||||
"target_model": "SWivid/F5-TTS/F5TTS_v1_Base",
|
||||
"source": "system",
|
||||
} in response.json()["items"]
|
||||
|
||||
|
||||
def test_get_voices_shares_bundled_zero_shot_system_voices_with_local_f5_tts(tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("OPENTALKING_LOCAL_AUDIO_MODEL_ROOT", str(tmp_path / "models"))
|
||||
monkeypatch.setattr(voices_routes, "init_voice_store", lambda: None)
|
||||
monkeypatch.setattr(voices_routes, "list_voices", lambda provider=None: [])
|
||||
|
||||
app = FastAPI()
|
||||
app.include_router(voices_routes.router)
|
||||
response = TestClient(app).get("/voices?provider=local_f5_tts")
|
||||
|
||||
assert response.status_code == 200
|
||||
items = response.json()["items"]
|
||||
assert {
|
||||
"id": -1,
|
||||
"user_id": 1,
|
||||
"provider": "local_f5_tts",
|
||||
"voice_id": "cosyvoice-official-zero-shot",
|
||||
"display_label": "官方示例女声",
|
||||
"target_model": "FunAudioLLM/Fun-CosyVoice3-0.5B-2512",
|
||||
"source": "system",
|
||||
} in items
|
||||
assert any(item["voice_id"] == "local-office-serena" for item in items)
|
||||
|
||||
|
||||
def test_local_cosyvoice_clone_rejects_silent_prompt(tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("OPENTALKING_LOCAL_AUDIO_MODEL_ROOT", str(tmp_path / "models"))
|
||||
monkeypatch.setattr(voices_routes, "init_voice_store", lambda: None)
|
||||
|
||||
@@ -70,6 +70,7 @@ import {
|
||||
COSYVOICE_MODEL_OPTIONS,
|
||||
COSYVOICE_VOICE_OPTIONS,
|
||||
LOCAL_COSYVOICE_MODEL_OPTIONS,
|
||||
LOCAL_F5_TTS_MODEL_OPTIONS,
|
||||
LOCAL_INDEXTTS_MODEL_OPTIONS,
|
||||
LOCAL_TTS_VOICE_OPTIONS,
|
||||
SAMBERT_MODEL_OPTIONS,
|
||||
@@ -104,6 +105,8 @@ function bailianModelOptions(provider: TtsProviderExtended): { id: string; label
|
||||
return LOCAL_COSYVOICE_MODEL_OPTIONS;
|
||||
case "indextts":
|
||||
return LOCAL_INDEXTTS_MODEL_OPTIONS;
|
||||
case "local_f5_tts":
|
||||
return LOCAL_F5_TTS_MODEL_OPTIONS;
|
||||
case "xiaomi_mimo":
|
||||
return XIAOMI_MIMO_MODEL_OPTIONS;
|
||||
default:
|
||||
@@ -121,6 +124,7 @@ function bailianVoiceOptions(provider: TtsProviderExtended): { id: string; label
|
||||
return [];
|
||||
case "local_cosyvoice":
|
||||
case "indextts":
|
||||
case "local_f5_tts":
|
||||
return LOCAL_TTS_VOICE_OPTIONS;
|
||||
case "xiaomi_mimo":
|
||||
return XIAOMI_MIMO_VOICE_OPTIONS;
|
||||
@@ -134,6 +138,7 @@ function catalogProviderKey(p: TtsProviderExtended): string | null {
|
||||
if (p === "cosyvoice") return "cosyvoice";
|
||||
if (p === "local_cosyvoice") return "local_cosyvoice";
|
||||
if (p === "indextts") return "indextts";
|
||||
if (p === "local_f5_tts") return "local_f5_tts";
|
||||
if (p === "xiaomi_mimo") return "xiaomi_mimo";
|
||||
return null;
|
||||
}
|
||||
@@ -166,13 +171,15 @@ function mergeVoiceCatalogIntoOptions(
|
||||
const extras: VoiceOpt[] = [];
|
||||
for (const r of catalog) {
|
||||
if (r.provider !== cp) continue;
|
||||
if (activeModel && r.target_model && r.target_model !== activeModel && !(ttsProvider === "local_cosyvoice" && r.source === "system")) continue;
|
||||
const sharedSystemPrompt =
|
||||
r.source === "system" && (ttsProvider === "local_cosyvoice" || ttsProvider === "local_f5_tts");
|
||||
if (activeModel && r.target_model && r.target_model !== activeModel && !sharedSystemPrompt) continue;
|
||||
if (cloneOnly && r.source !== "clone") continue;
|
||||
if (staticIds.has(r.voice_id)) continue;
|
||||
extras.push({
|
||||
id: r.voice_id,
|
||||
label: r.source === "clone" ? `复刻 · ${r.display_label}` : r.display_label,
|
||||
targetModel: ttsProvider === "local_cosyvoice" && r.source === "system" ? undefined : r.target_model,
|
||||
targetModel: sharedSystemPrompt ? undefined : r.target_model,
|
||||
});
|
||||
staticIds.add(r.voice_id);
|
||||
}
|
||||
@@ -489,6 +496,7 @@ function normalizeTtsProvider(value: string | null | undefined, fallback: TtsPro
|
||||
normalized === "sambert" ||
|
||||
normalized === "local_cosyvoice" ||
|
||||
normalized === "indextts" ||
|
||||
normalized === "local_f5_tts" ||
|
||||
normalized === "xiaomi_mimo" ||
|
||||
normalized === "openai_compatible"
|
||||
) {
|
||||
@@ -2791,7 +2799,8 @@ export default function App() {
|
||||
|| runtimeConfigTtsProvider === "local_cosyvoice"
|
||||
|| runtimeConfigTtsProvider === "indextts"
|
||||
|| runtimeConfigTtsProvider === "local_indextts"
|
||||
|| runtimeConfigTtsProvider === "omnirt_indextts",
|
||||
|| runtimeConfigTtsProvider === "omnirt_indextts"
|
||||
|| runtimeConfigTtsProvider === "local_f5_tts",
|
||||
);
|
||||
const runtimeConfigReady = Boolean(
|
||||
runtimeConfig?.llm.api_key_set
|
||||
|
||||
@@ -5,6 +5,7 @@ import {
|
||||
COSYVOICE_MODEL_OPTIONS,
|
||||
LOCAL_COSYVOICE_MODEL_OPTIONS,
|
||||
LOCAL_INDEXTTS_MODEL_OPTIONS,
|
||||
LOCAL_F5_TTS_MODEL_OPTIONS,
|
||||
XIAOMI_MIMO_MODEL_OPTIONS,
|
||||
} from "../constants/ttsBailian";
|
||||
import { QWEN_VOICE_CLONE_TARGET_OPTIONS } from "../constants/ttsQwen";
|
||||
@@ -51,6 +52,7 @@ type CloneProvider =
|
||||
| "cosyvoice"
|
||||
| "local_cosyvoice"
|
||||
| "indextts"
|
||||
| "local_f5_tts"
|
||||
| "xiaomi_mimo";
|
||||
type RecorderPhase = "idle" | "recording" | "paused" | "recorded";
|
||||
|
||||
@@ -61,6 +63,7 @@ function defaultTargetModelForProvider(provider: CloneProvider): string {
|
||||
if (provider === "indextts") {
|
||||
return LOCAL_INDEXTTS_MODEL_OPTIONS[0]?.id ?? "";
|
||||
}
|
||||
if (provider === "local_f5_tts") return LOCAL_F5_TTS_MODEL_OPTIONS[0]?.id ?? "";
|
||||
return COSYVOICE_MODEL_OPTIONS[0]?.id ?? "";
|
||||
}
|
||||
|
||||
@@ -392,6 +395,7 @@ export function BailianVoiceClone({ onSuccess, onClose }: BailianVoiceCloneProps
|
||||
<option value="xiaomi_mimo">小米 MiMo VoiceClone</option>
|
||||
<option value="local_cosyvoice">本地 CosyVoice</option>
|
||||
<option value="indextts">Local IndexTTS</option>
|
||||
<option value="local_f5_tts">Local F5-TTS</option>
|
||||
<option value="cosyvoice">云端 CosyVoice</option>
|
||||
</select>
|
||||
</label>
|
||||
@@ -411,6 +415,8 @@ export function BailianVoiceClone({ onSuccess, onClose }: BailianVoiceCloneProps
|
||||
? LOCAL_COSYVOICE_MODEL_OPTIONS
|
||||
: provider === "indextts"
|
||||
? LOCAL_INDEXTTS_MODEL_OPTIONS
|
||||
: provider === "local_f5_tts"
|
||||
? LOCAL_F5_TTS_MODEL_OPTIONS
|
||||
: COSYVOICE_MODEL_OPTIONS
|
||||
).map((o) => (
|
||||
<option key={o.id} value={o.id}>
|
||||
|
||||
@@ -67,6 +67,7 @@ const TTS_PROVIDER_LABELS: Record<TtsProviderExtended, string> = {
|
||||
sambert: "Sambert",
|
||||
local_cosyvoice: "Local CosyVoice",
|
||||
indextts: "Local IndexTTS",
|
||||
local_f5_tts: "Local F5-TTS",
|
||||
xiaomi_mimo: "小米 MiMo",
|
||||
openai_compatible: "OpenAI API",
|
||||
};
|
||||
@@ -78,6 +79,7 @@ const TTS_PROVIDER_SUBTITLES: Record<TtsProviderExtended, string> = {
|
||||
sambert: "Bailian",
|
||||
local_cosyvoice: "本地模型",
|
||||
indextts: "本地部署",
|
||||
local_f5_tts: "本地模型",
|
||||
xiaomi_mimo: "OpenAI 兼容",
|
||||
openai_compatible: "OpenAI-compatible",
|
||||
};
|
||||
@@ -491,7 +493,7 @@ export function SettingsPanel({
|
||||
subtitle: option.id,
|
||||
hasChildren: true,
|
||||
}));
|
||||
const providerOptions: ColumnOption[] = (["edge", "dashscope", "cosyvoice", "sambert", "local_cosyvoice", "indextts", "xiaomi_mimo", "openai_compatible"] as TtsProviderExtended[]).map((p) => ({
|
||||
const providerOptions: ColumnOption[] = (["edge", "dashscope", "cosyvoice", "sambert", "local_cosyvoice", "indextts", "local_f5_tts", "xiaomi_mimo", "openai_compatible"] as TtsProviderExtended[]).map((p) => ({
|
||||
id: p,
|
||||
label: TTS_PROVIDER_LABELS[p],
|
||||
subtitle: TTS_PROVIDER_SUBTITLES[p],
|
||||
@@ -517,7 +519,7 @@ export function SettingsPanel({
|
||||
}));
|
||||
const providerHasSingleModel = (provider: TtsProviderExtended) => {
|
||||
if (provider === "edge" || provider === "openai_compatible") return true;
|
||||
if (provider === "local_cosyvoice" || provider === "indextts") return true;
|
||||
if (provider === "local_cosyvoice" || provider === "indextts" || provider === "local_f5_tts") return true;
|
||||
if (provider !== ttsProvider) return false;
|
||||
return qwenModelColumnOptions.length <= 1;
|
||||
};
|
||||
|
||||
@@ -234,6 +234,7 @@ function providerLabel(provider: TtsProviderExtended): string {
|
||||
if (provider === "cosyvoice") return "CosyVoice";
|
||||
if (provider === "sambert") return "Sambert";
|
||||
if (provider === "indextts") return "Local IndexTTS";
|
||||
if (provider === "local_f5_tts") return "Local F5-TTS";
|
||||
if (provider === "xiaomi_mimo") return "小米 MiMo";
|
||||
if (provider === "openai_compatible") return "OpenAI-compatible TTS";
|
||||
return "Local CosyVoice";
|
||||
@@ -737,7 +738,7 @@ export function VideoCreationWorkspace({
|
||||
<label className="block text-sm font-medium text-slate-700">
|
||||
TTS
|
||||
<select value={ttsProvider} onChange={(event) => onTtsProviderChange(event.target.value as TtsProviderExtended)} className="mt-2 w-full rounded-lg border border-slate-200 bg-white px-3 py-2 text-sm">
|
||||
{(["edge", "dashscope", "cosyvoice", "sambert", "local_cosyvoice", "indextts", "xiaomi_mimo", "openai_compatible"] as TtsProviderExtended[]).map((item) => <option key={item} value={item}>{providerLabel(item)}</option>)}
|
||||
{(["edge", "dashscope", "cosyvoice", "sambert", "local_cosyvoice", "indextts", "local_f5_tts", "xiaomi_mimo", "openai_compatible"] as TtsProviderExtended[]).map((item) => <option key={item} value={item}>{providerLabel(item)}</option>)}
|
||||
</select>
|
||||
</label>
|
||||
<label className="block text-sm font-medium text-slate-700">
|
||||
|
||||
@@ -28,6 +28,10 @@ export const LOCAL_INDEXTTS_MODEL_OPTIONS: { id: string; label: string }[] = [
|
||||
{ id: "IndexTeam/IndexTTS-2", label: "IndexTTS-2(本地模型)" },
|
||||
];
|
||||
|
||||
export const LOCAL_F5_TTS_MODEL_OPTIONS: { id: string; label: string }[] = [
|
||||
{ id: "SWivid/F5-TTS/F5TTS_v1_Base", label: "F5-TTS v1 Base(本地模型)" },
|
||||
];
|
||||
|
||||
export const LOCAL_TTS_VOICE_OPTIONS: { id: string; label: string }[] = [];
|
||||
|
||||
export const XIAOMI_MIMO_MODEL_OPTIONS: { id: string; label: string }[] = [
|
||||
@@ -54,6 +58,7 @@ export type TtsProviderExtended =
|
||||
| "sambert"
|
||||
| "local_cosyvoice"
|
||||
| "indextts"
|
||||
| "local_f5_tts"
|
||||
| "xiaomi_mimo"
|
||||
| "openai_compatible";
|
||||
|
||||
|
||||
@@ -15,12 +15,14 @@ live in the model pages.
|
||||
| `elevenlabs` | Hosted API | Hosted multilingual voices | `.env` provider config |
|
||||
| `local_cosyvoice` | Local deployment | Local Chinese TTS, built-in voices, and cloned voices | [CosyVoice](tts/cosyvoice.md) |
|
||||
| `indextts` | Local deployment / OmniRT | Controllable dubbing, emotion control, and voice cloning | [IndexTTS](tts/indextts.md) |
|
||||
| `local_f5_tts` | Local deployment | Local F5-TTS Base voice cloning | [F5-TTS](tts/f5-tts.md) |
|
||||
| `local_qwen3_tts` | Local deployment | Local Qwen3-TTS Base voice cloning | [Qwen3-TTS](tts/qwen3-tts.md) |
|
||||
|
||||
## Local Model Entries
|
||||
|
||||
- [CosyVoice Local Deployment](tts/cosyvoice.md)
|
||||
- [IndexTTS Local Deployment](tts/indextts.md)
|
||||
- [F5-TTS Local Deployment](tts/f5-tts.md)
|
||||
- [Qwen3-TTS Local Deployment](tts/qwen3-tts.md)
|
||||
|
||||
Each local model page contains use cases, weight preparation, startup commands,
|
||||
|
||||
116
docs/en/speech_models/tts/f5-tts.md
Normal file
116
docs/en/speech_models/tts/f5-tts.md
Normal file
@@ -0,0 +1,116 @@
|
||||
# F5-TTS Local Deployment
|
||||
|
||||
F5-TTS is integrated through OpenTalking's `local_f5_tts` provider. Use it for local voice cloning, short realtime replies, and offline video dubbing. The integration runs as a same-machine HTTP sidecar so the OpenTalking main process stays isolated from F5-TTS runtime and CUDA dependencies.
|
||||
|
||||
## Use Cases
|
||||
|
||||
- Local voice cloning without a hosted TTS API.
|
||||
- A 3-15 second reference clip and matching transcript are available.
|
||||
- F5-TTS dependencies should stay outside the main OpenTalking venv.
|
||||
|
||||
## Weight Preparation
|
||||
|
||||
Use a single local audio model root, for example `$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT`:
|
||||
|
||||
```bash title="Terminal"
|
||||
cd "$OPENTALKING_HOME"
|
||||
export OPENTALKING_LOCAL_AUDIO_MODEL_ROOT="${OPENTALKING_LOCAL_AUDIO_MODEL_ROOT:-$OPENTALKING_HOME/models/local-audio}"
|
||||
|
||||
python scripts/download_local_audio_models.py \
|
||||
--root "$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT" \
|
||||
--model f5-tts-v1-base
|
||||
```
|
||||
|
||||
The downloader maps `SWivid/F5-TTS` `F5TTS_v1_Base/model_1250000.safetensors` to:
|
||||
|
||||
```text
|
||||
$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT/SWivid__F5-TTS__F5TTS_v1_Base/model_1250000.safetensors
|
||||
```
|
||||
|
||||
Prepare the runtime and sidecar venv:
|
||||
|
||||
```bash title="Terminal"
|
||||
mkdir -p "$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT/runtime"
|
||||
cd "$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT/runtime"
|
||||
|
||||
if [ ! -d F5-TTS/.git ]; then
|
||||
git clone https://github.com/SWivid/F5-TTS.git F5-TTS
|
||||
fi
|
||||
|
||||
python3 -m venv --system-site-packages "$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT/runtime/.venv-f5-tts-system"
|
||||
. "$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT/runtime/.venv-f5-tts-system/bin/activate"
|
||||
pip install -U pip wheel setuptools
|
||||
pip install --no-deps -e "$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT/runtime/F5-TTS"
|
||||
pip install fastapi "uvicorn[standard]" soundfile cached_path hydra-core ema_pytorch vocos x_transformers transformers_stream_generator rjieba pypinyin tomli bitsandbytes pydub torchcodec torchdiffeq unidecode wandb
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
```env title=".env"
|
||||
OPENTALKING_TTS_DEFAULT_PROVIDER=local_f5_tts
|
||||
OPENTALKING_TTS_LOCAL_F5_TTS_SERVICE_URL=http://127.0.0.1:19095/synthesize
|
||||
OPENTALKING_LOCAL_AUDIO_MODEL_ROOT=./models/local-audio
|
||||
OPENTALKING_TTS_LOCAL_F5_TTS_RUNTIME_DIR=./models/local-audio/runtime/F5-TTS
|
||||
OPENTALKING_TTS_LOCAL_F5_TTS_DEVICE=cuda
|
||||
```
|
||||
|
||||
## Voice Cloning
|
||||
|
||||
`local_f5_tts` requires reference audio. Upload a clone voice through the API, or prepare the directory manually:
|
||||
|
||||
```text
|
||||
$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT/voices/clones/my-f5-voice/
|
||||
prompt.wav
|
||||
prompt.txt
|
||||
meta.json
|
||||
```
|
||||
|
||||
Example `meta.json`:
|
||||
|
||||
```json
|
||||
{"provider":"local_f5_tts"}
|
||||
```
|
||||
|
||||
After upload, `/api/voices?provider=local_f5_tts` returns the voice id. TTS preview, realtime dialogue, and video generation can all use that voice.
|
||||
|
||||
## Start Command
|
||||
|
||||
Start the F5-TTS sidecar first, then OpenTalking:
|
||||
|
||||
```bash title="Terminal"
|
||||
cd "$OPENTALKING_HOME"
|
||||
export OPENTALKING_LOCAL_AUDIO_MODEL_ROOT="${OPENTALKING_LOCAL_AUDIO_MODEL_ROOT:-$OPENTALKING_HOME/models/local-audio}"
|
||||
export OPENTALKING_F5_TTS_VENV_DIR="$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT/runtime/.venv-f5-tts-system"
|
||||
bash scripts/quickstart/start_local_f5_tts.sh --port 19095
|
||||
|
||||
export OPENTALKING_TTS_DEFAULT_PROVIDER=local_f5_tts
|
||||
export OPENTALKING_TTS_LOCAL_F5_TTS_SERVICE_URL=http://127.0.0.1:19095/synthesize
|
||||
python -m apps.api.main
|
||||
```
|
||||
|
||||
## Verification
|
||||
|
||||
```bash title="Terminal"
|
||||
curl -fsS http://127.0.0.1:19095/health
|
||||
curl -fsS http://127.0.0.1:8000/health
|
||||
```
|
||||
|
||||
TTS preview should use provider `local_f5_tts` and a clone voice with `prompt.wav`. Save the result as WAV and verify the spoken text and voice by ASR or listening.
|
||||
|
||||
## Benchmark Log
|
||||
|
||||
| Item | Command / API | Target | Measured |
|
||||
|------|---------------|--------|----------|
|
||||
| TTS preview | `/tts/preview` + SenseVoiceSmall ASR | Playable WAV, correct text | Passed: preview returned a 16 kHz mono WAV; SenseVoiceSmall ASR matched the target text. |
|
||||
| Realtime dialogue | local mode dialogue / warm TTS | RTF < 1.0 | Passed: warm RTF 0.278 for 3.31s audio in 0.918s; prior warm checks were 0.386 and 0.518, all below 1 |
|
||||
| Offline video | video generation API / CLI | Generation succeeds, audio drives avatar | Passed: QuickTalk + F5 clone voice generated an MP4; ffprobe showed H.264 video and 16 kHz mono AAC audio. |
|
||||
|
||||
## Common Errors
|
||||
|
||||
| Symptom | Action |
|
||||
|---------|--------|
|
||||
| `Missing F5-TTS checkpoint` | Confirm `model_1250000.safetensors` is under `SWivid__F5-TTS__F5TTS_v1_Base`. |
|
||||
| `requires prompt_audio` | Select a clone voice or set `OPENTALKING_TTS_LOCAL_F5_TTS_PROMPT_AUDIO`. |
|
||||
| Dependency conflicts | Do not run the sidecar from OpenTalking's main `.venv`; use a separate venv such as `$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT/runtime/.venv-f5-tts-system` and reuse the host PyTorch/CUDA environment when appropriate. |
|
||||
| Slow first request | Set `OPENTALKING_TTS_LOCAL_F5_TTS_PRELOAD=1` and run a short warm-up request after startup. |
|
||||
| QuickTalk v3 reshape error | Keep `OPENTALKING_QUICKTALK_RESOLUTION=256` for the current TorchScript export when generating video; 160/128 resolution makes internal feature shapes mismatch. |
|
||||
@@ -13,12 +13,14 @@ talking-head backend。本文只做选型和入口导航;权重、启动、验
|
||||
| `elevenlabs` | 托管 API | 多语言托管音色 | `.env` provider 配置 |
|
||||
| `local_cosyvoice` | 本地部署 | 本地中文 TTS、内置音色和复刻音色 | [CosyVoice](tts/cosyvoice.md) |
|
||||
| `indextts` | 本地部署 / OmniRT | 可控配音、情绪控制和复刻音色 | [IndexTTS](tts/indextts.md) |
|
||||
| `local_f5_tts` | 本地部署 | 本地 F5-TTS Base 音色克隆 | [F5-TTS](tts/f5-tts.md) |
|
||||
| `local_qwen3_tts` | 本地部署 | 本地 Qwen3-TTS Base 复刻音色 | [Qwen3-TTS](tts/qwen3-tts.md) |
|
||||
|
||||
## 本地模型入口
|
||||
|
||||
- [CosyVoice 本地部署](tts/cosyvoice.md)
|
||||
- [IndexTTS 本地部署](tts/indextts.md)
|
||||
- [F5-TTS 本地部署](tts/f5-tts.md)
|
||||
- [Qwen3-TTS 本地部署](tts/qwen3-tts.md)
|
||||
|
||||
每个本地模型页面都包含适用场景、权重准备、启动命令、验证命令和常见错误。
|
||||
|
||||
116
docs/zh/speech_models/tts/f5-tts.md
Normal file
116
docs/zh/speech_models/tts/f5-tts.md
Normal file
@@ -0,0 +1,116 @@
|
||||
# F5-TTS 本地部署
|
||||
|
||||
F5-TTS 通过 OpenTalking 的 `local_f5_tts` provider 接入,适合本地音色克隆、短句实时回复和离线视频配音。当前实现采用同机 HTTP sidecar:OpenTalking 主进程只负责调度,F5-TTS 运行在独立 venv 中,避免依赖和 CUDA 包冲突。
|
||||
|
||||
## 适用场景
|
||||
|
||||
- 需要本地音色克隆,不希望调用托管 TTS API。
|
||||
- 已有 3-15 秒参考音频和对应文本,希望复刻说话人音色。
|
||||
- 需要把 F5-TTS runtime 与 OpenTalking 主进程隔离。
|
||||
|
||||
## 权重准备
|
||||
|
||||
推荐把权重放到统一的本地音频模型目录,例如 `$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT`:
|
||||
|
||||
```bash title="终端"
|
||||
cd "$OPENTALKING_HOME"
|
||||
export OPENTALKING_LOCAL_AUDIO_MODEL_ROOT="${OPENTALKING_LOCAL_AUDIO_MODEL_ROOT:-$OPENTALKING_HOME/models/local-audio}"
|
||||
|
||||
python scripts/download_local_audio_models.py \
|
||||
--root "$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT" \
|
||||
--model f5-tts-v1-base
|
||||
```
|
||||
|
||||
脚本会把 `SWivid/F5-TTS` 的 `F5TTS_v1_Base/model_1250000.safetensors` 映射到:
|
||||
|
||||
```text
|
||||
$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT/SWivid__F5-TTS__F5TTS_v1_Base/model_1250000.safetensors
|
||||
```
|
||||
|
||||
准备 runtime 和独立 venv:
|
||||
|
||||
```bash title="终端"
|
||||
mkdir -p "$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT/runtime"
|
||||
cd "$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT/runtime"
|
||||
|
||||
if [ ! -d F5-TTS/.git ]; then
|
||||
git clone https://github.com/SWivid/F5-TTS.git F5-TTS
|
||||
fi
|
||||
|
||||
python3 -m venv --system-site-packages "$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT/runtime/.venv-f5-tts-system"
|
||||
. "$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT/runtime/.venv-f5-tts-system/bin/activate"
|
||||
pip install -U pip wheel setuptools
|
||||
pip install --no-deps -e "$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT/runtime/F5-TTS"
|
||||
pip install fastapi "uvicorn[standard]" soundfile cached_path hydra-core ema_pytorch vocos x_transformers transformers_stream_generator rjieba pypinyin tomli bitsandbytes pydub torchcodec torchdiffeq unidecode wandb
|
||||
```
|
||||
|
||||
## 配置项
|
||||
|
||||
```env title=".env"
|
||||
OPENTALKING_TTS_DEFAULT_PROVIDER=local_f5_tts
|
||||
OPENTALKING_TTS_LOCAL_F5_TTS_SERVICE_URL=http://127.0.0.1:19095/synthesize
|
||||
OPENTALKING_LOCAL_AUDIO_MODEL_ROOT=./models/local-audio
|
||||
OPENTALKING_TTS_LOCAL_F5_TTS_RUNTIME_DIR=./models/local-audio/runtime/F5-TTS
|
||||
OPENTALKING_TTS_LOCAL_F5_TTS_DEVICE=cuda
|
||||
```
|
||||
|
||||
## 音色克隆
|
||||
|
||||
`local_f5_tts` 请求必须带参考音频。可以通过 API 上传 clone voice,也可以手工准备目录:
|
||||
|
||||
```text
|
||||
$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT/voices/clones/my-f5-voice/
|
||||
prompt.wav
|
||||
prompt.txt
|
||||
meta.json
|
||||
```
|
||||
|
||||
`meta.json` 示例:
|
||||
|
||||
```json
|
||||
{"provider":"local_f5_tts"}
|
||||
```
|
||||
|
||||
上传后 `/api/voices?provider=local_f5_tts` 会返回可选 voice id;TTS preview、实时对话和视频生成都可以使用这个 voice。
|
||||
|
||||
## 启动命令
|
||||
|
||||
先启动 F5-TTS sidecar,再启动 OpenTalking:
|
||||
|
||||
```bash title="终端"
|
||||
cd "$OPENTALKING_HOME"
|
||||
export OPENTALKING_LOCAL_AUDIO_MODEL_ROOT="${OPENTALKING_LOCAL_AUDIO_MODEL_ROOT:-$OPENTALKING_HOME/models/local-audio}"
|
||||
export OPENTALKING_F5_TTS_VENV_DIR="$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT/runtime/.venv-f5-tts-system"
|
||||
bash scripts/quickstart/start_local_f5_tts.sh --port 19095
|
||||
|
||||
export OPENTALKING_TTS_DEFAULT_PROVIDER=local_f5_tts
|
||||
export OPENTALKING_TTS_LOCAL_F5_TTS_SERVICE_URL=http://127.0.0.1:19095/synthesize
|
||||
python -m apps.api.main
|
||||
```
|
||||
|
||||
## 验证命令
|
||||
|
||||
```bash title="终端"
|
||||
curl -fsS http://127.0.0.1:19095/health
|
||||
curl -fsS http://127.0.0.1:8000/health
|
||||
```
|
||||
|
||||
TTS preview 应使用 `local_f5_tts` provider 和一个带 `prompt.wav` 的 clone voice。生成结果可以保存成 WAV 后用 ASR 或人工听检确认文本和音色。
|
||||
|
||||
## 实测记录
|
||||
|
||||
| 项目 | 命令 / 接口 | 目标 | 实测 |
|
||||
|------|-------------|------|------|
|
||||
| TTS preview | `/tts/preview` + SenseVoiceSmall ASR | 可播放 WAV,文本正确 | 通过:试听接口返回 16 kHz mono WAV;SenseVoiceSmall ASR 识别文本与目标文本一致。 |
|
||||
| 实时对话 | local mode dialogue / warm TTS | RTF < 1.0 | 通过:warm RTF 0.278(3.31s 音频,0.918s 合成);历史复测 0.386/0.518,均低于 1 |
|
||||
| 离线视频 | video generation API / CLI | 生成成功,音频驱动正常 | 通过:QuickTalk + F5 clone voice 可生成 MP4;ffprobe 显示 H.264 视频和 16 kHz mono AAC 音频。 |
|
||||
|
||||
## 常见错误
|
||||
|
||||
| 现象 | 处理 |
|
||||
|------|------|
|
||||
| `Missing F5-TTS checkpoint` | 确认 `model_1250000.safetensors` 位于 `SWivid__F5-TTS__F5TTS_v1_Base` 目录。 |
|
||||
| `requires prompt_audio` | 选择 clone voice,或设置 `OPENTALKING_TTS_LOCAL_F5_TTS_PROMPT_AUDIO`。 |
|
||||
| 依赖冲突 | 不要用 OpenTalking 主 `.venv` 启动 sidecar;建议使用 `$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT/runtime/.venv-f5-tts-system` 这样的独立 venv,并复用宿主机已有 PyTorch/CUDA 环境。 |
|
||||
| 首次请求慢 | 设置 `OPENTALKING_TTS_LOCAL_F5_TTS_PRELOAD=1` 并在服务启动后先做一次短句预热。 |
|
||||
| QuickTalk v3 reshape 报错 | 使用当前 TorchScript 导出模型生成视频时保持 `OPENTALKING_QUICKTALK_RESOLUTION=256`;160/128 分辨率会让模型内部特征尺寸不匹配。 |
|
||||
@@ -149,6 +149,15 @@ def _flatten_config(raw: dict[str, Any] | None) -> dict[str, Any]:
|
||||
"local_cosyvoice_max_token_text_ratio": "tts_local_cosyvoice_max_token_text_ratio",
|
||||
"local_cosyvoice_min_token_text_ratio": "tts_local_cosyvoice_min_token_text_ratio",
|
||||
"local_cosyvoice_mask_stop_tokens": "tts_local_cosyvoice_mask_stop_tokens",
|
||||
"local_f5_tts_model": "tts_local_f5_tts_model",
|
||||
"local_f5_tts_model_dir": "tts_local_f5_tts_model_dir",
|
||||
"local_f5_tts_runtime_dir": "tts_local_f5_tts_runtime_dir",
|
||||
"local_f5_tts_service_url": "tts_local_f5_tts_service_url",
|
||||
"local_f5_tts_ckpt_file": "tts_local_f5_tts_ckpt_file",
|
||||
"local_f5_tts_vocoder_local_path": "tts_local_f5_tts_vocoder_local_path",
|
||||
"local_f5_tts_prompt_audio": "tts_local_f5_tts_prompt_audio",
|
||||
"local_f5_tts_prompt_text": "tts_local_f5_tts_prompt_text",
|
||||
"local_f5_tts_device": "tts_local_f5_tts_device",
|
||||
"local_indextts_model": "tts_local_indextts_model",
|
||||
"local_indextts_model_dir": "tts_local_indextts_model_dir",
|
||||
"local_indextts_cfg_path": "tts_local_indextts_cfg_path",
|
||||
@@ -516,6 +525,15 @@ class Settings(BaseSettings):
|
||||
tts_local_cosyvoice_max_token_text_ratio: float = 6.0
|
||||
tts_local_cosyvoice_min_token_text_ratio: float = 0.0
|
||||
tts_local_cosyvoice_mask_stop_tokens: bool = True
|
||||
tts_local_f5_tts_model: str = "SWivid/F5-TTS/F5TTS_v1_Base"
|
||||
tts_local_f5_tts_model_dir: str = ""
|
||||
tts_local_f5_tts_runtime_dir: str = ""
|
||||
tts_local_f5_tts_service_url: str = ""
|
||||
tts_local_f5_tts_ckpt_file: str = ""
|
||||
tts_local_f5_tts_vocoder_local_path: str = ""
|
||||
tts_local_f5_tts_prompt_audio: str = ""
|
||||
tts_local_f5_tts_prompt_text: str = ""
|
||||
tts_local_f5_tts_device: str = "auto"
|
||||
tts_local_indextts_model: str = "IndexTeam/IndexTTS-2"
|
||||
tts_local_indextts_model_dir: str = ""
|
||||
tts_local_indextts_cfg_path: str = ""
|
||||
|
||||
@@ -202,6 +202,42 @@ def _local_audio_asset_file_dir(name: str, relative_file: str, *fallback_names:
|
||||
return _local_audio_asset_dir(name, relative_file, *fallback_names)
|
||||
|
||||
|
||||
def _local_f5_tts_model() -> str:
|
||||
return _provider_env("local_f5_tts", "MODEL") or _settings_value("tts_local_f5_tts_model", "") or "SWivid/F5-TTS/F5TTS_v1_Base"
|
||||
|
||||
|
||||
def _local_f5_tts_model_dir(model: str) -> str:
|
||||
return _provider_env("local_f5_tts", "MODEL_DIR") or _settings_value("tts_local_f5_tts_model_dir", "") or str(Path(_local_audio_model_root()) / model.replace("/", "__"))
|
||||
|
||||
|
||||
def _local_f5_tts_runtime_dir() -> str:
|
||||
return _provider_env("local_f5_tts", "RUNTIME_DIR") or _settings_value("tts_local_f5_tts_runtime_dir", "") or str(Path(_local_audio_model_root()) / "runtime" / "F5-TTS")
|
||||
|
||||
|
||||
def _local_f5_tts_service_url() -> str:
|
||||
return _provider_env("local_f5_tts", "SERVICE_URL") or _settings_value("tts_local_f5_tts_service_url", "")
|
||||
|
||||
|
||||
def _local_f5_tts_ckpt_file(model_dir: str) -> str:
|
||||
return _provider_env("local_f5_tts", "CKPT_FILE") or _settings_value("tts_local_f5_tts_ckpt_file", "") or str(Path(model_dir) / "model_1250000.safetensors")
|
||||
|
||||
|
||||
def _local_f5_tts_vocoder_local_path() -> str:
|
||||
return _provider_env("local_f5_tts", "VOCODER_LOCAL_PATH") or _settings_value("tts_local_f5_tts_vocoder_local_path", "")
|
||||
|
||||
|
||||
def _local_f5_tts_prompt_audio() -> str:
|
||||
return _provider_env("local_f5_tts", "PROMPT_AUDIO") or _settings_value("tts_local_f5_tts_prompt_audio", "")
|
||||
|
||||
|
||||
def _local_f5_tts_prompt_text() -> str:
|
||||
return _provider_env("local_f5_tts", "PROMPT_TEXT") or _settings_value("tts_local_f5_tts_prompt_text", "")
|
||||
|
||||
|
||||
def _local_f5_tts_device() -> str:
|
||||
return _provider_env("local_f5_tts", "DEVICE") or _settings_value("tts_local_f5_tts_device", "") or os.environ.get("OPENTALKING_LOCAL_TTS_DEVICE", "").strip() or os.environ.get("OPENTALKING_LOCAL_AUDIO_DEVICE", "").strip() or _settings_value("local_audio_device", "") or "auto"
|
||||
|
||||
|
||||
def _local_indextts_model() -> str:
|
||||
return (
|
||||
_provider_env("local_indextts", "MODEL")
|
||||
@@ -671,6 +707,11 @@ def tts_provider_config(provider: str) -> dict[str, str | bool | int | float]:
|
||||
"key_set": False,
|
||||
"service_url_set": bool(service_url),
|
||||
}
|
||||
if p == "local_f5_tts":
|
||||
model = _local_f5_tts_model()
|
||||
model_dir = _local_f5_tts_model_dir(model)
|
||||
service_url = _local_f5_tts_service_url()
|
||||
return {"provider": p, "model": model, "model_dir": model_dir, "voice": "local-default", "device": _local_f5_tts_device(), "key_set": False, "service_url": service_url, "service_url_set": bool(service_url), "runtime_dir": _local_f5_tts_runtime_dir(), "ckpt_file": _local_f5_tts_ckpt_file(model_dir), "vocoder_local_path": _local_f5_tts_vocoder_local_path(), "prompt_audio_set": bool(_local_f5_tts_prompt_audio())}
|
||||
if p == "local_indextts":
|
||||
model = _local_indextts_model()
|
||||
model_dir = _local_indextts_model_dir(model)
|
||||
@@ -904,6 +945,10 @@ def tts_log_profile(
|
||||
service = os.environ.get("OPENTALKING_LOCAL_QWEN3_TTS_SERVICE_URL", "").strip() or "(unset)"
|
||||
return f"TTS_API=local_qwen3_tts | model={model!r} service={service!r} | {req_part}"
|
||||
|
||||
if p == "local_f5_tts":
|
||||
model = (tts_model_override or "").strip() or _local_f5_tts_model()
|
||||
return f"TTS_API=local_f5_tts | model={model!r} device={_local_f5_tts_device()!r} prompt_audio_set={bool(_local_f5_tts_prompt_audio())} | {req_part}"
|
||||
|
||||
if p == "local_indextts":
|
||||
model = (tts_model_override or "").strip() or _local_indextts_model()
|
||||
return (
|
||||
@@ -1038,6 +1083,11 @@ def create_tts_adapter(
|
||||
chunk_ms=chunk_ms,
|
||||
model=tts_model,
|
||||
)
|
||||
if p == "local_f5_tts":
|
||||
from opentalking.providers.tts.local_f5_tts.adapter import LocalF5TTSAdapter
|
||||
model = (tts_model or "").strip() or _local_f5_tts_model()
|
||||
model_dir = _local_f5_tts_model_dir(model)
|
||||
return LocalF5TTSAdapter(default_voice=default_voice, sample_rate=sample_rate, chunk_ms=chunk_ms, model=model, model_dir=model_dir, runtime_dir=_local_f5_tts_runtime_dir(), ckpt_file=_local_f5_tts_ckpt_file(model_dir), vocoder_local_path=_local_f5_tts_vocoder_local_path(), service_url=_local_f5_tts_service_url(), prompt_audio=_local_f5_tts_prompt_audio(), prompt_text=_local_f5_tts_prompt_text(), device=_local_f5_tts_device())
|
||||
if p == "local_indextts":
|
||||
from opentalking.providers.tts.local_indextts.adapter import LocalIndexTTSAdapter
|
||||
|
||||
|
||||
5
opentalking/providers/tts/local_f5_tts/__init__.py
Normal file
5
opentalking/providers/tts/local_f5_tts/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from .adapter import LocalF5TTSAdapter
|
||||
|
||||
__all__ = ["LocalF5TTSAdapter"]
|
||||
182
opentalking/providers/tts/local_f5_tts/adapter.py
Normal file
182
opentalking/providers/tts/local_f5_tts/adapter.py
Normal file
@@ -0,0 +1,182 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import os
|
||||
import re
|
||||
import wave
|
||||
from collections.abc import AsyncIterator
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import httpx
|
||||
import numpy as np
|
||||
|
||||
from opentalking.core.types.frames import AudioChunk
|
||||
from opentalking.providers.tts.voice_assets import LOCAL_F5_TTS_PROVIDER, resolve_voice_asset
|
||||
|
||||
|
||||
def _settings_value(name: str, default: str = "") -> str:
|
||||
try:
|
||||
from opentalking.core.config import get_settings
|
||||
value = getattr(get_settings(), name, default)
|
||||
if value is not None and str(value).strip():
|
||||
return str(value).strip()
|
||||
except Exception:
|
||||
pass
|
||||
return default
|
||||
|
||||
|
||||
def _local_audio_model_root() -> Path:
|
||||
raw = os.environ.get("OPENTALKING_LOCAL_AUDIO_MODEL_ROOT", "").strip()
|
||||
try:
|
||||
from opentalking.core.config import get_settings
|
||||
raw = raw or (get_settings().local_audio_model_root or "").strip()
|
||||
except Exception:
|
||||
pass
|
||||
return Path(raw or "./models/local-audio").expanduser().resolve()
|
||||
|
||||
|
||||
def _audio_format_from_content_type(content_type: str | None) -> str | None:
|
||||
value = (content_type or "").split(";", 1)[0].strip().lower()
|
||||
if value in {"audio/wav", "audio/wave", "audio/x-wav"}:
|
||||
return "wav"
|
||||
if value in {"audio/l16", "audio/pcm", "application/octet-stream"}:
|
||||
return "pcm"
|
||||
if value in {"audio/mpeg", "audio/mp3"}:
|
||||
return "mp3"
|
||||
return None
|
||||
|
||||
|
||||
def _source_sample_rate_from_headers(headers: Any, fallback: int) -> int:
|
||||
direct = str(headers.get("x-audio-sample-rate", "") or "").strip()
|
||||
if direct.isdigit():
|
||||
return int(direct)
|
||||
content_type = str(headers.get("content-type", "") or "")
|
||||
for part in content_type.split(";")[1:]:
|
||||
key, sep, value = part.strip().partition("=")
|
||||
if sep and key.strip().lower() == "rate" and value.strip().isdigit():
|
||||
return int(value.strip())
|
||||
return fallback
|
||||
|
||||
|
||||
def _resample_linear(pcm: np.ndarray, src_sr: int, dst_sr: int) -> np.ndarray:
|
||||
pcm = np.asarray(pcm, dtype=np.int16).reshape(-1)
|
||||
if pcm.size == 0 or src_sr == dst_sr:
|
||||
return pcm.copy()
|
||||
pcm_f = pcm.astype(np.float32) / 32768.0
|
||||
n_dst = max(1, int(round(pcm.size * dst_sr / src_sr)))
|
||||
xi = np.linspace(0.0, pcm.size - 1.0, num=n_dst)
|
||||
out = np.interp(xi, np.arange(pcm.size), pcm_f)
|
||||
return np.clip(np.round(out * 32768.0), -32768, 32767).astype(np.int16)
|
||||
|
||||
|
||||
def _split_pcm_chunks(pcm: np.ndarray, sr: int, chunk_ms: float) -> list[AudioChunk]:
|
||||
samples_per_chunk = max(1, int(sr * (chunk_ms / 1000.0)))
|
||||
out: list[AudioChunk] = []
|
||||
for i in range(0, len(pcm), samples_per_chunk):
|
||||
part = pcm[i : i + samples_per_chunk]
|
||||
if part.size == 0:
|
||||
continue
|
||||
out.append(AudioChunk(data=part.astype(np.int16), sample_rate=sr, duration_ms=1000.0 * part.size / sr))
|
||||
return out
|
||||
|
||||
|
||||
def _read_wav_bytes_i16(raw: bytes) -> tuple[np.ndarray, int]:
|
||||
with wave.open(io.BytesIO(raw), "rb") as wf:
|
||||
source_sr = int(wf.getframerate())
|
||||
channels = int(wf.getnchannels())
|
||||
sample_width = int(wf.getsampwidth())
|
||||
pcm_bytes = wf.readframes(wf.getnframes())
|
||||
if sample_width != 2:
|
||||
raise RuntimeError(f"Unsupported WAV sample width for local F5-TTS: {sample_width}")
|
||||
pcm = np.frombuffer(pcm_bytes, dtype="<i2").astype(np.int16, copy=False)
|
||||
if channels > 1:
|
||||
frame_count = pcm.size // channels
|
||||
pcm = pcm[: frame_count * channels].reshape(frame_count, channels).mean(axis=1).astype(np.int16)
|
||||
return pcm, source_sr
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class F5VoicePrompt:
|
||||
prompt_audio: Path
|
||||
prompt_text: str
|
||||
|
||||
|
||||
class LocalF5TTSAdapter:
|
||||
def __init__(self, default_voice: str | None = None, sample_rate: int = 16000, chunk_ms: float = 20.0, *, model: str | None = None, model_dir: str | None = None, runtime_dir: str | None = None, ckpt_file: str | None = None, vocoder_local_path: str | None = None, service_url: str | None = None, prompt_audio: str | None = None, prompt_text: str | None = None, device: str = "auto") -> None:
|
||||
self.default_voice = default_voice or "local-default"
|
||||
self.sample_rate = sample_rate
|
||||
self.chunk_ms = chunk_ms
|
||||
self.model = (model or "SWivid/F5-TTS/F5TTS_v1_Base").strip()
|
||||
self.model_dir = str(Path(model_dir or _local_audio_model_root() / self.model.replace("/", "__")).expanduser())
|
||||
self.runtime_dir = str(Path(runtime_dir or _local_audio_model_root() / "runtime" / "F5-TTS").expanduser())
|
||||
self.ckpt_file = str(Path(ckpt_file or Path(self.model_dir) / "model_1250000.safetensors").expanduser())
|
||||
self.vocoder_local_path = str(Path(vocoder_local_path).expanduser()) if vocoder_local_path else ""
|
||||
self.service_url = (service_url or os.environ.get("OPENTALKING_TTS_LOCAL_F5_TTS_SERVICE_URL", "").strip() or _settings_value("tts_local_f5_tts_service_url", "")).strip()
|
||||
self.prompt_audio = str(Path(prompt_audio).expanduser()) if prompt_audio else ""
|
||||
self.prompt_text = (prompt_text or "").strip()
|
||||
self.device = device or "auto"
|
||||
|
||||
async def synthesize_stream(self, text: str, voice: str | None = None) -> AsyncIterator[AudioChunk]:
|
||||
if not text.strip():
|
||||
return
|
||||
if not self.service_url:
|
||||
raise RuntimeError("Local F5-TTS requires OPENTALKING_TTS_LOCAL_F5_TTS_SERVICE_URL. Run scripts/quickstart/start_local_f5_tts.sh first.")
|
||||
async for chunk in self._synthesize_via_service(text, voice=voice):
|
||||
yield chunk
|
||||
|
||||
def _resolve_voice_prompt(self, voice: str | None) -> F5VoicePrompt | None:
|
||||
voice_id = (voice or "").strip()
|
||||
if voice_id and voice_id != "local-default" and re.fullmatch(r"[A-Za-z0-9_-]{3,80}", voice_id):
|
||||
asset = resolve_voice_asset(voice_id, provider=LOCAL_F5_TTS_PROVIDER, sources=("clones", "system"), model_root=_local_audio_model_root(), require_prompt_text=False)
|
||||
if asset is not None:
|
||||
text = asset.prompt_text.read_text(encoding="utf-8").strip() if asset.prompt_text else ""
|
||||
return F5VoicePrompt(prompt_audio=asset.prompt_audio, prompt_text=text)
|
||||
if self.prompt_audio:
|
||||
return F5VoicePrompt(prompt_audio=Path(self.prompt_audio), prompt_text=self.prompt_text)
|
||||
return None
|
||||
|
||||
async def _synthesize_via_service(self, text: str, voice: str | None = None) -> AsyncIterator[AudioChunk]:
|
||||
timeout = httpx.Timeout(connect=30.0, read=180.0, write=30.0, pool=30.0)
|
||||
payload: dict[str, Any] = {"text": text, "voice": voice or self.default_voice, "model": self.model, "sample_rate": self.sample_rate}
|
||||
prompt = self._resolve_voice_prompt(voice or self.default_voice)
|
||||
if prompt is not None:
|
||||
payload["prompt_audio"] = str(prompt.prompt_audio)
|
||||
payload["prompt_text"] = prompt.prompt_text
|
||||
async with httpx.AsyncClient(timeout=timeout) as client:
|
||||
async with client.stream("POST", self.service_url, json=payload) as resp:
|
||||
resp.raise_for_status()
|
||||
input_format = _audio_format_from_content_type(resp.headers.get("content-type"))
|
||||
if input_format == "pcm":
|
||||
source_sr = _source_sample_rate_from_headers(resp.headers, self.sample_rate)
|
||||
pending = b""
|
||||
async for data in resp.aiter_bytes():
|
||||
if not data:
|
||||
continue
|
||||
data = pending + data
|
||||
if len(data) % 2:
|
||||
pending = data[-1:]
|
||||
data = data[:-1]
|
||||
else:
|
||||
pending = b""
|
||||
if not data:
|
||||
continue
|
||||
pcm = np.frombuffer(data, dtype="<i2").astype(np.int16, copy=False)
|
||||
pcm = _resample_linear(pcm, source_sr, self.sample_rate)
|
||||
for chunk in _split_pcm_chunks(pcm, self.sample_rate, self.chunk_ms):
|
||||
yield chunk
|
||||
return
|
||||
if input_format == "wav":
|
||||
pcm, source_sr = _read_wav_bytes_i16(await resp.aread())
|
||||
pcm = _resample_linear(pcm, source_sr, self.sample_rate)
|
||||
for chunk in _split_pcm_chunks(pcm, self.sample_rate, self.chunk_ms):
|
||||
yield chunk
|
||||
return
|
||||
from opentalking.providers.tts.edge.adapter import _stream_decode_audio_to_pcm_chunks
|
||||
async def _audio_iter() -> AsyncIterator[bytes]:
|
||||
async for data in resp.aiter_bytes():
|
||||
if data:
|
||||
yield data
|
||||
async for chunk in _stream_decode_audio_to_pcm_chunks(_audio_iter(), self.sample_rate, self.chunk_ms, input_format=input_format):
|
||||
yield chunk
|
||||
@@ -6,7 +6,7 @@ XIAOMI_MIMO_TTS_PROVIDERS = frozenset({"xiaomi_mimo", "xiaomi", "mimo"})
|
||||
QWEN_TTS_PROVIDERS = frozenset({"dashscope", "bailian", "qwen", "qwen_tts"})
|
||||
COSYVOICE_TTS_PROVIDERS = frozenset({"cosyvoice", "cosyvoice_http"})
|
||||
SAMBERT_TTS_PROVIDERS = frozenset({"sambert", "dashscope_sambert"})
|
||||
LOCAL_TTS_PROVIDERS = frozenset({"local_cosyvoice", "local_qwen3_tts", "local_indextts"})
|
||||
LOCAL_TTS_PROVIDERS = frozenset({"local_cosyvoice", "local_qwen3_tts", "local_indextts", "local_f5_tts"})
|
||||
OMNIRT_TTS_PROVIDERS = frozenset({"omnirt_indextts"})
|
||||
INDEXTTS_TTS_PROVIDERS = frozenset({"indextts"})
|
||||
BAILIAN_TTS_PROVIDERS = (
|
||||
|
||||
@@ -11,6 +11,7 @@ INDEXTTS_PROVIDER = "indextts"
|
||||
INDEXTTS_LEGACY_PROVIDERS = {"local_indextts", "omnirt_indextts"}
|
||||
INDEXTTS_PROVIDERS = {INDEXTTS_PROVIDER, *INDEXTTS_LEGACY_PROVIDERS}
|
||||
LOCAL_COSYVOICE_PROVIDER = "local_cosyvoice"
|
||||
LOCAL_F5_TTS_PROVIDER = "local_f5_tts"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
@@ -66,6 +67,8 @@ def _provider_aliases(provider: str) -> set[str]:
|
||||
normalized = provider.strip().lower()
|
||||
if normalized in INDEXTTS_PROVIDERS:
|
||||
return {INDEXTTS_PROVIDER, *INDEXTTS_LEGACY_PROVIDERS}
|
||||
if normalized == LOCAL_F5_TTS_PROVIDER:
|
||||
return {LOCAL_F5_TTS_PROVIDER}
|
||||
return {normalized}
|
||||
|
||||
|
||||
@@ -83,6 +86,8 @@ def voice_applies_to_provider(meta: dict[str, Any], provider: str, *, bundled_sy
|
||||
return True
|
||||
if bundled_system and normalized == LOCAL_COSYVOICE_PROVIDER:
|
||||
return True
|
||||
if bundled_system and normalized == LOCAL_F5_TTS_PROVIDER:
|
||||
return True
|
||||
if any(_truthy_meta_flag(meta.get(key)) for key in ("universal", "compatible", "zero_shot_compatible")):
|
||||
return True
|
||||
aliases = _provider_aliases(normalized)
|
||||
|
||||
@@ -18,6 +18,8 @@ DEFAULT_REUSE_ROOTS = (
|
||||
MODELS: dict[str, tuple[str, str]] = {
|
||||
"sensevoice-small": ("modelscope", "iic/SenseVoiceSmall"),
|
||||
"fun-cosyvoice3-0.5b-2512": ("modelscope", "FunAudioLLM/Fun-CosyVoice3-0.5B-2512"),
|
||||
"f5-tts-v1-base": ("hf", "SWivid/F5-TTS"),
|
||||
"f5-tts-vocos": ("hf", "charactr/vocos-mel-24khz"),
|
||||
"indextts2": ("modelscope", "IndexTeam/IndexTTS-2"),
|
||||
"indextts2-w2v-bert": ("hf", "facebook/w2v-bert-2.0"),
|
||||
"indextts2-maskgct": ("hf", "amphion/MaskGCT"),
|
||||
@@ -26,6 +28,12 @@ MODELS: dict[str, tuple[str, str]] = {
|
||||
}
|
||||
|
||||
HF_ALLOW_PATTERNS: dict[str, list[str]] = {
|
||||
"f5-tts-v1-base": [
|
||||
"README.md",
|
||||
"F5TTS_v1_Base/model_1250000.safetensors",
|
||||
"F5TTS_v1_Base/vocab.txt",
|
||||
],
|
||||
"f5-tts-vocos": ["config.yaml", "pytorch_model.bin"],
|
||||
# IndexTTS2 only needs the feature extractor, model weights, and conformer shim.
|
||||
"indextts2-w2v-bert": [
|
||||
"README.md",
|
||||
@@ -65,6 +73,8 @@ HF_ALLOW_PATTERNS: dict[str, list[str]] = {
|
||||
|
||||
MODEL_HINTS: dict[str, tuple[str, ...]] = {
|
||||
"sensevoice-small": ("iic__SenseVoiceSmall", "sensevoice", "SenseVoiceSmall"),
|
||||
"f5-tts-v1-base": ("SWivid__F5-TTS__F5TTS_v1_Base", "F5TTS_v1_Base", "SWivid__F5-TTS"),
|
||||
"f5-tts-vocos": ("charactr__vocos-mel-24khz", "vocos-mel-24khz"),
|
||||
"fun-cosyvoice3-0.5b-2512": (
|
||||
"FunAudioLLM__Fun-CosyVoice3-0.5B-2512",
|
||||
"Fun-CosyVoice3-0.5B-2512",
|
||||
@@ -79,6 +89,8 @@ MODEL_HINTS: dict[str, tuple[str, ...]] = {
|
||||
|
||||
MODEL_REQUIRED_FILES: dict[str, tuple[str, ...]] = {
|
||||
"sensevoice-small": ("model.pt", "config.yaml", "configuration.json"),
|
||||
"f5-tts-v1-base": ("model_1250000.safetensors",),
|
||||
"f5-tts-vocos": ("config.yaml", "pytorch_model.bin"),
|
||||
"fun-cosyvoice3-0.5b-2512": ("cosyvoice3.yaml", "flow.pt", "hift.pt", "llm.pt"),
|
||||
"indextts2": ("config.yaml", "model.pt"),
|
||||
"indextts2-w2v-bert": ("model.safetensors", "conformer_shaw.pt"),
|
||||
@@ -96,7 +108,9 @@ def local_audio_model_ids() -> tuple[str, ...]:
|
||||
return tuple(model_id for _, model_id in MODELS.values())
|
||||
|
||||
|
||||
def _target(root: Path, model_id: str) -> Path:
|
||||
def _target(root: Path, model_id: str, *, model_key: str | None = None) -> Path:
|
||||
if model_key == "f5-tts-v1-base":
|
||||
return root / "SWivid__F5-TTS__F5TTS_v1_Base"
|
||||
return root / model_id.replace("/", "__")
|
||||
|
||||
|
||||
@@ -167,12 +181,26 @@ def _download_hf(model_id: str, target: Path, *, model_key: str) -> None:
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
endpoint = os.environ.get("HF_ENDPOINT", "").strip()
|
||||
kwargs = {"repo_id": model_id, "local_dir": str(target)}
|
||||
download_dir = target.parent / "SWivid__F5-TTS" if model_key == "f5-tts-v1-base" else target
|
||||
kwargs = {"repo_id": model_id, "local_dir": str(download_dir)}
|
||||
if endpoint:
|
||||
kwargs["endpoint"] = endpoint
|
||||
if patterns := HF_ALLOW_PATTERNS.get(model_key):
|
||||
kwargs["allow_patterns"] = patterns
|
||||
snapshot_download(**kwargs)
|
||||
if model_key == "f5-tts-v1-base":
|
||||
nested = download_dir / "F5TTS_v1_Base"
|
||||
if nested.exists() and not _is_model_ready(target, model_key=model_key):
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
if target.exists() and target.is_dir() and not any(target.iterdir()):
|
||||
target.rmdir()
|
||||
if not target.exists():
|
||||
try:
|
||||
target.symlink_to(nested, target_is_directory=True)
|
||||
except Exception:
|
||||
shutil.copytree(nested, target)
|
||||
else:
|
||||
shutil.copytree(nested, target, dirs_exist_ok=True)
|
||||
|
||||
|
||||
def _git_lfs_pull_if_needed(target: Path) -> None:
|
||||
@@ -207,7 +235,7 @@ def main() -> None:
|
||||
failures: list[tuple[str, str]] = []
|
||||
for key in selected:
|
||||
source, model_id = MODELS[key]
|
||||
target = _target(root, model_id)
|
||||
target = _target(root, model_id, model_key=key)
|
||||
print(f"[{key}] {source}:{model_id} -> {target}", flush=True)
|
||||
target.mkdir(parents=True, exist_ok=True)
|
||||
try:
|
||||
|
||||
265
scripts/local_f5_tts_service.py
Normal file
265
scripts/local_f5_tts_service.py
Normal file
@@ -0,0 +1,265 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
from collections.abc import Iterator
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from fastapi.responses import StreamingResponse
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
DEFAULT_MODEL = "SWivid/F5-TTS/F5TTS_v1_Base"
|
||||
DEFAULT_SERVICE_SAMPLE_RATE = 24000
|
||||
|
||||
|
||||
def _local_audio_model_root() -> Path:
|
||||
return Path(os.environ.get("OPENTALKING_LOCAL_AUDIO_MODEL_ROOT", "./models/local-audio")).expanduser().resolve()
|
||||
|
||||
|
||||
def _default_model_dir(root: Path) -> Path:
|
||||
return root / "SWivid__F5-TTS__F5TTS_v1_Base"
|
||||
|
||||
|
||||
def _default_runtime_dir(root: Path) -> Path:
|
||||
return root / "runtime" / "F5-TTS"
|
||||
|
||||
|
||||
def _env_bool(name: str, default: bool) -> bool:
|
||||
raw = os.environ.get(name, "").strip().lower()
|
||||
if not raw:
|
||||
return default
|
||||
return raw in {"1", "true", "yes", "on"}
|
||||
|
||||
|
||||
def _select_device(value: str) -> str | None:
|
||||
value = value.strip()
|
||||
if value and value != "auto":
|
||||
return value
|
||||
try:
|
||||
import torch
|
||||
|
||||
return "cuda" if torch.cuda.is_available() else "cpu"
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _pcm_bytes(wav: Any) -> bytes:
|
||||
arr = np.asarray(wav)
|
||||
if arr.ndim > 1:
|
||||
arr = arr.mean(axis=1)
|
||||
if np.issubdtype(arr.dtype, np.floating):
|
||||
arr = np.clip(arr, -1.0, 1.0)
|
||||
arr = np.round(arr * 32767.0).astype("<i2")
|
||||
else:
|
||||
arr = np.clip(arr, -32768, 32767).astype("<i2")
|
||||
return arr.reshape(-1).tobytes()
|
||||
|
||||
|
||||
def _soundfile_torchaudio_load(path: str | Path, *args: Any, **kwargs: Any):
|
||||
import soundfile as sf
|
||||
import torch
|
||||
|
||||
audio, sr = sf.read(str(path), dtype="float32", always_2d=True)
|
||||
tensor = torch.from_numpy(audio.T.copy())
|
||||
normalize = kwargs.get("normalize", True)
|
||||
if not normalize:
|
||||
tensor = (tensor.clamp(-1.0, 1.0) * 32767.0).to(torch.int16)
|
||||
frame_offset = int(kwargs.get("frame_offset", 0) or 0)
|
||||
num_frames = int(kwargs.get("num_frames", -1) or -1)
|
||||
if frame_offset > 0 or num_frames >= 0:
|
||||
end = None if num_frames < 0 else frame_offset + num_frames
|
||||
tensor = tensor[:, frame_offset:end]
|
||||
return tensor, int(sr)
|
||||
|
||||
|
||||
def _patch_torchaudio_load() -> None:
|
||||
try:
|
||||
import torchaudio
|
||||
except Exception:
|
||||
return
|
||||
if getattr(torchaudio, "_opentalking_soundfile_load_patched", False):
|
||||
return
|
||||
torchaudio.load = _soundfile_torchaudio_load
|
||||
torchaudio._opentalking_soundfile_load_patched = True
|
||||
|
||||
|
||||
class SynthesizeRequest(BaseModel):
|
||||
text: str
|
||||
voice: str | None = None
|
||||
model: str | None = None
|
||||
sample_rate: int | None = None
|
||||
prompt_audio: str | None = None
|
||||
prompt_text: str | None = None
|
||||
speed: float | None = None
|
||||
nfe_step: int | None = None
|
||||
|
||||
|
||||
class F5TTSService:
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
model: str | None = None,
|
||||
model_dir: str | Path | None = None,
|
||||
runtime_dir: str | Path | None = None,
|
||||
ckpt_file: str | Path | None = None,
|
||||
vocoder_local_path: str | Path | None = None,
|
||||
prompt_audio: str | Path | None = None,
|
||||
prompt_text: str | None = None,
|
||||
device: str | None = None,
|
||||
preload: bool = True,
|
||||
) -> None:
|
||||
root = _local_audio_model_root()
|
||||
self.model = (model or os.environ.get("OPENTALKING_TTS_LOCAL_F5_TTS_MODEL") or DEFAULT_MODEL).strip()
|
||||
self.model_dir = Path(
|
||||
model_dir or os.environ.get("OPENTALKING_TTS_LOCAL_F5_TTS_MODEL_DIR") or _default_model_dir(root)
|
||||
).expanduser()
|
||||
self.runtime_dir = Path(
|
||||
runtime_dir or os.environ.get("OPENTALKING_TTS_LOCAL_F5_TTS_RUNTIME_DIR") or _default_runtime_dir(root)
|
||||
).expanduser()
|
||||
self.ckpt_file = Path(
|
||||
ckpt_file
|
||||
or os.environ.get("OPENTALKING_TTS_LOCAL_F5_TTS_CKPT_FILE")
|
||||
or self.model_dir / "model_1250000.safetensors"
|
||||
).expanduser()
|
||||
vocoder = vocoder_local_path or os.environ.get("OPENTALKING_TTS_LOCAL_F5_TTS_VOCODER_LOCAL_PATH") or ""
|
||||
self.vocoder_local_path = Path(vocoder).expanduser() if str(vocoder).strip() else None
|
||||
prompt = prompt_audio or os.environ.get("OPENTALKING_TTS_LOCAL_F5_TTS_PROMPT_AUDIO") or ""
|
||||
self.prompt_audio = Path(prompt).expanduser() if str(prompt).strip() else None
|
||||
self.prompt_text = (prompt_text or os.environ.get("OPENTALKING_TTS_LOCAL_F5_TTS_PROMPT_TEXT") or "").strip()
|
||||
self.device = device or os.environ.get("OPENTALKING_TTS_LOCAL_F5_TTS_DEVICE") or "auto"
|
||||
self._engine: Any | None = None
|
||||
self._lock = threading.Lock()
|
||||
if preload:
|
||||
self.engine()
|
||||
|
||||
def engine(self) -> Any:
|
||||
with self._lock:
|
||||
if self._engine is not None:
|
||||
return self._engine
|
||||
if not self.ckpt_file.exists():
|
||||
raise RuntimeError(f"Missing F5-TTS checkpoint: {self.ckpt_file}")
|
||||
src_dir = self.runtime_dir / "src"
|
||||
for candidate in (src_dir, self.runtime_dir):
|
||||
if candidate.exists() and str(candidate) not in sys.path:
|
||||
sys.path.insert(0, str(candidate))
|
||||
_patch_torchaudio_load()
|
||||
from f5_tts.api import F5TTS
|
||||
|
||||
kwargs: dict[str, Any] = {"model": "F5TTS_v1_Base", "ckpt_file": str(self.ckpt_file)}
|
||||
if self.vocoder_local_path is not None:
|
||||
kwargs["vocoder_local_path"] = str(self.vocoder_local_path)
|
||||
selected_device = _select_device(self.device)
|
||||
if selected_device:
|
||||
kwargs["device"] = selected_device
|
||||
self._engine = F5TTS(**kwargs)
|
||||
return self._engine
|
||||
|
||||
def health(self) -> dict[str, Any]:
|
||||
return {
|
||||
"ok": True,
|
||||
"provider": "local_f5_tts",
|
||||
"model": self.model,
|
||||
"model_dir": str(self.model_dir),
|
||||
"runtime_dir": str(self.runtime_dir),
|
||||
"ckpt_file": str(self.ckpt_file),
|
||||
"ckpt_exists": self.ckpt_file.exists(),
|
||||
"loaded": self._engine is not None,
|
||||
"device": self.device,
|
||||
}
|
||||
|
||||
def prewarm(self, text: str = "你好。") -> None:
|
||||
if self.prompt_audio is None or not self.prompt_audio.exists():
|
||||
self.engine()
|
||||
return
|
||||
list(self.synthesize(text=text, prompt_audio=self.prompt_audio, prompt_text=self.prompt_text))
|
||||
|
||||
def synthesize(
|
||||
self,
|
||||
*,
|
||||
text: str,
|
||||
prompt_audio: str | Path | None = None,
|
||||
prompt_text: str | None = None,
|
||||
speed: float | None = None,
|
||||
nfe_step: int | None = None,
|
||||
) -> Iterator[bytes]:
|
||||
text = text.strip()
|
||||
if not text:
|
||||
return
|
||||
ref_audio = Path(prompt_audio).expanduser() if prompt_audio else self.prompt_audio
|
||||
if ref_audio is None or not ref_audio.exists():
|
||||
raise RuntimeError("Local F5-TTS requires prompt_audio for voice cloning.")
|
||||
ref_text = (prompt_text if prompt_text is not None else self.prompt_text).strip()
|
||||
infer_kwargs: dict[str, Any] = {
|
||||
"ref_file": str(ref_audio),
|
||||
"ref_text": ref_text,
|
||||
"gen_text": text,
|
||||
"show_info": lambda *_args, **_kwargs: None,
|
||||
"progress": None,
|
||||
}
|
||||
if speed is not None:
|
||||
infer_kwargs["speed"] = float(speed)
|
||||
if nfe_step is not None:
|
||||
infer_kwargs["nfe_step"] = int(nfe_step)
|
||||
wav, _sr, _spec = self.engine().infer(**infer_kwargs)
|
||||
yield _pcm_bytes(wav)
|
||||
|
||||
|
||||
def create_app(service: F5TTSService | None = None) -> FastAPI:
|
||||
service = service or F5TTSService(preload=_env_bool("OPENTALKING_TTS_LOCAL_F5_TTS_PRELOAD", True))
|
||||
app = FastAPI(title="OpenTalking Local F5-TTS Service")
|
||||
|
||||
@app.get("/health")
|
||||
def health() -> dict[str, Any]:
|
||||
return service.health()
|
||||
|
||||
@app.post("/synthesize")
|
||||
def synthesize(request: SynthesizeRequest) -> StreamingResponse:
|
||||
try:
|
||||
started = time.perf_counter()
|
||||
audio = list(
|
||||
service.synthesize(
|
||||
text=request.text,
|
||||
prompt_audio=request.prompt_audio,
|
||||
prompt_text=request.prompt_text,
|
||||
speed=request.speed,
|
||||
nfe_step=request.nfe_step,
|
||||
)
|
||||
)
|
||||
elapsed = max(time.perf_counter() - started, 0.001)
|
||||
except Exception as exc:
|
||||
raise HTTPException(status_code=500, detail=str(exc)) from exc
|
||||
headers = {
|
||||
"X-Audio-Sample-Rate": str(DEFAULT_SERVICE_SAMPLE_RATE),
|
||||
"X-OpenTalking-Elapsed": f"{elapsed:.3f}",
|
||||
}
|
||||
return StreamingResponse(
|
||||
iter(audio),
|
||||
media_type=f"audio/L16; rate={DEFAULT_SERVICE_SAMPLE_RATE}; channels=1",
|
||||
headers=headers,
|
||||
)
|
||||
|
||||
return app
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Run the OpenTalking local F5-TTS sidecar.")
|
||||
parser.add_argument("--host", default=os.environ.get("OPENTALKING_TTS_LOCAL_F5_TTS_HOST", "127.0.0.1"))
|
||||
parser.add_argument("--port", type=int, default=int(os.environ.get("OPENTALKING_TTS_LOCAL_F5_TTS_PORT", "19095")))
|
||||
parser.add_argument("--no-preload", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
import uvicorn
|
||||
|
||||
app = create_app(F5TTSService(preload=not args.no_preload and _env_bool("OPENTALKING_TTS_LOCAL_F5_TTS_PRELOAD", True)))
|
||||
uvicorn.run(app, host=args.host, port=args.port)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
191
scripts/quickstart/start_local_f5_tts.sh
Executable file
191
scripts/quickstart/start_local_f5_tts.sh
Executable file
@@ -0,0 +1,191 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
script_dir="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)"
|
||||
repo_root="$(cd -- "$script_dir/../.." && pwd)"
|
||||
default_home="$(cd -- "$repo_root/.." && pwd)"
|
||||
# shellcheck disable=SC1091
|
||||
source "$script_dir/_helpers.sh"
|
||||
|
||||
usage() {
|
||||
cat <<'USAGE'
|
||||
Usage:
|
||||
bash scripts/quickstart/start_local_f5_tts.sh [--host HOST] [--port PORT] [--env FILE]
|
||||
|
||||
Options:
|
||||
--host HOST Bind host for the local F5-TTS sidecar. Defaults to 127.0.0.1.
|
||||
--port PORT Bind port. Defaults to OPENTALKING_TTS_LOCAL_F5_TTS_SERVICE_URL or 19095.
|
||||
--env FILE Source a quickstart env file before starting the sidecar.
|
||||
--help Show this help.
|
||||
USAGE
|
||||
}
|
||||
|
||||
env_file="${OPENTALKING_QUICKSTART_ENV:-$script_dir/env}"
|
||||
host="${OPENTALKING_TTS_LOCAL_F5_TTS_HOST:-127.0.0.1}"
|
||||
port=""
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--host)
|
||||
if [[ $# -lt 2 ]]; then
|
||||
echo "Missing value for --host" >&2
|
||||
exit 2
|
||||
fi
|
||||
host="$2"
|
||||
shift 2
|
||||
;;
|
||||
--port)
|
||||
if [[ $# -lt 2 ]]; then
|
||||
echo "Missing value for --port" >&2
|
||||
exit 2
|
||||
fi
|
||||
port="$2"
|
||||
shift 2
|
||||
;;
|
||||
--env)
|
||||
if [[ $# -lt 2 ]]; then
|
||||
echo "Missing value for --env" >&2
|
||||
exit 2
|
||||
fi
|
||||
env_file="$2"
|
||||
export OPENTALKING_QUICKSTART_ENV="$env_file"
|
||||
shift 2
|
||||
;;
|
||||
--help|-h)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
echo "Unknown argument: $1" >&2
|
||||
usage >&2
|
||||
exit 2
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
quickstart_source_env "$env_file"
|
||||
|
||||
export DIGITAL_HUMAN_HOME="${DIGITAL_HUMAN_HOME:-$default_home}"
|
||||
run_dir="$DIGITAL_HUMAN_HOME/run"
|
||||
log_dir="$DIGITAL_HUMAN_HOME/logs"
|
||||
mkdir -p "$run_dir" "$log_dir"
|
||||
|
||||
if [[ -z "$port" ]]; then
|
||||
port="${OPENTALKING_TTS_LOCAL_F5_TTS_PORT:-}"
|
||||
fi
|
||||
if [[ -z "$port" && -n "${OPENTALKING_TTS_LOCAL_F5_TTS_SERVICE_URL:-}" ]]; then
|
||||
port="$(
|
||||
python3 - <<'PY'
|
||||
import os
|
||||
from urllib.parse import urlparse
|
||||
|
||||
url = os.environ.get("OPENTALKING_TTS_LOCAL_F5_TTS_SERVICE_URL", "")
|
||||
parsed = urlparse(url)
|
||||
print(parsed.port or "")
|
||||
PY
|
||||
)"
|
||||
fi
|
||||
port="${port:-19095}"
|
||||
|
||||
resolve_f5_python() {
|
||||
if [[ -n "${OPENTALKING_F5_TTS_PYTHON:-}" ]]; then
|
||||
case "$OPENTALKING_F5_TTS_PYTHON" in
|
||||
"$repo_root/.venv/"*)
|
||||
echo "Refusing to start local F5-TTS from the OpenTalking main venv: $OPENTALKING_F5_TTS_PYTHON" >&2
|
||||
echo "Use OPENTALKING_F5_TTS_VENV_DIR or OPENTALKING_F5_TTS_PYTHON for the sidecar venv." >&2
|
||||
return 1
|
||||
;;
|
||||
esac
|
||||
if [[ -x "$OPENTALKING_F5_TTS_PYTHON" ]]; then
|
||||
printf '%s\n' "$OPENTALKING_F5_TTS_PYTHON"
|
||||
return 0
|
||||
fi
|
||||
echo "OPENTALKING_F5_TTS_PYTHON is not executable: $OPENTALKING_F5_TTS_PYTHON" >&2
|
||||
return 1
|
||||
fi
|
||||
|
||||
local candidate_dir=""
|
||||
for candidate_dir in \
|
||||
"${OPENTALKING_F5_TTS_VENV_DIR:-}" \
|
||||
"$repo_root/.venv-f5-tts" \
|
||||
"$DIGITAL_HUMAN_HOME/.venv-f5-tts" \
|
||||
"/home/zhongyi/models/local-audio/runtime/.venv-f5-tts"
|
||||
do
|
||||
[[ -n "$candidate_dir" ]] || continue
|
||||
if [[ -x "$candidate_dir/bin/python" ]]; then
|
||||
printf '%s\n' "$candidate_dir/bin/python"
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
|
||||
echo "Missing F5-TTS sidecar venv." >&2
|
||||
echo "Create it first: python3 -m venv $repo_root/.venv-f5-tts && $repo_root/.venv-f5-tts/bin/pip install -e /home/zhongyi/models/local-audio/runtime/F5-TTS fastapi 'uvicorn[standard]' soundfile" >&2
|
||||
return 1
|
||||
}
|
||||
|
||||
f5_python="$(resolve_f5_python)"
|
||||
|
||||
pid_file="$run_dir/local-f5-tts-$port.pid"
|
||||
log_file="$log_dir/local-f5-tts-$port.log"
|
||||
|
||||
if [[ -f "$pid_file" ]]; then
|
||||
old_pid="$(cat "$pid_file" 2>/dev/null || true)"
|
||||
if [[ -n "$old_pid" ]] && kill -0 "$old_pid" >/dev/null 2>&1; then
|
||||
if curl --max-time 2 -fsS "http://127.0.0.1:$port/health" >/dev/null 2>&1; then
|
||||
echo "Local F5-TTS is already running: pid=$old_pid port=$port"
|
||||
echo "Log: $log_file"
|
||||
exit 0
|
||||
fi
|
||||
echo "Stale Local F5-TTS pid file: pid=$old_pid port=$port" >&2
|
||||
fi
|
||||
rm -f "$pid_file"
|
||||
fi
|
||||
|
||||
if quickstart_port_in_use "$port"; then
|
||||
echo "Local F5-TTS port $port is already in use." >&2
|
||||
quickstart_describe_port "$port" >&2 || true
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Starting Local F5-TTS"
|
||||
echo " repo: $repo_root"
|
||||
echo " python: $f5_python"
|
||||
echo " host: $host"
|
||||
echo " port: $port"
|
||||
echo " log: $log_file"
|
||||
|
||||
(
|
||||
cd "$repo_root"
|
||||
export PYTHONPATH="$repo_root${PYTHONPATH:+:$PYTHONPATH}"
|
||||
export OPENTALKING_TTS_LOCAL_F5_TTS_PRELOAD="${OPENTALKING_TTS_LOCAL_F5_TTS_PRELOAD:-1}"
|
||||
if declare -F quickstart_detach >/dev/null 2>&1; then
|
||||
quickstart_detach "$log_file" "$f5_python" scripts/local_f5_tts_service.py --host "$host" --port "$port" >"$pid_file"
|
||||
else
|
||||
setsid "$f5_python" scripts/local_f5_tts_service.py --host "$host" --port "$port" >"$log_file" 2>&1 < /dev/null &
|
||||
echo "$!" >"$pid_file"
|
||||
fi
|
||||
)
|
||||
|
||||
pid="$(cat "$pid_file" 2>/dev/null || true)"
|
||||
if [[ -z "$pid" ]]; then
|
||||
echo "Failed to capture Local F5-TTS pid." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
for _ in {1..180}; do
|
||||
if ! kill -0 "$pid" >/dev/null 2>&1; then
|
||||
echo "Local F5-TTS exited during startup. Last log lines:" >&2
|
||||
tail -80 "$log_file" >&2 || true
|
||||
rm -f "$pid_file"
|
||||
exit 1
|
||||
fi
|
||||
if curl --max-time 2 -fsS "http://127.0.0.1:$port/health" >/dev/null 2>&1; then
|
||||
echo "Local F5-TTS is up: http://127.0.0.1:$port"
|
||||
exit 0
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
|
||||
echo "Local F5-TTS did not become ready in 180s. Last log lines:" >&2
|
||||
tail -80 "$log_file" >&2 || true
|
||||
exit 1
|
||||
@@ -15,19 +15,28 @@ def test_frontend_lists_local_tts_models_and_labels():
|
||||
|
||||
assert "local_cosyvoice" in constants
|
||||
assert "indextts" in constants
|
||||
assert "local_f5_tts" in constants
|
||||
assert "Local CosyVoice" in settings
|
||||
assert "IndexTTS" in settings
|
||||
assert "Local IndexTTS" in settings
|
||||
assert "Local F5-TTS" in settings
|
||||
assert "OmniRT IndexTTS" not in settings
|
||||
assert "Local IndexTTS" in workspace
|
||||
assert "Local F5-TTS" in workspace
|
||||
assert "OmniRT IndexTTS" not in workspace
|
||||
assert "Local IndexTTS" in clone
|
||||
assert "Local F5-TTS" in clone
|
||||
assert "OmniRT IndexTTS" not in clone
|
||||
assert "本地模型" in constants
|
||||
assert "local_cosyvoice" in app
|
||||
assert "indextts" in app
|
||||
assert "FunAudioLLM/Fun-CosyVoice3-0.5B-2512" in constants
|
||||
assert "IndexTeam/IndexTTS-2" in constants
|
||||
assert "SWivid/F5-TTS/F5TTS_v1_Base" in constants
|
||||
assert "LOCAL_F5_TTS_MODEL_OPTIONS" in app
|
||||
assert "local_f5_tts" in app[app.index("function normalizeTtsProvider"):app.index("if (normalized === \"local_indextts\"")]
|
||||
assert "if (p === \"local_f5_tts\") return \"local_f5_tts\"" in app
|
||||
assert 'ttsProvider === "local_f5_tts"' in app[app.index("const sharedSystemPrompt"):app.index("targetModel: sharedSystemPrompt")]
|
||||
assert "iic/CosyVoice-300M" not in constants
|
||||
assert "local_qwen3_tts" not in settings
|
||||
|
||||
@@ -45,6 +54,7 @@ def test_single_model_tts_provider_opens_voice_picker_first():
|
||||
assert "const providerOptions" in settings
|
||||
assert "hasChildren: true," in settings[settings.index("const providerOptions"):settings.index("const selectedProvider")]
|
||||
assert "hasChildren: p !== ttsProvider" not in settings
|
||||
assert "provider === \"local_f5_tts\"" in settings[settings.index("providerHasSingleModel"):settings.index("handleProviderSelect")]
|
||||
assert settings.index("const qwenModelColumnOptions") < settings.index("const providerOptions")
|
||||
|
||||
|
||||
|
||||
@@ -2,10 +2,12 @@ from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import io
|
||||
import os
|
||||
import queue
|
||||
import wave
|
||||
import importlib
|
||||
import sys
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from types import SimpleNamespace
|
||||
|
||||
@@ -33,6 +35,7 @@ def _settings(**overrides):
|
||||
("local_cosyvoice", "LocalCosyVoiceTTSAdapter"),
|
||||
("local_qwen3_tts", "LocalQwen3TTSAdapter"),
|
||||
("local_indextts", "LocalIndexTTSAdapter"),
|
||||
("local_f5_tts", "LocalF5TTSAdapter"),
|
||||
],
|
||||
)
|
||||
def test_local_tts_providers_are_supported(provider: str, expected_cls: str, monkeypatch):
|
||||
@@ -53,6 +56,44 @@ def test_local_tts_providers_are_supported(provider: str, expected_cls: str, mon
|
||||
assert adapter.model == "test-model"
|
||||
|
||||
|
||||
def test_local_f5_tts_status_uses_local_model_root(monkeypatch):
|
||||
from opentalking.providers.tts.factory import tts_provider_config
|
||||
|
||||
monkeypatch.setenv("OPENTALKING_LOCAL_AUDIO_MODEL_ROOT", "/tmp/opentalking-local-audio")
|
||||
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_F5_TTS_SERVICE_URL", "http://127.0.0.1:19095/synthesize")
|
||||
monkeypatch.setattr(
|
||||
"opentalking.providers.tts.factory._settings_value",
|
||||
lambda _name, default="": default,
|
||||
)
|
||||
for key in (
|
||||
"OPENTALKING_TTS_LOCAL_F5_TTS_MODEL",
|
||||
"OPENTALKING_TTS_LOCAL_F5_TTS_MODEL_DIR",
|
||||
"OPENTALKING_TTS_LOCAL_F5_TTS_RUNTIME_DIR",
|
||||
"OPENTALKING_TTS_LOCAL_F5_TTS_CKPT_FILE",
|
||||
"OPENTALKING_TTS_LOCAL_F5_TTS_VOCODER_LOCAL_PATH",
|
||||
"OPENTALKING_TTS_LOCAL_F5_TTS_PROMPT_AUDIO",
|
||||
"OPENTALKING_TTS_LOCAL_F5_TTS_DEVICE",
|
||||
):
|
||||
monkeypatch.delenv(key, raising=False)
|
||||
|
||||
status = tts_provider_config("local_f5_tts")
|
||||
|
||||
assert status == {
|
||||
"provider": "local_f5_tts",
|
||||
"model": "SWivid/F5-TTS/F5TTS_v1_Base",
|
||||
"model_dir": "/tmp/opentalking-local-audio/SWivid__F5-TTS__F5TTS_v1_Base",
|
||||
"voice": "local-default",
|
||||
"device": "auto",
|
||||
"key_set": False,
|
||||
"service_url": "http://127.0.0.1:19095/synthesize",
|
||||
"service_url_set": True,
|
||||
"runtime_dir": "/tmp/opentalking-local-audio/runtime/F5-TTS",
|
||||
"ckpt_file": "/tmp/opentalking-local-audio/SWivid__F5-TTS__F5TTS_v1_Base/model_1250000.safetensors",
|
||||
"vocoder_local_path": "",
|
||||
"prompt_audio_set": False,
|
||||
}
|
||||
|
||||
|
||||
def test_local_indextts_status_uses_local_model_root(monkeypatch):
|
||||
from opentalking.providers.tts.factory import tts_provider_config
|
||||
|
||||
@@ -1974,3 +2015,143 @@ def test_local_cosyvoice_service_prewarm_loads_model_and_runs_short_synthesis(mo
|
||||
service.prewarm(text="你好")
|
||||
|
||||
assert calls == ["model", "synth:你好:True"]
|
||||
|
||||
|
||||
def test_local_f5_tts_streams_pcm_from_service(monkeypatch):
|
||||
from opentalking.providers.tts.local_f5_tts.adapter import LocalF5TTSAdapter
|
||||
|
||||
class FakeHeaders(dict):
|
||||
def get(self, key, default=None):
|
||||
return super().get(key.lower(), default)
|
||||
|
||||
class FakeResponse:
|
||||
headers = FakeHeaders({"content-type": "audio/L16; rate=24000; channels=1", "x-audio-sample-rate": "24000"})
|
||||
|
||||
def raise_for_status(self):
|
||||
return None
|
||||
|
||||
async def aiter_bytes(self):
|
||||
yield np.array([0, 1200, -1200, 0], dtype="<i2").tobytes()
|
||||
|
||||
class FakeStream:
|
||||
async def __aenter__(self):
|
||||
return FakeResponse()
|
||||
|
||||
async def __aexit__(self, exc_type, exc, tb):
|
||||
return False
|
||||
|
||||
class FakeClient:
|
||||
def __init__(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
async def __aenter__(self):
|
||||
return self
|
||||
|
||||
async def __aexit__(self, exc_type, exc, tb):
|
||||
return False
|
||||
|
||||
def stream(self, method, url, json):
|
||||
assert method == "POST"
|
||||
assert url == "http://127.0.0.1:19095/synthesize"
|
||||
assert json["text"] == "你好"
|
||||
assert json["sample_rate"] == 16000
|
||||
return FakeStream()
|
||||
|
||||
monkeypatch.setattr("opentalking.providers.tts.local_f5_tts.adapter.httpx.AsyncClient", FakeClient)
|
||||
adapter = LocalF5TTSAdapter(
|
||||
service_url="http://127.0.0.1:19095/synthesize",
|
||||
sample_rate=16000,
|
||||
chunk_ms=20.0,
|
||||
)
|
||||
|
||||
async def collect():
|
||||
return [chunk async for chunk in adapter.synthesize_stream("你好", voice="local-default")]
|
||||
|
||||
chunks = asyncio.run(collect())
|
||||
|
||||
assert chunks
|
||||
assert all(chunk.sample_rate == 16000 for chunk in chunks)
|
||||
assert sum(int(chunk.data.size) for chunk in chunks) > 0
|
||||
|
||||
|
||||
def test_local_f5_tts_resolves_prompt_text_and_audio(tmp_path, monkeypatch):
|
||||
from opentalking.providers.tts.local_f5_tts.adapter import LocalF5TTSAdapter
|
||||
|
||||
monkeypatch.setenv("OPENTALKING_LOCAL_AUDIO_MODEL_ROOT", str(tmp_path / "models"))
|
||||
voice_dir = tmp_path / "models" / "voices" / "clones" / "local-f5-voice"
|
||||
voice_dir.mkdir(parents=True)
|
||||
(voice_dir / "prompt.wav").write_bytes(b"RIFFtest")
|
||||
(voice_dir / "prompt.txt").write_text("这是一段参考文本。", encoding="utf-8")
|
||||
(voice_dir / "meta.json").write_text('{"provider":"local_f5_tts"}', encoding="utf-8")
|
||||
|
||||
adapter = LocalF5TTSAdapter(service_url="http://127.0.0.1:19095/synthesize")
|
||||
prompt = adapter._resolve_voice_prompt("local-f5-voice")
|
||||
|
||||
assert prompt is not None
|
||||
assert prompt.prompt_audio == voice_dir / "prompt.wav"
|
||||
assert prompt.prompt_text == "这是一段参考文本。"
|
||||
|
||||
|
||||
def test_local_f5_tts_resolves_bundled_zero_shot_system_voice(monkeypatch):
|
||||
from opentalking.providers.tts.local_f5_tts.adapter import LocalF5TTSAdapter
|
||||
from opentalking.providers.tts.voice_assets import bundled_system_voice_root
|
||||
|
||||
monkeypatch.setenv("OPENTALKING_LOCAL_AUDIO_MODEL_ROOT", "/tmp/opentalking-local-audio")
|
||||
adapter = LocalF5TTSAdapter(service_url="http://127.0.0.1:19095/synthesize")
|
||||
prompt = adapter._resolve_voice_prompt("local-office-serena")
|
||||
|
||||
assert prompt is not None
|
||||
assert prompt.prompt_audio == bundled_system_voice_root() / "local-office-serena" / "prompt.wav"
|
||||
assert prompt.prompt_text == "你好,欢迎来到OpenTalking。我会用自然清晰的声音,为你介绍今天的内容。"
|
||||
|
||||
|
||||
def test_download_script_knows_f5_tts_base_model():
|
||||
from scripts import download_local_audio_models as downloader
|
||||
|
||||
assert downloader.MODELS["f5-tts-v1-base"] == ("hf", "SWivid/F5-TTS")
|
||||
assert downloader.HF_ALLOW_PATTERNS["f5-tts-v1-base"] == [
|
||||
"README.md",
|
||||
"F5TTS_v1_Base/model_1250000.safetensors",
|
||||
"F5TTS_v1_Base/vocab.txt",
|
||||
]
|
||||
assert downloader.MODEL_REQUIRED_FILES["f5-tts-v1-base"] == (
|
||||
"model_1250000.safetensors",
|
||||
)
|
||||
assert downloader._target(Path("/models"), "SWivid/F5-TTS", model_key="f5-tts-v1-base") == Path(
|
||||
"/models/SWivid__F5-TTS__F5TTS_v1_Base"
|
||||
)
|
||||
|
||||
|
||||
def test_local_f5_tts_service_module_exposes_routes(monkeypatch, tmp_path):
|
||||
monkeypatch.setenv("OPENTALKING_LOCAL_AUDIO_MODEL_ROOT", str(tmp_path / "models"))
|
||||
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_F5_TTS_PRELOAD", "0")
|
||||
module = importlib.import_module("scripts.local_f5_tts_service")
|
||||
|
||||
service = module.F5TTSService(preload=False)
|
||||
app = module.create_app(service)
|
||||
routes = {getattr(route, "path", "") for route in app.routes}
|
||||
|
||||
assert "/health" in routes
|
||||
assert "/synthesize" in routes
|
||||
assert service.model_dir == tmp_path / "models" / "SWivid__F5-TTS__F5TTS_v1_Base"
|
||||
assert service.ckpt_file == service.model_dir / "model_1250000.safetensors"
|
||||
|
||||
|
||||
def test_start_local_f5_tts_script_refuses_main_venv(tmp_path):
|
||||
repo = Path(__file__).resolve().parents[2]
|
||||
|
||||
result = subprocess.run(
|
||||
["bash", str(repo / "scripts" / "quickstart" / "start_local_f5_tts.sh"), "--port", "19995"],
|
||||
cwd=repo,
|
||||
env={
|
||||
**os.environ,
|
||||
"OPENTALKING_F5_TTS_PYTHON": str(repo / ".venv" / "bin" / "python"),
|
||||
"OPENTALKING_QUICKSTART_ENV": str(tmp_path / "missing-env"),
|
||||
},
|
||||
text=True,
|
||||
capture_output=True,
|
||||
timeout=10,
|
||||
)
|
||||
|
||||
assert result.returncode == 1
|
||||
assert "Refusing to start local F5-TTS from the OpenTalking main venv" in result.stderr
|
||||
|
||||
Reference in New Issue
Block a user