feat: add local F5-TTS provider (#128)

This commit is contained in:
zyairehhh
2026-06-28 23:21:10 +08:00
committed by GitHub
parent 5516cd5675
commit 5473bb2665
26 changed files with 1370 additions and 13 deletions

View File

@@ -326,6 +326,7 @@ Join the QQ community to discuss real-time digital humans, FlashTalk, OmniRT, mo
OpenTalking references and benefits from excellent projects in the real-time digital-human ecosystem:
- Thanks to the [LINUX DO](https://linux.do/) community for their support and discussions.
- [SoulX-FlashTalk](https://github.com/Soul-AILab/SoulX-FlashTalk) and [SoulX-FlashTalk-14B](https://huggingface.co/Soul-AILab/SoulX-FlashTalk-14B)
- [LiveTalking](https://github.com/lipku/LiveTalking)
- [OmniRT](https://github.com/datascale-ai/omnirt)

View File

@@ -326,6 +326,7 @@ Join the QQ community to discuss real-time digital humans, FlashTalk, OmniRT, mo
OpenTalking references and benefits from excellent projects in the real-time digital-human ecosystem:
- Thanks to the [LINUX DO](https://linux.do/) community for their support and discussions.
- [SoulX-FlashTalk](https://github.com/Soul-AILab/SoulX-FlashTalk) and [SoulX-FlashTalk-14B](https://huggingface.co/Soul-AILab/SoulX-FlashTalk-14B)
- [LiveTalking](https://github.com/lipku/LiveTalking)
- [OmniRT](https://github.com/datascale-ai/omnirt)

View File

@@ -324,6 +324,7 @@ OpenTalking 的 LLM 走 OpenAI-compatible 接口,把 `OPENTALKING_LLM_BASE_URL
OpenTalking 参考并受益于实时数字人生态中的优秀项目:
- 感谢 [LINUX DO](https://linux.do/) 社区的支持与讨论。
- [SoulX-FlashTalk](https://github.com/Soul-AILab/SoulX-FlashTalk) 和 [SoulX-FlashTalk-14B](https://huggingface.co/Soul-AILab/SoulX-FlashTalk-14B)
- [LiveTalking](https://github.com/lipku/LiveTalking)
- [OmniRT](https://github.com/datascale-ai/omnirt)

View File

@@ -53,6 +53,7 @@ def _runtime_status_payload(request: Request) -> dict[str, Any]:
"mock",
"local_cosyvoice",
"indextts",
"local_f5_tts",
"dashscope",
"xiaomi_mimo",
"openai_compatible",

View File

@@ -24,6 +24,7 @@ from opentalking.providers.tts.voice_assets import (
INDEXTTS_PROVIDER,
INDEXTTS_PROVIDERS,
LOCAL_COSYVOICE_PROVIDER,
LOCAL_F5_TTS_PROVIDER,
bundled_system_voice_root,
iter_voice_assets,
local_audio_model_root,
@@ -117,6 +118,19 @@ def _write_local_cosyvoice_prompt(
return voice_dir
def _write_local_f5_tts_prompt(*, voice_id: str, wav: bytes, prompt_text: str, display_label: str, target_model: str, validation: dict[str, Any] | None = None) -> Path:
if not re.fullmatch(r"[A-Za-z0-9_-]{3,80}", voice_id):
raise ValueError("invalid local voice id")
voice_dir = _local_audio_model_root() / "voices" / "clones" / voice_id
voice_dir.mkdir(parents=True, exist_ok=True)
clean_prompt_text = prompt_text.strip()
(voice_dir / "prompt.wav").write_bytes(wav)
if clean_prompt_text:
(voice_dir / "prompt.txt").write_text(clean_prompt_text, encoding="utf-8")
(voice_dir / "meta.json").write_text(json.dumps({"voice_id": voice_id, "display_label": display_label, "provider": LOCAL_F5_TTS_PROVIDER, "target_model": target_model, "prompt_audio": str(voice_dir / "prompt.wav"), "prompt_text": clean_prompt_text, "validation": validation or {}, "source": "clone"}, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
return voice_dir
def _write_local_indextts_prompt(
*,
provider: str,
@@ -187,6 +201,15 @@ def _wav_audio_stats(wav: bytes) -> dict[str, float]:
}
def _validate_local_f5_tts_prompt(wav: bytes) -> dict[str, Any]:
stats = _wav_audio_stats(wav)
if stats["duration_sec"] < 1.0:
raise HTTPException(status_code=400, detail="F5-TTS 参考音频过短,请录制 3-15 秒清晰人声。")
if stats["active_sec"] < 0.5 or stats["rms_dbfs"] < LOCAL_COSYVOICE_MIN_RMS_DBFS:
raise HTTPException(status_code=400, detail="F5-TTS 参考音频声音太小或静音太多,请靠近麦克风重录。")
return stats
def _validate_local_indextts_prompt(wav: bytes) -> dict[str, Any]:
stats = _wav_audio_stats(wav)
if stats["duration_sec"] < 1.0:
@@ -307,6 +330,26 @@ def _local_cosyvoice_system_voice_items() -> list[VoiceItem]:
return items
def _local_f5_tts_voice_items(source: str) -> list[VoiceItem]:
if source not in {"system", "clones"}:
return []
items: list[VoiceItem] = []
for asset in iter_voice_assets(
provider=LOCAL_F5_TTS_PROVIDER,
sources=(source,),
model_root=_local_audio_model_root(),
require_prompt_text=source == "system",
):
voice_id = asset.voice_id
if not re.fullmatch(r"[A-Za-z0-9_-]{3,80}", voice_id):
continue
meta = asset.meta
label = _public_voice_label(str(meta.get("display_label") or meta.get("label") or voice_id), fallback=voice_id)
tm = str(meta.get("target_model") or "").strip()
items.append({"id": -len(items) - 1, "user_id": 1, "provider": LOCAL_F5_TTS_PROVIDER, "voice_id": voice_id, "display_label": label, "target_model": tm or None, "source": "clone" if source == "clones" else "system"})
return items
def _local_indextts_voice_items(source: str) -> list[VoiceItem]:
if source not in {"system", "clones"}:
return []
@@ -399,6 +442,13 @@ async def get_voices(provider: str | None = None) -> JSONResponse:
if key not in existing:
items.append(item)
existing.add(key)
if public_p is None or public_p == LOCAL_F5_TTS_PROVIDER:
for source in ("system", "clones"):
for item in _local_f5_tts_voice_items(source):
key = (item["provider"], item["voice_id"])
if key not in existing:
items.append(item)
existing.add(key)
if public_p is None or public_p == INDEXTTS_PROVIDER:
for source in ("system", "clones"):
for item in _local_indextts_voice_items(source):
@@ -445,10 +495,10 @@ async def post_voice_clone(
prov = provider.strip().lower()
if prov in {"xiaomi", "mimo"}:
prov = "xiaomi_mimo"
if prov not in {"local_cosyvoice", "cosyvoice", "dashscope", "xiaomi_mimo", *INDEXTTS_PROVIDERS}:
if prov not in {"local_cosyvoice", "local_f5_tts", "cosyvoice", "dashscope", "xiaomi_mimo", *INDEXTTS_PROVIDERS}:
raise HTTPException(
status_code=400,
detail="provider 须为 local_cosyvoice、indextts、cosyvoice、dashscope 或 xiaomi_mimo",
detail="provider 须为 local_cosyvoice、local_f5_tts、indextts、cosyvoice、dashscope 或 xiaomi_mimo",
)
raw = await audio.read()
@@ -471,6 +521,14 @@ async def post_voice_clone(
)
try:
if prov == LOCAL_F5_TTS_PROVIDER:
voice_id = _safe_local_voice_id(label)
effective_model = tm or "SWivid/F5-TTS/F5TTS_v1_Base"
validation = _validate_local_f5_tts_prompt(wav)
_write_local_f5_tts_prompt(voice_id=voice_id, wav=wav, prompt_text=(prompt_text or "").strip(), display_label=label, target_model=effective_model, validation=validation)
eid = insert_clone(provider=LOCAL_F5_TTS_PROVIDER, voice_id=voice_id, display_label=label, target_model=effective_model)
return JSONResponse({"ok": True, "entry_id": eid, "voice_id": voice_id, "display_label": label, "provider": LOCAL_F5_TTS_PROVIDER, "target_model": effective_model, "message": "F5-TTS 复刻音色已保存,可用于 F5-TTS 合成。"})
if prov in INDEXTTS_PROVIDERS:
voice_id = _safe_local_voice_id(label)
effective_model = tm or "IndexTeam/IndexTTS-2"
@@ -641,7 +699,7 @@ async def delete_voice_entry(entry_id: int) -> JSONResponse:
if row.get("source") != "clone":
raise HTTPException(status_code=400, detail="不能删除系统预设音色")
if delete_entry(entry_id):
if row.get("provider") in {"local_cosyvoice", *INDEXTTS_PROVIDERS}:
if row.get("provider") in {"local_cosyvoice", "local_f5_tts", *INDEXTTS_PROVIDERS}:
_remove_local_prompt(str(row.get("voice_id") or ""))
return JSONResponse({"ok": True})
raise HTTPException(status_code=404, detail="not found")

View File

@@ -71,6 +71,7 @@ def test_local_cosyvoice_clone_stores_prompt_locally(tmp_path, monkeypatch):
monkeypatch.setenv("OPENTALKING_LOCAL_AUDIO_MODEL_ROOT", str(tmp_path / "models"))
monkeypatch.setattr(voices_routes, "init_voice_store", lambda: None)
monkeypatch.setattr(voices_routes, "list_voices", lambda provider=None: [])
monkeypatch.setattr(
voices_routes.bailian_clone,
"convert_audio_to_wav_24k_mono",
@@ -117,6 +118,106 @@ def test_local_cosyvoice_clone_stores_prompt_locally(tmp_path, monkeypatch):
assert inserted["display_label"] == "本地客服女声"
def test_local_f5_tts_clone_stores_prompt_audio_and_text(tmp_path, monkeypatch):
inserted: dict[str, object] = {}
monkeypatch.setenv("OPENTALKING_LOCAL_AUDIO_MODEL_ROOT", str(tmp_path / "models"))
monkeypatch.setattr(voices_routes, "init_voice_store", lambda: None)
monkeypatch.setattr(voices_routes, "list_voices", lambda provider=None: [])
monkeypatch.setattr(
voices_routes.bailian_clone,
"convert_audio_to_wav_24k_mono",
lambda raw, suffix: _wav_bytes(),
)
def fake_insert_clone(**kwargs):
inserted.update(kwargs)
return 77
monkeypatch.setattr(voices_routes, "insert_clone", fake_insert_clone)
app = FastAPI()
app.include_router(voices_routes.router)
response = TestClient(app).post(
"/voices/clone",
data={
"provider": "local_f5_tts",
"target_model": "SWivid/F5-TTS/F5TTS_v1_Base",
"display_label": "F5 客服女声",
"prompt_text": "您好,欢迎使用实时数字人。",
},
files={"audio": ("sample.wav", _wav_bytes(), "audio/wav")},
)
assert response.status_code == 200, response.text
body = response.json()
voice_id = body["voice_id"]
voice_dir = tmp_path / "models" / "voices" / "clones" / voice_id
assert body["provider"] == "local_f5_tts"
assert body["entry_id"] == 77
assert body["target_model"] == "SWivid/F5-TTS/F5TTS_v1_Base"
assert (voice_dir / "prompt.wav").is_file()
assert (voice_dir / "prompt.txt").read_text(encoding="utf-8") == "您好,欢迎使用实时数字人。"
meta = (voice_dir / "meta.json").read_text(encoding="utf-8")
assert '"provider": "local_f5_tts"' in meta
assert inserted["provider"] == "local_f5_tts"
assert inserted["voice_id"] == voice_id
assert inserted["display_label"] == "F5 客服女声"
def test_get_voices_includes_local_f5_tts_system_voice_dirs(tmp_path, monkeypatch):
monkeypatch.setenv("OPENTALKING_LOCAL_AUDIO_MODEL_ROOT", str(tmp_path / "models"))
voice_dir = tmp_path / "models" / "voices" / "system" / "f5-clear-cn"
voice_dir.mkdir(parents=True)
(voice_dir / "prompt.wav").write_bytes(b"RIFFtest")
(voice_dir / "prompt.txt").write_text("这是一段 F5 参考音色。", encoding="utf-8")
(voice_dir / "meta.json").write_text(
'{"provider":"local_f5_tts","display_label":"F5 清晰中文","target_model":"SWivid/F5-TTS/F5TTS_v1_Base"}',
encoding="utf-8",
)
monkeypatch.setattr(voices_routes, "init_voice_store", lambda: None)
monkeypatch.setattr(voices_routes, "list_voices", lambda provider=None: [])
app = FastAPI()
app.include_router(voices_routes.router)
response = TestClient(app).get("/voices?provider=local_f5_tts")
assert response.status_code == 200
assert {
"id": -1,
"user_id": 1,
"provider": "local_f5_tts",
"voice_id": "f5-clear-cn",
"display_label": "F5 清晰中文",
"target_model": "SWivid/F5-TTS/F5TTS_v1_Base",
"source": "system",
} in response.json()["items"]
def test_get_voices_shares_bundled_zero_shot_system_voices_with_local_f5_tts(tmp_path, monkeypatch):
monkeypatch.setenv("OPENTALKING_LOCAL_AUDIO_MODEL_ROOT", str(tmp_path / "models"))
monkeypatch.setattr(voices_routes, "init_voice_store", lambda: None)
monkeypatch.setattr(voices_routes, "list_voices", lambda provider=None: [])
app = FastAPI()
app.include_router(voices_routes.router)
response = TestClient(app).get("/voices?provider=local_f5_tts")
assert response.status_code == 200
items = response.json()["items"]
assert {
"id": -1,
"user_id": 1,
"provider": "local_f5_tts",
"voice_id": "cosyvoice-official-zero-shot",
"display_label": "官方示例女声",
"target_model": "FunAudioLLM/Fun-CosyVoice3-0.5B-2512",
"source": "system",
} in items
assert any(item["voice_id"] == "local-office-serena" for item in items)
def test_local_cosyvoice_clone_rejects_silent_prompt(tmp_path, monkeypatch):
monkeypatch.setenv("OPENTALKING_LOCAL_AUDIO_MODEL_ROOT", str(tmp_path / "models"))
monkeypatch.setattr(voices_routes, "init_voice_store", lambda: None)

View File

@@ -70,6 +70,7 @@ import {
COSYVOICE_MODEL_OPTIONS,
COSYVOICE_VOICE_OPTIONS,
LOCAL_COSYVOICE_MODEL_OPTIONS,
LOCAL_F5_TTS_MODEL_OPTIONS,
LOCAL_INDEXTTS_MODEL_OPTIONS,
LOCAL_TTS_VOICE_OPTIONS,
SAMBERT_MODEL_OPTIONS,
@@ -104,6 +105,8 @@ function bailianModelOptions(provider: TtsProviderExtended): { id: string; label
return LOCAL_COSYVOICE_MODEL_OPTIONS;
case "indextts":
return LOCAL_INDEXTTS_MODEL_OPTIONS;
case "local_f5_tts":
return LOCAL_F5_TTS_MODEL_OPTIONS;
case "xiaomi_mimo":
return XIAOMI_MIMO_MODEL_OPTIONS;
default:
@@ -121,6 +124,7 @@ function bailianVoiceOptions(provider: TtsProviderExtended): { id: string; label
return [];
case "local_cosyvoice":
case "indextts":
case "local_f5_tts":
return LOCAL_TTS_VOICE_OPTIONS;
case "xiaomi_mimo":
return XIAOMI_MIMO_VOICE_OPTIONS;
@@ -134,6 +138,7 @@ function catalogProviderKey(p: TtsProviderExtended): string | null {
if (p === "cosyvoice") return "cosyvoice";
if (p === "local_cosyvoice") return "local_cosyvoice";
if (p === "indextts") return "indextts";
if (p === "local_f5_tts") return "local_f5_tts";
if (p === "xiaomi_mimo") return "xiaomi_mimo";
return null;
}
@@ -166,13 +171,15 @@ function mergeVoiceCatalogIntoOptions(
const extras: VoiceOpt[] = [];
for (const r of catalog) {
if (r.provider !== cp) continue;
if (activeModel && r.target_model && r.target_model !== activeModel && !(ttsProvider === "local_cosyvoice" && r.source === "system")) continue;
const sharedSystemPrompt =
r.source === "system" && (ttsProvider === "local_cosyvoice" || ttsProvider === "local_f5_tts");
if (activeModel && r.target_model && r.target_model !== activeModel && !sharedSystemPrompt) continue;
if (cloneOnly && r.source !== "clone") continue;
if (staticIds.has(r.voice_id)) continue;
extras.push({
id: r.voice_id,
label: r.source === "clone" ? `复刻 · ${r.display_label}` : r.display_label,
targetModel: ttsProvider === "local_cosyvoice" && r.source === "system" ? undefined : r.target_model,
targetModel: sharedSystemPrompt ? undefined : r.target_model,
});
staticIds.add(r.voice_id);
}
@@ -489,6 +496,7 @@ function normalizeTtsProvider(value: string | null | undefined, fallback: TtsPro
normalized === "sambert" ||
normalized === "local_cosyvoice" ||
normalized === "indextts" ||
normalized === "local_f5_tts" ||
normalized === "xiaomi_mimo" ||
normalized === "openai_compatible"
) {
@@ -2791,7 +2799,8 @@ export default function App() {
|| runtimeConfigTtsProvider === "local_cosyvoice"
|| runtimeConfigTtsProvider === "indextts"
|| runtimeConfigTtsProvider === "local_indextts"
|| runtimeConfigTtsProvider === "omnirt_indextts",
|| runtimeConfigTtsProvider === "omnirt_indextts"
|| runtimeConfigTtsProvider === "local_f5_tts",
);
const runtimeConfigReady = Boolean(
runtimeConfig?.llm.api_key_set

View File

@@ -5,6 +5,7 @@ import {
COSYVOICE_MODEL_OPTIONS,
LOCAL_COSYVOICE_MODEL_OPTIONS,
LOCAL_INDEXTTS_MODEL_OPTIONS,
LOCAL_F5_TTS_MODEL_OPTIONS,
XIAOMI_MIMO_MODEL_OPTIONS,
} from "../constants/ttsBailian";
import { QWEN_VOICE_CLONE_TARGET_OPTIONS } from "../constants/ttsQwen";
@@ -51,6 +52,7 @@ type CloneProvider =
| "cosyvoice"
| "local_cosyvoice"
| "indextts"
| "local_f5_tts"
| "xiaomi_mimo";
type RecorderPhase = "idle" | "recording" | "paused" | "recorded";
@@ -61,6 +63,7 @@ function defaultTargetModelForProvider(provider: CloneProvider): string {
if (provider === "indextts") {
return LOCAL_INDEXTTS_MODEL_OPTIONS[0]?.id ?? "";
}
if (provider === "local_f5_tts") return LOCAL_F5_TTS_MODEL_OPTIONS[0]?.id ?? "";
return COSYVOICE_MODEL_OPTIONS[0]?.id ?? "";
}
@@ -392,6 +395,7 @@ export function BailianVoiceClone({ onSuccess, onClose }: BailianVoiceCloneProps
<option value="xiaomi_mimo"> MiMo VoiceClone</option>
<option value="local_cosyvoice"> CosyVoice</option>
<option value="indextts">Local IndexTTS</option>
<option value="local_f5_tts">Local F5-TTS</option>
<option value="cosyvoice"> CosyVoice</option>
</select>
</label>
@@ -411,6 +415,8 @@ export function BailianVoiceClone({ onSuccess, onClose }: BailianVoiceCloneProps
? LOCAL_COSYVOICE_MODEL_OPTIONS
: provider === "indextts"
? LOCAL_INDEXTTS_MODEL_OPTIONS
: provider === "local_f5_tts"
? LOCAL_F5_TTS_MODEL_OPTIONS
: COSYVOICE_MODEL_OPTIONS
).map((o) => (
<option key={o.id} value={o.id}>

View File

@@ -67,6 +67,7 @@ const TTS_PROVIDER_LABELS: Record<TtsProviderExtended, string> = {
sambert: "Sambert",
local_cosyvoice: "Local CosyVoice",
indextts: "Local IndexTTS",
local_f5_tts: "Local F5-TTS",
xiaomi_mimo: "小米 MiMo",
openai_compatible: "OpenAI API",
};
@@ -78,6 +79,7 @@ const TTS_PROVIDER_SUBTITLES: Record<TtsProviderExtended, string> = {
sambert: "Bailian",
local_cosyvoice: "本地模型",
indextts: "本地部署",
local_f5_tts: "本地模型",
xiaomi_mimo: "OpenAI 兼容",
openai_compatible: "OpenAI-compatible",
};
@@ -491,7 +493,7 @@ export function SettingsPanel({
subtitle: option.id,
hasChildren: true,
}));
const providerOptions: ColumnOption[] = (["edge", "dashscope", "cosyvoice", "sambert", "local_cosyvoice", "indextts", "xiaomi_mimo", "openai_compatible"] as TtsProviderExtended[]).map((p) => ({
const providerOptions: ColumnOption[] = (["edge", "dashscope", "cosyvoice", "sambert", "local_cosyvoice", "indextts", "local_f5_tts", "xiaomi_mimo", "openai_compatible"] as TtsProviderExtended[]).map((p) => ({
id: p,
label: TTS_PROVIDER_LABELS[p],
subtitle: TTS_PROVIDER_SUBTITLES[p],
@@ -517,7 +519,7 @@ export function SettingsPanel({
}));
const providerHasSingleModel = (provider: TtsProviderExtended) => {
if (provider === "edge" || provider === "openai_compatible") return true;
if (provider === "local_cosyvoice" || provider === "indextts") return true;
if (provider === "local_cosyvoice" || provider === "indextts" || provider === "local_f5_tts") return true;
if (provider !== ttsProvider) return false;
return qwenModelColumnOptions.length <= 1;
};

View File

@@ -234,6 +234,7 @@ function providerLabel(provider: TtsProviderExtended): string {
if (provider === "cosyvoice") return "CosyVoice";
if (provider === "sambert") return "Sambert";
if (provider === "indextts") return "Local IndexTTS";
if (provider === "local_f5_tts") return "Local F5-TTS";
if (provider === "xiaomi_mimo") return "小米 MiMo";
if (provider === "openai_compatible") return "OpenAI-compatible TTS";
return "Local CosyVoice";
@@ -737,7 +738,7 @@ export function VideoCreationWorkspace({
<label className="block text-sm font-medium text-slate-700">
TTS
<select value={ttsProvider} onChange={(event) => onTtsProviderChange(event.target.value as TtsProviderExtended)} className="mt-2 w-full rounded-lg border border-slate-200 bg-white px-3 py-2 text-sm">
{(["edge", "dashscope", "cosyvoice", "sambert", "local_cosyvoice", "indextts", "xiaomi_mimo", "openai_compatible"] as TtsProviderExtended[]).map((item) => <option key={item} value={item}>{providerLabel(item)}</option>)}
{(["edge", "dashscope", "cosyvoice", "sambert", "local_cosyvoice", "indextts", "local_f5_tts", "xiaomi_mimo", "openai_compatible"] as TtsProviderExtended[]).map((item) => <option key={item} value={item}>{providerLabel(item)}</option>)}
</select>
</label>
<label className="block text-sm font-medium text-slate-700">

View File

@@ -28,6 +28,10 @@ export const LOCAL_INDEXTTS_MODEL_OPTIONS: { id: string; label: string }[] = [
{ id: "IndexTeam/IndexTTS-2", label: "IndexTTS-2本地模型" },
];
export const LOCAL_F5_TTS_MODEL_OPTIONS: { id: string; label: string }[] = [
{ id: "SWivid/F5-TTS/F5TTS_v1_Base", label: "F5-TTS v1 Base本地模型" },
];
export const LOCAL_TTS_VOICE_OPTIONS: { id: string; label: string }[] = [];
export const XIAOMI_MIMO_MODEL_OPTIONS: { id: string; label: string }[] = [
@@ -54,6 +58,7 @@ export type TtsProviderExtended =
| "sambert"
| "local_cosyvoice"
| "indextts"
| "local_f5_tts"
| "xiaomi_mimo"
| "openai_compatible";

View File

@@ -15,12 +15,14 @@ live in the model pages.
| `elevenlabs` | Hosted API | Hosted multilingual voices | `.env` provider config |
| `local_cosyvoice` | Local deployment | Local Chinese TTS, built-in voices, and cloned voices | [CosyVoice](tts/cosyvoice.md) |
| `indextts` | Local deployment / OmniRT | Controllable dubbing, emotion control, and voice cloning | [IndexTTS](tts/indextts.md) |
| `local_f5_tts` | Local deployment | Local F5-TTS Base voice cloning | [F5-TTS](tts/f5-tts.md) |
| `local_qwen3_tts` | Local deployment | Local Qwen3-TTS Base voice cloning | [Qwen3-TTS](tts/qwen3-tts.md) |
## Local Model Entries
- [CosyVoice Local Deployment](tts/cosyvoice.md)
- [IndexTTS Local Deployment](tts/indextts.md)
- [F5-TTS Local Deployment](tts/f5-tts.md)
- [Qwen3-TTS Local Deployment](tts/qwen3-tts.md)
Each local model page contains use cases, weight preparation, startup commands,

View File

@@ -0,0 +1,116 @@
# F5-TTS Local Deployment
F5-TTS is integrated through OpenTalking's `local_f5_tts` provider. Use it for local voice cloning, short realtime replies, and offline video dubbing. The integration runs as a same-machine HTTP sidecar so the OpenTalking main process stays isolated from F5-TTS runtime and CUDA dependencies.
## Use Cases
- Local voice cloning without a hosted TTS API.
- A 3-15 second reference clip and matching transcript are available.
- F5-TTS dependencies should stay outside the main OpenTalking venv.
## Weight Preparation
Use a single local audio model root, for example `$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT`:
```bash title="Terminal"
cd "$OPENTALKING_HOME"
export OPENTALKING_LOCAL_AUDIO_MODEL_ROOT="${OPENTALKING_LOCAL_AUDIO_MODEL_ROOT:-$OPENTALKING_HOME/models/local-audio}"
python scripts/download_local_audio_models.py \
--root "$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT" \
--model f5-tts-v1-base
```
The downloader maps `SWivid/F5-TTS` `F5TTS_v1_Base/model_1250000.safetensors` to:
```text
$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT/SWivid__F5-TTS__F5TTS_v1_Base/model_1250000.safetensors
```
Prepare the runtime and sidecar venv:
```bash title="Terminal"
mkdir -p "$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT/runtime"
cd "$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT/runtime"
if [ ! -d F5-TTS/.git ]; then
git clone https://github.com/SWivid/F5-TTS.git F5-TTS
fi
python3 -m venv --system-site-packages "$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT/runtime/.venv-f5-tts-system"
. "$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT/runtime/.venv-f5-tts-system/bin/activate"
pip install -U pip wheel setuptools
pip install --no-deps -e "$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT/runtime/F5-TTS"
pip install fastapi "uvicorn[standard]" soundfile cached_path hydra-core ema_pytorch vocos x_transformers transformers_stream_generator rjieba pypinyin tomli bitsandbytes pydub torchcodec torchdiffeq unidecode wandb
```
## Configuration
```env title=".env"
OPENTALKING_TTS_DEFAULT_PROVIDER=local_f5_tts
OPENTALKING_TTS_LOCAL_F5_TTS_SERVICE_URL=http://127.0.0.1:19095/synthesize
OPENTALKING_LOCAL_AUDIO_MODEL_ROOT=./models/local-audio
OPENTALKING_TTS_LOCAL_F5_TTS_RUNTIME_DIR=./models/local-audio/runtime/F5-TTS
OPENTALKING_TTS_LOCAL_F5_TTS_DEVICE=cuda
```
## Voice Cloning
`local_f5_tts` requires reference audio. Upload a clone voice through the API, or prepare the directory manually:
```text
$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT/voices/clones/my-f5-voice/
prompt.wav
prompt.txt
meta.json
```
Example `meta.json`:
```json
{"provider":"local_f5_tts"}
```
After upload, `/api/voices?provider=local_f5_tts` returns the voice id. TTS preview, realtime dialogue, and video generation can all use that voice.
## Start Command
Start the F5-TTS sidecar first, then OpenTalking:
```bash title="Terminal"
cd "$OPENTALKING_HOME"
export OPENTALKING_LOCAL_AUDIO_MODEL_ROOT="${OPENTALKING_LOCAL_AUDIO_MODEL_ROOT:-$OPENTALKING_HOME/models/local-audio}"
export OPENTALKING_F5_TTS_VENV_DIR="$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT/runtime/.venv-f5-tts-system"
bash scripts/quickstart/start_local_f5_tts.sh --port 19095
export OPENTALKING_TTS_DEFAULT_PROVIDER=local_f5_tts
export OPENTALKING_TTS_LOCAL_F5_TTS_SERVICE_URL=http://127.0.0.1:19095/synthesize
python -m apps.api.main
```
## Verification
```bash title="Terminal"
curl -fsS http://127.0.0.1:19095/health
curl -fsS http://127.0.0.1:8000/health
```
TTS preview should use provider `local_f5_tts` and a clone voice with `prompt.wav`. Save the result as WAV and verify the spoken text and voice by ASR or listening.
## Benchmark Log
| Item | Command / API | Target | Measured |
|------|---------------|--------|----------|
| TTS preview | `/tts/preview` + SenseVoiceSmall ASR | Playable WAV, correct text | Passed: preview returned a 16 kHz mono WAV; SenseVoiceSmall ASR matched the target text. |
| Realtime dialogue | local mode dialogue / warm TTS | RTF < 1.0 | Passed: warm RTF 0.278 for 3.31s audio in 0.918s; prior warm checks were 0.386 and 0.518, all below 1 |
| Offline video | video generation API / CLI | Generation succeeds, audio drives avatar | Passed: QuickTalk + F5 clone voice generated an MP4; ffprobe showed H.264 video and 16 kHz mono AAC audio. |
## Common Errors
| Symptom | Action |
|---------|--------|
| `Missing F5-TTS checkpoint` | Confirm `model_1250000.safetensors` is under `SWivid__F5-TTS__F5TTS_v1_Base`. |
| `requires prompt_audio` | Select a clone voice or set `OPENTALKING_TTS_LOCAL_F5_TTS_PROMPT_AUDIO`. |
| Dependency conflicts | Do not run the sidecar from OpenTalking's main `.venv`; use a separate venv such as `$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT/runtime/.venv-f5-tts-system` and reuse the host PyTorch/CUDA environment when appropriate. |
| Slow first request | Set `OPENTALKING_TTS_LOCAL_F5_TTS_PRELOAD=1` and run a short warm-up request after startup. |
| QuickTalk v3 reshape error | Keep `OPENTALKING_QUICKTALK_RESOLUTION=256` for the current TorchScript export when generating video; 160/128 resolution makes internal feature shapes mismatch. |

View File

@@ -13,12 +13,14 @@ talking-head backend。本文只做选型和入口导航权重、启动、验
| `elevenlabs` | 托管 API | 多语言托管音色 | `.env` provider 配置 |
| `local_cosyvoice` | 本地部署 | 本地中文 TTS、内置音色和复刻音色 | [CosyVoice](tts/cosyvoice.md) |
| `indextts` | 本地部署 / OmniRT | 可控配音、情绪控制和复刻音色 | [IndexTTS](tts/indextts.md) |
| `local_f5_tts` | 本地部署 | 本地 F5-TTS Base 音色克隆 | [F5-TTS](tts/f5-tts.md) |
| `local_qwen3_tts` | 本地部署 | 本地 Qwen3-TTS Base 复刻音色 | [Qwen3-TTS](tts/qwen3-tts.md) |
## 本地模型入口
- [CosyVoice 本地部署](tts/cosyvoice.md)
- [IndexTTS 本地部署](tts/indextts.md)
- [F5-TTS 本地部署](tts/f5-tts.md)
- [Qwen3-TTS 本地部署](tts/qwen3-tts.md)
每个本地模型页面都包含适用场景、权重准备、启动命令、验证命令和常见错误。

View File

@@ -0,0 +1,116 @@
# F5-TTS 本地部署
F5-TTS 通过 OpenTalking 的 `local_f5_tts` provider 接入,适合本地音色克隆、短句实时回复和离线视频配音。当前实现采用同机 HTTP sidecarOpenTalking 主进程只负责调度F5-TTS 运行在独立 venv 中,避免依赖和 CUDA 包冲突。
## 适用场景
- 需要本地音色克隆,不希望调用托管 TTS API。
- 已有 3-15 秒参考音频和对应文本,希望复刻说话人音色。
- 需要把 F5-TTS runtime 与 OpenTalking 主进程隔离。
## 权重准备
推荐把权重放到统一的本地音频模型目录,例如 `$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT`
```bash title="终端"
cd "$OPENTALKING_HOME"
export OPENTALKING_LOCAL_AUDIO_MODEL_ROOT="${OPENTALKING_LOCAL_AUDIO_MODEL_ROOT:-$OPENTALKING_HOME/models/local-audio}"
python scripts/download_local_audio_models.py \
--root "$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT" \
--model f5-tts-v1-base
```
脚本会把 `SWivid/F5-TTS` 的 `F5TTS_v1_Base/model_1250000.safetensors` 映射到:
```text
$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT/SWivid__F5-TTS__F5TTS_v1_Base/model_1250000.safetensors
```
准备 runtime 和独立 venv
```bash title="终端"
mkdir -p "$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT/runtime"
cd "$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT/runtime"
if [ ! -d F5-TTS/.git ]; then
git clone https://github.com/SWivid/F5-TTS.git F5-TTS
fi
python3 -m venv --system-site-packages "$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT/runtime/.venv-f5-tts-system"
. "$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT/runtime/.venv-f5-tts-system/bin/activate"
pip install -U pip wheel setuptools
pip install --no-deps -e "$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT/runtime/F5-TTS"
pip install fastapi "uvicorn[standard]" soundfile cached_path hydra-core ema_pytorch vocos x_transformers transformers_stream_generator rjieba pypinyin tomli bitsandbytes pydub torchcodec torchdiffeq unidecode wandb
```
## 配置项
```env title=".env"
OPENTALKING_TTS_DEFAULT_PROVIDER=local_f5_tts
OPENTALKING_TTS_LOCAL_F5_TTS_SERVICE_URL=http://127.0.0.1:19095/synthesize
OPENTALKING_LOCAL_AUDIO_MODEL_ROOT=./models/local-audio
OPENTALKING_TTS_LOCAL_F5_TTS_RUNTIME_DIR=./models/local-audio/runtime/F5-TTS
OPENTALKING_TTS_LOCAL_F5_TTS_DEVICE=cuda
```
## 音色克隆
`local_f5_tts` 请求必须带参考音频。可以通过 API 上传 clone voice也可以手工准备目录
```text
$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT/voices/clones/my-f5-voice/
prompt.wav
prompt.txt
meta.json
```
`meta.json` 示例:
```json
{"provider":"local_f5_tts"}
```
上传后 `/api/voices?provider=local_f5_tts` 会返回可选 voice idTTS preview、实时对话和视频生成都可以使用这个 voice。
## 启动命令
先启动 F5-TTS sidecar再启动 OpenTalking
```bash title="终端"
cd "$OPENTALKING_HOME"
export OPENTALKING_LOCAL_AUDIO_MODEL_ROOT="${OPENTALKING_LOCAL_AUDIO_MODEL_ROOT:-$OPENTALKING_HOME/models/local-audio}"
export OPENTALKING_F5_TTS_VENV_DIR="$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT/runtime/.venv-f5-tts-system"
bash scripts/quickstart/start_local_f5_tts.sh --port 19095
export OPENTALKING_TTS_DEFAULT_PROVIDER=local_f5_tts
export OPENTALKING_TTS_LOCAL_F5_TTS_SERVICE_URL=http://127.0.0.1:19095/synthesize
python -m apps.api.main
```
## 验证命令
```bash title="终端"
curl -fsS http://127.0.0.1:19095/health
curl -fsS http://127.0.0.1:8000/health
```
TTS preview 应使用 `local_f5_tts` provider 和一个带 `prompt.wav` 的 clone voice。生成结果可以保存成 WAV 后用 ASR 或人工听检确认文本和音色。
## 实测记录
| 项目 | 命令 / 接口 | 目标 | 实测 |
|------|-------------|------|------|
| TTS preview | `/tts/preview` + SenseVoiceSmall ASR | 可播放 WAV文本正确 | 通过:试听接口返回 16 kHz mono WAVSenseVoiceSmall ASR 识别文本与目标文本一致。 |
| 实时对话 | local mode dialogue / warm TTS | RTF < 1.0 | 通过warm RTF 0.2783.31s 音频0.918s 合成);历史复测 0.386/0.518,均低于 1 |
| 离线视频 | video generation API / CLI | 生成成功,音频驱动正常 | 通过QuickTalk + F5 clone voice 可生成 MP4ffprobe 显示 H.264 视频和 16 kHz mono AAC 音频。 |
## 常见错误
| 现象 | 处理 |
|------|------|
| `Missing F5-TTS checkpoint` | 确认 `model_1250000.safetensors` 位于 `SWivid__F5-TTS__F5TTS_v1_Base` 目录。 |
| `requires prompt_audio` | 选择 clone voice或设置 `OPENTALKING_TTS_LOCAL_F5_TTS_PROMPT_AUDIO`。 |
| 依赖冲突 | 不要用 OpenTalking 主 `.venv` 启动 sidecar建议使用 `$OPENTALKING_LOCAL_AUDIO_MODEL_ROOT/runtime/.venv-f5-tts-system` 这样的独立 venv并复用宿主机已有 PyTorch/CUDA 环境。 |
| 首次请求慢 | 设置 `OPENTALKING_TTS_LOCAL_F5_TTS_PRELOAD=1` 并在服务启动后先做一次短句预热。 |
| QuickTalk v3 reshape 报错 | 使用当前 TorchScript 导出模型生成视频时保持 `OPENTALKING_QUICKTALK_RESOLUTION=256`160/128 分辨率会让模型内部特征尺寸不匹配。 |

View File

@@ -149,6 +149,15 @@ def _flatten_config(raw: dict[str, Any] | None) -> dict[str, Any]:
"local_cosyvoice_max_token_text_ratio": "tts_local_cosyvoice_max_token_text_ratio",
"local_cosyvoice_min_token_text_ratio": "tts_local_cosyvoice_min_token_text_ratio",
"local_cosyvoice_mask_stop_tokens": "tts_local_cosyvoice_mask_stop_tokens",
"local_f5_tts_model": "tts_local_f5_tts_model",
"local_f5_tts_model_dir": "tts_local_f5_tts_model_dir",
"local_f5_tts_runtime_dir": "tts_local_f5_tts_runtime_dir",
"local_f5_tts_service_url": "tts_local_f5_tts_service_url",
"local_f5_tts_ckpt_file": "tts_local_f5_tts_ckpt_file",
"local_f5_tts_vocoder_local_path": "tts_local_f5_tts_vocoder_local_path",
"local_f5_tts_prompt_audio": "tts_local_f5_tts_prompt_audio",
"local_f5_tts_prompt_text": "tts_local_f5_tts_prompt_text",
"local_f5_tts_device": "tts_local_f5_tts_device",
"local_indextts_model": "tts_local_indextts_model",
"local_indextts_model_dir": "tts_local_indextts_model_dir",
"local_indextts_cfg_path": "tts_local_indextts_cfg_path",
@@ -516,6 +525,15 @@ class Settings(BaseSettings):
tts_local_cosyvoice_max_token_text_ratio: float = 6.0
tts_local_cosyvoice_min_token_text_ratio: float = 0.0
tts_local_cosyvoice_mask_stop_tokens: bool = True
tts_local_f5_tts_model: str = "SWivid/F5-TTS/F5TTS_v1_Base"
tts_local_f5_tts_model_dir: str = ""
tts_local_f5_tts_runtime_dir: str = ""
tts_local_f5_tts_service_url: str = ""
tts_local_f5_tts_ckpt_file: str = ""
tts_local_f5_tts_vocoder_local_path: str = ""
tts_local_f5_tts_prompt_audio: str = ""
tts_local_f5_tts_prompt_text: str = ""
tts_local_f5_tts_device: str = "auto"
tts_local_indextts_model: str = "IndexTeam/IndexTTS-2"
tts_local_indextts_model_dir: str = ""
tts_local_indextts_cfg_path: str = ""

View File

@@ -202,6 +202,42 @@ def _local_audio_asset_file_dir(name: str, relative_file: str, *fallback_names:
return _local_audio_asset_dir(name, relative_file, *fallback_names)
def _local_f5_tts_model() -> str:
return _provider_env("local_f5_tts", "MODEL") or _settings_value("tts_local_f5_tts_model", "") or "SWivid/F5-TTS/F5TTS_v1_Base"
def _local_f5_tts_model_dir(model: str) -> str:
return _provider_env("local_f5_tts", "MODEL_DIR") or _settings_value("tts_local_f5_tts_model_dir", "") or str(Path(_local_audio_model_root()) / model.replace("/", "__"))
def _local_f5_tts_runtime_dir() -> str:
return _provider_env("local_f5_tts", "RUNTIME_DIR") or _settings_value("tts_local_f5_tts_runtime_dir", "") or str(Path(_local_audio_model_root()) / "runtime" / "F5-TTS")
def _local_f5_tts_service_url() -> str:
return _provider_env("local_f5_tts", "SERVICE_URL") or _settings_value("tts_local_f5_tts_service_url", "")
def _local_f5_tts_ckpt_file(model_dir: str) -> str:
return _provider_env("local_f5_tts", "CKPT_FILE") or _settings_value("tts_local_f5_tts_ckpt_file", "") or str(Path(model_dir) / "model_1250000.safetensors")
def _local_f5_tts_vocoder_local_path() -> str:
return _provider_env("local_f5_tts", "VOCODER_LOCAL_PATH") or _settings_value("tts_local_f5_tts_vocoder_local_path", "")
def _local_f5_tts_prompt_audio() -> str:
return _provider_env("local_f5_tts", "PROMPT_AUDIO") or _settings_value("tts_local_f5_tts_prompt_audio", "")
def _local_f5_tts_prompt_text() -> str:
return _provider_env("local_f5_tts", "PROMPT_TEXT") or _settings_value("tts_local_f5_tts_prompt_text", "")
def _local_f5_tts_device() -> str:
return _provider_env("local_f5_tts", "DEVICE") or _settings_value("tts_local_f5_tts_device", "") or os.environ.get("OPENTALKING_LOCAL_TTS_DEVICE", "").strip() or os.environ.get("OPENTALKING_LOCAL_AUDIO_DEVICE", "").strip() or _settings_value("local_audio_device", "") or "auto"
def _local_indextts_model() -> str:
return (
_provider_env("local_indextts", "MODEL")
@@ -671,6 +707,11 @@ def tts_provider_config(provider: str) -> dict[str, str | bool | int | float]:
"key_set": False,
"service_url_set": bool(service_url),
}
if p == "local_f5_tts":
model = _local_f5_tts_model()
model_dir = _local_f5_tts_model_dir(model)
service_url = _local_f5_tts_service_url()
return {"provider": p, "model": model, "model_dir": model_dir, "voice": "local-default", "device": _local_f5_tts_device(), "key_set": False, "service_url": service_url, "service_url_set": bool(service_url), "runtime_dir": _local_f5_tts_runtime_dir(), "ckpt_file": _local_f5_tts_ckpt_file(model_dir), "vocoder_local_path": _local_f5_tts_vocoder_local_path(), "prompt_audio_set": bool(_local_f5_tts_prompt_audio())}
if p == "local_indextts":
model = _local_indextts_model()
model_dir = _local_indextts_model_dir(model)
@@ -904,6 +945,10 @@ def tts_log_profile(
service = os.environ.get("OPENTALKING_LOCAL_QWEN3_TTS_SERVICE_URL", "").strip() or "(unset)"
return f"TTS_API=local_qwen3_tts | model={model!r} service={service!r} | {req_part}"
if p == "local_f5_tts":
model = (tts_model_override or "").strip() or _local_f5_tts_model()
return f"TTS_API=local_f5_tts | model={model!r} device={_local_f5_tts_device()!r} prompt_audio_set={bool(_local_f5_tts_prompt_audio())} | {req_part}"
if p == "local_indextts":
model = (tts_model_override or "").strip() or _local_indextts_model()
return (
@@ -1038,6 +1083,11 @@ def create_tts_adapter(
chunk_ms=chunk_ms,
model=tts_model,
)
if p == "local_f5_tts":
from opentalking.providers.tts.local_f5_tts.adapter import LocalF5TTSAdapter
model = (tts_model or "").strip() or _local_f5_tts_model()
model_dir = _local_f5_tts_model_dir(model)
return LocalF5TTSAdapter(default_voice=default_voice, sample_rate=sample_rate, chunk_ms=chunk_ms, model=model, model_dir=model_dir, runtime_dir=_local_f5_tts_runtime_dir(), ckpt_file=_local_f5_tts_ckpt_file(model_dir), vocoder_local_path=_local_f5_tts_vocoder_local_path(), service_url=_local_f5_tts_service_url(), prompt_audio=_local_f5_tts_prompt_audio(), prompt_text=_local_f5_tts_prompt_text(), device=_local_f5_tts_device())
if p == "local_indextts":
from opentalking.providers.tts.local_indextts.adapter import LocalIndexTTSAdapter

View File

@@ -0,0 +1,5 @@
from __future__ import annotations
from .adapter import LocalF5TTSAdapter
__all__ = ["LocalF5TTSAdapter"]

View File

@@ -0,0 +1,182 @@
from __future__ import annotations
import io
import os
import re
import wave
from collections.abc import AsyncIterator
from dataclasses import dataclass
from pathlib import Path
from typing import Any
import httpx
import numpy as np
from opentalking.core.types.frames import AudioChunk
from opentalking.providers.tts.voice_assets import LOCAL_F5_TTS_PROVIDER, resolve_voice_asset
def _settings_value(name: str, default: str = "") -> str:
try:
from opentalking.core.config import get_settings
value = getattr(get_settings(), name, default)
if value is not None and str(value).strip():
return str(value).strip()
except Exception:
pass
return default
def _local_audio_model_root() -> Path:
raw = os.environ.get("OPENTALKING_LOCAL_AUDIO_MODEL_ROOT", "").strip()
try:
from opentalking.core.config import get_settings
raw = raw or (get_settings().local_audio_model_root or "").strip()
except Exception:
pass
return Path(raw or "./models/local-audio").expanduser().resolve()
def _audio_format_from_content_type(content_type: str | None) -> str | None:
value = (content_type or "").split(";", 1)[0].strip().lower()
if value in {"audio/wav", "audio/wave", "audio/x-wav"}:
return "wav"
if value in {"audio/l16", "audio/pcm", "application/octet-stream"}:
return "pcm"
if value in {"audio/mpeg", "audio/mp3"}:
return "mp3"
return None
def _source_sample_rate_from_headers(headers: Any, fallback: int) -> int:
direct = str(headers.get("x-audio-sample-rate", "") or "").strip()
if direct.isdigit():
return int(direct)
content_type = str(headers.get("content-type", "") or "")
for part in content_type.split(";")[1:]:
key, sep, value = part.strip().partition("=")
if sep and key.strip().lower() == "rate" and value.strip().isdigit():
return int(value.strip())
return fallback
def _resample_linear(pcm: np.ndarray, src_sr: int, dst_sr: int) -> np.ndarray:
pcm = np.asarray(pcm, dtype=np.int16).reshape(-1)
if pcm.size == 0 or src_sr == dst_sr:
return pcm.copy()
pcm_f = pcm.astype(np.float32) / 32768.0
n_dst = max(1, int(round(pcm.size * dst_sr / src_sr)))
xi = np.linspace(0.0, pcm.size - 1.0, num=n_dst)
out = np.interp(xi, np.arange(pcm.size), pcm_f)
return np.clip(np.round(out * 32768.0), -32768, 32767).astype(np.int16)
def _split_pcm_chunks(pcm: np.ndarray, sr: int, chunk_ms: float) -> list[AudioChunk]:
samples_per_chunk = max(1, int(sr * (chunk_ms / 1000.0)))
out: list[AudioChunk] = []
for i in range(0, len(pcm), samples_per_chunk):
part = pcm[i : i + samples_per_chunk]
if part.size == 0:
continue
out.append(AudioChunk(data=part.astype(np.int16), sample_rate=sr, duration_ms=1000.0 * part.size / sr))
return out
def _read_wav_bytes_i16(raw: bytes) -> tuple[np.ndarray, int]:
with wave.open(io.BytesIO(raw), "rb") as wf:
source_sr = int(wf.getframerate())
channels = int(wf.getnchannels())
sample_width = int(wf.getsampwidth())
pcm_bytes = wf.readframes(wf.getnframes())
if sample_width != 2:
raise RuntimeError(f"Unsupported WAV sample width for local F5-TTS: {sample_width}")
pcm = np.frombuffer(pcm_bytes, dtype="<i2").astype(np.int16, copy=False)
if channels > 1:
frame_count = pcm.size // channels
pcm = pcm[: frame_count * channels].reshape(frame_count, channels).mean(axis=1).astype(np.int16)
return pcm, source_sr
@dataclass(frozen=True)
class F5VoicePrompt:
prompt_audio: Path
prompt_text: str
class LocalF5TTSAdapter:
def __init__(self, default_voice: str | None = None, sample_rate: int = 16000, chunk_ms: float = 20.0, *, model: str | None = None, model_dir: str | None = None, runtime_dir: str | None = None, ckpt_file: str | None = None, vocoder_local_path: str | None = None, service_url: str | None = None, prompt_audio: str | None = None, prompt_text: str | None = None, device: str = "auto") -> None:
self.default_voice = default_voice or "local-default"
self.sample_rate = sample_rate
self.chunk_ms = chunk_ms
self.model = (model or "SWivid/F5-TTS/F5TTS_v1_Base").strip()
self.model_dir = str(Path(model_dir or _local_audio_model_root() / self.model.replace("/", "__")).expanduser())
self.runtime_dir = str(Path(runtime_dir or _local_audio_model_root() / "runtime" / "F5-TTS").expanduser())
self.ckpt_file = str(Path(ckpt_file or Path(self.model_dir) / "model_1250000.safetensors").expanduser())
self.vocoder_local_path = str(Path(vocoder_local_path).expanduser()) if vocoder_local_path else ""
self.service_url = (service_url or os.environ.get("OPENTALKING_TTS_LOCAL_F5_TTS_SERVICE_URL", "").strip() or _settings_value("tts_local_f5_tts_service_url", "")).strip()
self.prompt_audio = str(Path(prompt_audio).expanduser()) if prompt_audio else ""
self.prompt_text = (prompt_text or "").strip()
self.device = device or "auto"
async def synthesize_stream(self, text: str, voice: str | None = None) -> AsyncIterator[AudioChunk]:
if not text.strip():
return
if not self.service_url:
raise RuntimeError("Local F5-TTS requires OPENTALKING_TTS_LOCAL_F5_TTS_SERVICE_URL. Run scripts/quickstart/start_local_f5_tts.sh first.")
async for chunk in self._synthesize_via_service(text, voice=voice):
yield chunk
def _resolve_voice_prompt(self, voice: str | None) -> F5VoicePrompt | None:
voice_id = (voice or "").strip()
if voice_id and voice_id != "local-default" and re.fullmatch(r"[A-Za-z0-9_-]{3,80}", voice_id):
asset = resolve_voice_asset(voice_id, provider=LOCAL_F5_TTS_PROVIDER, sources=("clones", "system"), model_root=_local_audio_model_root(), require_prompt_text=False)
if asset is not None:
text = asset.prompt_text.read_text(encoding="utf-8").strip() if asset.prompt_text else ""
return F5VoicePrompt(prompt_audio=asset.prompt_audio, prompt_text=text)
if self.prompt_audio:
return F5VoicePrompt(prompt_audio=Path(self.prompt_audio), prompt_text=self.prompt_text)
return None
async def _synthesize_via_service(self, text: str, voice: str | None = None) -> AsyncIterator[AudioChunk]:
timeout = httpx.Timeout(connect=30.0, read=180.0, write=30.0, pool=30.0)
payload: dict[str, Any] = {"text": text, "voice": voice or self.default_voice, "model": self.model, "sample_rate": self.sample_rate}
prompt = self._resolve_voice_prompt(voice or self.default_voice)
if prompt is not None:
payload["prompt_audio"] = str(prompt.prompt_audio)
payload["prompt_text"] = prompt.prompt_text
async with httpx.AsyncClient(timeout=timeout) as client:
async with client.stream("POST", self.service_url, json=payload) as resp:
resp.raise_for_status()
input_format = _audio_format_from_content_type(resp.headers.get("content-type"))
if input_format == "pcm":
source_sr = _source_sample_rate_from_headers(resp.headers, self.sample_rate)
pending = b""
async for data in resp.aiter_bytes():
if not data:
continue
data = pending + data
if len(data) % 2:
pending = data[-1:]
data = data[:-1]
else:
pending = b""
if not data:
continue
pcm = np.frombuffer(data, dtype="<i2").astype(np.int16, copy=False)
pcm = _resample_linear(pcm, source_sr, self.sample_rate)
for chunk in _split_pcm_chunks(pcm, self.sample_rate, self.chunk_ms):
yield chunk
return
if input_format == "wav":
pcm, source_sr = _read_wav_bytes_i16(await resp.aread())
pcm = _resample_linear(pcm, source_sr, self.sample_rate)
for chunk in _split_pcm_chunks(pcm, self.sample_rate, self.chunk_ms):
yield chunk
return
from opentalking.providers.tts.edge.adapter import _stream_decode_audio_to_pcm_chunks
async def _audio_iter() -> AsyncIterator[bytes]:
async for data in resp.aiter_bytes():
if data:
yield data
async for chunk in _stream_decode_audio_to_pcm_chunks(_audio_iter(), self.sample_rate, self.chunk_ms, input_format=input_format):
yield chunk

View File

@@ -6,7 +6,7 @@ XIAOMI_MIMO_TTS_PROVIDERS = frozenset({"xiaomi_mimo", "xiaomi", "mimo"})
QWEN_TTS_PROVIDERS = frozenset({"dashscope", "bailian", "qwen", "qwen_tts"})
COSYVOICE_TTS_PROVIDERS = frozenset({"cosyvoice", "cosyvoice_http"})
SAMBERT_TTS_PROVIDERS = frozenset({"sambert", "dashscope_sambert"})
LOCAL_TTS_PROVIDERS = frozenset({"local_cosyvoice", "local_qwen3_tts", "local_indextts"})
LOCAL_TTS_PROVIDERS = frozenset({"local_cosyvoice", "local_qwen3_tts", "local_indextts", "local_f5_tts"})
OMNIRT_TTS_PROVIDERS = frozenset({"omnirt_indextts"})
INDEXTTS_TTS_PROVIDERS = frozenset({"indextts"})
BAILIAN_TTS_PROVIDERS = (

View File

@@ -11,6 +11,7 @@ INDEXTTS_PROVIDER = "indextts"
INDEXTTS_LEGACY_PROVIDERS = {"local_indextts", "omnirt_indextts"}
INDEXTTS_PROVIDERS = {INDEXTTS_PROVIDER, *INDEXTTS_LEGACY_PROVIDERS}
LOCAL_COSYVOICE_PROVIDER = "local_cosyvoice"
LOCAL_F5_TTS_PROVIDER = "local_f5_tts"
@dataclass(frozen=True)
@@ -66,6 +67,8 @@ def _provider_aliases(provider: str) -> set[str]:
normalized = provider.strip().lower()
if normalized in INDEXTTS_PROVIDERS:
return {INDEXTTS_PROVIDER, *INDEXTTS_LEGACY_PROVIDERS}
if normalized == LOCAL_F5_TTS_PROVIDER:
return {LOCAL_F5_TTS_PROVIDER}
return {normalized}
@@ -83,6 +86,8 @@ def voice_applies_to_provider(meta: dict[str, Any], provider: str, *, bundled_sy
return True
if bundled_system and normalized == LOCAL_COSYVOICE_PROVIDER:
return True
if bundled_system and normalized == LOCAL_F5_TTS_PROVIDER:
return True
if any(_truthy_meta_flag(meta.get(key)) for key in ("universal", "compatible", "zero_shot_compatible")):
return True
aliases = _provider_aliases(normalized)

View File

@@ -18,6 +18,8 @@ DEFAULT_REUSE_ROOTS = (
MODELS: dict[str, tuple[str, str]] = {
"sensevoice-small": ("modelscope", "iic/SenseVoiceSmall"),
"fun-cosyvoice3-0.5b-2512": ("modelscope", "FunAudioLLM/Fun-CosyVoice3-0.5B-2512"),
"f5-tts-v1-base": ("hf", "SWivid/F5-TTS"),
"f5-tts-vocos": ("hf", "charactr/vocos-mel-24khz"),
"indextts2": ("modelscope", "IndexTeam/IndexTTS-2"),
"indextts2-w2v-bert": ("hf", "facebook/w2v-bert-2.0"),
"indextts2-maskgct": ("hf", "amphion/MaskGCT"),
@@ -26,6 +28,12 @@ MODELS: dict[str, tuple[str, str]] = {
}
HF_ALLOW_PATTERNS: dict[str, list[str]] = {
"f5-tts-v1-base": [
"README.md",
"F5TTS_v1_Base/model_1250000.safetensors",
"F5TTS_v1_Base/vocab.txt",
],
"f5-tts-vocos": ["config.yaml", "pytorch_model.bin"],
# IndexTTS2 only needs the feature extractor, model weights, and conformer shim.
"indextts2-w2v-bert": [
"README.md",
@@ -65,6 +73,8 @@ HF_ALLOW_PATTERNS: dict[str, list[str]] = {
MODEL_HINTS: dict[str, tuple[str, ...]] = {
"sensevoice-small": ("iic__SenseVoiceSmall", "sensevoice", "SenseVoiceSmall"),
"f5-tts-v1-base": ("SWivid__F5-TTS__F5TTS_v1_Base", "F5TTS_v1_Base", "SWivid__F5-TTS"),
"f5-tts-vocos": ("charactr__vocos-mel-24khz", "vocos-mel-24khz"),
"fun-cosyvoice3-0.5b-2512": (
"FunAudioLLM__Fun-CosyVoice3-0.5B-2512",
"Fun-CosyVoice3-0.5B-2512",
@@ -79,6 +89,8 @@ MODEL_HINTS: dict[str, tuple[str, ...]] = {
MODEL_REQUIRED_FILES: dict[str, tuple[str, ...]] = {
"sensevoice-small": ("model.pt", "config.yaml", "configuration.json"),
"f5-tts-v1-base": ("model_1250000.safetensors",),
"f5-tts-vocos": ("config.yaml", "pytorch_model.bin"),
"fun-cosyvoice3-0.5b-2512": ("cosyvoice3.yaml", "flow.pt", "hift.pt", "llm.pt"),
"indextts2": ("config.yaml", "model.pt"),
"indextts2-w2v-bert": ("model.safetensors", "conformer_shaw.pt"),
@@ -96,7 +108,9 @@ def local_audio_model_ids() -> tuple[str, ...]:
return tuple(model_id for _, model_id in MODELS.values())
def _target(root: Path, model_id: str) -> Path:
def _target(root: Path, model_id: str, *, model_key: str | None = None) -> Path:
if model_key == "f5-tts-v1-base":
return root / "SWivid__F5-TTS__F5TTS_v1_Base"
return root / model_id.replace("/", "__")
@@ -167,12 +181,26 @@ def _download_hf(model_id: str, target: Path, *, model_key: str) -> None:
from huggingface_hub import snapshot_download
endpoint = os.environ.get("HF_ENDPOINT", "").strip()
kwargs = {"repo_id": model_id, "local_dir": str(target)}
download_dir = target.parent / "SWivid__F5-TTS" if model_key == "f5-tts-v1-base" else target
kwargs = {"repo_id": model_id, "local_dir": str(download_dir)}
if endpoint:
kwargs["endpoint"] = endpoint
if patterns := HF_ALLOW_PATTERNS.get(model_key):
kwargs["allow_patterns"] = patterns
snapshot_download(**kwargs)
if model_key == "f5-tts-v1-base":
nested = download_dir / "F5TTS_v1_Base"
if nested.exists() and not _is_model_ready(target, model_key=model_key):
target.parent.mkdir(parents=True, exist_ok=True)
if target.exists() and target.is_dir() and not any(target.iterdir()):
target.rmdir()
if not target.exists():
try:
target.symlink_to(nested, target_is_directory=True)
except Exception:
shutil.copytree(nested, target)
else:
shutil.copytree(nested, target, dirs_exist_ok=True)
def _git_lfs_pull_if_needed(target: Path) -> None:
@@ -207,7 +235,7 @@ def main() -> None:
failures: list[tuple[str, str]] = []
for key in selected:
source, model_id = MODELS[key]
target = _target(root, model_id)
target = _target(root, model_id, model_key=key)
print(f"[{key}] {source}:{model_id} -> {target}", flush=True)
target.mkdir(parents=True, exist_ok=True)
try:

View File

@@ -0,0 +1,265 @@
from __future__ import annotations
import argparse
import os
import sys
import threading
import time
from collections.abc import Iterator
from pathlib import Path
from typing import Any
import numpy as np
from fastapi import FastAPI, HTTPException
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
DEFAULT_MODEL = "SWivid/F5-TTS/F5TTS_v1_Base"
DEFAULT_SERVICE_SAMPLE_RATE = 24000
def _local_audio_model_root() -> Path:
return Path(os.environ.get("OPENTALKING_LOCAL_AUDIO_MODEL_ROOT", "./models/local-audio")).expanduser().resolve()
def _default_model_dir(root: Path) -> Path:
return root / "SWivid__F5-TTS__F5TTS_v1_Base"
def _default_runtime_dir(root: Path) -> Path:
return root / "runtime" / "F5-TTS"
def _env_bool(name: str, default: bool) -> bool:
raw = os.environ.get(name, "").strip().lower()
if not raw:
return default
return raw in {"1", "true", "yes", "on"}
def _select_device(value: str) -> str | None:
value = value.strip()
if value and value != "auto":
return value
try:
import torch
return "cuda" if torch.cuda.is_available() else "cpu"
except Exception:
return None
def _pcm_bytes(wav: Any) -> bytes:
arr = np.asarray(wav)
if arr.ndim > 1:
arr = arr.mean(axis=1)
if np.issubdtype(arr.dtype, np.floating):
arr = np.clip(arr, -1.0, 1.0)
arr = np.round(arr * 32767.0).astype("<i2")
else:
arr = np.clip(arr, -32768, 32767).astype("<i2")
return arr.reshape(-1).tobytes()
def _soundfile_torchaudio_load(path: str | Path, *args: Any, **kwargs: Any):
import soundfile as sf
import torch
audio, sr = sf.read(str(path), dtype="float32", always_2d=True)
tensor = torch.from_numpy(audio.T.copy())
normalize = kwargs.get("normalize", True)
if not normalize:
tensor = (tensor.clamp(-1.0, 1.0) * 32767.0).to(torch.int16)
frame_offset = int(kwargs.get("frame_offset", 0) or 0)
num_frames = int(kwargs.get("num_frames", -1) or -1)
if frame_offset > 0 or num_frames >= 0:
end = None if num_frames < 0 else frame_offset + num_frames
tensor = tensor[:, frame_offset:end]
return tensor, int(sr)
def _patch_torchaudio_load() -> None:
try:
import torchaudio
except Exception:
return
if getattr(torchaudio, "_opentalking_soundfile_load_patched", False):
return
torchaudio.load = _soundfile_torchaudio_load
torchaudio._opentalking_soundfile_load_patched = True
class SynthesizeRequest(BaseModel):
text: str
voice: str | None = None
model: str | None = None
sample_rate: int | None = None
prompt_audio: str | None = None
prompt_text: str | None = None
speed: float | None = None
nfe_step: int | None = None
class F5TTSService:
def __init__(
self,
*,
model: str | None = None,
model_dir: str | Path | None = None,
runtime_dir: str | Path | None = None,
ckpt_file: str | Path | None = None,
vocoder_local_path: str | Path | None = None,
prompt_audio: str | Path | None = None,
prompt_text: str | None = None,
device: str | None = None,
preload: bool = True,
) -> None:
root = _local_audio_model_root()
self.model = (model or os.environ.get("OPENTALKING_TTS_LOCAL_F5_TTS_MODEL") or DEFAULT_MODEL).strip()
self.model_dir = Path(
model_dir or os.environ.get("OPENTALKING_TTS_LOCAL_F5_TTS_MODEL_DIR") or _default_model_dir(root)
).expanduser()
self.runtime_dir = Path(
runtime_dir or os.environ.get("OPENTALKING_TTS_LOCAL_F5_TTS_RUNTIME_DIR") or _default_runtime_dir(root)
).expanduser()
self.ckpt_file = Path(
ckpt_file
or os.environ.get("OPENTALKING_TTS_LOCAL_F5_TTS_CKPT_FILE")
or self.model_dir / "model_1250000.safetensors"
).expanduser()
vocoder = vocoder_local_path or os.environ.get("OPENTALKING_TTS_LOCAL_F5_TTS_VOCODER_LOCAL_PATH") or ""
self.vocoder_local_path = Path(vocoder).expanduser() if str(vocoder).strip() else None
prompt = prompt_audio or os.environ.get("OPENTALKING_TTS_LOCAL_F5_TTS_PROMPT_AUDIO") or ""
self.prompt_audio = Path(prompt).expanduser() if str(prompt).strip() else None
self.prompt_text = (prompt_text or os.environ.get("OPENTALKING_TTS_LOCAL_F5_TTS_PROMPT_TEXT") or "").strip()
self.device = device or os.environ.get("OPENTALKING_TTS_LOCAL_F5_TTS_DEVICE") or "auto"
self._engine: Any | None = None
self._lock = threading.Lock()
if preload:
self.engine()
def engine(self) -> Any:
with self._lock:
if self._engine is not None:
return self._engine
if not self.ckpt_file.exists():
raise RuntimeError(f"Missing F5-TTS checkpoint: {self.ckpt_file}")
src_dir = self.runtime_dir / "src"
for candidate in (src_dir, self.runtime_dir):
if candidate.exists() and str(candidate) not in sys.path:
sys.path.insert(0, str(candidate))
_patch_torchaudio_load()
from f5_tts.api import F5TTS
kwargs: dict[str, Any] = {"model": "F5TTS_v1_Base", "ckpt_file": str(self.ckpt_file)}
if self.vocoder_local_path is not None:
kwargs["vocoder_local_path"] = str(self.vocoder_local_path)
selected_device = _select_device(self.device)
if selected_device:
kwargs["device"] = selected_device
self._engine = F5TTS(**kwargs)
return self._engine
def health(self) -> dict[str, Any]:
return {
"ok": True,
"provider": "local_f5_tts",
"model": self.model,
"model_dir": str(self.model_dir),
"runtime_dir": str(self.runtime_dir),
"ckpt_file": str(self.ckpt_file),
"ckpt_exists": self.ckpt_file.exists(),
"loaded": self._engine is not None,
"device": self.device,
}
def prewarm(self, text: str = "你好。") -> None:
if self.prompt_audio is None or not self.prompt_audio.exists():
self.engine()
return
list(self.synthesize(text=text, prompt_audio=self.prompt_audio, prompt_text=self.prompt_text))
def synthesize(
self,
*,
text: str,
prompt_audio: str | Path | None = None,
prompt_text: str | None = None,
speed: float | None = None,
nfe_step: int | None = None,
) -> Iterator[bytes]:
text = text.strip()
if not text:
return
ref_audio = Path(prompt_audio).expanduser() if prompt_audio else self.prompt_audio
if ref_audio is None or not ref_audio.exists():
raise RuntimeError("Local F5-TTS requires prompt_audio for voice cloning.")
ref_text = (prompt_text if prompt_text is not None else self.prompt_text).strip()
infer_kwargs: dict[str, Any] = {
"ref_file": str(ref_audio),
"ref_text": ref_text,
"gen_text": text,
"show_info": lambda *_args, **_kwargs: None,
"progress": None,
}
if speed is not None:
infer_kwargs["speed"] = float(speed)
if nfe_step is not None:
infer_kwargs["nfe_step"] = int(nfe_step)
wav, _sr, _spec = self.engine().infer(**infer_kwargs)
yield _pcm_bytes(wav)
def create_app(service: F5TTSService | None = None) -> FastAPI:
service = service or F5TTSService(preload=_env_bool("OPENTALKING_TTS_LOCAL_F5_TTS_PRELOAD", True))
app = FastAPI(title="OpenTalking Local F5-TTS Service")
@app.get("/health")
def health() -> dict[str, Any]:
return service.health()
@app.post("/synthesize")
def synthesize(request: SynthesizeRequest) -> StreamingResponse:
try:
started = time.perf_counter()
audio = list(
service.synthesize(
text=request.text,
prompt_audio=request.prompt_audio,
prompt_text=request.prompt_text,
speed=request.speed,
nfe_step=request.nfe_step,
)
)
elapsed = max(time.perf_counter() - started, 0.001)
except Exception as exc:
raise HTTPException(status_code=500, detail=str(exc)) from exc
headers = {
"X-Audio-Sample-Rate": str(DEFAULT_SERVICE_SAMPLE_RATE),
"X-OpenTalking-Elapsed": f"{elapsed:.3f}",
}
return StreamingResponse(
iter(audio),
media_type=f"audio/L16; rate={DEFAULT_SERVICE_SAMPLE_RATE}; channels=1",
headers=headers,
)
return app
def main() -> None:
parser = argparse.ArgumentParser(description="Run the OpenTalking local F5-TTS sidecar.")
parser.add_argument("--host", default=os.environ.get("OPENTALKING_TTS_LOCAL_F5_TTS_HOST", "127.0.0.1"))
parser.add_argument("--port", type=int, default=int(os.environ.get("OPENTALKING_TTS_LOCAL_F5_TTS_PORT", "19095")))
parser.add_argument("--no-preload", action="store_true")
args = parser.parse_args()
import uvicorn
app = create_app(F5TTSService(preload=not args.no_preload and _env_bool("OPENTALKING_TTS_LOCAL_F5_TTS_PRELOAD", True)))
uvicorn.run(app, host=args.host, port=args.port)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,191 @@
#!/usr/bin/env bash
set -euo pipefail
script_dir="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)"
repo_root="$(cd -- "$script_dir/../.." && pwd)"
default_home="$(cd -- "$repo_root/.." && pwd)"
# shellcheck disable=SC1091
source "$script_dir/_helpers.sh"
usage() {
cat <<'USAGE'
Usage:
bash scripts/quickstart/start_local_f5_tts.sh [--host HOST] [--port PORT] [--env FILE]
Options:
--host HOST Bind host for the local F5-TTS sidecar. Defaults to 127.0.0.1.
--port PORT Bind port. Defaults to OPENTALKING_TTS_LOCAL_F5_TTS_SERVICE_URL or 19095.
--env FILE Source a quickstart env file before starting the sidecar.
--help Show this help.
USAGE
}
env_file="${OPENTALKING_QUICKSTART_ENV:-$script_dir/env}"
host="${OPENTALKING_TTS_LOCAL_F5_TTS_HOST:-127.0.0.1}"
port=""
while [[ $# -gt 0 ]]; do
case "$1" in
--host)
if [[ $# -lt 2 ]]; then
echo "Missing value for --host" >&2
exit 2
fi
host="$2"
shift 2
;;
--port)
if [[ $# -lt 2 ]]; then
echo "Missing value for --port" >&2
exit 2
fi
port="$2"
shift 2
;;
--env)
if [[ $# -lt 2 ]]; then
echo "Missing value for --env" >&2
exit 2
fi
env_file="$2"
export OPENTALKING_QUICKSTART_ENV="$env_file"
shift 2
;;
--help|-h)
usage
exit 0
;;
*)
echo "Unknown argument: $1" >&2
usage >&2
exit 2
;;
esac
done
quickstart_source_env "$env_file"
export DIGITAL_HUMAN_HOME="${DIGITAL_HUMAN_HOME:-$default_home}"
run_dir="$DIGITAL_HUMAN_HOME/run"
log_dir="$DIGITAL_HUMAN_HOME/logs"
mkdir -p "$run_dir" "$log_dir"
if [[ -z "$port" ]]; then
port="${OPENTALKING_TTS_LOCAL_F5_TTS_PORT:-}"
fi
if [[ -z "$port" && -n "${OPENTALKING_TTS_LOCAL_F5_TTS_SERVICE_URL:-}" ]]; then
port="$(
python3 - <<'PY'
import os
from urllib.parse import urlparse
url = os.environ.get("OPENTALKING_TTS_LOCAL_F5_TTS_SERVICE_URL", "")
parsed = urlparse(url)
print(parsed.port or "")
PY
)"
fi
port="${port:-19095}"
resolve_f5_python() {
if [[ -n "${OPENTALKING_F5_TTS_PYTHON:-}" ]]; then
case "$OPENTALKING_F5_TTS_PYTHON" in
"$repo_root/.venv/"*)
echo "Refusing to start local F5-TTS from the OpenTalking main venv: $OPENTALKING_F5_TTS_PYTHON" >&2
echo "Use OPENTALKING_F5_TTS_VENV_DIR or OPENTALKING_F5_TTS_PYTHON for the sidecar venv." >&2
return 1
;;
esac
if [[ -x "$OPENTALKING_F5_TTS_PYTHON" ]]; then
printf '%s\n' "$OPENTALKING_F5_TTS_PYTHON"
return 0
fi
echo "OPENTALKING_F5_TTS_PYTHON is not executable: $OPENTALKING_F5_TTS_PYTHON" >&2
return 1
fi
local candidate_dir=""
for candidate_dir in \
"${OPENTALKING_F5_TTS_VENV_DIR:-}" \
"$repo_root/.venv-f5-tts" \
"$DIGITAL_HUMAN_HOME/.venv-f5-tts" \
"/home/zhongyi/models/local-audio/runtime/.venv-f5-tts"
do
[[ -n "$candidate_dir" ]] || continue
if [[ -x "$candidate_dir/bin/python" ]]; then
printf '%s\n' "$candidate_dir/bin/python"
return 0
fi
done
echo "Missing F5-TTS sidecar venv." >&2
echo "Create it first: python3 -m venv $repo_root/.venv-f5-tts && $repo_root/.venv-f5-tts/bin/pip install -e /home/zhongyi/models/local-audio/runtime/F5-TTS fastapi 'uvicorn[standard]' soundfile" >&2
return 1
}
f5_python="$(resolve_f5_python)"
pid_file="$run_dir/local-f5-tts-$port.pid"
log_file="$log_dir/local-f5-tts-$port.log"
if [[ -f "$pid_file" ]]; then
old_pid="$(cat "$pid_file" 2>/dev/null || true)"
if [[ -n "$old_pid" ]] && kill -0 "$old_pid" >/dev/null 2>&1; then
if curl --max-time 2 -fsS "http://127.0.0.1:$port/health" >/dev/null 2>&1; then
echo "Local F5-TTS is already running: pid=$old_pid port=$port"
echo "Log: $log_file"
exit 0
fi
echo "Stale Local F5-TTS pid file: pid=$old_pid port=$port" >&2
fi
rm -f "$pid_file"
fi
if quickstart_port_in_use "$port"; then
echo "Local F5-TTS port $port is already in use." >&2
quickstart_describe_port "$port" >&2 || true
exit 1
fi
echo "Starting Local F5-TTS"
echo " repo: $repo_root"
echo " python: $f5_python"
echo " host: $host"
echo " port: $port"
echo " log: $log_file"
(
cd "$repo_root"
export PYTHONPATH="$repo_root${PYTHONPATH:+:$PYTHONPATH}"
export OPENTALKING_TTS_LOCAL_F5_TTS_PRELOAD="${OPENTALKING_TTS_LOCAL_F5_TTS_PRELOAD:-1}"
if declare -F quickstart_detach >/dev/null 2>&1; then
quickstart_detach "$log_file" "$f5_python" scripts/local_f5_tts_service.py --host "$host" --port "$port" >"$pid_file"
else
setsid "$f5_python" scripts/local_f5_tts_service.py --host "$host" --port "$port" >"$log_file" 2>&1 < /dev/null &
echo "$!" >"$pid_file"
fi
)
pid="$(cat "$pid_file" 2>/dev/null || true)"
if [[ -z "$pid" ]]; then
echo "Failed to capture Local F5-TTS pid." >&2
exit 1
fi
for _ in {1..180}; do
if ! kill -0 "$pid" >/dev/null 2>&1; then
echo "Local F5-TTS exited during startup. Last log lines:" >&2
tail -80 "$log_file" >&2 || true
rm -f "$pid_file"
exit 1
fi
if curl --max-time 2 -fsS "http://127.0.0.1:$port/health" >/dev/null 2>&1; then
echo "Local F5-TTS is up: http://127.0.0.1:$port"
exit 0
fi
sleep 1
done
echo "Local F5-TTS did not become ready in 180s. Last log lines:" >&2
tail -80 "$log_file" >&2 || true
exit 1

View File

@@ -15,19 +15,28 @@ def test_frontend_lists_local_tts_models_and_labels():
assert "local_cosyvoice" in constants
assert "indextts" in constants
assert "local_f5_tts" in constants
assert "Local CosyVoice" in settings
assert "IndexTTS" in settings
assert "Local IndexTTS" in settings
assert "Local F5-TTS" in settings
assert "OmniRT IndexTTS" not in settings
assert "Local IndexTTS" in workspace
assert "Local F5-TTS" in workspace
assert "OmniRT IndexTTS" not in workspace
assert "Local IndexTTS" in clone
assert "Local F5-TTS" in clone
assert "OmniRT IndexTTS" not in clone
assert "本地模型" in constants
assert "local_cosyvoice" in app
assert "indextts" in app
assert "FunAudioLLM/Fun-CosyVoice3-0.5B-2512" in constants
assert "IndexTeam/IndexTTS-2" in constants
assert "SWivid/F5-TTS/F5TTS_v1_Base" in constants
assert "LOCAL_F5_TTS_MODEL_OPTIONS" in app
assert "local_f5_tts" in app[app.index("function normalizeTtsProvider"):app.index("if (normalized === \"local_indextts\"")]
assert "if (p === \"local_f5_tts\") return \"local_f5_tts\"" in app
assert 'ttsProvider === "local_f5_tts"' in app[app.index("const sharedSystemPrompt"):app.index("targetModel: sharedSystemPrompt")]
assert "iic/CosyVoice-300M" not in constants
assert "local_qwen3_tts" not in settings
@@ -45,6 +54,7 @@ def test_single_model_tts_provider_opens_voice_picker_first():
assert "const providerOptions" in settings
assert "hasChildren: true," in settings[settings.index("const providerOptions"):settings.index("const selectedProvider")]
assert "hasChildren: p !== ttsProvider" not in settings
assert "provider === \"local_f5_tts\"" in settings[settings.index("providerHasSingleModel"):settings.index("handleProviderSelect")]
assert settings.index("const qwenModelColumnOptions") < settings.index("const providerOptions")

View File

@@ -2,10 +2,12 @@ from __future__ import annotations
import asyncio
import io
import os
import queue
import wave
import importlib
import sys
import subprocess
from pathlib import Path
from types import SimpleNamespace
@@ -33,6 +35,7 @@ def _settings(**overrides):
("local_cosyvoice", "LocalCosyVoiceTTSAdapter"),
("local_qwen3_tts", "LocalQwen3TTSAdapter"),
("local_indextts", "LocalIndexTTSAdapter"),
("local_f5_tts", "LocalF5TTSAdapter"),
],
)
def test_local_tts_providers_are_supported(provider: str, expected_cls: str, monkeypatch):
@@ -53,6 +56,44 @@ def test_local_tts_providers_are_supported(provider: str, expected_cls: str, mon
assert adapter.model == "test-model"
def test_local_f5_tts_status_uses_local_model_root(monkeypatch):
from opentalking.providers.tts.factory import tts_provider_config
monkeypatch.setenv("OPENTALKING_LOCAL_AUDIO_MODEL_ROOT", "/tmp/opentalking-local-audio")
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_F5_TTS_SERVICE_URL", "http://127.0.0.1:19095/synthesize")
monkeypatch.setattr(
"opentalking.providers.tts.factory._settings_value",
lambda _name, default="": default,
)
for key in (
"OPENTALKING_TTS_LOCAL_F5_TTS_MODEL",
"OPENTALKING_TTS_LOCAL_F5_TTS_MODEL_DIR",
"OPENTALKING_TTS_LOCAL_F5_TTS_RUNTIME_DIR",
"OPENTALKING_TTS_LOCAL_F5_TTS_CKPT_FILE",
"OPENTALKING_TTS_LOCAL_F5_TTS_VOCODER_LOCAL_PATH",
"OPENTALKING_TTS_LOCAL_F5_TTS_PROMPT_AUDIO",
"OPENTALKING_TTS_LOCAL_F5_TTS_DEVICE",
):
monkeypatch.delenv(key, raising=False)
status = tts_provider_config("local_f5_tts")
assert status == {
"provider": "local_f5_tts",
"model": "SWivid/F5-TTS/F5TTS_v1_Base",
"model_dir": "/tmp/opentalking-local-audio/SWivid__F5-TTS__F5TTS_v1_Base",
"voice": "local-default",
"device": "auto",
"key_set": False,
"service_url": "http://127.0.0.1:19095/synthesize",
"service_url_set": True,
"runtime_dir": "/tmp/opentalking-local-audio/runtime/F5-TTS",
"ckpt_file": "/tmp/opentalking-local-audio/SWivid__F5-TTS__F5TTS_v1_Base/model_1250000.safetensors",
"vocoder_local_path": "",
"prompt_audio_set": False,
}
def test_local_indextts_status_uses_local_model_root(monkeypatch):
from opentalking.providers.tts.factory import tts_provider_config
@@ -1974,3 +2015,143 @@ def test_local_cosyvoice_service_prewarm_loads_model_and_runs_short_synthesis(mo
service.prewarm(text="你好")
assert calls == ["model", "synth:你好:True"]
def test_local_f5_tts_streams_pcm_from_service(monkeypatch):
from opentalking.providers.tts.local_f5_tts.adapter import LocalF5TTSAdapter
class FakeHeaders(dict):
def get(self, key, default=None):
return super().get(key.lower(), default)
class FakeResponse:
headers = FakeHeaders({"content-type": "audio/L16; rate=24000; channels=1", "x-audio-sample-rate": "24000"})
def raise_for_status(self):
return None
async def aiter_bytes(self):
yield np.array([0, 1200, -1200, 0], dtype="<i2").tobytes()
class FakeStream:
async def __aenter__(self):
return FakeResponse()
async def __aexit__(self, exc_type, exc, tb):
return False
class FakeClient:
def __init__(self, *args, **kwargs):
pass
async def __aenter__(self):
return self
async def __aexit__(self, exc_type, exc, tb):
return False
def stream(self, method, url, json):
assert method == "POST"
assert url == "http://127.0.0.1:19095/synthesize"
assert json["text"] == "你好"
assert json["sample_rate"] == 16000
return FakeStream()
monkeypatch.setattr("opentalking.providers.tts.local_f5_tts.adapter.httpx.AsyncClient", FakeClient)
adapter = LocalF5TTSAdapter(
service_url="http://127.0.0.1:19095/synthesize",
sample_rate=16000,
chunk_ms=20.0,
)
async def collect():
return [chunk async for chunk in adapter.synthesize_stream("你好", voice="local-default")]
chunks = asyncio.run(collect())
assert chunks
assert all(chunk.sample_rate == 16000 for chunk in chunks)
assert sum(int(chunk.data.size) for chunk in chunks) > 0
def test_local_f5_tts_resolves_prompt_text_and_audio(tmp_path, monkeypatch):
from opentalking.providers.tts.local_f5_tts.adapter import LocalF5TTSAdapter
monkeypatch.setenv("OPENTALKING_LOCAL_AUDIO_MODEL_ROOT", str(tmp_path / "models"))
voice_dir = tmp_path / "models" / "voices" / "clones" / "local-f5-voice"
voice_dir.mkdir(parents=True)
(voice_dir / "prompt.wav").write_bytes(b"RIFFtest")
(voice_dir / "prompt.txt").write_text("这是一段参考文本。", encoding="utf-8")
(voice_dir / "meta.json").write_text('{"provider":"local_f5_tts"}', encoding="utf-8")
adapter = LocalF5TTSAdapter(service_url="http://127.0.0.1:19095/synthesize")
prompt = adapter._resolve_voice_prompt("local-f5-voice")
assert prompt is not None
assert prompt.prompt_audio == voice_dir / "prompt.wav"
assert prompt.prompt_text == "这是一段参考文本。"
def test_local_f5_tts_resolves_bundled_zero_shot_system_voice(monkeypatch):
from opentalking.providers.tts.local_f5_tts.adapter import LocalF5TTSAdapter
from opentalking.providers.tts.voice_assets import bundled_system_voice_root
monkeypatch.setenv("OPENTALKING_LOCAL_AUDIO_MODEL_ROOT", "/tmp/opentalking-local-audio")
adapter = LocalF5TTSAdapter(service_url="http://127.0.0.1:19095/synthesize")
prompt = adapter._resolve_voice_prompt("local-office-serena")
assert prompt is not None
assert prompt.prompt_audio == bundled_system_voice_root() / "local-office-serena" / "prompt.wav"
assert prompt.prompt_text == "你好欢迎来到OpenTalking。我会用自然清晰的声音为你介绍今天的内容。"
def test_download_script_knows_f5_tts_base_model():
from scripts import download_local_audio_models as downloader
assert downloader.MODELS["f5-tts-v1-base"] == ("hf", "SWivid/F5-TTS")
assert downloader.HF_ALLOW_PATTERNS["f5-tts-v1-base"] == [
"README.md",
"F5TTS_v1_Base/model_1250000.safetensors",
"F5TTS_v1_Base/vocab.txt",
]
assert downloader.MODEL_REQUIRED_FILES["f5-tts-v1-base"] == (
"model_1250000.safetensors",
)
assert downloader._target(Path("/models"), "SWivid/F5-TTS", model_key="f5-tts-v1-base") == Path(
"/models/SWivid__F5-TTS__F5TTS_v1_Base"
)
def test_local_f5_tts_service_module_exposes_routes(monkeypatch, tmp_path):
monkeypatch.setenv("OPENTALKING_LOCAL_AUDIO_MODEL_ROOT", str(tmp_path / "models"))
monkeypatch.setenv("OPENTALKING_TTS_LOCAL_F5_TTS_PRELOAD", "0")
module = importlib.import_module("scripts.local_f5_tts_service")
service = module.F5TTSService(preload=False)
app = module.create_app(service)
routes = {getattr(route, "path", "") for route in app.routes}
assert "/health" in routes
assert "/synthesize" in routes
assert service.model_dir == tmp_path / "models" / "SWivid__F5-TTS__F5TTS_v1_Base"
assert service.ckpt_file == service.model_dir / "model_1250000.safetensors"
def test_start_local_f5_tts_script_refuses_main_venv(tmp_path):
repo = Path(__file__).resolve().parents[2]
result = subprocess.run(
["bash", str(repo / "scripts" / "quickstart" / "start_local_f5_tts.sh"), "--port", "19995"],
cwd=repo,
env={
**os.environ,
"OPENTALKING_F5_TTS_PYTHON": str(repo / ".venv" / "bin" / "python"),
"OPENTALKING_QUICKSTART_ENV": str(tmp_path / "missing-env"),
},
text=True,
capture_output=True,
timeout=10,
)
assert result.returncode == 1
assert "Refusing to start local F5-TTS from the OpenTalking main venv" in result.stderr