fix quicktalk local assets and support QuickTalk on Apple Silicon (#98)

This commit is contained in:
zyairehhh
2026-06-12 16:41:50 +08:00
committed by GitHub
parent b6ffab2bb4
commit 5cdcd8dd3d
40 changed files with 1640 additions and 129 deletions

View File

@@ -20,7 +20,8 @@ OPENTALKING_EXPORTS_DIR=./data/exports
# WebUI 默认展示的数字人模型CLI --model 会覆盖该值。
OPENTALKING_DEFAULT_MODEL=mock
OPENTALKING_TORCH_DEVICE=auto
OPENTALKING_FFMPEG_BIN=ffmpeg
# Leave empty to auto-detect system ffmpeg, then fall back to imageio-ffmpeg.
OPENTALKING_FFMPEG_BIN=
OPENTALKING_DEFAULT_FPS=25
OPENTALKING_TTS_SAMPLE_RATE=16000
OPENTALKING_TTS_STREAMING_DECODE=1
@@ -204,11 +205,15 @@ OPENTALKING_TTS_EDGE_VOICE=zh-CN-XiaoxiaoNeural
# OPENTALKING_DEFAULT_MODEL=quicktalk
# OPENTALKING_QUICKTALK_BACKEND=local
# OPENTALKING_QUICKTALK_ASSET_ROOT=./models/quicktalk
# OPENTALKING_QUICKTALK_MODEL_ROOT=./models/quicktalk
# OPENTALKING_QUICKTALK_MODEL_BACKEND=auto
# OPENTALKING_QUICKTALK_DEVICE=cuda:0
# OPENTALKING_QUICKTALK_HUBERT_DEVICE=
# OPENTALKING_QUICKTALK_WORKER_CACHE=1
# OPENTALKING_QUICKTALK_SLICE_LEN=28
# Apple Silicon/MPS defaults to 12 for lower per-chunk latency; Linux CUDA keeps 28.
# Set this on Apple Silicon when smooth long-text playback matters more than 25fps motion.
# Linux CUDA keeps the model-native 25fps when this is unset.
# OPENTALKING_QUICKTALK_FPS=14
# OPENTALKING_QUICKTALK_RENDER_CHUNK_MS=500
# OPENTALKING_QUICKTALK_PREFETCH=1
# OPENTALKING_PREWARM_AVATARS=

1
.gitignore vendored
View File

@@ -53,6 +53,7 @@ data/
# Runtime output & debug artifacts
output/
outputs/
run/
.run-video-clone/
# Frontend build artifacts (regenerate with npm run build)

View File

@@ -22,6 +22,7 @@ from PIL import Image
from opentalking.avatar import mouth_metadata
from opentalking.avatar.loader import load_avatar_bundle
from opentalking.avatar.validator import list_avatar_dirs
from opentalking.models.quicktalk.paths import resolve_quicktalk_asset_root
from opentalking.models.registry import get_adapter
from opentalking.providers.synthesis.backends import resolve_model_backend
from opentalking.providers.synthesis.omnirt import auth_headers
@@ -333,17 +334,10 @@ async def _post_omnirt_json(settings: Any, path: str, payload: dict[str, Any]) -
def _settings_quicktalk_model_root(settings: Any) -> Path:
raw = (
getattr(settings, "quicktalk_model_root", "")
or os.environ.get("OPENTALKING_QUICKTALK_MODEL_ROOT", "")
or os.environ.get("OMNIRT_QUICKTALK_MODEL_ROOT", "")
)
if raw:
return Path(str(raw)).expanduser().resolve()
omnirt_model_root = os.environ.get("OMNIRT_MODEL_ROOT", "").strip()
if omnirt_model_root:
return (Path(omnirt_model_root).expanduser().resolve() / "quicktalk").resolve()
return (Path(getattr(settings, "models_dir", "./models")) / "quicktalk").expanduser().resolve()
resolved = resolve_quicktalk_asset_root(settings)
if resolved is not None:
return resolved
return (Path("./models") / "quicktalk").expanduser().resolve()
def _settings_int(settings: Any, name: str, env_name: str, default: int) -> int:
@@ -381,15 +375,16 @@ def _settings_optional_float(
def _quicktalk_rebuild(settings: Any):
from opentalking.models.quicktalk.adapter import _configured_quicktalk_device
from opentalking.models.quicktalk.runtime_v2 import QuickTalkRebuild
return QuickTalkRebuild(
asset_root=_settings_quicktalk_model_root(settings),
device=str(
device=_configured_quicktalk_device(
getattr(settings, "quicktalk_device", None)
or os.environ.get("OPENTALKING_QUICKTALK_DEVICE")
or os.environ.get("OMNIRT_QUICKTALK_DEVICE")
or "cuda:0"
or os.environ.get("OMNIRT_QUICKTALK_DEVICE"),
getattr(settings, "torch_device", ""),
getattr(settings, "device", ""),
),
hubert_device=(
getattr(settings, "quicktalk_hubert_device", None)
@@ -409,19 +404,20 @@ def _quicktalk_rebuild(settings: Any):
class _QuickTalkCacheBuilder:
def __init__(self, settings: Any) -> None:
import torch
from opentalking.models.quicktalk.adapter import _configured_quicktalk_device
from opentalking.models.quicktalk.runtime_v2 import ImageProcessor
asset_root = _settings_quicktalk_model_root(settings)
checkpoints = asset_root / "checkpoints"
aux_min = checkpoints / "auxiliary_min"
aux_root = aux_min if aux_min.exists() else (checkpoints / "auxiliary")
device = str(
device = _configured_quicktalk_device(
getattr(settings, "quicktalk_face_cache_device", None)
or getattr(settings, "quicktalk_device", None)
or os.environ.get("OPENTALKING_QUICKTALK_FACE_CACHE_DEVICE")
or os.environ.get("OPENTALKING_QUICKTALK_DEVICE")
or os.environ.get("OMNIRT_QUICKTALK_DEVICE")
or "cuda:0"
or os.environ.get("OMNIRT_QUICKTALK_DEVICE"),
getattr(settings, "torch_device", ""),
getattr(settings, "device", ""),
)
torch_device = torch.device(device)
dtype = torch.float32
@@ -568,15 +564,14 @@ def _local_adapter_device(model: str, settings: Any) -> str:
or "cuda"
)
if model == "quicktalk":
return str(
getattr(settings, "quicktalk_device", "")
or os.environ.get("OPENTALKING_QUICKTALK_DEVICE")
or os.environ.get("OPENTALKING_TORCH_DEVICE")
or getattr(settings, "torch_device", "")
or getattr(settings, "device", "")
or os.environ.get("OPENTALKING_DEVICE")
or os.environ.get("DEVICE")
or "cuda:0"
from opentalking.models.quicktalk.adapter import _configured_quicktalk_device
return _configured_quicktalk_device(
getattr(settings, "quicktalk_device", ""),
os.environ.get("OPENTALKING_DEVICE"),
os.environ.get("DEVICE"),
getattr(settings, "torch_device", ""),
getattr(settings, "device", ""),
)
return str(
getattr(settings, "device", "")

View File

@@ -6,6 +6,7 @@ from typing import Any
from fastapi import APIRouter, Request
from opentalking.core.queue_status import get_flashtalk_queue_status
from opentalking.models.quicktalk.paths import resolve_quicktalk_asset_root
from opentalking.providers.stt.factory import stt_enabled_providers, stt_provider_config, stt_status
from opentalking.providers.tts.factory import tts_enabled_providers, tts_provider_config, tts_status
@@ -66,8 +67,14 @@ def _runtime_status_payload(request: Request) -> dict[str, Any]:
getattr(settings, "llm_api_key", "") or ""
).strip()
ignored_legacy_env = [name for name in _IGNORED_LEGACY_ENV if os.environ.get(name)]
quicktalk_backend = os.environ.get("OPENTALKING_QUICKTALK_BACKEND", "").strip()
quicktalk_device = os.environ.get("OPENTALKING_TORCH_DEVICE", "").strip()
quicktalk_backend = os.environ.get("OPENTALKING_QUICKTALK_BACKEND", "").strip() or str(
getattr(settings, "quicktalk_backend", "") or ""
).strip()
quicktalk_device = os.environ.get("OPENTALKING_QUICKTALK_DEVICE", "").strip() or str(
getattr(settings, "quicktalk_device", "") or ""
).strip()
quicktalk_asset_root_path = resolve_quicktalk_asset_root(settings)
quicktalk_asset_root = str(quicktalk_asset_root_path) if quicktalk_asset_root_path else ""
return {
"status": "ok",
"llm_provider": os.environ.get("OPENTALKING_LLM_PROVIDER", "").strip()
@@ -97,7 +104,7 @@ def _runtime_status_payload(request: Request) -> dict[str, Any]:
"default_model": str(getattr(settings, "default_model", "") or ""),
"quicktalk_backend": quicktalk_backend,
"quicktalk_device": quicktalk_device,
"quicktalk_asset_root": os.environ.get("OPENTALKING_QUICKTALK_ASSET_ROOT", "").strip(),
"quicktalk_asset_root": quicktalk_asset_root,
"ignored_legacy_env": ignored_legacy_env,
}

View File

@@ -75,6 +75,7 @@ def test_create_custom_avatar_adds_listed_asset_with_preview(tmp_path):
def test_quicktalk_model_root_falls_back_to_omnirt_model_root(tmp_path, monkeypatch):
monkeypatch.delenv("OPENTALKING_QUICKTALK_ASSET_ROOT", raising=False)
monkeypatch.delenv("OPENTALKING_QUICKTALK_MODEL_ROOT", raising=False)
monkeypatch.delenv("OMNIRT_QUICKTALK_MODEL_ROOT", raising=False)
monkeypatch.setenv("OMNIRT_MODEL_ROOT", str(tmp_path / "shared-models"))
@@ -86,6 +87,35 @@ def test_quicktalk_model_root_falls_back_to_omnirt_model_root(tmp_path, monkeypa
).resolve()
def test_quicktalk_model_root_prefers_asset_root_setting_and_env(tmp_path, monkeypatch):
env_asset_root = tmp_path / "env-quicktalk"
setting_asset_root = tmp_path / "settings-quicktalk"
monkeypatch.setenv("OPENTALKING_QUICKTALK_ASSET_ROOT", str(env_asset_root))
monkeypatch.setenv("OPENTALKING_QUICKTALK_MODEL_ROOT", str(tmp_path / "legacy-env-root"))
monkeypatch.setenv("OMNIRT_QUICKTALK_MODEL_ROOT", str(tmp_path / "omnirt-env-root"))
monkeypatch.setenv("OMNIRT_MODEL_ROOT", str(tmp_path / "shared-models"))
settings = SimpleNamespace(
models_dir=str(tmp_path / "repo-models"),
quicktalk_asset_root=str(setting_asset_root),
quicktalk_model_root=str(tmp_path / "legacy-settings-root"),
)
assert avatars._settings_quicktalk_model_root(settings) == setting_asset_root.resolve()
settings.quicktalk_asset_root = ""
assert avatars._settings_quicktalk_model_root(settings) == env_asset_root.resolve()
monkeypatch.delenv("OPENTALKING_QUICKTALK_ASSET_ROOT")
assert avatars._settings_quicktalk_model_root(settings) == (
tmp_path / "legacy-settings-root"
).resolve()
settings.quicktalk_model_root = ""
assert avatars._settings_quicktalk_model_root(settings) == (
tmp_path / "legacy-env-root"
).resolve()
def test_quicktalk_avatar_prewarm_uses_full_video_by_default(
tmp_path,
monkeypatch,
@@ -891,6 +921,91 @@ def test_quicktalk_avatar_prewarm_uses_local_adapter_when_backend_is_local(tmp_p
assert calls[1][1] == str(avatar)
def test_wav2lip_avatar_can_prewarm_quicktalk_with_asset_root_setting(tmp_path, monkeypatch):
quicktalk_asset_root = tmp_path / "quicktalk-assets"
avatar = tmp_path / "wav-avatar"
avatar.mkdir()
(avatar / "reference.png").write_bytes(_png_bytes((16, 24)))
(avatar / "manifest.json").write_text(
json.dumps(
{
"id": "wav-avatar",
"name": "Wav Avatar",
"model_type": "wav2lip",
"fps": 25,
"sample_rate": 16000,
"width": 16,
"height": 24,
"version": "1.0",
}
),
encoding="utf-8",
)
prepared_asset_roots: list[Path] = []
def fake_prepare_quicktalk_asset(**kwargs):
rebuild = kwargs["rebuild"]
prepared_asset_roots.append(rebuild.asset_root)
return avatars.PreparedAssetResult(
avatar_id="wav-avatar",
status="generated",
source_mode="image",
template_path=avatar / "quicktalk" / "template_16x24.mp4",
cache_path=avatar / "quicktalk" / "face_cache_v3_16x24.npz",
frames=1,
)
class FakeAdapter:
def load_model(self, device="cuda"):
del device
def load_avatar(self, avatar_path):
return {"avatar_path": avatar_path}
def warmup(self, avatar_state):
del avatar_state
async def fail_omnirt(settings, path, payload):
del settings, path, payload
raise AssertionError("local prewarm must not call OmniRT")
monkeypatch.setattr(avatars, "_prepare_quicktalk_asset", fake_prepare_quicktalk_asset)
monkeypatch.setattr(
avatars,
"_quicktalk_cache_builder",
lambda settings: SimpleNamespace(
asset_root=avatars._settings_quicktalk_model_root(settings)
),
)
monkeypatch.setattr(avatars, "_quicktalk_cache_hit_result", lambda *args, **kwargs: None)
monkeypatch.setattr(avatars, "_post_omnirt_json", fail_omnirt)
monkeypatch.setattr(avatars, "resolve_model_backend", lambda model, settings: SimpleNamespace(backend="local"))
monkeypatch.setattr(avatars, "get_adapter", lambda model: FakeAdapter())
monkeypatch.setenv("OPENTALKING_QUICKTALK_MODEL_ROOT", str(tmp_path / "wrong-legacy-root"))
app = FastAPI()
app.state.settings = SimpleNamespace(
avatars_dir=str(tmp_path),
models_dir=str(tmp_path / "wrong-models-dir"),
quicktalk_asset_root=str(quicktalk_asset_root),
quicktalk_model_root="",
quicktalk_device="cpu",
device="cpu",
)
app.include_router(avatars.router)
client = TestClient(app)
response = client.post("/avatars/wav-avatar/prewarm", json={"model": "quicktalk"})
assert response.status_code == 200
payload = response.json()
assert payload["status"] == "ready"
assert payload["cache"]["model"] == "quicktalk"
assert payload["runtime"]["type"] == "local_prewarm_result"
assert prepared_asset_roots == [quicktalk_asset_root.resolve()]
def test_video_avatar_exposes_preview_video(tmp_path):
base = tmp_path / "video-avatar"
base.mkdir()

View File

@@ -0,0 +1,34 @@
from __future__ import annotations
from types import SimpleNamespace
from apps.api.routes.health import _runtime_status_payload
def test_health_reports_resolved_quicktalk_asset_root(tmp_path, monkeypatch) -> None:
settings_asset_root = tmp_path / "settings-assets"
env_asset_root = tmp_path / "env-assets"
monkeypatch.setenv("OPENTALKING_QUICKTALK_ASSET_ROOT", str(env_asset_root))
monkeypatch.setenv("OPENTALKING_QUICKTALK_MODEL_ROOT", str(tmp_path / "legacy-root"))
request = SimpleNamespace(
app=SimpleNamespace(
state=SimpleNamespace(
settings=SimpleNamespace(
quicktalk_asset_root=str(settings_asset_root),
quicktalk_model_root="",
models_dir=str(tmp_path / "models"),
llm_api_key="",
llm_provider="openai_compatible",
llm_model="",
default_model="quicktalk",
quicktalk_backend="local",
quicktalk_device="mps",
)
)
)
)
payload = _runtime_status_payload(request)
assert payload["quicktalk_asset_root"] == str(settings_asset_root.resolve())

View File

@@ -15,6 +15,16 @@ from opentalking.providers.synthesis.availability import (
)
def _write_quicktalk_local_assets(asset_root) -> None:
checkpoints = asset_root / "checkpoints"
(checkpoints / "chinese-hubert-large").mkdir(parents=True)
(checkpoints / "auxiliary" / "models" / "buffalo_l").mkdir(parents=True)
(checkpoints / "quicktalk.pth").write_bytes(b"pth")
(checkpoints / "repair.npy").write_bytes(b"repair")
(checkpoints / "chinese-hubert-large" / "pytorch_model.bin").write_bytes(b"hubert")
(checkpoints / "auxiliary" / "models" / "buffalo_l" / "det_10g.onnx").write_bytes(b"onnx")
def test_models_route_lists_all_models_with_connection_status_without_omnirt(monkeypatch) -> None:
monkeypatch.delenv("OPENTALKING_QUICKTALK_BACKEND", raising=False)
monkeypatch.delenv("OPENTALKING_CONFIG_FILE", raising=False)
@@ -98,6 +108,25 @@ def test_settings_loads_default_model_from_environment(monkeypatch, tmp_path) ->
assert settings.default_model == "quicktalk"
def test_settings_loads_quicktalk_local_fields_from_environment(monkeypatch, tmp_path) -> None:
monkeypatch.setenv("OPENTALKING_QUICKTALK_ASSET_ROOT", str(tmp_path / "models" / "quicktalk"))
monkeypatch.setenv("OPENTALKING_QUICKTALK_BACKEND", "local")
monkeypatch.setenv("OPENTALKING_QUICKTALK_MODEL_BACKEND", "auto")
monkeypatch.setenv("OPENTALKING_QUICKTALK_DEVICE", "mps")
monkeypatch.setenv("OPENTALKING_QUICKTALK_SLICE_LEN", "12")
monkeypatch.delenv("OPENTALKING_CONFIG_FILE", raising=False)
monkeypatch.delenv("CONFIG_FILE", raising=False)
monkeypatch.chdir(tmp_path)
settings = Settings(_env_file=None)
assert settings.quicktalk_asset_root == str(tmp_path / "models" / "quicktalk")
assert settings.quicktalk_backend == "local"
assert settings.quicktalk_model_backend == "auto"
assert settings.quicktalk_device == "mps"
assert settings.quicktalk_slice_len == 12
def test_settings_loads_default_model_from_yaml_model_section(monkeypatch, tmp_path) -> None:
config_file = tmp_path / "opentalking.yaml"
config_file.write_text(
@@ -169,7 +198,12 @@ def test_omnirt_endpoint_defaults_to_audio2video_routes() -> None:
assert resolve_synthesis_ws_url("flashtalk", settings) == "ws://127.0.0.1:9000/v1/audio2video/flashtalk"
async def test_omnirt_status_keeps_local_backend_local(monkeypatch) -> None:
async def test_omnirt_status_keeps_local_backend_local(monkeypatch, tmp_path) -> None:
monkeypatch.chdir(tmp_path)
monkeypatch.delenv("OPENTALKING_CONFIG_FILE", raising=False)
monkeypatch.delenv("CONFIG_FILE", raising=False)
monkeypatch.setenv("OPENTALKING_QUICKTALK_BACKEND", "omnirt")
clear_model_config_cache()
monkeypatch.setattr(
"opentalking.models.wav2lip.adapter.Wav2LipAdapter.runtime_available",
staticmethod(lambda: True),
@@ -200,6 +234,36 @@ async def test_omnirt_status_keeps_local_backend_local(monkeypatch) -> None:
assert statuses["quicktalk"].backend == "omnirt"
assert statuses["quicktalk"].connected is True
assert statuses["quicktalk"].reason == "omnirt"
clear_model_config_cache()
async def test_models_status_uses_settings_for_local_quicktalk_assets(
tmp_path,
monkeypatch,
) -> None:
monkeypatch.delenv("OPENTALKING_QUICKTALK_ASSET_ROOT", raising=False)
monkeypatch.delenv("OPENTALKING_QUICKTALK_MODEL_ROOT", raising=False)
monkeypatch.setenv("OPENTALKING_QUICKTALK_BACKEND", "local")
asset_root = tmp_path / "models" / "quicktalk"
_write_quicktalk_local_assets(asset_root)
settings = SimpleNamespace(
omnirt_endpoint="",
flashtalk_ws_url="",
flashhead_ws_url="",
quicktalk_asset_root=str(asset_root),
quicktalk_model_root="",
quicktalk_device="mps",
quicktalk_hubert_device="",
torch_device="auto",
device="",
)
statuses = {status.id: status for status in await resolve_model_statuses(settings)}
assert statuses["quicktalk"].backend == "local"
assert statuses["quicktalk"].connected is True
assert statuses["quicktalk"].reason == "local_runtime"
async def test_omnirt_endpoint_only_affects_omnirt_backend(tmp_path, monkeypatch) -> None:

View File

@@ -75,7 +75,11 @@ def test_tts_openai_compatible_posts_audio_speech(monkeypatch: pytest.MonkeyPatc
monkeypatch.setenv("OPENTALKING_TTS_OPENAI_VOICE", "neutral-test")
monkeypatch.setenv("OPENTALKING_TTS_OPENAI_PROTOCOL", "audio_speech")
tts = build_tts_adapter(sample_rate=16000, chunk_ms=20.0)
tts = build_tts_adapter(
sample_rate=16000,
chunk_ms=20.0,
tts_provider="openai_compatible",
)
chunks = asyncio.run(_collect_tts_chunks(tts, "你好,开始测试。"))
assert chunks

View File

@@ -515,7 +515,8 @@ def parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace:
)
parser.add_argument("--avatars-root", type=Path, required=True)
parser.add_argument("--avatar", action="append", default=[], help="Avatar id to process.")
parser.add_argument("--quicktalk-model-root", type=Path)
parser.add_argument("--quicktalk-asset-root", type=Path)
parser.add_argument("--quicktalk-model-root", type=Path, help=argparse.SUPPRESS)
parser.add_argument("--wav2lip-model-root", type=Path)
parser.add_argument("--wav2lip-face-det-device")
parser.add_argument("--wav2lip-max-reference-frames", type=int, default=125)
@@ -537,13 +538,14 @@ def parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace:
def main(argv: Sequence[str] | None = None) -> int:
args = parse_args(argv)
avatars_root = args.avatars_root.expanduser().resolve()
if "quicktalk" in args.model and args.quicktalk_model_root is None:
raise SystemExit("--quicktalk-model-root is required when --model quicktalk is selected")
quicktalk_asset_root = args.quicktalk_asset_root or args.quicktalk_model_root
if "quicktalk" in args.model and quicktalk_asset_root is None:
raise SystemExit("--quicktalk-asset-root is required when --model quicktalk is selected")
rebuild = None
if "quicktalk" in args.model:
from opentalking.models.quicktalk.runtime_v2 import QuickTalkRebuild
quicktalk_root = args.quicktalk_model_root.expanduser().resolve()
quicktalk_root = quicktalk_asset_root.expanduser().resolve()
rebuild = QuickTalkRebuild(
asset_root=quicktalk_root,
device=args.device,

View File

@@ -107,11 +107,9 @@ def _adapter_device(model_type: str, default_device: str) -> str:
if model_type == "wav2lip":
return os.environ.get("OPENTALKING_WAV2LIP_DEVICE") or default_device
if model_type == "quicktalk":
return (
os.environ.get("OPENTALKING_QUICKTALK_DEVICE")
or os.environ.get("OPENTALKING_TORCH_DEVICE")
or default_device
)
from opentalking.models.quicktalk.adapter import _configured_quicktalk_device
return _configured_quicktalk_device(default_device)
return default_device

View File

@@ -30,7 +30,7 @@ The local adapter reads an asset root that contains `checkpoints/`:
```text
$OPENTALKING_QUICKTALK_ASSET_ROOT/
checkpoints/
quicktalk.pth or 256.onnx
quicktalk.pth
repair.npy
chinese-hubert-large/
pytorch_model.bin

View File

@@ -0,0 +1,210 @@
# QuickTalk on Apple Silicon
This page is for running QuickTalk locally on Apple Silicon macOS. It is intended for development, demos, and integration checks. For stable realtime 25fps output, use the Linux CUDA path in [QuickTalk Local Deployment](local.md) or run QuickTalk behind OmniRT.
## 1. Install Dependencies
```bash title="Terminal"
brew install python@3.11 node uv
# Optional. OpenTalking can fall back to imageio-ffmpeg when this is absent.
brew install ffmpeg
```
Clone OpenTalking and create the environment with the CPU/macOS extra:
```bash title="Terminal"
git clone https://github.com/OpenTalker/opentalking.git
cd opentalking
export UV_INDEX_URL=https://pypi.tuna.tsinghua.edu.cn/simple
export PIP_INDEX_URL=https://pypi.tuna.tsinghua.edu.cn/simple
export UV_HTTP_TIMEOUT=300
export UV_LINK_MODE=copy
uv sync --extra dev --extra models --extra quicktalk-cpu --python 3.11
source .venv/bin/activate
```
Do not install `quicktalk-cuda` on Apple Silicon. `onnxruntime-gpu` does not provide a macOS arm64 wheel.
## 2. Download QuickTalk Assets
Download the QuickTalk weights and HuBERT files:
```bash title="Terminal"
mkdir -p models/quicktalk/checkpoints
hf download datascale-ai/quicktalk \
quicktalk.pth \
repair.npy \
chinese-hubert-large/config.json \
chinese-hubert-large/preprocessor_config.json \
chinese-hubert-large/pytorch_model.bin \
--local-dir models/quicktalk/checkpoints
```
Download InsightFace `buffalo_l` into the QuickTalk auxiliary directory:
```bash title="Terminal"
mkdir -p /tmp/opentalking-insightface \
models/quicktalk/checkpoints/auxiliary/models/buffalo_l
curl -L \
-o /tmp/opentalking-insightface/buffalo_l.zip \
https://github.com/deepinsight/insightface/releases/download/v0.7/buffalo_l.zip
unzip -q -o /tmp/opentalking-insightface/buffalo_l.zip \
-d /tmp/opentalking-insightface
rsync -a /tmp/opentalking-insightface/buffalo_l/ \
models/quicktalk/checkpoints/auxiliary/models/buffalo_l/
```
The final layout should be:
```text
models/quicktalk/
checkpoints/
quicktalk.pth
repair.npy
chinese-hubert-large/
config.json
preprocessor_config.json
pytorch_model.bin
auxiliary/models/buffalo_l/
*.onnx
```
Check the required files:
```bash title="Terminal"
stat models/quicktalk/checkpoints/quicktalk.pth
stat models/quicktalk/checkpoints/repair.npy
stat models/quicktalk/checkpoints/chinese-hubert-large/pytorch_model.bin
stat models/quicktalk/checkpoints/auxiliary/models/buffalo_l/det_10g.onnx
```
## 3. Configure `.env`
Create `.env` if it does not exist:
```bash title="Terminal"
cp .env.example .env
```
Set these values:
```env title=".env"
OPENTALKING_DEFAULT_MODEL=quicktalk
OPENTALKING_FFMPEG_BIN=
OPENTALKING_QUICKTALK_BACKEND=local
OPENTALKING_QUICKTALK_ASSET_ROOT=./models/quicktalk
OPENTALKING_QUICKTALK_MODEL_BACKEND=auto
OPENTALKING_QUICKTALK_WORKER_CACHE=1
# Optional. If unset, OpenTalking selects mps when PyTorch MPS is available,
# then falls back to cpu.
OPENTALKING_QUICKTALK_DEVICE=mps
# Apple Silicon default. Keep 12 so each generated chunk has enough audio budget.
OPENTALKING_QUICKTALK_SLICE_LEN=12
# Optional for long text. This lowers output cadence from model-native 25fps
# to 14fps so MPS generation can stay closer to playback.
OPENTALKING_QUICKTALK_FPS=14
```
Leaving `OPENTALKING_FFMPEG_BIN=` empty lets OpenTalking find system `ffmpeg` first and fall back to `imageio-ffmpeg`.
## 4. Check the Environment
```bash title="Terminal"
python - <<'PY'
from pathlib import Path
import torch
import onnxruntime as ort
from opentalking.models.quicktalk.runtime_v2 import ensure_ffmpeg
root = Path("models/quicktalk/checkpoints")
for path in [
root / "quicktalk.pth",
root / "repair.npy",
root / "chinese-hubert-large/pytorch_model.bin",
root / "auxiliary/models/buffalo_l/det_10g.onnx",
]:
print(path, path.exists())
print("mps:", torch.backends.mps.is_available())
print("onnxruntime providers:", ort.get_available_providers())
print("ffmpeg:", ensure_ffmpeg())
PY
```
Every printed file path should be `True`. `mps` should be `True` on a healthy Apple Silicon PyTorch install, though OpenTalking can fall back to CPU.
## 5. Start OpenTalking
```bash title="Terminal"
bash scripts/start_unified.sh \
--backend local \
--model quicktalk \
--api-port 8210 \
--web-port 5280
```
Open `http://127.0.0.1:5280`, choose a front-facing avatar such as the built-in `singer`, and select `quicktalk`. The first run builds the avatar cache; later runs reuse it.
## 6. Verify the Realtime Digital Human Path
```bash title="Terminal"
curl -s http://127.0.0.1:8210/health | python -m json.tool
curl -s http://127.0.0.1:8210/models | python -m json.tool
```
The QuickTalk model should report `connected: true` with reason `local_runtime`.
Create a session and send a short sentence:
```bash title="Terminal"
curl -s -X POST http://127.0.0.1:8210/sessions \
-H 'Content-Type: application/json' \
-d '{"avatar_id":"singer","model":"quicktalk","tts_provider":"edge"}' \
| tee /tmp/opentalking-session.json | python -m json.tool
sid=$(python - <<'PY'
import json
print(json.load(open("/tmp/opentalking-session.json"))["session_id"])
PY
)
curl -s -X POST "http://127.0.0.1:8210/sessions/$sid/start" \
-H 'Content-Type: application/json' \
-d '{}' | python -m json.tool
curl -s -X POST "http://127.0.0.1:8210/sessions/$sid/speak" \
-H 'Content-Type: application/json' \
-d '{"text":"Please confirm in one sentence that QuickTalk is running locally on this Mac.","tts_provider":"edge"}' \
| python -m json.tool
```
When the session state returns from `speaking` to `ready`, and the WebUI shows generated audio and video frames for the selected avatar, the local realtime digital human path is working.
## Performance Notes
Apple Silicon can run the local path, but it is not the recommended realtime production target. If long text stalls, try:
```env title=".env"
OPENTALKING_QUICKTALK_SLICE_LEN=12
OPENTALKING_QUICKTALK_FPS=14
OPENTALKING_QUICKTALK_MAX_LONG_EDGE=720
```
This trades motion FPS or image size for smoother playback. Use Linux CUDA or OmniRT when stable 25fps realtime output matters.
## Troubleshooting
| Symptom | Fix |
| --- | --- |
| `onnxruntime-gpu` fails to install | Use `quicktalk-cpu`; do not install `quicktalk-cuda` on Apple Silicon. |
| `ffmpeg` is missing | Keep `OPENTALKING_FFMPEG_BIN=` empty, or run `brew install ffmpeg`. |
| MPS shows an SVD CPU fallback warning | This is a PyTorch MPS operator coverage limitation. It can affect speed but usually does not block execution. |
| First startup is slow | The first run loads HuBERT, QuickTalk, and the avatar face cache. Reusing the same avatar is faster. |

View File

@@ -17,7 +17,7 @@ uv sync --extra dev --extra models --extra quicktalk-cuda --python 3.11
source .venv/bin/activate
```
Prepare a QuickTalk local asset root that contains `checkpoints/quicktalk.pth` or `checkpoints/256.onnx`, `checkpoints/repair.npy`, HuBERT files, and InsightFace assets.
Prepare a QuickTalk local asset root that contains `checkpoints/quicktalk.pth`, `checkpoints/repair.npy`, HuBERT files, and InsightFace assets.
The avatar does not need to start as `model_type=quicktalk`. OpenTalking decouples avatar selection from model selection: if an avatar has `metadata.source_video`, `metadata.source_image`, `reference.png`, or `preview.png`, QuickTalk prewarm can generate the template video and face cache it needs. Dedicated QuickTalk avatars can still declare `metadata.quicktalk.template_video` explicitly.

View File

@@ -6,7 +6,7 @@ This page explains the recommended ways to run OpenTalking on different system e
| Platform | Recommended Use | Available Paths | Notes |
| --- | --- | --- | --- |
| macOS | Docs, frontend, API, Mock validation | `mock` | Good for quick trials, not recommended as a real model inference environment. |
| macOS | Docs, frontend, API, Mock validation; experimental QuickTalk local on Apple Silicon | `mock`, experimental `quicktalk` local | Good for quick trials. See [QuickTalk on Apple Silicon](../model-deployment/quicktalk/apple-silicon.md). Stable realtime output is still recommended on Linux GPU. |
| Linux + CUDA | Real model validation and deployment | `mock`, `quicktalk`, `wav2lip`, `musetalk`, `omnirt` | Primary recommended environment. |
| Linux + Ascend NPU | Private deployment and NPU evaluation | `mock`, selected OmniRT / FlashTalk paths | Requires CANN, driver, and `torch_npu`. |
@@ -21,9 +21,15 @@ brew install python@3.11 node ffmpeg
uv sync --extra dev --python 3.11
```
### Not suitable for real digital-human models
### Experimental QuickTalk local on Apple Silicon
QuickTalk, MuseTalk, FlashTalk, and similar models mainly target CUDA GPUs or dedicated inference services. Even if some Python dependencies can be installed on macOS, it is not recommended as the real video-generation path. Deploy models on a Linux GPU machine and connect OpenTalking to the remote inference service instead.
Apple Silicon can run QuickTalk local with `quicktalk-cpu` for development, demos, and integration checks. The full path is documented in [QuickTalk on Apple Silicon](../model-deployment/quicktalk/apple-silicon.md).
This path uses PyTorch MPS when available and falls back to CPU. It is not the recommended production realtime path; use Linux CUDA or OmniRT when stable 25fps output matters.
### Other real digital-human models
MuseTalk, FlashTalk, and similar production paths mainly target CUDA GPUs, Ascend NPUs, or dedicated inference services. Deploy those models on a Linux GPU/NPU machine and connect OpenTalking to the remote inference service.
### ffmpeg Installation

View File

@@ -24,7 +24,7 @@ streamlined first-run procedure, see the [Quickstart](quickstart.md).
| Platform | Synthesis backends | Notes |
|----------|-------------------|-------|
| macOS (Apple Silicon and Intel) | `mock` | Suitable for orchestration and frontend development. Real talking-head models are not supported on macOS. |
| macOS (Apple Silicon and Intel) | `mock`, experimental `quicktalk` local on Apple Silicon | Suitable for orchestration and frontend development. QuickTalk local can be tested on Apple Silicon with `quicktalk-cpu`; see [QuickTalk on Apple Silicon](../model-deployment/quicktalk/apple-silicon.md) for the full path. Realtime production paths still target Linux GPU/NPU or OmniRT. |
| Linux x86_64 + CUDA 12 | `mock`, `wav2lip`, `musetalk`, `flashtalk`, `flashhead`, `quicktalk` | Primary deployment target. |
| Linux aarch64 + Ascend 910B (CANN 8.0+) | `mock`, `wav2lip`, `flashtalk` | NPU production deployment path. |
| Windows | `mock` (WSL2 recommended) | Not part of the continuous integration matrix. |

View File

@@ -35,7 +35,7 @@ $OMNIRT_MODEL_ROOT/quicktalk/ # OmniRT 默认读取
```text
$OPENTALKING_QUICKTALK_ASSET_ROOT/ # local adapter 默认读取
checkpoints/
quicktalk.pth 或 256.onnx
quicktalk.pth
repair.npy
chinese-hubert-large/
pytorch_model.bin

View File

@@ -0,0 +1,210 @@
# Apple Silicon 上运行 QuickTalk
本页用于在 Apple Silicon macOS 上本地运行 QuickTalk。它适合开发、演示和集成验证如果需要稳定 25fps 实时输出,仍建议使用 [QuickTalk Local 单机部署](local.md) 中的 Linux CUDA 路径,或把 QuickTalk 放到 OmniRT 后面运行。
## 1. 安装依赖
```bash title="终端"
brew install python@3.11 node uv
# 可选。不安装时 OpenTalking 可以回退到 imageio-ffmpeg。
brew install ffmpeg
```
拉取 OpenTalking并使用 CPU/macOS extra 创建环境:
```bash title="终端"
git clone https://github.com/OpenTalker/opentalking.git
cd opentalking
export UV_INDEX_URL=https://pypi.tuna.tsinghua.edu.cn/simple
export PIP_INDEX_URL=https://pypi.tuna.tsinghua.edu.cn/simple
export UV_HTTP_TIMEOUT=300
export UV_LINK_MODE=copy
uv sync --extra dev --extra models --extra quicktalk-cpu --python 3.11
source .venv/bin/activate
```
不要在 Apple Silicon 上安装 `quicktalk-cuda`。`onnxruntime-gpu` 没有 macOS arm64 wheel。
## 2. 下载 QuickTalk 资产
下载 QuickTalk 权重和 HuBERT 文件:
```bash title="终端"
mkdir -p models/quicktalk/checkpoints
hf download datascale-ai/quicktalk \
quicktalk.pth \
repair.npy \
chinese-hubert-large/config.json \
chinese-hubert-large/preprocessor_config.json \
chinese-hubert-large/pytorch_model.bin \
--local-dir models/quicktalk/checkpoints
```
下载 InsightFace `buffalo_l` 到 QuickTalk auxiliary 目录:
```bash title="终端"
mkdir -p /tmp/opentalking-insightface \
models/quicktalk/checkpoints/auxiliary/models/buffalo_l
curl -L \
-o /tmp/opentalking-insightface/buffalo_l.zip \
https://github.com/deepinsight/insightface/releases/download/v0.7/buffalo_l.zip
unzip -q -o /tmp/opentalking-insightface/buffalo_l.zip \
-d /tmp/opentalking-insightface
rsync -a /tmp/opentalking-insightface/buffalo_l/ \
models/quicktalk/checkpoints/auxiliary/models/buffalo_l/
```
最终目录应为:
```text
models/quicktalk/
checkpoints/
quicktalk.pth
repair.npy
chinese-hubert-large/
config.json
preprocessor_config.json
pytorch_model.bin
auxiliary/models/buffalo_l/
*.onnx
```
检查必需文件:
```bash title="终端"
stat models/quicktalk/checkpoints/quicktalk.pth
stat models/quicktalk/checkpoints/repair.npy
stat models/quicktalk/checkpoints/chinese-hubert-large/pytorch_model.bin
stat models/quicktalk/checkpoints/auxiliary/models/buffalo_l/det_10g.onnx
```
## 3. 配置 `.env`
如果还没有 `.env`,先创建:
```bash title="终端"
cp .env.example .env
```
设置这些值:
```env title=".env"
OPENTALKING_DEFAULT_MODEL=quicktalk
OPENTALKING_FFMPEG_BIN=
OPENTALKING_QUICKTALK_BACKEND=local
OPENTALKING_QUICKTALK_ASSET_ROOT=./models/quicktalk
OPENTALKING_QUICKTALK_MODEL_BACKEND=auto
OPENTALKING_QUICKTALK_WORKER_CACHE=1
# 可选。不设置时 OpenTalking 会在 PyTorch MPS 可用时选择 mps
# 否则回退 cpu。
OPENTALKING_QUICKTALK_DEVICE=mps
# Apple Silicon 默认值。保持 12让每个生成 chunk 有足够音频预算。
OPENTALKING_QUICKTALK_SLICE_LEN=12
# 长文本可选。把输出从模型原生 25fps 降到 14fps
# 让 MPS 生成速度更接近播放速度。
OPENTALKING_QUICKTALK_FPS=14
```
`OPENTALKING_FFMPEG_BIN=` 保持为空时OpenTalking 会先找系统 `ffmpeg`,找不到再回退到 `imageio-ffmpeg`。
## 4. 检查本地环境
```bash title="终端"
python - <<'PY'
from pathlib import Path
import torch
import onnxruntime as ort
from opentalking.models.quicktalk.runtime_v2 import ensure_ffmpeg
root = Path("models/quicktalk/checkpoints")
for path in [
root / "quicktalk.pth",
root / "repair.npy",
root / "chinese-hubert-large/pytorch_model.bin",
root / "auxiliary/models/buffalo_l/det_10g.onnx",
]:
print(path, path.exists())
print("mps:", torch.backends.mps.is_available())
print("onnxruntime providers:", ort.get_available_providers())
print("ffmpeg:", ensure_ffmpeg())
PY
```
每个文件路径都应该输出 `True`。健康的 Apple Silicon PyTorch 环境里 `mps` 应该是 `True`如果不可用OpenTalking 可以回退到 CPU。
## 5. 启动 OpenTalking
```bash title="终端"
bash scripts/start_unified.sh \
--backend local \
--model quicktalk \
--api-port 8210 \
--web-port 5280
```
打开 `http://127.0.0.1:5280`,选择正脸清晰的 avatar例如内置 `singer`,模型选择 `quicktalk`。首次运行会构建 avatar cache后续可复用。
## 6. 验证实时数字人链路
```bash title="终端"
curl -s http://127.0.0.1:8210/health | python -m json.tool
curl -s http://127.0.0.1:8210/models | python -m json.tool
```
QuickTalk 模型应返回 `connected: true`,原因是 `local_runtime`。
创建会话并发送一句短文本:
```bash title="终端"
curl -s -X POST http://127.0.0.1:8210/sessions \
-H 'Content-Type: application/json' \
-d '{"avatar_id":"singer","model":"quicktalk","tts_provider":"edge"}' \
| tee /tmp/opentalking-session.json | python -m json.tool
sid=$(python - <<'PY'
import json
print(json.load(open("/tmp/opentalking-session.json"))["session_id"])
PY
)
curl -s -X POST "http://127.0.0.1:8210/sessions/$sid/start" \
-H 'Content-Type: application/json' \
-d '{}' | python -m json.tool
curl -s -X POST "http://127.0.0.1:8210/sessions/$sid/speak" \
-H 'Content-Type: application/json' \
-d '{"text":"请用一句话确认 QuickTalk 已在 Mac 本地运行。","tts_provider":"edge"}' \
| python -m json.tool
```
当 session 状态从 `speaking` 回到 `ready`,且 WebUI 中能看到所选 avatar 生成音频和视频帧,就表示本地实时数字人链路已经跑通。
## 性能说明
Apple Silicon 可以跑通本地链路,但不是推荐的实时生产目标。如果长文本卡顿,优先尝试:
```env title=".env"
OPENTALKING_QUICKTALK_SLICE_LEN=12
OPENTALKING_QUICKTALK_FPS=14
OPENTALKING_QUICKTALK_MAX_LONG_EDGE=720
```
这会用动作帧率或画面尺寸换取更顺滑的播放。需要稳定 25fps 实时输出时,请使用 Linux CUDA 或 OmniRT。
## 常见问题
| 现象 | 处理 |
| --- | --- |
| `onnxruntime-gpu` 安装失败 | Apple Silicon 使用 `quicktalk-cpu`,不要安装 `quicktalk-cuda`。 |
| `ffmpeg` 找不到 | `.env` 中保持 `OPENTALKING_FFMPEG_BIN=`,或运行 `brew install ffmpeg`。 |
| MPS 出现 SVD CPU fallback 警告 | 属于 PyTorch MPS 的算子覆盖限制,可能影响速度,但通常不阻塞运行。 |
| 首次启动很慢 | 首次会加载 HuBERT、QuickTalk 和 avatar face cache同一 avatar 后续会更快。 |

View File

@@ -26,7 +26,7 @@ local adapter 的资产根必须包含 `checkpoints/` 目录。推荐放在仓
```text
models/quicktalk/
checkpoints/
quicktalk.pth 或 256.onnx
quicktalk.pth
repair.npy
chinese-hubert-large/
pytorch_model.bin
@@ -93,7 +93,7 @@ cd "$OPENTALKING_HOME"
opentalking-prepare-cache \
--model quicktalk \
--avatars-root examples/avatars \
--quicktalk-model-root models/quicktalk \
--quicktalk-asset-root models/quicktalk \
--device cuda:0 \
--model-backend pth \
--verify

View File

@@ -10,7 +10,7 @@ QuickTalk 适合快速实时口播、低延迟验证和从图片快速生成数
- 推荐 NVIDIA GPU。
- QuickTalk 本地资产目录需要包含 `checkpoints/`
- 至少需要 `quicktalk.pth``256.onnx``repair.npy``chinese-hubert-large/``auxiliary/``auxiliary_min/`
- 至少需要 `quicktalk.pth``repair.npy``chinese-hubert-large/``auxiliary/``auxiliary_min/`
- Avatar 需要 `quicktalk.template_video` 或可由上传图片生成模板视频。
## 准备权重
@@ -130,7 +130,7 @@ uv run opentalking-quicktalk-bench \
### 提示资产不完整
检查 `checkpoints/quicktalk.pth``checkpoints/256.onnx``repair.npy``chinese-hubert-large/``auxiliary/` 是否存在。
检查 `checkpoints/quicktalk.pth``repair.npy``chinese-hubert-large/``auxiliary/` 是否存在。
### 首次创建会话很慢

View File

@@ -8,7 +8,7 @@
| 平台 | 推荐用途 | 可用路径 | 说明 |
| --- | --- | --- | --- |
| macOS | 文档、前端、API、Mock 验证 | `mock` | 适合快速体验,不建议作为真实模型推理环境。 |
| macOS | 文档、前端、API、Mock 验证Apple Silicon 上实验性 QuickTalk local | `mock`、实验性 `quicktalk` local | 适合快速体验;见 [Apple Silicon 上运行 QuickTalk](../model-deployment/quicktalk/apple-silicon.md)。稳定实时输出仍推荐 Linux GPU。 |
| Linux + CUDA | 真实模型验证与部署 | `mock``quicktalk``wav2lip``musetalk``omnirt` | 主要推荐环境。 |
| Linux + Ascend NPU | 私有化和 NPU 评估 | `mock`、部分 OmniRT / FlashTalk 路线 | 依赖 CANN、驱动和 `torch_npu` |
@@ -24,11 +24,18 @@ brew install python@3.11 node ffmpeg
uv sync --extra dev --python 3.11
```
### 不适合真实数字人模型
### Apple Silicon 上的实验性 QuickTalk local
QuickTalk、MuseTalk、FlashTalk 等模型主要面向 CUDA GPU 或专用推理服务。macOS 上即使可以安装
部分 Python 依赖,也不建议作为真实视频生成路径;更推荐把模型部署到 Linux GPU 机器,
再通过 OpenTalking 连接远端推理服务。
Apple Silicon 可以用 `quicktalk-cpu` 跑 QuickTalk local适合开发、演示和集成验证。完整步骤见
[Apple Silicon 上运行 QuickTalk](../model-deployment/quicktalk/apple-silicon.md)。
这条路径会在 PyTorch MPS 可用时使用 MPS否则回退 CPU。它不是推荐的生产实时路径需要稳定
25fps 输出时,请使用 Linux CUDA 或 OmniRT。
### 其他真实数字人模型
MuseTalk、FlashTalk 等生产路径主要面向 CUDA GPU、昇腾 NPU 或专用推理服务。更推荐把这些模型
部署到 Linux GPU/NPU 机器,再通过 OpenTalking 连接远端推理服务。
### ffmpeg 安装

View File

@@ -21,7 +21,7 @@ OpenTalking 提供两种安装方式。选择哪一种取决于两个问题:
| 平台 | 合成后端 | 说明 |
|------|---------|------|
| macOSApple Silicon 与 Intel | `mock` | 适用于编排与前端开发macOS 不支持真实 talking-head 模型。 |
| macOSApple Silicon 与 Intel | `mock`、Apple Silicon 实验性 `quicktalk` local | 适用于编排与前端开发QuickTalk local 可用 `quicktalk-cpu` 在 Apple Silicon 上验证,完整步骤见 [Apple Silicon 上运行 QuickTalk](../model-deployment/quicktalk/apple-silicon.md),生产实时路径仍以 Linux GPU/NPU 或 OmniRT 为主。 |
| Linux x86_64 + CUDA 12 | `mock``wav2lip``musetalk``flashtalk``flashhead``quicktalk` | 主要部署目标。 |
| Linux aarch64 + 昇腾 910BCANN 8.0+ | `mock``wav2lip``flashtalk` | NPU 生产部署路径。 |
| Windows | `mock`(建议 WSL2 | 不在持续集成矩阵中。 |

View File

@@ -252,6 +252,7 @@ nav:
- QuickTalk:
- Overview: model-deployment/quicktalk.md
- Local: model-deployment/quicktalk/local.md
- Apple Silicon: model-deployment/quicktalk/apple-silicon.md
- OmniRT: model-deployment/quicktalk/omnirt.md
- Wav2Lip:
- Overview: model-deployment/wav2lip.md
@@ -329,6 +330,7 @@ nav:
- QuickTalk:
- Overview: model-deployment/quicktalk.md
- Local: model-deployment/quicktalk/local.md
- Apple Silicon: model-deployment/quicktalk/apple-silicon.md
- OmniRT: model-deployment/quicktalk/omnirt.md
- Wav2Lip:
- Overview: model-deployment/wav2lip.md

View File

@@ -367,6 +367,15 @@ class Settings(BaseSettings):
flashhead_frame_num: int = 29
flashhead_chunk_samples: int = 17920
quicktalk_asset_root: str = ""
quicktalk_model_root: str = ""
quicktalk_backend: str = ""
quicktalk_model_backend: str = "auto"
quicktalk_device: str = ""
quicktalk_hubert_device: str = ""
quicktalk_worker_cache: bool = True
quicktalk_slice_len: int = 0
llm_provider: str = "openai_compatible"
llm_base_url: str = ""
llm_api_key: str = ""

View File

@@ -2,6 +2,7 @@ from __future__ import annotations
import logging
import os
import platform
import threading
from collections import OrderedDict
from dataclasses import dataclass
@@ -14,6 +15,7 @@ from opentalking.avatar.loader import load_avatar_bundle
from opentalking.core.interfaces.avatar_asset import AvatarManifest
from opentalking.core.types.frames import AudioChunk, VideoFrameData
from opentalking.media.frame_avatar import numpy_bgr_to_videoframe
from opentalking.models.quicktalk.paths import resolve_quicktalk_asset_root
from opentalking.models.registry import register_model
if TYPE_CHECKING: # pragma: no cover — avoids importing torch/onnx at module load
@@ -26,6 +28,8 @@ log = logging.getLogger(__name__)
class QuickTalkFeatures:
reps: list[np.ndarray]
audio_feature_seconds: float
render_reps: list[np.ndarray] | None = None
output_fps: float | None = None
@dataclass
@@ -87,6 +91,39 @@ def _env_value(name: str, default: str = "") -> str:
return os.environ.get(name, "").strip() or default
def _default_quicktalk_device() -> str:
if platform.system() == "Darwin" and platform.machine().lower() in {"arm64", "aarch64"}:
try:
import torch
mps = getattr(getattr(torch, "backends", None), "mps", None)
if mps is not None and bool(mps.is_available()):
return "mps"
except Exception:
pass
return "cpu"
return "cuda:0"
def _first_configured_device(*values: str | None) -> str:
for value in values:
device = (value or "").strip()
if device and device.lower() != "auto":
return device
return ""
def _configured_quicktalk_device(*extra_values: str | None) -> str:
return (
_first_configured_device(
_env_value("OPENTALKING_QUICKTALK_DEVICE"),
_env_value("OPENTALKING_TORCH_DEVICE"),
*extra_values,
)
or _default_quicktalk_device()
)
def _positive_int_env(name: str, default: int) -> int:
try:
value = int(_env_value(name, str(default)))
@@ -95,6 +132,39 @@ def _positive_int_env(name: str, default: int) -> int:
return max(1, value)
def _optional_positive_int_env(name: str) -> int | None:
raw = _env_value(name)
if not raw:
return None
try:
value = int(raw)
except ValueError:
return None
return value if value > 0 else None
def _downsample_sequence(items: list[np.ndarray], target_count: int) -> list[np.ndarray]:
if target_count >= len(items):
return items
if target_count <= 1:
return [items[0]]
last = len(items) - 1
return [items[int(round(i * last / float(target_count - 1)))] for i in range(target_count)]
def _quicktalk_render_plan(
reps: list[np.ndarray],
*,
worker_fps: float,
) -> tuple[list[np.ndarray], float]:
target_fps = _optional_positive_int_env("OPENTALKING_QUICKTALK_FPS")
output_fps = worker_fps
if target_fps is None or target_fps >= worker_fps or not reps:
return reps, output_fps
target_count = max(1, int(round(float(len(reps)) * float(target_fps) / worker_fps)))
return _downsample_sequence(list(reps), target_count), float(target_fps)
def _close_worker(worker: Any) -> None:
close = getattr(worker, "close", None)
if callable(close):
@@ -171,18 +241,22 @@ def _quicktalk_template_from_bundle(bundle_path: Path) -> Path | None:
return None
def _optional_env_path(name: str) -> Path | None:
raw = _env_value(name)
if not raw:
def _quicktalk_settings() -> Any | None:
try:
from opentalking.core.config import get_settings
return get_settings()
except Exception:
return None
return Path(raw).expanduser().resolve()
def _quicktalk_asset_root_env() -> Path | None:
return (
_optional_env_path("OPENTALKING_QUICKTALK_ASSET_ROOT")
or _optional_env_path("OPENTALKING_QUICKTALK_MODEL_ROOT")
or _optional_env_path("OMNIRT_QUICKTALK_MODEL_ROOT")
return resolve_quicktalk_asset_root(None, include_default=False)
def _quicktalk_asset_root_config(settings: Any | None = None) -> Path | None:
return resolve_quicktalk_asset_root(
settings if settings is not None else _quicktalk_settings()
)
@@ -284,7 +358,7 @@ def _validate_asset_root(asset_root: Path) -> None:
formatted = "\n - ".join(str(path) for path in missing)
raise FileNotFoundError(
"QuickTalk local assets are incomplete. "
"OPENTALKING_QUICKTALK_ASSET_ROOT or OPENTALKING_QUICKTALK_MODEL_ROOT must point to a QuickTalk local "
"OPENTALKING_QUICKTALK_ASSET_ROOT must point to a QuickTalk local "
"asset directory containing checkpoints/quicktalk.pth or checkpoints/256.onnx, checkpoints/repair.npy, "
"checkpoints/chinese-hubert-large/ and checkpoints/auxiliary/.\n"
f"Current asset root: {asset_root}\n"
@@ -311,11 +385,21 @@ class QuickTalkAdapter:
model_type = "quicktalk"
def __init__(self) -> None:
self._device = os.environ.get("OPENTALKING_TORCH_DEVICE", "cuda:0")
settings = _quicktalk_settings()
self._device = _configured_quicktalk_device(
getattr(settings, "quicktalk_device", None) if settings is not None else None,
getattr(settings, "torch_device", None) if settings is not None else None,
getattr(settings, "device", None) if settings is not None else None,
)
# 多卡部署:让 HuBERT 跑在另一张卡,避免与 ONNX 在同一 GPU default
# stream 上排队。空字符串表示与主 device 同卡(默认行为)。
self._hubert_device = (
_env_value("OPENTALKING_QUICKTALK_HUBERT_DEVICE") or None
or (
str(getattr(settings, "quicktalk_hubert_device", "") or "").strip()
if settings is not None
else None
)
)
self._asset_root = _quicktalk_asset_root_env()
self._output_transform = _env_value(
@@ -328,7 +412,12 @@ class QuickTalkAdapter:
self._neck_fade_start = float(_env_value("OPENTALKING_QUICKTALK_NECK_FADE_START", "0.72"))
self._neck_fade_end = float(_env_value("OPENTALKING_QUICKTALK_NECK_FADE_END", "0.88"))
self._max_template_seconds_env = _env_value("OPENTALKING_QUICKTALK_MAX_TEMPLATE_SECONDS")
self._model_backend = _env_value("OPENTALKING_QUICKTALK_MODEL_BACKEND", "auto")
self._model_backend = _env_value(
"OPENTALKING_QUICKTALK_MODEL_BACKEND",
str(getattr(settings, "quicktalk_model_backend", "") or "").strip()
if settings is not None
else "auto",
)
# Idle frame selection. The template video typically contains the source
# speaker talking, so cycling all frames during idle makes the avatar
# appear to keep speaking. We restrict idle to a configurable still
@@ -369,14 +458,25 @@ class QuickTalkAdapter:
return None
@staticmethod
def runtime_available() -> bool:
def runtime_available(settings: Any | None = None) -> bool:
try:
asset_root = _quicktalk_asset_root_env()
asset_root = _quicktalk_asset_root_config(settings)
if asset_root is None:
return False
_validate_asset_root(_normalize_asset_root(asset_root))
device = _env_value("OPENTALKING_QUICKTALK_DEVICE") or _env_value("OPENTALKING_TORCH_DEVICE", "cuda:0")
hubert_device = _env_value("OPENTALKING_QUICKTALK_HUBERT_DEVICE")
device = _configured_quicktalk_device(
getattr(settings, "quicktalk_device", None) if settings is not None else None,
getattr(settings, "torch_device", None) if settings is not None else None,
getattr(settings, "device", None) if settings is not None else None,
)
hubert_device = (
_env_value("OPENTALKING_QUICKTALK_HUBERT_DEVICE")
or (
str(getattr(settings, "quicktalk_hubert_device", "") or "").strip()
if settings is not None
else ""
)
)
return _explicit_cuda_available(device) and _explicit_cuda_available(hubert_device)
except Exception:
return False
@@ -404,14 +504,23 @@ class QuickTalkAdapter:
def load_avatar(self, avatar_path: str) -> QuickTalkState:
bundle = load_avatar_bundle(Path(avatar_path), strict=False)
metadata = bundle.manifest.metadata or {}
asset_root = self._asset_root if self._asset_root is not None else _path_from_env_or_metadata(
"OPENTALKING_QUICKTALK_ASSET_ROOT",
metadata,
"asset_root",
"quicktalk_asset_root",
base_dir=bundle.path,
sections=("quicktalk",),
)
asset_root = self._asset_root
if asset_root is None:
try:
asset_root = _path_from_env_or_metadata(
"OPENTALKING_QUICKTALK_ASSET_ROOT",
metadata,
"asset_root",
"quicktalk_asset_root",
base_dir=bundle.path,
sections=("quicktalk",),
)
except ValueError:
asset_root = _quicktalk_asset_root_config()
if asset_root is None:
raise
if asset_root is None:
raise ValueError("Missing OPENTALKING_QUICKTALK_ASSET_ROOT or QuickTalk settings asset root")
asset_root = _normalize_asset_root(asset_root)
_validate_asset_root(asset_root)
prepared_template, face_cache_file = _prepared_quicktalk_template_and_cache(
@@ -561,11 +670,24 @@ class QuickTalkAdapter:
np.asarray(audio_chunk.data, dtype=np.int16).reshape(-1),
int(audio_chunk.sample_rate),
)
return QuickTalkFeatures(reps=reps, audio_feature_seconds=feature_seconds)
render_reps, output_fps = _quicktalk_render_plan(
reps,
worker_fps=float(getattr(avatar_state.worker, "fps", 25) or 25),
)
avatar_state.fps = output_fps
return QuickTalkFeatures(
reps=reps,
audio_feature_seconds=feature_seconds,
render_reps=render_reps,
output_fps=output_fps,
)
def infer(self, features: QuickTalkFeatures, avatar_state: QuickTalkState) -> Iterator[np.ndarray]:
render_reps = features.render_reps if features.render_reps is not None else features.reps
if features.output_fps is not None:
avatar_state.fps = features.output_fps
return avatar_state.worker.generate_frames_from_reps(
features.reps, state=avatar_state.session_state
render_reps, state=avatar_state.session_state
)
def compose_frame(
@@ -593,11 +715,27 @@ class QuickTalkAdapter:
np.asarray(audio_chunk.data, dtype=np.int16).reshape(-1),
int(audio_chunk.sample_rate),
)
features = QuickTalkFeatures(reps=reps, audio_feature_seconds=feature_seconds)
render_reps, output_fps = _quicktalk_render_plan(
reps,
worker_fps=float(getattr(avatar_state.worker, "fps", 25) or 25),
)
features = QuickTalkFeatures(
reps=reps,
audio_feature_seconds=feature_seconds,
render_reps=render_reps,
output_fps=output_fps,
)
frames = []
for prediction in avatar_state.worker.generate_frames_from_reps(
reps, state=avatar_state.session_state
):
frames.append(self.compose_frame(avatar_state, avatar_state.frame_index, prediction))
avatar_state.frame_index += 1
previous_fps = getattr(avatar_state, "fps", None)
avatar_state.fps = output_fps
try:
predictions = avatar_state.worker.generate_frames_from_reps(
render_reps, state=avatar_state.session_state
)
for prediction in predictions:
frames.append(self.compose_frame(avatar_state, avatar_state.frame_index, prediction))
avatar_state.frame_index += 1
finally:
if previous_fps is not None:
avatar_state.fps = previous_fps
return features, frames

View File

@@ -0,0 +1,131 @@
from __future__ import annotations
import logging
import os
from dataclasses import dataclass
from pathlib import Path
from typing import Any
log = logging.getLogger(__name__)
@dataclass(frozen=True)
class QuickTalkRootCandidate:
source: str
path: Path
deprecated: bool = False
default: bool = False
def _path_from_raw(raw: object) -> Path | None:
value = str(raw or "").strip()
if not value:
return None
return Path(value).expanduser().resolve()
def _settings_path(settings: Any | None, name: str) -> Path | None:
if settings is None:
return None
return _path_from_raw(getattr(settings, name, ""))
def _env_path(name: str) -> Path | None:
return _path_from_raw(os.environ.get(name, ""))
def quicktalk_asset_root_candidates(
settings: Any | None = None,
*,
include_legacy: bool = True,
include_default: bool = True,
) -> list[QuickTalkRootCandidate]:
"""Return QuickTalk asset-root candidates in the supported priority order.
New deployments should set only ``OPENTALKING_QUICKTALK_ASSET_ROOT`` or the
corresponding settings field. The other names are kept only so existing
installations do not break during upgrades.
"""
candidates: list[QuickTalkRootCandidate] = []
settings_asset_root = _settings_path(settings, "quicktalk_asset_root")
if settings_asset_root is not None:
candidates.append(
QuickTalkRootCandidate("settings.quicktalk_asset_root", settings_asset_root)
)
env_asset_root = _env_path("OPENTALKING_QUICKTALK_ASSET_ROOT")
if env_asset_root is not None:
candidates.append(
QuickTalkRootCandidate("OPENTALKING_QUICKTALK_ASSET_ROOT", env_asset_root)
)
if include_legacy:
legacy_settings_root = _settings_path(settings, "quicktalk_model_root")
if legacy_settings_root is not None:
candidates.append(
QuickTalkRootCandidate(
"settings.quicktalk_model_root",
legacy_settings_root,
deprecated=True,
)
)
for name in ("OPENTALKING_QUICKTALK_MODEL_ROOT", "OMNIRT_QUICKTALK_MODEL_ROOT"):
path = _env_path(name)
if path is not None:
candidates.append(QuickTalkRootCandidate(name, path, deprecated=True))
omnirt_model_root = _env_path("OMNIRT_MODEL_ROOT")
if omnirt_model_root is not None:
candidates.append(
QuickTalkRootCandidate(
"OMNIRT_MODEL_ROOT/quicktalk",
(omnirt_model_root / "quicktalk").resolve(),
deprecated=True,
)
)
if include_default and settings is not None:
models_dir = _path_from_raw(getattr(settings, "models_dir", ""))
if models_dir is not None:
candidates.append(
QuickTalkRootCandidate(
"settings.models_dir/quicktalk",
(models_dir / "quicktalk").resolve(),
default=True,
)
)
return candidates
def resolve_quicktalk_asset_root(
settings: Any | None = None,
*,
include_legacy: bool = True,
include_default: bool = True,
) -> Path | None:
candidates = quicktalk_asset_root_candidates(
settings,
include_legacy=include_legacy,
include_default=include_default,
)
if not candidates:
return None
_warn_conflicting_explicit_roots(candidates)
return candidates[0].path
def _warn_conflicting_explicit_roots(candidates: list[QuickTalkRootCandidate]) -> None:
explicit = [candidate for candidate in candidates if not candidate.default]
unique_paths = {candidate.path for candidate in explicit}
if len(unique_paths) <= 1:
return
formatted = ", ".join(f"{candidate.source}={candidate.path}" for candidate in explicit)
log.warning(
"Found conflicting QuickTalk asset roots; using %s=%s. Conflicting roots: %s",
candidates[0].source,
candidates[0].path,
formatted,
)

View File

@@ -47,10 +47,22 @@ def run_cmd(cmd: Sequence[str]) -> None:
def ensure_ffmpeg() -> str:
configured = os.environ.get("OPENTALKING_FFMPEG_BIN", "").strip()
if configured:
return configured
ffmpeg = shutil.which("ffmpeg")
if not ffmpeg:
raise RuntimeError("ffmpeg not found in PATH")
return ffmpeg
if ffmpeg:
return ffmpeg
try:
import imageio_ffmpeg
return str(imageio_ffmpeg.get_ffmpeg_exe())
except Exception as exc:
raise RuntimeError(
"ffmpeg not found. Install ffmpeg or install imageio-ffmpeg; "
"on macOS, `uv sync --extra models --extra quicktalk-cpu --python 3.11` "
"includes the fallback binary."
) from exc
def maybe_mkdir(path: Path) -> None:
@@ -142,14 +154,7 @@ class QuickTalkModelBackend(Protocol):
class OnnxQuickTalkModel:
def __init__(self, onnx_path: Path, device: torch.device) -> None:
if device.type == "cuda":
device_id = device.index if device.index is not None else 0
providers = [
("CUDAExecutionProvider", {"device_id": device_id}),
"CPUExecutionProvider",
]
else:
providers = ["CPUExecutionProvider"]
providers = _onnx_providers_for_device(device)
self.session = ort.InferenceSession(str(onnx_path), providers=providers)
self.input_names = [x.name for x in self.session.get_inputs()]
@@ -172,6 +177,20 @@ class OnnxQuickTalkModel:
return cast(np.ndarray, g), cast(np.ndarray, hn_out), cast(np.ndarray, cn_out)
def _onnx_providers_for_device(device: torch.device) -> list[str | tuple[str, dict[str, int]]]:
available = set(ort.get_available_providers())
if device.type == "cuda":
device_id = device.index if device.index is not None else 0
providers: list[str | tuple[str, dict[str, int]]] = []
if "CUDAExecutionProvider" in available:
providers.append(("CUDAExecutionProvider", {"device_id": device_id}))
providers.append("CPUExecutionProvider")
return providers
if device.type == "mps" and "CoreMLExecutionProvider" in available:
return ["CoreMLExecutionProvider", "CPUExecutionProvider"]
return ["CPUExecutionProvider"]
class TorchQuickTalkModel:
input_names = ["input_1", "input_2", "input_3", "input_4"]

View File

@@ -29,6 +29,38 @@ def make_audio_chunk(audio_pcm: np.ndarray, *, sample_rate: int = 16000) -> Audi
return AudioChunk(data=pcm, sample_rate=int(sample_rate), duration_ms=duration_ms)
def _positive_int_env(*names: str) -> int | None:
for name in names:
raw = os.environ.get(name, "").strip()
if not raw:
continue
try:
value = int(raw)
except ValueError:
logger.warning("Ignoring invalid positive integer env %s=%r", name, raw)
continue
if value > 0:
return value
logger.warning("Ignoring non-positive integer env %s=%r", name, raw)
return None
def _quicktalk_slice_len_for_device(device: str) -> int:
configured = _positive_int_env(
"OPENTALKING_QUICKTALK_SLICE_LEN",
"OPENTALKING_QUICKTALK_CHUNK_FRAMES",
)
if configured is not None:
return configured
if str(device or "").strip().lower().startswith("mps"):
return 12
return 28
def _quicktalk_fps() -> int:
return _positive_int_env("OPENTALKING_QUICKTALK_FPS") or 25
@runtime_checkable
class Audio2VideoClient(Protocol):
"""Common realtime audio-to-video client contract for local and OmniRT backends."""
@@ -359,9 +391,8 @@ class LocalAudio2VideoClient:
frames = getattr(state, "frames", None)
self.frame_num = len(frames) if frames is not None else 1
if self._is_quicktalk_adapter():
self.fps = 25
if self.slice_len <= 0:
self.slice_len = 28
self.fps = _quicktalk_fps()
self.slice_len = _quicktalk_slice_len_for_device(self.device)
self.audio_chunk_samples = max(
1,
int(round(float(self.sample_rate) * float(self.slice_len) / max(1, self.fps))),

View File

@@ -87,7 +87,7 @@ def _explicit_env_enabled(name: str) -> bool:
return raw is not None and raw.strip().lower() in {"1", "true", "yes", "on"}
def _local_adapter_available(model: str) -> bool:
def _local_adapter_available(model: str, settings=None) -> bool:
try:
adapter = get_adapter(model)
except Exception:
@@ -95,7 +95,12 @@ def _local_adapter_available(model: str) -> bool:
runtime_available = getattr(adapter, "runtime_available", None)
if callable(runtime_available):
try:
return bool(runtime_available())
return bool(runtime_available(settings=settings))
except TypeError:
try:
return bool(runtime_available())
except Exception:
return False
except Exception:
return False
return True
@@ -131,7 +136,7 @@ async def resolve_model_statuses(settings) -> list[ModelStatus]:
connected = True
reason = "local_self_test"
elif resolved.backend == "local":
connected = _local_adapter_available(model)
connected = _local_adapter_available(model, settings=settings)
reason = "local_runtime" if connected else "local_adapter_missing"
elif resolved.backend == "omnirt":
if has_omnirt:

View File

@@ -45,7 +45,9 @@ def direct_ws_url(model: str, settings: Any) -> str:
def resolve_model_backend(model: str, settings: Any) -> ModelBackend:
model = model.strip().lower()
backend = get_model_backend(model)
backend = str(getattr(settings, f"{model}_backend", "") or "").strip().lower()
if backend not in {"mock", "local", "omnirt", "direct_ws"}:
backend = get_model_backend(model)
if backend == "direct_ws":
return ModelBackend(model=model, backend=backend, ws_url=direct_ws_url(model, settings))
return ModelBackend(model=model, backend=backend)

View File

@@ -90,15 +90,15 @@ def _log_task_exception(task: asyncio.Task, sid: str) -> None:
def _local_runner_device(model: str, settings: Any, default_device: str) -> str:
model = model.strip().lower()
if model == "quicktalk":
return str(
os.environ.get("OPENTALKING_QUICKTALK_DEVICE")
or getattr(settings, "quicktalk_device", "")
or os.environ.get("OPENTALKING_TORCH_DEVICE")
or getattr(settings, "torch_device", "")
or os.environ.get("OPENTALKING_DEVICE")
or getattr(settings, "device", "")
or os.environ.get("DEVICE")
or default_device
from opentalking.models.quicktalk.adapter import _configured_quicktalk_device
return _configured_quicktalk_device(
getattr(settings, "quicktalk_device", ""),
os.environ.get("OPENTALKING_DEVICE"),
os.environ.get("DEVICE"),
getattr(settings, "torch_device", ""),
getattr(settings, "device", ""),
default_device,
)
if model == "wav2lip":
return str(

View File

@@ -97,9 +97,11 @@ local-qwen3-tts-service = [
"uvicorn[standard]>=0.27",
]
quicktalk-cpu = [
"imageio-ffmpeg>=0.5",
"onnxruntime>=1.24.3",
]
quicktalk-cuda = [
"imageio-ffmpeg>=0.5",
"onnxruntime-gpu>=1.24.0",
]
local-cosyvoice-service = [

View File

@@ -134,3 +134,55 @@ quickstart_describe_port() {
return 1
}
quickstart_resolve_ffmpeg() {
local py_bin="${repo_root:-}/.venv/bin/python"
if [[ -n "${OPENTALKING_FFMPEG_BIN:-}" ]]; then
printf '%s\n' "$OPENTALKING_FFMPEG_BIN"
return 0
fi
if command -v ffmpeg >/dev/null 2>&1; then
command -v ffmpeg
return 0
fi
if [[ ! -x "$py_bin" ]]; then
py_bin="python3"
fi
"$py_bin" - <<'PY'
import imageio_ffmpeg
print(imageio_ffmpeg.get_ffmpeg_exe())
PY
}
quickstart_detach() {
local log_file="$1"
shift
if command -v setsid >/dev/null 2>&1; then
setsid "$@" >"$log_file" 2>&1 < /dev/null &
printf '%s\n' "$!"
return 0
fi
python3 - "$log_file" "$@" <<'PY'
import subprocess
import sys
log_file = sys.argv[1]
argv = sys.argv[2:]
with open(log_file, "ab", buffering=0) as log:
process = subprocess.Popen(
argv,
stdin=subprocess.DEVNULL,
stdout=log,
stderr=subprocess.STDOUT,
close_fds=True,
start_new_session=True,
)
print(process.pid)
PY
}

View File

@@ -96,8 +96,7 @@ echo " api: http://127.0.0.1:$backend_port"
(
cd "$web_dir"
export VITE_BACKEND_PORT="$backend_port"
setsid ./node_modules/.bin/vite --host "$web_host" --port "$web_port" >"$log_file" 2>&1 < /dev/null &
echo "$!" >"$pid_file"
quickstart_detach "$log_file" ./node_modules/.bin/vite --host "$web_host" --port "$web_port" >"$pid_file"
)
pid="$(cat "$pid_file" 2>/dev/null || true)"

View File

@@ -139,14 +139,13 @@ fi
export OPENTALKING_FLASHTALK_TTS_TRAILING_SILENCE_MS="${OPENTALKING_FLASHTALK_TTS_TRAILING_SILENCE_MS:-320}"
# 其它运行时参数
export OPENTALKING_FFMPEG_BIN="${OPENTALKING_FFMPEG_BIN:-ffmpeg}"
export OPENTALKING_FFMPEG_BIN="$(quickstart_resolve_ffmpeg)"
export OPENTALKING_TTS_STREAMING_DECODE="${OPENTALKING_TTS_STREAMING_DECODE:-1}"
export OPENTALKING_TTS_SAMPLE_RATE="${OPENTALKING_TTS_SAMPLE_RATE:-16000}"
export OMNIRT_AUDIO2VIDEO_PATH_TEMPLATE="${OMNIRT_AUDIO2VIDEO_PATH_TEMPLATE:-/v1/audio2video/{model}}"
export FLASHTALK_PREBUFFER_CHUNKS="${FLASHTALK_PREBUFFER_CHUNKS:-2}"
setsid opentalking-unified >"$log_file" 2>&1 < /dev/null &
echo "$!" >"$pid_file"
quickstart_detach "$log_file" opentalking-unified >"$pid_file"
)
pid="$(cat "$pid_file" 2>/dev/null || true)"

View File

@@ -189,6 +189,31 @@ if [[ "$backend" == "local" && "$model" == "musetalk" ]]; then
bash "$quickstart_dir/prepare_local_musetalk.sh"
fi
if [[ "$backend" == "local" && "$model" == "quicktalk" ]]; then
export OMNIRT_ENDPOINT=""
export OPENTALKING_OMNIRT_ENDPOINT=""
if [[ "$(uname -s)" == "Darwin" && -z "${OPENTALKING_QUICKTALK_DEVICE:-}" && -z "${OPENTALKING_TORCH_DEVICE:-}" ]]; then
quicktalk_mac_device="$("$script_dir/../.venv/bin/python" - <<'PY' 2>/dev/null || true
import platform
import sys
if sys.platform == 'darwin' and platform.machine().lower() in {'arm64', 'aarch64'}:
try:
import torch
print('mps' if torch.backends.mps.is_available() else 'cpu')
except Exception:
print('cpu')
PY
)"
quicktalk_mac_device="${quicktalk_mac_device:-cpu}"
export OPENTALKING_QUICKTALK_DEVICE="$quicktalk_mac_device"
export OPENTALKING_TORCH_DEVICE="$quicktalk_mac_device"
echo "Apple Silicon QuickTalk local defaults: OPENTALKING_QUICKTALK_DEVICE=$quicktalk_mac_device"
echo "Install macOS QuickTalk dependencies with: uv sync --extra dev --extra models --extra quicktalk-cpu --python 3.11"
fi
fi
bash "$quickstart_dir/start_opentalking.sh" "${start_args[@]}"
bash "$quickstart_dir/start_frontend.sh" "${web_args[@]}"

View File

@@ -321,3 +321,61 @@ async def test_local_quicktalk_uses_omnirt_chunk_defaults(tmp_path: Path) -> Non
assert init["slice_len"] == 28
assert init["chunk_samples"] == 17920
assert client.audio_chunk_samples == 17920
@pytest.mark.asyncio
async def test_local_quicktalk_uses_smaller_chunks_on_mps_by_default(
tmp_path: Path,
monkeypatch: pytest.MonkeyPatch,
) -> None:
monkeypatch.delenv("OPENTALKING_QUICKTALK_SLICE_LEN", raising=False)
monkeypatch.delenv("OPENTALKING_QUICKTALK_CHUNK_FRAMES", raising=False)
adapter = FakeQuickTalkLocalAdapter()
client = LocalAudio2VideoClient(adapter, device="mps")
avatar = tmp_path / "avatar"
avatar.mkdir()
init = await client.init_session(avatar_path=avatar)
assert init["fps"] == 25
assert init["slice_len"] == 12
assert init["chunk_samples"] == 7680
assert client.audio_chunk_samples == 7680
@pytest.mark.asyncio
async def test_local_quicktalk_slice_len_env_overrides_mps_default(
tmp_path: Path,
monkeypatch: pytest.MonkeyPatch,
) -> None:
monkeypatch.setenv("OPENTALKING_QUICKTALK_SLICE_LEN", "16")
adapter = FakeQuickTalkLocalAdapter()
client = LocalAudio2VideoClient(adapter, device="mps")
avatar = tmp_path / "avatar"
avatar.mkdir()
init = await client.init_session(avatar_path=avatar)
assert init["slice_len"] == 16
assert init["chunk_samples"] == 10240
@pytest.mark.asyncio
async def test_local_quicktalk_fps_env_can_lower_mps_playback_rate(
tmp_path: Path,
monkeypatch: pytest.MonkeyPatch,
) -> None:
monkeypatch.setenv("OPENTALKING_QUICKTALK_FPS", "14")
monkeypatch.delenv("OPENTALKING_QUICKTALK_SLICE_LEN", raising=False)
monkeypatch.delenv("OPENTALKING_QUICKTALK_CHUNK_FRAMES", raising=False)
adapter = FakeQuickTalkLocalAdapter()
client = LocalAudio2VideoClient(adapter, device="mps")
avatar = tmp_path / "avatar"
avatar.mkdir()
init = await client.init_session(avatar_path=avatar)
assert init["fps"] == 14
assert init["slice_len"] == 12
assert init["chunk_samples"] == 13714
assert client.audio_chunk_samples == 13714

View File

@@ -1221,6 +1221,7 @@ def test_quicktalk_cuda_extra_declares_gpu_onnxruntime():
assert "quicktalk-cuda" in pyproject
assert "onnxruntime>=1.24.3" in pyproject
assert "onnxruntime-gpu>=1.24.0" in pyproject
assert "imageio-ffmpeg>=0.5" in pyproject
base_deps = pyproject.split("dependencies = [", 1)[1].split("]", 1)[0]
models_extra = pyproject.split("models = [", 1)[1].split("]", 1)[0]
@@ -1238,6 +1239,50 @@ def test_onnxruntime_extras_declare_uv_conflicts():
assert '{ extra = "demo" }' in pyproject
def test_quicktalk_onnx_provider_prefers_coreml_for_mps(monkeypatch):
from opentalking.models.quicktalk import runtime_v2
monkeypatch.setattr(
runtime_v2.ort,
"get_available_providers",
lambda: ["CoreMLExecutionProvider", "CPUExecutionProvider"],
)
assert runtime_v2._onnx_providers_for_device(runtime_v2.torch.device("mps")) == [
"CoreMLExecutionProvider",
"CPUExecutionProvider",
]
def test_quicktalk_onnx_provider_keeps_cuda_for_cuda_device(monkeypatch):
from opentalking.models.quicktalk import runtime_v2
monkeypatch.setattr(
runtime_v2.ort,
"get_available_providers",
lambda: ["CUDAExecutionProvider", "CPUExecutionProvider"],
)
assert runtime_v2._onnx_providers_for_device(runtime_v2.torch.device("cuda:2")) == [
("CUDAExecutionProvider", {"device_id": 2}),
"CPUExecutionProvider",
]
def test_quicktalk_runtime_uses_imageio_ffmpeg_fallback(monkeypatch):
import types
from opentalking.models.quicktalk import runtime_v2
monkeypatch.delenv("OPENTALKING_FFMPEG_BIN", raising=False)
monkeypatch.setattr(runtime_v2.shutil, "which", lambda _: None)
fake_imageio = types.SimpleNamespace(get_ffmpeg_exe=lambda: "/tmp/ffmpeg-imageio")
monkeypatch.setitem(runtime_v2.sys.modules, "imageio_ffmpeg", fake_imageio)
assert runtime_v2.ensure_ffmpeg() == "/tmp/ffmpeg-imageio"
def test_download_script_excludes_experimental_model_candidates():
from scripts import download_local_audio_models as downloader

View File

@@ -69,7 +69,7 @@ def test_quickstart_source_env_keeps_new_env_file_assignments(tmp_path: Path) ->
pytest.skip("bash is not available")
env_file = tmp_path / "quickstart.env"
env_file.write_text(
"OPENTALKING_QUICKTALK_MODEL_ROOT=/models/quicktalk\n"
"OPENTALKING_QUICKTALK_ASSET_ROOT=/models/quicktalk\n"
"OPENTALKING_WAV2LIP_DEVICE=cuda:6\n",
encoding="utf-8",
)
@@ -77,18 +77,52 @@ def test_quickstart_source_env_keeps_new_env_file_assignments(tmp_path: Path) ->
script = f"""
set -euo pipefail
export OPENTALKING_TORCH_DEVICE=cuda:6
unset OPENTALKING_QUICKTALK_MODEL_ROOT
unset OPENTALKING_QUICKTALK_ASSET_ROOT
unset OPENTALKING_WAV2LIP_DEVICE
source scripts/quickstart/_helpers.sh
quickstart_source_env {env_file}
bash -c 'test "$OPENTALKING_TORCH_DEVICE" = cuda:6'
bash -c 'test "$OPENTALKING_QUICKTALK_MODEL_ROOT" = /models/quicktalk'
bash -c 'test "$OPENTALKING_QUICKTALK_ASSET_ROOT" = /models/quicktalk'
bash -c 'test "$OPENTALKING_WAV2LIP_DEVICE" = cuda:6'
"""
subprocess.run(["bash", "-lc", script], cwd=REPO_ROOT, check=True)
def test_start_unified_sets_apple_silicon_quicktalk_defaults() -> None:
source = (REPO_ROOT / "scripts/start_unified.sh").read_text(encoding="utf-8")
assert 'if [[ "$backend" == "local" && "$model" == "quicktalk" ]]' in source
assert "quicktalk-cpu" in source
assert "OPENTALKING_QUICKTALK_DEVICE" in source
assert "sys.platform == 'darwin'" in source
@pytest.mark.parametrize(
"relpath",
[
"scripts/quickstart/start_opentalking.sh",
"scripts/quickstart/start_frontend.sh",
],
)
def test_quickstart_process_launch_does_not_require_setsid_on_macos(relpath: str) -> None:
source = (REPO_ROOT / relpath).read_text(encoding="utf-8")
helpers = (REPO_ROOT / "scripts/quickstart/_helpers.sh").read_text(encoding="utf-8")
assert "quickstart_detach" in source
assert "command -v setsid" in helpers
assert "start_new_session=True" in helpers
def test_start_opentalking_resolves_ffmpeg_fallback() -> None:
source = (REPO_ROOT / "scripts/quickstart/start_opentalking.sh").read_text(encoding="utf-8")
helpers = (REPO_ROOT / "scripts/quickstart/_helpers.sh").read_text(encoding="utf-8")
assert "quickstart_resolve_ffmpeg" in source
assert 'OPENTALKING_FFMPEG_BIN="${OPENTALKING_FFMPEG_BIN:-ffmpeg}"' not in source
assert "imageio_ffmpeg.get_ffmpeg_exe()" in helpers
def test_quickstart_source_ascend_env_tolerates_unset_ld_library_path(tmp_path: Path) -> None:
if shutil.which("bash") is None:
pytest.skip("bash is not available")

View File

@@ -8,8 +8,12 @@ from pathlib import Path
import numpy as np
import pytest
from opentalking.core.types.frames import VideoFrameData
from opentalking.models.quicktalk.adapter import QuickTalkAdapter
from opentalking.core.types.frames import AudioChunk, VideoFrameData
from opentalking.models.quicktalk.adapter import (
QuickTalkAdapter,
_configured_quicktalk_device,
_default_quicktalk_device,
)
def _write_quicktalk_local_assets(asset_root: Path) -> None:
@@ -56,6 +60,47 @@ def test_quicktalk_runtime_available_rejects_unavailable_explicit_cuda(
assert QuickTalkAdapter.runtime_available() is False
def test_quicktalk_default_device_prefers_mps_on_apple_silicon(
monkeypatch: pytest.MonkeyPatch,
) -> None:
class FakeMps:
@staticmethod
def is_available() -> bool:
return True
fake_torch = types.SimpleNamespace(backends=types.SimpleNamespace(mps=FakeMps()))
monkeypatch.setitem(sys.modules, "torch", fake_torch)
monkeypatch.setattr("platform.system", lambda: "Darwin")
monkeypatch.setattr("platform.machine", lambda: "arm64")
assert _default_quicktalk_device() == "mps"
def test_quicktalk_default_device_falls_back_to_cpu_on_apple_silicon_without_mps(
monkeypatch: pytest.MonkeyPatch,
) -> None:
class FakeMps:
@staticmethod
def is_available() -> bool:
return False
fake_torch = types.SimpleNamespace(backends=types.SimpleNamespace(mps=FakeMps()))
monkeypatch.setitem(sys.modules, "torch", fake_torch)
monkeypatch.setattr("platform.system", lambda: "Darwin")
monkeypatch.setattr("platform.machine", lambda: "arm64")
assert _default_quicktalk_device() == "cpu"
def test_quicktalk_configured_device_preserves_explicit_generic_device(
monkeypatch: pytest.MonkeyPatch,
) -> None:
monkeypatch.delenv("OPENTALKING_QUICKTALK_DEVICE", raising=False)
monkeypatch.delenv("OPENTALKING_TORCH_DEVICE", raising=False)
assert _configured_quicktalk_device("auto", "cuda:3") == "cuda:3"
def test_quicktalk_adapter_treats_empty_asset_root_env_as_unset(
monkeypatch: pytest.MonkeyPatch,
) -> None:
@@ -127,6 +172,94 @@ def test_quicktalk_adapter_falls_back_to_model_root_env(
assert captured["template_video"] == template.resolve()
def test_quicktalk_adapter_falls_back_to_settings_asset_root(
tmp_path: Path,
monkeypatch: pytest.MonkeyPatch,
) -> None:
asset_root = tmp_path / "models" / "quicktalk"
_write_quicktalk_pth_assets(asset_root)
avatar_dir = tmp_path / "avatars" / "anchor"
quicktalk_dir = avatar_dir / "quicktalk"
quicktalk_dir.mkdir(parents=True)
template = quicktalk_dir / "template_512x512.mp4"
template.write_bytes(b"video")
(avatar_dir / "manifest.json").write_text(
json.dumps(
{
"id": "anchor",
"model_type": "quicktalk",
"fps": 25,
"sample_rate": 16000,
"width": 512,
"height": 512,
"version": "1.0",
}
),
encoding="utf-8",
)
captured: dict[str, Path | str | None] = {}
class FakeWorker:
fps = 25
def __init__(
self,
*,
asset_root: Path,
template_video: Path,
device: str,
hubert_device: str | None,
model_backend: str,
**_: object,
) -> None:
captured["asset_root"] = asset_root
captured["template_video"] = template_video
captured["device"] = device
captured["hubert_device"] = hubert_device
captured["model_backend"] = model_backend
def make_state(self) -> object:
return object()
fake_runtime = types.ModuleType("opentalking.models.quicktalk.runtime")
fake_runtime.RealtimeV3Worker = FakeWorker
monkeypatch.setitem(sys.modules, "opentalking.models.quicktalk.runtime", fake_runtime)
monkeypatch.delenv("OPENTALKING_QUICKTALK_ASSET_ROOT", raising=False)
monkeypatch.delenv("OPENTALKING_QUICKTALK_MODEL_ROOT", raising=False)
monkeypatch.delenv("OMNIRT_QUICKTALK_MODEL_ROOT", raising=False)
monkeypatch.setenv("OPENTALKING_QUICKTALK_ASSET_ROOT", "")
monkeypatch.setenv("OPENTALKING_QUICKTALK_MODEL_ROOT", "")
from opentalking.core.config import get_settings
get_settings.cache_clear()
monkeypatch.setattr(
"opentalking.core.config.get_settings",
lambda: types.SimpleNamespace(
quicktalk_asset_root=str(asset_root),
quicktalk_model_root="",
quicktalk_device="mps",
quicktalk_hubert_device="cpu",
quicktalk_model_backend="onnx",
torch_device="",
device="",
),
)
try:
adapter = QuickTalkAdapter()
adapter.load_avatar(str(avatar_dir))
finally:
get_settings.cache_clear()
assert captured["asset_root"] == asset_root.resolve()
assert captured["template_video"] == template.resolve()
assert captured["device"] == "mps"
assert captured["hubert_device"] == "cpu"
assert captured["model_backend"] == "onnx"
def test_quicktalk_adapter_accepts_avatar_with_quicktalk_metadata(
tmp_path: Path,
monkeypatch: pytest.MonkeyPatch,
@@ -556,6 +689,94 @@ def test_quicktalk_adapter_warmup_runs_silence_and_restores_stream_state() -> No
assert state.session_state == {"existing": True}
def test_quicktalk_adapter_can_downsample_generated_frames_for_mac(
monkeypatch: pytest.MonkeyPatch,
) -> None:
monkeypatch.setenv("OPENTALKING_QUICKTALK_FPS", "14")
adapter = QuickTalkAdapter()
generated_rep_ids: list[int] = []
class FakeWorker:
fps = 25
def prepare_pcm_features(self, pcm, sample_rate):
return [np.full((1, 1), i, dtype=np.float32) for i in range(21)], 0.1
def generate_frames_from_reps(self, reps, state=None):
del state
for rep in reps:
generated_rep_ids.append(int(rep[0, 0]))
yield np.zeros((4, 4, 3), dtype=np.uint8)
state = types.SimpleNamespace(
worker=FakeWorker(),
fps=25,
frame_index=0,
session_state=None,
)
features, frames = adapter.render_audio_chunk(
state, # type: ignore[arg-type]
AudioChunk(
data=np.zeros(13714, dtype=np.int16),
sample_rate=16000,
duration_ms=857.125,
),
)
assert len(features.reps) == 21
assert len(frames) == 12
assert state.frame_index == 12
assert generated_rep_ids == [0, 2, 4, 5, 7, 9, 11, 13, 15, 16, 18, 20]
def test_quicktalk_adapter_downsamples_through_live_render_pipeline(
monkeypatch: pytest.MonkeyPatch,
) -> None:
from opentalking.pipeline.speak.render_pipeline import render_audio_chunk_sync
monkeypatch.setenv("OPENTALKING_QUICKTALK_FPS", "14")
adapter = QuickTalkAdapter()
generated_rep_ids: list[int] = []
class FakeWorker:
fps = 25
def prepare_pcm_features(self, pcm, sample_rate):
return [np.full((1, 1), i, dtype=np.float32) for i in range(21)], 0.1
def generate_frames_from_reps(self, reps, state=None):
del state
for rep in reps:
generated_rep_ids.append(int(rep[0, 0]))
yield np.zeros((4, 4, 3), dtype=np.uint8)
state = types.SimpleNamespace(
worker=FakeWorker(),
fps=25,
frame_index=0,
extra={},
session_state=None,
)
next_frame_idx, frames = render_audio_chunk_sync(
adapter,
state,
AudioChunk(
data=np.zeros(13714, dtype=np.int16),
sample_rate=16000,
duration_ms=857.125,
),
frame_index_start=0,
speech_frame_index_start=0,
)
assert next_frame_idx == 12
assert len(frames) == 12
assert frames[1].timestamp_ms == pytest.approx(1000.0 / 14.0)
assert generated_rep_ids == [0, 2, 4, 5, 7, 9, 11, 13, 15, 16, 18, 20]
def test_quicktalk_adapter_evicts_old_worker_cache_entries(
tmp_path: Path,
monkeypatch: pytest.MonkeyPatch,

View File

@@ -0,0 +1,81 @@
from __future__ import annotations
from types import SimpleNamespace
import pytest
from opentalking.models.quicktalk.paths import resolve_quicktalk_asset_root
def test_resolve_quicktalk_asset_root_uses_one_public_root_before_legacy(
tmp_path,
monkeypatch,
) -> None:
settings_asset_root = tmp_path / "settings-assets"
env_asset_root = tmp_path / "env-assets"
legacy_settings_root = tmp_path / "settings-legacy"
legacy_env_root = tmp_path / "env-legacy"
omnirt_root = tmp_path / "omnirt-legacy"
shared_omnirt_root = tmp_path / "shared-omnirt"
monkeypatch.setenv("OPENTALKING_QUICKTALK_ASSET_ROOT", str(env_asset_root))
monkeypatch.setenv("OPENTALKING_QUICKTALK_MODEL_ROOT", str(legacy_env_root))
monkeypatch.setenv("OMNIRT_QUICKTALK_MODEL_ROOT", str(omnirt_root))
monkeypatch.setenv("OMNIRT_MODEL_ROOT", str(shared_omnirt_root))
settings = SimpleNamespace(
quicktalk_asset_root=str(settings_asset_root),
quicktalk_model_root=str(legacy_settings_root),
models_dir=str(tmp_path / "repo-models"),
)
assert resolve_quicktalk_asset_root(settings) == settings_asset_root.resolve()
settings.quicktalk_asset_root = ""
assert resolve_quicktalk_asset_root(settings) == env_asset_root.resolve()
monkeypatch.delenv("OPENTALKING_QUICKTALK_ASSET_ROOT")
assert resolve_quicktalk_asset_root(settings) == legacy_settings_root.resolve()
settings.quicktalk_model_root = ""
assert resolve_quicktalk_asset_root(settings) == legacy_env_root.resolve()
monkeypatch.delenv("OPENTALKING_QUICKTALK_MODEL_ROOT")
assert resolve_quicktalk_asset_root(settings) == omnirt_root.resolve()
monkeypatch.delenv("OMNIRT_QUICKTALK_MODEL_ROOT")
assert resolve_quicktalk_asset_root(settings) == (
shared_omnirt_root / "quicktalk"
).resolve()
monkeypatch.delenv("OMNIRT_MODEL_ROOT")
assert resolve_quicktalk_asset_root(settings) == (
tmp_path / "repo-models" / "quicktalk"
).resolve()
def test_resolve_quicktalk_asset_root_can_skip_default_fallback(tmp_path, monkeypatch) -> None:
monkeypatch.delenv("OPENTALKING_QUICKTALK_ASSET_ROOT", raising=False)
monkeypatch.delenv("OPENTALKING_QUICKTALK_MODEL_ROOT", raising=False)
monkeypatch.delenv("OMNIRT_QUICKTALK_MODEL_ROOT", raising=False)
monkeypatch.delenv("OMNIRT_MODEL_ROOT", raising=False)
settings = SimpleNamespace(models_dir=str(tmp_path / "repo-models"))
assert resolve_quicktalk_asset_root(settings, include_default=False) is None
def test_resolve_quicktalk_asset_root_warns_on_conflicting_explicit_roots(
tmp_path,
monkeypatch,
caplog: pytest.LogCaptureFixture,
) -> None:
monkeypatch.setenv("OPENTALKING_QUICKTALK_ASSET_ROOT", str(tmp_path / "env-assets"))
monkeypatch.setenv("OPENTALKING_QUICKTALK_MODEL_ROOT", str(tmp_path / "legacy-env"))
settings = SimpleNamespace(
quicktalk_asset_root=str(tmp_path / "settings-assets"),
quicktalk_model_root="",
models_dir=str(tmp_path / "repo-models"),
)
assert resolve_quicktalk_asset_root(settings) == (tmp_path / "settings-assets").resolve()
assert "conflicting QuickTalk asset roots" in caplog.text