fix quicktalk local assets and support QuickTalk on Apple Silicon (#98)

2026-07-03 15:22:34 +08:00 · 2026-06-12 16:41:50 +08:00
parent b6ffab2bb4
commit 5cdcd8dd3d
40 changed files with 1640 additions and 129 deletions
--- a/.env.example
+++ b/.env.example
@@ -20,7 +20,8 @@ OPENTALKING_EXPORTS_DIR=./data/exports
 # WebUI 默认展示的数字人模型；CLI --model 会覆盖该值。
 OPENTALKING_DEFAULT_MODEL=mock
 OPENTALKING_TORCH_DEVICE=auto
-OPENTALKING_FFMPEG_BIN=ffmpeg
+# Leave empty to auto-detect system ffmpeg, then fall back to imageio-ffmpeg.
+OPENTALKING_FFMPEG_BIN=
 OPENTALKING_DEFAULT_FPS=25
 OPENTALKING_TTS_SAMPLE_RATE=16000
 OPENTALKING_TTS_STREAMING_DECODE=1
@@ -204,11 +205,15 @@ OPENTALKING_TTS_EDGE_VOICE=zh-CN-XiaoxiaoNeural
 # OPENTALKING_DEFAULT_MODEL=quicktalk
 # OPENTALKING_QUICKTALK_BACKEND=local
 # OPENTALKING_QUICKTALK_ASSET_ROOT=./models/quicktalk
-# OPENTALKING_QUICKTALK_MODEL_ROOT=./models/quicktalk
 # OPENTALKING_QUICKTALK_MODEL_BACKEND=auto
 # OPENTALKING_QUICKTALK_DEVICE=cuda:0
 # OPENTALKING_QUICKTALK_HUBERT_DEVICE=
 # OPENTALKING_QUICKTALK_WORKER_CACHE=1
+# OPENTALKING_QUICKTALK_SLICE_LEN=28
+# Apple Silicon/MPS defaults to 12 for lower per-chunk latency; Linux CUDA keeps 28.
+# Set this on Apple Silicon when smooth long-text playback matters more than 25fps motion.
+# Linux CUDA keeps the model-native 25fps when this is unset.
+# OPENTALKING_QUICKTALK_FPS=14
 # OPENTALKING_QUICKTALK_RENDER_CHUNK_MS=500
 # OPENTALKING_QUICKTALK_PREFETCH=1
 # OPENTALKING_PREWARM_AVATARS=
--- a/.gitignore
+++ b/.gitignore
@@ -53,6 +53,7 @@ data/
 # Runtime output & debug artifacts
 output/
 outputs/
+run/
 .run-video-clone/

 # Frontend build artifacts (regenerate with npm run build)
--- a/apps/api/routes/avatars.py
+++ b/apps/api/routes/avatars.py
@@ -22,6 +22,7 @@ from PIL import Image
 from opentalking.avatar import mouth_metadata
 from opentalking.avatar.loader import load_avatar_bundle
 from opentalking.avatar.validator import list_avatar_dirs
+from opentalking.models.quicktalk.paths import resolve_quicktalk_asset_root
 from opentalking.models.registry import get_adapter
 from opentalking.providers.synthesis.backends import resolve_model_backend
 from opentalking.providers.synthesis.omnirt import auth_headers
@@ -333,17 +334,10 @@ async def _post_omnirt_json(settings: Any, path: str, payload: dict[str, Any]) -


 def _settings_quicktalk_model_root(settings: Any) -> Path:
-    raw = (
-        getattr(settings, "quicktalk_model_root", "")
-        or os.environ.get("OPENTALKING_QUICKTALK_MODEL_ROOT", "")
-        or os.environ.get("OMNIRT_QUICKTALK_MODEL_ROOT", "")
-    )
-    if raw:
-        return Path(str(raw)).expanduser().resolve()
-    omnirt_model_root = os.environ.get("OMNIRT_MODEL_ROOT", "").strip()
-    if omnirt_model_root:
-        return (Path(omnirt_model_root).expanduser().resolve() / "quicktalk").resolve()
-    return (Path(getattr(settings, "models_dir", "./models")) / "quicktalk").expanduser().resolve()
+    resolved = resolve_quicktalk_asset_root(settings)
+    if resolved is not None:
+        return resolved
+    return (Path("./models") / "quicktalk").expanduser().resolve()


 def _settings_int(settings: Any, name: str, env_name: str, default: int) -> int:
@@ -381,15 +375,16 @@ def _settings_optional_float(


 def _quicktalk_rebuild(settings: Any):
+    from opentalking.models.quicktalk.adapter import _configured_quicktalk_device
    from opentalking.models.quicktalk.runtime_v2 import QuickTalkRebuild

    return QuickTalkRebuild(
        asset_root=_settings_quicktalk_model_root(settings),
-        device=str(
+        device=_configured_quicktalk_device(
            getattr(settings, "quicktalk_device", None)
-            or os.environ.get("OPENTALKING_QUICKTALK_DEVICE")
-            or os.environ.get("OMNIRT_QUICKTALK_DEVICE")
-            or "cuda:0"
+            or os.environ.get("OMNIRT_QUICKTALK_DEVICE"),
+            getattr(settings, "torch_device", ""),
+            getattr(settings, "device", ""),
        ),
        hubert_device=(
            getattr(settings, "quicktalk_hubert_device", None)
@@ -409,19 +404,20 @@ def _quicktalk_rebuild(settings: Any):
 class _QuickTalkCacheBuilder:
    def __init__(self, settings: Any) -> None:
        import torch
+        from opentalking.models.quicktalk.adapter import _configured_quicktalk_device
        from opentalking.models.quicktalk.runtime_v2 import ImageProcessor

        asset_root = _settings_quicktalk_model_root(settings)
        checkpoints = asset_root / "checkpoints"
        aux_min = checkpoints / "auxiliary_min"
        aux_root = aux_min if aux_min.exists() else (checkpoints / "auxiliary")
-        device = str(
+        device = _configured_quicktalk_device(
            getattr(settings, "quicktalk_face_cache_device", None)
            or getattr(settings, "quicktalk_device", None)
            or os.environ.get("OPENTALKING_QUICKTALK_FACE_CACHE_DEVICE")
-            or os.environ.get("OPENTALKING_QUICKTALK_DEVICE")
-            or os.environ.get("OMNIRT_QUICKTALK_DEVICE")
-            or "cuda:0"
+            or os.environ.get("OMNIRT_QUICKTALK_DEVICE"),
+            getattr(settings, "torch_device", ""),
+            getattr(settings, "device", ""),
        )
        torch_device = torch.device(device)
        dtype = torch.float32
@@ -568,15 +564,14 @@ def _local_adapter_device(model: str, settings: Any) -> str:
            or "cuda"
        )
    if model == "quicktalk":
-        return str(
-            getattr(settings, "quicktalk_device", "")
-            or os.environ.get("OPENTALKING_QUICKTALK_DEVICE")
-            or os.environ.get("OPENTALKING_TORCH_DEVICE")
-            or getattr(settings, "torch_device", "")
-            or getattr(settings, "device", "")
-            or os.environ.get("OPENTALKING_DEVICE")
-            or os.environ.get("DEVICE")
-            or "cuda:0"
+        from opentalking.models.quicktalk.adapter import _configured_quicktalk_device
+
+        return _configured_quicktalk_device(
+            getattr(settings, "quicktalk_device", ""),
+            os.environ.get("OPENTALKING_DEVICE"),
+            os.environ.get("DEVICE"),
+            getattr(settings, "torch_device", ""),
+            getattr(settings, "device", ""),
        )
    return str(
        getattr(settings, "device", "")
--- a/apps/api/routes/health.py
+++ b/apps/api/routes/health.py
@@ -6,6 +6,7 @@ from typing import Any
 from fastapi import APIRouter, Request

 from opentalking.core.queue_status import get_flashtalk_queue_status
+from opentalking.models.quicktalk.paths import resolve_quicktalk_asset_root
 from opentalking.providers.stt.factory import stt_enabled_providers, stt_provider_config, stt_status
 from opentalking.providers.tts.factory import tts_enabled_providers, tts_provider_config, tts_status

@@ -66,8 +67,14 @@ def _runtime_status_payload(request: Request) -> dict[str, Any]:
        getattr(settings, "llm_api_key", "") or ""
    ).strip()
    ignored_legacy_env = [name for name in _IGNORED_LEGACY_ENV if os.environ.get(name)]
-    quicktalk_backend = os.environ.get("OPENTALKING_QUICKTALK_BACKEND", "").strip()
-    quicktalk_device = os.environ.get("OPENTALKING_TORCH_DEVICE", "").strip()
+    quicktalk_backend = os.environ.get("OPENTALKING_QUICKTALK_BACKEND", "").strip() or str(
+        getattr(settings, "quicktalk_backend", "") or ""
+    ).strip()
+    quicktalk_device = os.environ.get("OPENTALKING_QUICKTALK_DEVICE", "").strip() or str(
+        getattr(settings, "quicktalk_device", "") or ""
+    ).strip()
+    quicktalk_asset_root_path = resolve_quicktalk_asset_root(settings)
+    quicktalk_asset_root = str(quicktalk_asset_root_path) if quicktalk_asset_root_path else ""
    return {
        "status": "ok",
        "llm_provider": os.environ.get("OPENTALKING_LLM_PROVIDER", "").strip()
@@ -97,7 +104,7 @@ def _runtime_status_payload(request: Request) -> dict[str, Any]:
        "default_model": str(getattr(settings, "default_model", "") or ""),
        "quicktalk_backend": quicktalk_backend,
        "quicktalk_device": quicktalk_device,
-        "quicktalk_asset_root": os.environ.get("OPENTALKING_QUICKTALK_ASSET_ROOT", "").strip(),
+        "quicktalk_asset_root": quicktalk_asset_root,
        "ignored_legacy_env": ignored_legacy_env,
    }

--- a/apps/api/tests/test_custom_avatars.py
+++ b/apps/api/tests/test_custom_avatars.py
@@ -75,6 +75,7 @@ def test_create_custom_avatar_adds_listed_asset_with_preview(tmp_path):


 def test_quicktalk_model_root_falls_back_to_omnirt_model_root(tmp_path, monkeypatch):
+    monkeypatch.delenv("OPENTALKING_QUICKTALK_ASSET_ROOT", raising=False)
    monkeypatch.delenv("OPENTALKING_QUICKTALK_MODEL_ROOT", raising=False)
    monkeypatch.delenv("OMNIRT_QUICKTALK_MODEL_ROOT", raising=False)
    monkeypatch.setenv("OMNIRT_MODEL_ROOT", str(tmp_path / "shared-models"))
@@ -86,6 +87,35 @@ def test_quicktalk_model_root_falls_back_to_omnirt_model_root(tmp_path, monkeypa
    ).resolve()


+def test_quicktalk_model_root_prefers_asset_root_setting_and_env(tmp_path, monkeypatch):
+    env_asset_root = tmp_path / "env-quicktalk"
+    setting_asset_root = tmp_path / "settings-quicktalk"
+    monkeypatch.setenv("OPENTALKING_QUICKTALK_ASSET_ROOT", str(env_asset_root))
+    monkeypatch.setenv("OPENTALKING_QUICKTALK_MODEL_ROOT", str(tmp_path / "legacy-env-root"))
+    monkeypatch.setenv("OMNIRT_QUICKTALK_MODEL_ROOT", str(tmp_path / "omnirt-env-root"))
+    monkeypatch.setenv("OMNIRT_MODEL_ROOT", str(tmp_path / "shared-models"))
+
+    settings = SimpleNamespace(
+        models_dir=str(tmp_path / "repo-models"),
+        quicktalk_asset_root=str(setting_asset_root),
+        quicktalk_model_root=str(tmp_path / "legacy-settings-root"),
+    )
+
+    assert avatars._settings_quicktalk_model_root(settings) == setting_asset_root.resolve()
+
+    settings.quicktalk_asset_root = ""
+    assert avatars._settings_quicktalk_model_root(settings) == env_asset_root.resolve()
+
+    monkeypatch.delenv("OPENTALKING_QUICKTALK_ASSET_ROOT")
+    assert avatars._settings_quicktalk_model_root(settings) == (
+        tmp_path / "legacy-settings-root"
+    ).resolve()
+    settings.quicktalk_model_root = ""
+    assert avatars._settings_quicktalk_model_root(settings) == (
+        tmp_path / "legacy-env-root"
+    ).resolve()
+
+
 def test_quicktalk_avatar_prewarm_uses_full_video_by_default(
    tmp_path,
    monkeypatch,
@@ -891,6 +921,91 @@ def test_quicktalk_avatar_prewarm_uses_local_adapter_when_backend_is_local(tmp_p
    assert calls[1][1] == str(avatar)


+def test_wav2lip_avatar_can_prewarm_quicktalk_with_asset_root_setting(tmp_path, monkeypatch):
+    quicktalk_asset_root = tmp_path / "quicktalk-assets"
+    avatar = tmp_path / "wav-avatar"
+    avatar.mkdir()
+    (avatar / "reference.png").write_bytes(_png_bytes((16, 24)))
+    (avatar / "manifest.json").write_text(
+        json.dumps(
+            {
+                "id": "wav-avatar",
+                "name": "Wav Avatar",
+                "model_type": "wav2lip",
+                "fps": 25,
+                "sample_rate": 16000,
+                "width": 16,
+                "height": 24,
+                "version": "1.0",
+            }
+        ),
+        encoding="utf-8",
+    )
+
+    prepared_asset_roots: list[Path] = []
+
+    def fake_prepare_quicktalk_asset(**kwargs):
+        rebuild = kwargs["rebuild"]
+        prepared_asset_roots.append(rebuild.asset_root)
+        return avatars.PreparedAssetResult(
+            avatar_id="wav-avatar",
+            status="generated",
+            source_mode="image",
+            template_path=avatar / "quicktalk" / "template_16x24.mp4",
+            cache_path=avatar / "quicktalk" / "face_cache_v3_16x24.npz",
+            frames=1,
+        )
+
+    class FakeAdapter:
+        def load_model(self, device="cuda"):
+            del device
+
+        def load_avatar(self, avatar_path):
+            return {"avatar_path": avatar_path}
+
+        def warmup(self, avatar_state):
+            del avatar_state
+
+    async def fail_omnirt(settings, path, payload):
+        del settings, path, payload
+        raise AssertionError("local prewarm must not call OmniRT")
+
+    monkeypatch.setattr(avatars, "_prepare_quicktalk_asset", fake_prepare_quicktalk_asset)
+    monkeypatch.setattr(
+        avatars,
+        "_quicktalk_cache_builder",
+        lambda settings: SimpleNamespace(
+            asset_root=avatars._settings_quicktalk_model_root(settings)
+        ),
+    )
+    monkeypatch.setattr(avatars, "_quicktalk_cache_hit_result", lambda *args, **kwargs: None)
+    monkeypatch.setattr(avatars, "_post_omnirt_json", fail_omnirt)
+    monkeypatch.setattr(avatars, "resolve_model_backend", lambda model, settings: SimpleNamespace(backend="local"))
+    monkeypatch.setattr(avatars, "get_adapter", lambda model: FakeAdapter())
+    monkeypatch.setenv("OPENTALKING_QUICKTALK_MODEL_ROOT", str(tmp_path / "wrong-legacy-root"))
+
+    app = FastAPI()
+    app.state.settings = SimpleNamespace(
+        avatars_dir=str(tmp_path),
+        models_dir=str(tmp_path / "wrong-models-dir"),
+        quicktalk_asset_root=str(quicktalk_asset_root),
+        quicktalk_model_root="",
+        quicktalk_device="cpu",
+        device="cpu",
+    )
+    app.include_router(avatars.router)
+    client = TestClient(app)
+
+    response = client.post("/avatars/wav-avatar/prewarm", json={"model": "quicktalk"})
+
+    assert response.status_code == 200
+    payload = response.json()
+    assert payload["status"] == "ready"
+    assert payload["cache"]["model"] == "quicktalk"
+    assert payload["runtime"]["type"] == "local_prewarm_result"
+    assert prepared_asset_roots == [quicktalk_asset_root.resolve()]
+
+
 def test_video_avatar_exposes_preview_video(tmp_path):
    base = tmp_path / "video-avatar"
    base.mkdir()
--- a/apps/api/tests/test_health.py
+++ b/apps/api/tests/test_health.py
@@ -0,0 +1,34 @@
+from __future__ import annotations
+
+from types import SimpleNamespace
+
+from apps.api.routes.health import _runtime_status_payload
+
+
+def test_health_reports_resolved_quicktalk_asset_root(tmp_path, monkeypatch) -> None:
+    settings_asset_root = tmp_path / "settings-assets"
+    env_asset_root = tmp_path / "env-assets"
+    monkeypatch.setenv("OPENTALKING_QUICKTALK_ASSET_ROOT", str(env_asset_root))
+    monkeypatch.setenv("OPENTALKING_QUICKTALK_MODEL_ROOT", str(tmp_path / "legacy-root"))
+
+    request = SimpleNamespace(
+        app=SimpleNamespace(
+            state=SimpleNamespace(
+                settings=SimpleNamespace(
+                    quicktalk_asset_root=str(settings_asset_root),
+                    quicktalk_model_root="",
+                    models_dir=str(tmp_path / "models"),
+                    llm_api_key="",
+                    llm_provider="openai_compatible",
+                    llm_model="",
+                    default_model="quicktalk",
+                    quicktalk_backend="local",
+                    quicktalk_device="mps",
+                )
+            )
+        )
+    )
+
+    payload = _runtime_status_payload(request)
+
+    assert payload["quicktalk_asset_root"] == str(settings_asset_root.resolve())
--- a/apps/api/tests/test_models.py
+++ b/apps/api/tests/test_models.py
@@ -15,6 +15,16 @@ from opentalking.providers.synthesis.availability import (
 )


+def _write_quicktalk_local_assets(asset_root) -> None:
+    checkpoints = asset_root / "checkpoints"
+    (checkpoints / "chinese-hubert-large").mkdir(parents=True)
+    (checkpoints / "auxiliary" / "models" / "buffalo_l").mkdir(parents=True)
+    (checkpoints / "quicktalk.pth").write_bytes(b"pth")
+    (checkpoints / "repair.npy").write_bytes(b"repair")
+    (checkpoints / "chinese-hubert-large" / "pytorch_model.bin").write_bytes(b"hubert")
+    (checkpoints / "auxiliary" / "models" / "buffalo_l" / "det_10g.onnx").write_bytes(b"onnx")
+
+
 def test_models_route_lists_all_models_with_connection_status_without_omnirt(monkeypatch) -> None:
    monkeypatch.delenv("OPENTALKING_QUICKTALK_BACKEND", raising=False)
    monkeypatch.delenv("OPENTALKING_CONFIG_FILE", raising=False)
@@ -98,6 +108,25 @@ def test_settings_loads_default_model_from_environment(monkeypatch, tmp_path) ->
    assert settings.default_model == "quicktalk"


+def test_settings_loads_quicktalk_local_fields_from_environment(monkeypatch, tmp_path) -> None:
+    monkeypatch.setenv("OPENTALKING_QUICKTALK_ASSET_ROOT", str(tmp_path / "models" / "quicktalk"))
+    monkeypatch.setenv("OPENTALKING_QUICKTALK_BACKEND", "local")
+    monkeypatch.setenv("OPENTALKING_QUICKTALK_MODEL_BACKEND", "auto")
+    monkeypatch.setenv("OPENTALKING_QUICKTALK_DEVICE", "mps")
+    monkeypatch.setenv("OPENTALKING_QUICKTALK_SLICE_LEN", "12")
+    monkeypatch.delenv("OPENTALKING_CONFIG_FILE", raising=False)
+    monkeypatch.delenv("CONFIG_FILE", raising=False)
+    monkeypatch.chdir(tmp_path)
+
+    settings = Settings(_env_file=None)
+
+    assert settings.quicktalk_asset_root == str(tmp_path / "models" / "quicktalk")
+    assert settings.quicktalk_backend == "local"
+    assert settings.quicktalk_model_backend == "auto"
+    assert settings.quicktalk_device == "mps"
+    assert settings.quicktalk_slice_len == 12
+
+
 def test_settings_loads_default_model_from_yaml_model_section(monkeypatch, tmp_path) -> None:
    config_file = tmp_path / "opentalking.yaml"
    config_file.write_text(
@@ -169,7 +198,12 @@ def test_omnirt_endpoint_defaults_to_audio2video_routes() -> None:
    assert resolve_synthesis_ws_url("flashtalk", settings) == "ws://127.0.0.1:9000/v1/audio2video/flashtalk"


-async def test_omnirt_status_keeps_local_backend_local(monkeypatch) -> None:
+async def test_omnirt_status_keeps_local_backend_local(monkeypatch, tmp_path) -> None:
+    monkeypatch.chdir(tmp_path)
+    monkeypatch.delenv("OPENTALKING_CONFIG_FILE", raising=False)
+    monkeypatch.delenv("CONFIG_FILE", raising=False)
+    monkeypatch.setenv("OPENTALKING_QUICKTALK_BACKEND", "omnirt")
+    clear_model_config_cache()
    monkeypatch.setattr(
        "opentalking.models.wav2lip.adapter.Wav2LipAdapter.runtime_available",
        staticmethod(lambda: True),
@@ -200,6 +234,36 @@ async def test_omnirt_status_keeps_local_backend_local(monkeypatch) -> None:
    assert statuses["quicktalk"].backend == "omnirt"
    assert statuses["quicktalk"].connected is True
    assert statuses["quicktalk"].reason == "omnirt"
+    clear_model_config_cache()
+
+
+async def test_models_status_uses_settings_for_local_quicktalk_assets(
+    tmp_path,
+    monkeypatch,
+) -> None:
+    monkeypatch.delenv("OPENTALKING_QUICKTALK_ASSET_ROOT", raising=False)
+    monkeypatch.delenv("OPENTALKING_QUICKTALK_MODEL_ROOT", raising=False)
+    monkeypatch.setenv("OPENTALKING_QUICKTALK_BACKEND", "local")
+    asset_root = tmp_path / "models" / "quicktalk"
+    _write_quicktalk_local_assets(asset_root)
+
+    settings = SimpleNamespace(
+        omnirt_endpoint="",
+        flashtalk_ws_url="",
+        flashhead_ws_url="",
+        quicktalk_asset_root=str(asset_root),
+        quicktalk_model_root="",
+        quicktalk_device="mps",
+        quicktalk_hubert_device="",
+        torch_device="auto",
+        device="",
+    )
+
+    statuses = {status.id: status for status in await resolve_model_statuses(settings)}
+
+    assert statuses["quicktalk"].backend == "local"
+    assert statuses["quicktalk"].connected is True
+    assert statuses["quicktalk"].reason == "local_runtime"


 async def test_omnirt_endpoint_only_affects_omnirt_backend(tmp_path, monkeypatch) -> None:
--- a/apps/api/tests/test_openai_compatible_audio.py
+++ b/apps/api/tests/test_openai_compatible_audio.py
@@ -75,7 +75,11 @@ def test_tts_openai_compatible_posts_audio_speech(monkeypatch: pytest.MonkeyPatc
    monkeypatch.setenv("OPENTALKING_TTS_OPENAI_VOICE", "neutral-test")
    monkeypatch.setenv("OPENTALKING_TTS_OPENAI_PROTOCOL", "audio_speech")

-    tts = build_tts_adapter(sample_rate=16000, chunk_ms=20.0)
+    tts = build_tts_adapter(
+        sample_rate=16000,
+        chunk_ms=20.0,
+        tts_provider="openai_compatible",
+    )
    chunks = asyncio.run(_collect_tts_chunks(tts, "你好，开始测试。"))

    assert chunks
--- a/apps/cli/prepare_cache.py
+++ b/apps/cli/prepare_cache.py
@@ -515,7 +515,8 @@ def parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace:
    )
    parser.add_argument("--avatars-root", type=Path, required=True)
    parser.add_argument("--avatar", action="append", default=[], help="Avatar id to process.")
-    parser.add_argument("--quicktalk-model-root", type=Path)
+    parser.add_argument("--quicktalk-asset-root", type=Path)
+    parser.add_argument("--quicktalk-model-root", type=Path, help=argparse.SUPPRESS)
    parser.add_argument("--wav2lip-model-root", type=Path)
    parser.add_argument("--wav2lip-face-det-device")
    parser.add_argument("--wav2lip-max-reference-frames", type=int, default=125)
@@ -537,13 +538,14 @@ def parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace:
 def main(argv: Sequence[str] | None = None) -> int:
    args = parse_args(argv)
    avatars_root = args.avatars_root.expanduser().resolve()
-    if "quicktalk" in args.model and args.quicktalk_model_root is None:
-        raise SystemExit("--quicktalk-model-root is required when --model quicktalk is selected")
+    quicktalk_asset_root = args.quicktalk_asset_root or args.quicktalk_model_root
+    if "quicktalk" in args.model and quicktalk_asset_root is None:
+        raise SystemExit("--quicktalk-asset-root is required when --model quicktalk is selected")
    rebuild = None
    if "quicktalk" in args.model:
        from opentalking.models.quicktalk.runtime_v2 import QuickTalkRebuild

-        quicktalk_root = args.quicktalk_model_root.expanduser().resolve()
+        quicktalk_root = quicktalk_asset_root.expanduser().resolve()
        rebuild = QuickTalkRebuild(
            asset_root=quicktalk_root,
            device=args.device,
--- a/apps/unified/main.py
+++ b/apps/unified/main.py
@@ -107,11 +107,9 @@ def _adapter_device(model_type: str, default_device: str) -> str:
    if model_type == "wav2lip":
        return os.environ.get("OPENTALKING_WAV2LIP_DEVICE") or default_device
    if model_type == "quicktalk":
-        return (
-            os.environ.get("OPENTALKING_QUICKTALK_DEVICE")
-            or os.environ.get("OPENTALKING_TORCH_DEVICE")
-            or default_device
-        )
+        from opentalking.models.quicktalk.adapter import _configured_quicktalk_device
+
+        return _configured_quicktalk_device(default_device)
    return default_device


--- a/docs/en/model-deployment/quicktalk.md
+++ b/docs/en/model-deployment/quicktalk.md
@@ -30,7 +30,7 @@ The local adapter reads an asset root that contains `checkpoints/`:
 ```text
 $OPENTALKING_QUICKTALK_ASSET_ROOT/
  checkpoints/
-    quicktalk.pth or 256.onnx
+    quicktalk.pth
    repair.npy
    chinese-hubert-large/
      pytorch_model.bin
--- a/docs/en/model-deployment/quicktalk/apple-silicon.md
+++ b/docs/en/model-deployment/quicktalk/apple-silicon.md
@@ -0,0 +1,210 @@
+# QuickTalk on Apple Silicon
+
+This page is for running QuickTalk locally on Apple Silicon macOS. It is intended for development, demos, and integration checks. For stable realtime 25fps output, use the Linux CUDA path in [QuickTalk Local Deployment](local.md) or run QuickTalk behind OmniRT.
+
+## 1. Install Dependencies
+
+```bash title="Terminal"
+brew install python@3.11 node uv
+
+# Optional. OpenTalking can fall back to imageio-ffmpeg when this is absent.
+brew install ffmpeg
+```
+
+Clone OpenTalking and create the environment with the CPU/macOS extra:
+
+```bash title="Terminal"
+git clone https://github.com/OpenTalker/opentalking.git
+cd opentalking
+
+export UV_INDEX_URL=https://pypi.tuna.tsinghua.edu.cn/simple
+export PIP_INDEX_URL=https://pypi.tuna.tsinghua.edu.cn/simple
+export UV_HTTP_TIMEOUT=300
+export UV_LINK_MODE=copy
+
+uv sync --extra dev --extra models --extra quicktalk-cpu --python 3.11
+source .venv/bin/activate
+```
+
+Do not install `quicktalk-cuda` on Apple Silicon. `onnxruntime-gpu` does not provide a macOS arm64 wheel.
+
+## 2. Download QuickTalk Assets
+
+Download the QuickTalk weights and HuBERT files:
+
+```bash title="Terminal"
+mkdir -p models/quicktalk/checkpoints
+
+hf download datascale-ai/quicktalk \
+  quicktalk.pth \
+  repair.npy \
+  chinese-hubert-large/config.json \
+  chinese-hubert-large/preprocessor_config.json \
+  chinese-hubert-large/pytorch_model.bin \
+  --local-dir models/quicktalk/checkpoints
+```
+
+Download InsightFace `buffalo_l` into the QuickTalk auxiliary directory:
+
+```bash title="Terminal"
+mkdir -p /tmp/opentalking-insightface \
+  models/quicktalk/checkpoints/auxiliary/models/buffalo_l
+
+curl -L \
+  -o /tmp/opentalking-insightface/buffalo_l.zip \
+  https://github.com/deepinsight/insightface/releases/download/v0.7/buffalo_l.zip
+
+unzip -q -o /tmp/opentalking-insightface/buffalo_l.zip \
+  -d /tmp/opentalking-insightface
+rsync -a /tmp/opentalking-insightface/buffalo_l/ \
+  models/quicktalk/checkpoints/auxiliary/models/buffalo_l/
+```
+
+The final layout should be:
+
+```text
+models/quicktalk/
+  checkpoints/
+    quicktalk.pth
+    repair.npy
+    chinese-hubert-large/
+      config.json
+      preprocessor_config.json
+      pytorch_model.bin
+    auxiliary/models/buffalo_l/
+      *.onnx
+```
+
+Check the required files:
+
+```bash title="Terminal"
+stat models/quicktalk/checkpoints/quicktalk.pth
+stat models/quicktalk/checkpoints/repair.npy
+stat models/quicktalk/checkpoints/chinese-hubert-large/pytorch_model.bin
+stat models/quicktalk/checkpoints/auxiliary/models/buffalo_l/det_10g.onnx
+```
+
+## 3. Configure `.env`
+
+Create `.env` if it does not exist:
+
+```bash title="Terminal"
+cp .env.example .env
+```
+
+Set these values:
+
+```env title=".env"
+OPENTALKING_DEFAULT_MODEL=quicktalk
+OPENTALKING_FFMPEG_BIN=
+OPENTALKING_QUICKTALK_BACKEND=local
+OPENTALKING_QUICKTALK_ASSET_ROOT=./models/quicktalk
+OPENTALKING_QUICKTALK_MODEL_BACKEND=auto
+OPENTALKING_QUICKTALK_WORKER_CACHE=1
+
+# Optional. If unset, OpenTalking selects mps when PyTorch MPS is available,
+# then falls back to cpu.
+OPENTALKING_QUICKTALK_DEVICE=mps
+
+# Apple Silicon default. Keep 12 so each generated chunk has enough audio budget.
+OPENTALKING_QUICKTALK_SLICE_LEN=12
+
+# Optional for long text. This lowers output cadence from model-native 25fps
+# to 14fps so MPS generation can stay closer to playback.
+OPENTALKING_QUICKTALK_FPS=14
+```
+
+Leaving `OPENTALKING_FFMPEG_BIN=` empty lets OpenTalking find system `ffmpeg` first and fall back to `imageio-ffmpeg`.
+
+## 4. Check the Environment
+
+```bash title="Terminal"
+python - <<'PY'
+from pathlib import Path
+import torch
+import onnxruntime as ort
+from opentalking.models.quicktalk.runtime_v2 import ensure_ffmpeg
+
+root = Path("models/quicktalk/checkpoints")
+for path in [
+    root / "quicktalk.pth",
+    root / "repair.npy",
+    root / "chinese-hubert-large/pytorch_model.bin",
+    root / "auxiliary/models/buffalo_l/det_10g.onnx",
+]:
+    print(path, path.exists())
+print("mps:", torch.backends.mps.is_available())
+print("onnxruntime providers:", ort.get_available_providers())
+print("ffmpeg:", ensure_ffmpeg())
+PY
+```
+
+Every printed file path should be `True`. `mps` should be `True` on a healthy Apple Silicon PyTorch install, though OpenTalking can fall back to CPU.
+
+## 5. Start OpenTalking
+
+```bash title="Terminal"
+bash scripts/start_unified.sh \
+  --backend local \
+  --model quicktalk \
+  --api-port 8210 \
+  --web-port 5280
+```
+
+Open `http://127.0.0.1:5280`, choose a front-facing avatar such as the built-in `singer`, and select `quicktalk`. The first run builds the avatar cache; later runs reuse it.
+
+## 6. Verify the Realtime Digital Human Path
+
+```bash title="Terminal"
+curl -s http://127.0.0.1:8210/health | python -m json.tool
+curl -s http://127.0.0.1:8210/models | python -m json.tool
+```
+
+The QuickTalk model should report `connected: true` with reason `local_runtime`.
+
+Create a session and send a short sentence:
+
+```bash title="Terminal"
+curl -s -X POST http://127.0.0.1:8210/sessions \
+  -H 'Content-Type: application/json' \
+  -d '{"avatar_id":"singer","model":"quicktalk","tts_provider":"edge"}' \
+  | tee /tmp/opentalking-session.json | python -m json.tool
+
+sid=$(python - <<'PY'
+import json
+print(json.load(open("/tmp/opentalking-session.json"))["session_id"])
+PY
+)
+
+curl -s -X POST "http://127.0.0.1:8210/sessions/$sid/start" \
+  -H 'Content-Type: application/json' \
+  -d '{}' | python -m json.tool
+
+curl -s -X POST "http://127.0.0.1:8210/sessions/$sid/speak" \
+  -H 'Content-Type: application/json' \
+  -d '{"text":"Please confirm in one sentence that QuickTalk is running locally on this Mac.","tts_provider":"edge"}' \
+  | python -m json.tool
+```
+
+When the session state returns from `speaking` to `ready`, and the WebUI shows generated audio and video frames for the selected avatar, the local realtime digital human path is working.
+
+## Performance Notes
+
+Apple Silicon can run the local path, but it is not the recommended realtime production target. If long text stalls, try:
+
+```env title=".env"
+OPENTALKING_QUICKTALK_SLICE_LEN=12
+OPENTALKING_QUICKTALK_FPS=14
+OPENTALKING_QUICKTALK_MAX_LONG_EDGE=720
+```
+
+This trades motion FPS or image size for smoother playback. Use Linux CUDA or OmniRT when stable 25fps realtime output matters.
+
+## Troubleshooting
+
+| Symptom | Fix |
+| --- | --- |
+| `onnxruntime-gpu` fails to install | Use `quicktalk-cpu`; do not install `quicktalk-cuda` on Apple Silicon. |
+| `ffmpeg` is missing | Keep `OPENTALKING_FFMPEG_BIN=` empty, or run `brew install ffmpeg`. |
+| MPS shows an SVD CPU fallback warning | This is a PyTorch MPS operator coverage limitation. It can affect speed but usually does not block execution. |
+| First startup is slow | The first run loads HuBERT, QuickTalk, and the avatar face cache. Reusing the same avatar is faster. |
--- a/docs/en/model-deployment/quicktalk/local.md
+++ b/docs/en/model-deployment/quicktalk/local.md
@@ -17,7 +17,7 @@ uv sync --extra dev --extra models --extra quicktalk-cuda --python 3.11
 source .venv/bin/activate
 ```

-Prepare a QuickTalk local asset root that contains `checkpoints/quicktalk.pth` or `checkpoints/256.onnx`, `checkpoints/repair.npy`, HuBERT files, and InsightFace assets.
+Prepare a QuickTalk local asset root that contains `checkpoints/quicktalk.pth`, `checkpoints/repair.npy`, HuBERT files, and InsightFace assets.

 The avatar does not need to start as `model_type=quicktalk`. OpenTalking decouples avatar selection from model selection: if an avatar has `metadata.source_video`, `metadata.source_image`, `reference.png`, or `preview.png`, QuickTalk prewarm can generate the template video and face cache it needs. Dedicated QuickTalk avatars can still declare `metadata.quicktalk.template_video` explicitly.

--- a/docs/en/quick-start/platform-notes.md
+++ b/docs/en/quick-start/platform-notes.md
@@ -6,7 +6,7 @@ This page explains the recommended ways to run OpenTalking on different system e

 | Platform | Recommended Use | Available Paths | Notes |
 | --- | --- | --- | --- |
-| macOS | Docs, frontend, API, Mock validation | `mock` | Good for quick trials, not recommended as a real model inference environment. |
+| macOS | Docs, frontend, API, Mock validation; experimental QuickTalk local on Apple Silicon | `mock`, experimental `quicktalk` local | Good for quick trials. See [QuickTalk on Apple Silicon](../model-deployment/quicktalk/apple-silicon.md). Stable realtime output is still recommended on Linux GPU. |
 | Linux + CUDA | Real model validation and deployment | `mock`, `quicktalk`, `wav2lip`, `musetalk`, `omnirt` | Primary recommended environment. |
 | Linux + Ascend NPU | Private deployment and NPU evaluation | `mock`, selected OmniRT / FlashTalk paths | Requires CANN, driver, and `torch_npu`. |

@@ -21,9 +21,15 @@ brew install python@3.11 node ffmpeg
 uv sync --extra dev --python 3.11
 ```

-### Not suitable for real digital-human models
+### Experimental QuickTalk local on Apple Silicon

-QuickTalk, MuseTalk, FlashTalk, and similar models mainly target CUDA GPUs or dedicated inference services. Even if some Python dependencies can be installed on macOS, it is not recommended as the real video-generation path. Deploy models on a Linux GPU machine and connect OpenTalking to the remote inference service instead.
+Apple Silicon can run QuickTalk local with `quicktalk-cpu` for development, demos, and integration checks. The full path is documented in [QuickTalk on Apple Silicon](../model-deployment/quicktalk/apple-silicon.md).
+
+This path uses PyTorch MPS when available and falls back to CPU. It is not the recommended production realtime path; use Linux CUDA or OmniRT when stable 25fps output matters.
+
+### Other real digital-human models
+
+MuseTalk, FlashTalk, and similar production paths mainly target CUDA GPUs, Ascend NPUs, or dedicated inference services. Deploy those models on a Linux GPU/NPU machine and connect OpenTalking to the remote inference service.

 ### ffmpeg Installation

--- a/docs/en/tutorials/installation.md
+++ b/docs/en/tutorials/installation.md
@@ -24,7 +24,7 @@ streamlined first-run procedure, see the [Quickstart](quickstart.md).

 | Platform | Synthesis backends | Notes |
 |----------|-------------------|-------|
-| macOS (Apple Silicon and Intel) | `mock` | Suitable for orchestration and frontend development. Real talking-head models are not supported on macOS. |
+| macOS (Apple Silicon and Intel) | `mock`, experimental `quicktalk` local on Apple Silicon | Suitable for orchestration and frontend development. QuickTalk local can be tested on Apple Silicon with `quicktalk-cpu`; see [QuickTalk on Apple Silicon](../model-deployment/quicktalk/apple-silicon.md) for the full path. Realtime production paths still target Linux GPU/NPU or OmniRT. |
 | Linux x86_64 + CUDA 12 | `mock`, `wav2lip`, `musetalk`, `flashtalk`, `flashhead`, `quicktalk` | Primary deployment target. |
 | Linux aarch64 + Ascend 910B (CANN 8.0+) | `mock`, `wav2lip`, `flashtalk` | NPU production deployment path. |
 | Windows | `mock` (WSL2 recommended) | Not part of the continuous integration matrix. |
--- a/docs/zh/model-deployment/quicktalk.md
+++ b/docs/zh/model-deployment/quicktalk.md
@@ -35,7 +35,7 @@ $OMNIRT_MODEL_ROOT/quicktalk/          # OmniRT 默认读取
 ```text
 $OPENTALKING_QUICKTALK_ASSET_ROOT/    # local adapter 默认读取
  checkpoints/
-    quicktalk.pth 或 256.onnx
+    quicktalk.pth
    repair.npy
    chinese-hubert-large/
      pytorch_model.bin
--- a/docs/zh/model-deployment/quicktalk/apple-silicon.md
+++ b/docs/zh/model-deployment/quicktalk/apple-silicon.md
@@ -0,0 +1,210 @@
+# Apple Silicon 上运行 QuickTalk
+
+本页用于在 Apple Silicon macOS 上本地运行 QuickTalk。它适合开发、演示和集成验证；如果需要稳定 25fps 实时输出，仍建议使用 [QuickTalk Local 单机部署](local.md) 中的 Linux CUDA 路径，或把 QuickTalk 放到 OmniRT 后面运行。
+
+## 1. 安装依赖
+
+```bash title="终端"
+brew install python@3.11 node uv
+
+# 可选。不安装时 OpenTalking 可以回退到 imageio-ffmpeg。
+brew install ffmpeg
+```
+
+拉取 OpenTalking，并使用 CPU/macOS extra 创建环境：
+
+```bash title="终端"
+git clone https://github.com/OpenTalker/opentalking.git
+cd opentalking
+
+export UV_INDEX_URL=https://pypi.tuna.tsinghua.edu.cn/simple
+export PIP_INDEX_URL=https://pypi.tuna.tsinghua.edu.cn/simple
+export UV_HTTP_TIMEOUT=300
+export UV_LINK_MODE=copy
+
+uv sync --extra dev --extra models --extra quicktalk-cpu --python 3.11
+source .venv/bin/activate
+```
+
+不要在 Apple Silicon 上安装 `quicktalk-cuda`。`onnxruntime-gpu` 没有 macOS arm64 wheel。
+
+## 2. 下载 QuickTalk 资产
+
+下载 QuickTalk 权重和 HuBERT 文件：
+
+```bash title="终端"
+mkdir -p models/quicktalk/checkpoints
+
+hf download datascale-ai/quicktalk \
+  quicktalk.pth \
+  repair.npy \
+  chinese-hubert-large/config.json \
+  chinese-hubert-large/preprocessor_config.json \
+  chinese-hubert-large/pytorch_model.bin \
+  --local-dir models/quicktalk/checkpoints
+```
+
+下载 InsightFace `buffalo_l` 到 QuickTalk auxiliary 目录：
+
+```bash title="终端"
+mkdir -p /tmp/opentalking-insightface \
+  models/quicktalk/checkpoints/auxiliary/models/buffalo_l
+
+curl -L \
+  -o /tmp/opentalking-insightface/buffalo_l.zip \
+  https://github.com/deepinsight/insightface/releases/download/v0.7/buffalo_l.zip
+
+unzip -q -o /tmp/opentalking-insightface/buffalo_l.zip \
+  -d /tmp/opentalking-insightface
+rsync -a /tmp/opentalking-insightface/buffalo_l/ \
+  models/quicktalk/checkpoints/auxiliary/models/buffalo_l/
+```
+
+最终目录应为：
+
+```text
+models/quicktalk/
+  checkpoints/
+    quicktalk.pth
+    repair.npy
+    chinese-hubert-large/
+      config.json
+      preprocessor_config.json
+      pytorch_model.bin
+    auxiliary/models/buffalo_l/
+      *.onnx
+```
+
+检查必需文件：
+
+```bash title="终端"
+stat models/quicktalk/checkpoints/quicktalk.pth
+stat models/quicktalk/checkpoints/repair.npy
+stat models/quicktalk/checkpoints/chinese-hubert-large/pytorch_model.bin
+stat models/quicktalk/checkpoints/auxiliary/models/buffalo_l/det_10g.onnx
+```
+
+## 3. 配置 `.env`
+
+如果还没有 `.env`，先创建：
+
+```bash title="终端"
+cp .env.example .env
+```
+
+设置这些值：
+
+```env title=".env"
+OPENTALKING_DEFAULT_MODEL=quicktalk
+OPENTALKING_FFMPEG_BIN=
+OPENTALKING_QUICKTALK_BACKEND=local
+OPENTALKING_QUICKTALK_ASSET_ROOT=./models/quicktalk
+OPENTALKING_QUICKTALK_MODEL_BACKEND=auto
+OPENTALKING_QUICKTALK_WORKER_CACHE=1
+
+# 可选。不设置时 OpenTalking 会在 PyTorch MPS 可用时选择 mps，
+# 否则回退 cpu。
+OPENTALKING_QUICKTALK_DEVICE=mps
+
+# Apple Silicon 默认值。保持 12，让每个生成 chunk 有足够音频预算。
+OPENTALKING_QUICKTALK_SLICE_LEN=12
+
+# 长文本可选。把输出从模型原生 25fps 降到 14fps，
+# 让 MPS 生成速度更接近播放速度。
+OPENTALKING_QUICKTALK_FPS=14
+```
+
+`OPENTALKING_FFMPEG_BIN=` 保持为空时，OpenTalking 会先找系统 `ffmpeg`，找不到再回退到 `imageio-ffmpeg`。
+
+## 4. 检查本地环境
+
+```bash title="终端"
+python - <<'PY'
+from pathlib import Path
+import torch
+import onnxruntime as ort
+from opentalking.models.quicktalk.runtime_v2 import ensure_ffmpeg
+
+root = Path("models/quicktalk/checkpoints")
+for path in [
+    root / "quicktalk.pth",
+    root / "repair.npy",
+    root / "chinese-hubert-large/pytorch_model.bin",
+    root / "auxiliary/models/buffalo_l/det_10g.onnx",
+]:
+    print(path, path.exists())
+print("mps:", torch.backends.mps.is_available())
+print("onnxruntime providers:", ort.get_available_providers())
+print("ffmpeg:", ensure_ffmpeg())
+PY
+```
+
+每个文件路径都应该输出 `True`。健康的 Apple Silicon PyTorch 环境里 `mps` 应该是 `True`；如果不可用，OpenTalking 可以回退到 CPU。
+
+## 5. 启动 OpenTalking
+
+```bash title="终端"
+bash scripts/start_unified.sh \
+  --backend local \
+  --model quicktalk \
+  --api-port 8210 \
+  --web-port 5280
+```
+
+打开 `http://127.0.0.1:5280`，选择正脸清晰的 avatar，例如内置 `singer`，模型选择 `quicktalk`。首次运行会构建 avatar cache，后续可复用。
+
+## 6. 验证实时数字人链路
+
+```bash title="终端"
+curl -s http://127.0.0.1:8210/health | python -m json.tool
+curl -s http://127.0.0.1:8210/models | python -m json.tool
+```
+
+QuickTalk 模型应返回 `connected: true`，原因是 `local_runtime`。
+
+创建会话并发送一句短文本：
+
+```bash title="终端"
+curl -s -X POST http://127.0.0.1:8210/sessions \
+  -H 'Content-Type: application/json' \
+  -d '{"avatar_id":"singer","model":"quicktalk","tts_provider":"edge"}' \
+  | tee /tmp/opentalking-session.json | python -m json.tool
+
+sid=$(python - <<'PY'
+import json
+print(json.load(open("/tmp/opentalking-session.json"))["session_id"])
+PY
+)
+
+curl -s -X POST "http://127.0.0.1:8210/sessions/$sid/start" \
+  -H 'Content-Type: application/json' \
+  -d '{}' | python -m json.tool
+
+curl -s -X POST "http://127.0.0.1:8210/sessions/$sid/speak" \
+  -H 'Content-Type: application/json' \
+  -d '{"text":"请用一句话确认 QuickTalk 已在 Mac 本地运行。","tts_provider":"edge"}' \
+  | python -m json.tool
+```
+
+当 session 状态从 `speaking` 回到 `ready`，且 WebUI 中能看到所选 avatar 生成音频和视频帧，就表示本地实时数字人链路已经跑通。
+
+## 性能说明
+
+Apple Silicon 可以跑通本地链路，但不是推荐的实时生产目标。如果长文本卡顿，优先尝试：
+
+```env title=".env"
+OPENTALKING_QUICKTALK_SLICE_LEN=12
+OPENTALKING_QUICKTALK_FPS=14
+OPENTALKING_QUICKTALK_MAX_LONG_EDGE=720
+```
+
+这会用动作帧率或画面尺寸换取更顺滑的播放。需要稳定 25fps 实时输出时，请使用 Linux CUDA 或 OmniRT。
+
+## 常见问题
+
+| 现象 | 处理 |
+| --- | --- |
+| `onnxruntime-gpu` 安装失败 | Apple Silicon 使用 `quicktalk-cpu`，不要安装 `quicktalk-cuda`。 |
+| `ffmpeg` 找不到 | `.env` 中保持 `OPENTALKING_FFMPEG_BIN=`，或运行 `brew install ffmpeg`。 |
+| MPS 出现 SVD CPU fallback 警告 | 属于 PyTorch MPS 的算子覆盖限制，可能影响速度，但通常不阻塞运行。 |
+| 首次启动很慢 | 首次会加载 HuBERT、QuickTalk 和 avatar face cache；同一 avatar 后续会更快。 |
--- a/docs/zh/model-deployment/quicktalk/local.md
+++ b/docs/zh/model-deployment/quicktalk/local.md
@@ -26,7 +26,7 @@ local adapter 的资产根必须包含 `checkpoints/` 目录。推荐放在仓
 ```text
 models/quicktalk/
  checkpoints/
-    quicktalk.pth 或 256.onnx
+    quicktalk.pth
    repair.npy
    chinese-hubert-large/
      pytorch_model.bin
@@ -93,7 +93,7 @@ cd "$OPENTALKING_HOME"
 opentalking-prepare-cache \
  --model quicktalk \
  --avatars-root examples/avatars \
-  --quicktalk-model-root models/quicktalk \
+  --quicktalk-asset-root models/quicktalk \
  --device cuda:0 \
  --model-backend pth \
  --verify
--- a/docs/zh/model-support/models/quicktalk.md
+++ b/docs/zh/model-support/models/quicktalk.md
@@ -10,7 +10,7 @@ QuickTalk 适合快速实时口播、低延迟验证和从图片快速生成数

 - 推荐 NVIDIA GPU。
 - QuickTalk 本地资产目录需要包含 `checkpoints/`。
- 至少需要 `quicktalk.pth` 或 `256.onnx`、`repair.npy`、`chinese-hubert-large/` 和 `auxiliary/` 或 `auxiliary_min/`。
+- 至少需要 `quicktalk.pth`、`repair.npy`、`chinese-hubert-large/` 和 `auxiliary/` 或 `auxiliary_min/`。
 - Avatar 需要 `quicktalk.template_video` 或可由上传图片生成模板视频。

 ## 准备权重
@@ -130,7 +130,7 @@ uv run opentalking-quicktalk-bench \

 ### 提示资产不完整

-检查 `checkpoints/quicktalk.pth` 或 `checkpoints/256.onnx`、`repair.npy`、`chinese-hubert-large/`、`auxiliary/` 是否存在。
+检查 `checkpoints/quicktalk.pth`、`repair.npy`、`chinese-hubert-large/`、`auxiliary/` 是否存在。

 ### 首次创建会话很慢

--- a/docs/zh/quick-start/platform-notes.md
+++ b/docs/zh/quick-start/platform-notes.md
@@ -8,7 +8,7 @@

 | 平台 | 推荐用途 | 可用路径 | 说明 |
 | --- | --- | --- | --- |
-| macOS | 文档、前端、API、Mock 验证 | `mock` | 适合快速体验，不建议作为真实模型推理环境。 |
+| macOS | 文档、前端、API、Mock 验证；Apple Silicon 上实验性 QuickTalk local | `mock`、实验性 `quicktalk` local | 适合快速体验；见 [Apple Silicon 上运行 QuickTalk](../model-deployment/quicktalk/apple-silicon.md)。稳定实时输出仍推荐 Linux GPU。 |
 | Linux + CUDA | 真实模型验证与部署 | `mock`、`quicktalk`、`wav2lip`、`musetalk`、`omnirt` | 主要推荐环境。 |
 | Linux + Ascend NPU | 私有化和 NPU 评估 | `mock`、部分 OmniRT / FlashTalk 路线 | 依赖 CANN、驱动和 `torch_npu` |

@@ -24,11 +24,18 @@ brew install python@3.11 node ffmpeg
 uv sync --extra dev --python 3.11
 ```

-### 不适合真实数字人模型
+### Apple Silicon 上的实验性 QuickTalk local

-QuickTalk、MuseTalk、FlashTalk 等模型主要面向 CUDA GPU 或专用推理服务。macOS 上即使可以安装
-部分 Python 依赖，也不建议作为真实视频生成路径；更推荐把模型部署到 Linux GPU 机器，
-再通过 OpenTalking 连接远端推理服务。
+Apple Silicon 可以用 `quicktalk-cpu` 跑 QuickTalk local，适合开发、演示和集成验证。完整步骤见
+[Apple Silicon 上运行 QuickTalk](../model-deployment/quicktalk/apple-silicon.md)。
+
+这条路径会在 PyTorch MPS 可用时使用 MPS，否则回退 CPU。它不是推荐的生产实时路径；需要稳定
+25fps 输出时，请使用 Linux CUDA 或 OmniRT。
+
+### 其他真实数字人模型
+
+MuseTalk、FlashTalk 等生产路径主要面向 CUDA GPU、昇腾 NPU 或专用推理服务。更推荐把这些模型
+部署到 Linux GPU/NPU 机器，再通过 OpenTalking 连接远端推理服务。

 ### ffmpeg 安装

--- a/docs/zh/tutorials/installation.md
+++ b/docs/zh/tutorials/installation.md
@@ -21,7 +21,7 @@ OpenTalking 提供两种安装方式。选择哪一种取决于两个问题：

 | 平台 | 合成后端 | 说明 |
 |------|---------|------|
-| macOS（Apple Silicon 与 Intel） | `mock` | 适用于编排与前端开发；macOS 不支持真实 talking-head 模型。 |
+| macOS（Apple Silicon 与 Intel） | `mock`、Apple Silicon 实验性 `quicktalk` local | 适用于编排与前端开发；QuickTalk local 可用 `quicktalk-cpu` 在 Apple Silicon 上验证，完整步骤见 [Apple Silicon 上运行 QuickTalk](../model-deployment/quicktalk/apple-silicon.md)，生产实时路径仍以 Linux GPU/NPU 或 OmniRT 为主。 |
 | Linux x86_64 + CUDA 12 | `mock`、`wav2lip`、`musetalk`、`flashtalk`、`flashhead`、`quicktalk` | 主要部署目标。 |
 | Linux aarch64 + 昇腾 910B（CANN 8.0+） | `mock`、`wav2lip`、`flashtalk` | NPU 生产部署路径。 |
 | Windows | `mock`（建议 WSL2） | 不在持续集成矩阵中。 |
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -252,6 +252,7 @@ nav:
              - QuickTalk:
                  - Overview: model-deployment/quicktalk.md
                  - Local: model-deployment/quicktalk/local.md
+                  - Apple Silicon: model-deployment/quicktalk/apple-silicon.md
                  - OmniRT: model-deployment/quicktalk/omnirt.md
              - Wav2Lip:
                  - Overview: model-deployment/wav2lip.md
@@ -329,6 +330,7 @@ nav:
          - QuickTalk:
              - Overview: model-deployment/quicktalk.md
              - Local: model-deployment/quicktalk/local.md
+              - Apple Silicon: model-deployment/quicktalk/apple-silicon.md
              - OmniRT: model-deployment/quicktalk/omnirt.md
          - Wav2Lip:
              - Overview: model-deployment/wav2lip.md
--- a/opentalking/core/config.py
+++ b/opentalking/core/config.py
@@ -367,6 +367,15 @@ class Settings(BaseSettings):
    flashhead_frame_num: int = 29
    flashhead_chunk_samples: int = 17920

+    quicktalk_asset_root: str = ""
+    quicktalk_model_root: str = ""
+    quicktalk_backend: str = ""
+    quicktalk_model_backend: str = "auto"
+    quicktalk_device: str = ""
+    quicktalk_hubert_device: str = ""
+    quicktalk_worker_cache: bool = True
+    quicktalk_slice_len: int = 0
+
    llm_provider: str = "openai_compatible"
    llm_base_url: str = ""
    llm_api_key: str = ""
--- a/opentalking/models/quicktalk/adapter.py
+++ b/opentalking/models/quicktalk/adapter.py
@@ -2,6 +2,7 @@ from __future__ import annotations

 import logging
 import os
+import platform
 import threading
 from collections import OrderedDict
 from dataclasses import dataclass
@@ -14,6 +15,7 @@ from opentalking.avatar.loader import load_avatar_bundle
 from opentalking.core.interfaces.avatar_asset import AvatarManifest
 from opentalking.core.types.frames import AudioChunk, VideoFrameData
 from opentalking.media.frame_avatar import numpy_bgr_to_videoframe
+from opentalking.models.quicktalk.paths import resolve_quicktalk_asset_root
 from opentalking.models.registry import register_model

 if TYPE_CHECKING:  # pragma: no cover — avoids importing torch/onnx at module load
@@ -26,6 +28,8 @@ log = logging.getLogger(__name__)
 class QuickTalkFeatures:
    reps: list[np.ndarray]
    audio_feature_seconds: float
+    render_reps: list[np.ndarray] | None = None
+    output_fps: float | None = None


@dataclass
@@ -87,6 +91,39 @@ def _env_value(name: str, default: str = "") -> str:
    return os.environ.get(name, "").strip() or default


+def _default_quicktalk_device() -> str:
+    if platform.system() == "Darwin" and platform.machine().lower() in {"arm64", "aarch64"}:
+        try:
+            import torch
+
+            mps = getattr(getattr(torch, "backends", None), "mps", None)
+            if mps is not None and bool(mps.is_available()):
+                return "mps"
+        except Exception:
+            pass
+        return "cpu"
+    return "cuda:0"
+
+
+def _first_configured_device(*values: str | None) -> str:
+    for value in values:
+        device = (value or "").strip()
+        if device and device.lower() != "auto":
+            return device
+    return ""
+
+
+def _configured_quicktalk_device(*extra_values: str | None) -> str:
+    return (
+        _first_configured_device(
+            _env_value("OPENTALKING_QUICKTALK_DEVICE"),
+            _env_value("OPENTALKING_TORCH_DEVICE"),
+            *extra_values,
+        )
+        or _default_quicktalk_device()
+    )
+
+
 def _positive_int_env(name: str, default: int) -> int:
    try:
        value = int(_env_value(name, str(default)))
@@ -95,6 +132,39 @@ def _positive_int_env(name: str, default: int) -> int:
    return max(1, value)


+def _optional_positive_int_env(name: str) -> int | None:
+    raw = _env_value(name)
+    if not raw:
+        return None
+    try:
+        value = int(raw)
+    except ValueError:
+        return None
+    return value if value > 0 else None
+
+
+def _downsample_sequence(items: list[np.ndarray], target_count: int) -> list[np.ndarray]:
+    if target_count >= len(items):
+        return items
+    if target_count <= 1:
+        return [items[0]]
+    last = len(items) - 1
+    return [items[int(round(i * last / float(target_count - 1)))] for i in range(target_count)]
+
+
+def _quicktalk_render_plan(
+    reps: list[np.ndarray],
+    *,
+    worker_fps: float,
+) -> tuple[list[np.ndarray], float]:
+    target_fps = _optional_positive_int_env("OPENTALKING_QUICKTALK_FPS")
+    output_fps = worker_fps
+    if target_fps is None or target_fps >= worker_fps or not reps:
+        return reps, output_fps
+    target_count = max(1, int(round(float(len(reps)) * float(target_fps) / worker_fps)))
+    return _downsample_sequence(list(reps), target_count), float(target_fps)
+
+
 def _close_worker(worker: Any) -> None:
    close = getattr(worker, "close", None)
    if callable(close):
@@ -171,18 +241,22 @@ def _quicktalk_template_from_bundle(bundle_path: Path) -> Path | None:
    return None


-def _optional_env_path(name: str) -> Path | None:
-    raw = _env_value(name)
-    if not raw:
+def _quicktalk_settings() -> Any | None:
+    try:
+        from opentalking.core.config import get_settings
+
+        return get_settings()
+    except Exception:
        return None
-    return Path(raw).expanduser().resolve()


 def _quicktalk_asset_root_env() -> Path | None:
-    return (
-        _optional_env_path("OPENTALKING_QUICKTALK_ASSET_ROOT")
-        or _optional_env_path("OPENTALKING_QUICKTALK_MODEL_ROOT")
-        or _optional_env_path("OMNIRT_QUICKTALK_MODEL_ROOT")
+    return resolve_quicktalk_asset_root(None, include_default=False)
+
+
+def _quicktalk_asset_root_config(settings: Any | None = None) -> Path | None:
+    return resolve_quicktalk_asset_root(
+        settings if settings is not None else _quicktalk_settings()
    )


@@ -284,7 +358,7 @@ def _validate_asset_root(asset_root: Path) -> None:
        formatted = "\n  - ".join(str(path) for path in missing)
        raise FileNotFoundError(
            "QuickTalk local assets are incomplete. "
-            "OPENTALKING_QUICKTALK_ASSET_ROOT or OPENTALKING_QUICKTALK_MODEL_ROOT must point to a QuickTalk local "
+            "OPENTALKING_QUICKTALK_ASSET_ROOT must point to a QuickTalk local "
            "asset directory containing checkpoints/quicktalk.pth or checkpoints/256.onnx, checkpoints/repair.npy, "
            "checkpoints/chinese-hubert-large/ and checkpoints/auxiliary/.\n"
            f"Current asset root: {asset_root}\n"
@@ -311,11 +385,21 @@ class QuickTalkAdapter:
    model_type = "quicktalk"

    def __init__(self) -> None:
-        self._device = os.environ.get("OPENTALKING_TORCH_DEVICE", "cuda:0")
+        settings = _quicktalk_settings()
+        self._device = _configured_quicktalk_device(
+            getattr(settings, "quicktalk_device", None) if settings is not None else None,
+            getattr(settings, "torch_device", None) if settings is not None else None,
+            getattr(settings, "device", None) if settings is not None else None,
+        )
        # 多卡部署：让 HuBERT 跑在另一张卡，避免与 ONNX 在同一 GPU default
        # stream 上排队。空字符串表示与主 device 同卡（默认行为）。
        self._hubert_device = (
            _env_value("OPENTALKING_QUICKTALK_HUBERT_DEVICE") or None
+            or (
+                str(getattr(settings, "quicktalk_hubert_device", "") or "").strip()
+                if settings is not None
+                else None
+            )
        )
        self._asset_root = _quicktalk_asset_root_env()
        self._output_transform = _env_value(
@@ -328,7 +412,12 @@ class QuickTalkAdapter:
        self._neck_fade_start = float(_env_value("OPENTALKING_QUICKTALK_NECK_FADE_START", "0.72"))
        self._neck_fade_end = float(_env_value("OPENTALKING_QUICKTALK_NECK_FADE_END", "0.88"))
        self._max_template_seconds_env = _env_value("OPENTALKING_QUICKTALK_MAX_TEMPLATE_SECONDS")
-        self._model_backend = _env_value("OPENTALKING_QUICKTALK_MODEL_BACKEND", "auto")
+        self._model_backend = _env_value(
+            "OPENTALKING_QUICKTALK_MODEL_BACKEND",
+            str(getattr(settings, "quicktalk_model_backend", "") or "").strip()
+            if settings is not None
+            else "auto",
+        )
        # Idle frame selection. The template video typically contains the source
        # speaker talking, so cycling all frames during idle makes the avatar
        # appear to keep speaking. We restrict idle to a configurable still
@@ -369,14 +458,25 @@ class QuickTalkAdapter:
        return None

    @staticmethod
-    def runtime_available() -> bool:
+    def runtime_available(settings: Any | None = None) -> bool:
        try:
-            asset_root = _quicktalk_asset_root_env()
+            asset_root = _quicktalk_asset_root_config(settings)
            if asset_root is None:
                return False
            _validate_asset_root(_normalize_asset_root(asset_root))
-            device = _env_value("OPENTALKING_QUICKTALK_DEVICE") or _env_value("OPENTALKING_TORCH_DEVICE", "cuda:0")
-            hubert_device = _env_value("OPENTALKING_QUICKTALK_HUBERT_DEVICE")
+            device = _configured_quicktalk_device(
+                getattr(settings, "quicktalk_device", None) if settings is not None else None,
+                getattr(settings, "torch_device", None) if settings is not None else None,
+                getattr(settings, "device", None) if settings is not None else None,
+            )
+            hubert_device = (
+                _env_value("OPENTALKING_QUICKTALK_HUBERT_DEVICE")
+                or (
+                    str(getattr(settings, "quicktalk_hubert_device", "") or "").strip()
+                    if settings is not None
+                    else ""
+                )
+            )
            return _explicit_cuda_available(device) and _explicit_cuda_available(hubert_device)
        except Exception:
            return False
@@ -404,14 +504,23 @@ class QuickTalkAdapter:
    def load_avatar(self, avatar_path: str) -> QuickTalkState:
        bundle = load_avatar_bundle(Path(avatar_path), strict=False)
        metadata = bundle.manifest.metadata or {}
-        asset_root = self._asset_root if self._asset_root is not None else _path_from_env_or_metadata(
-            "OPENTALKING_QUICKTALK_ASSET_ROOT",
-            metadata,
-            "asset_root",
-            "quicktalk_asset_root",
-            base_dir=bundle.path,
-            sections=("quicktalk",),
-        )
+        asset_root = self._asset_root
+        if asset_root is None:
+            try:
+                asset_root = _path_from_env_or_metadata(
+                    "OPENTALKING_QUICKTALK_ASSET_ROOT",
+                    metadata,
+                    "asset_root",
+                    "quicktalk_asset_root",
+                    base_dir=bundle.path,
+                    sections=("quicktalk",),
+                )
+            except ValueError:
+                asset_root = _quicktalk_asset_root_config()
+                if asset_root is None:
+                    raise
+        if asset_root is None:
+            raise ValueError("Missing OPENTALKING_QUICKTALK_ASSET_ROOT or QuickTalk settings asset root")
        asset_root = _normalize_asset_root(asset_root)
        _validate_asset_root(asset_root)
        prepared_template, face_cache_file = _prepared_quicktalk_template_and_cache(
@@ -561,11 +670,24 @@ class QuickTalkAdapter:
            np.asarray(audio_chunk.data, dtype=np.int16).reshape(-1),
            int(audio_chunk.sample_rate),
        )
-        return QuickTalkFeatures(reps=reps, audio_feature_seconds=feature_seconds)
+        render_reps, output_fps = _quicktalk_render_plan(
+            reps,
+            worker_fps=float(getattr(avatar_state.worker, "fps", 25) or 25),
+        )
+        avatar_state.fps = output_fps
+        return QuickTalkFeatures(
+            reps=reps,
+            audio_feature_seconds=feature_seconds,
+            render_reps=render_reps,
+            output_fps=output_fps,
+        )

    def infer(self, features: QuickTalkFeatures, avatar_state: QuickTalkState) -> Iterator[np.ndarray]:
+        render_reps = features.render_reps if features.render_reps is not None else features.reps
+        if features.output_fps is not None:
+            avatar_state.fps = features.output_fps
        return avatar_state.worker.generate_frames_from_reps(
-            features.reps, state=avatar_state.session_state
+            render_reps, state=avatar_state.session_state
        )

    def compose_frame(
@@ -593,11 +715,27 @@ class QuickTalkAdapter:
            np.asarray(audio_chunk.data, dtype=np.int16).reshape(-1),
            int(audio_chunk.sample_rate),
        )
-        features = QuickTalkFeatures(reps=reps, audio_feature_seconds=feature_seconds)
+        render_reps, output_fps = _quicktalk_render_plan(
+            reps,
+            worker_fps=float(getattr(avatar_state.worker, "fps", 25) or 25),
+        )
+        features = QuickTalkFeatures(
+            reps=reps,
+            audio_feature_seconds=feature_seconds,
+            render_reps=render_reps,
+            output_fps=output_fps,
+        )
        frames = []
-        for prediction in avatar_state.worker.generate_frames_from_reps(
-            reps, state=avatar_state.session_state
-        ):
-            frames.append(self.compose_frame(avatar_state, avatar_state.frame_index, prediction))
-            avatar_state.frame_index += 1
+        previous_fps = getattr(avatar_state, "fps", None)
+        avatar_state.fps = output_fps
+        try:
+            predictions = avatar_state.worker.generate_frames_from_reps(
+                render_reps, state=avatar_state.session_state
+            )
+            for prediction in predictions:
+                frames.append(self.compose_frame(avatar_state, avatar_state.frame_index, prediction))
+                avatar_state.frame_index += 1
+        finally:
+            if previous_fps is not None:
+                avatar_state.fps = previous_fps
        return features, frames
--- a/opentalking/models/quicktalk/paths.py
+++ b/opentalking/models/quicktalk/paths.py
@@ -0,0 +1,131 @@
+from __future__ import annotations
+
+import logging
+import os
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+log = logging.getLogger(__name__)
+
+
+@dataclass(frozen=True)
+class QuickTalkRootCandidate:
+    source: str
+    path: Path
+    deprecated: bool = False
+    default: bool = False
+
+
+def _path_from_raw(raw: object) -> Path | None:
+    value = str(raw or "").strip()
+    if not value:
+        return None
+    return Path(value).expanduser().resolve()
+
+
+def _settings_path(settings: Any | None, name: str) -> Path | None:
+    if settings is None:
+        return None
+    return _path_from_raw(getattr(settings, name, ""))
+
+
+def _env_path(name: str) -> Path | None:
+    return _path_from_raw(os.environ.get(name, ""))
+
+
+def quicktalk_asset_root_candidates(
+    settings: Any | None = None,
+    *,
+    include_legacy: bool = True,
+    include_default: bool = True,
+) -> list[QuickTalkRootCandidate]:
+    """Return QuickTalk asset-root candidates in the supported priority order.
+
+    New deployments should set only ``OPENTALKING_QUICKTALK_ASSET_ROOT`` or the
+    corresponding settings field. The other names are kept only so existing
+    installations do not break during upgrades.
+    """
+
+    candidates: list[QuickTalkRootCandidate] = []
+    settings_asset_root = _settings_path(settings, "quicktalk_asset_root")
+    if settings_asset_root is not None:
+        candidates.append(
+            QuickTalkRootCandidate("settings.quicktalk_asset_root", settings_asset_root)
+        )
+
+    env_asset_root = _env_path("OPENTALKING_QUICKTALK_ASSET_ROOT")
+    if env_asset_root is not None:
+        candidates.append(
+            QuickTalkRootCandidate("OPENTALKING_QUICKTALK_ASSET_ROOT", env_asset_root)
+        )
+
+    if include_legacy:
+        legacy_settings_root = _settings_path(settings, "quicktalk_model_root")
+        if legacy_settings_root is not None:
+            candidates.append(
+                QuickTalkRootCandidate(
+                    "settings.quicktalk_model_root",
+                    legacy_settings_root,
+                    deprecated=True,
+                )
+            )
+
+        for name in ("OPENTALKING_QUICKTALK_MODEL_ROOT", "OMNIRT_QUICKTALK_MODEL_ROOT"):
+            path = _env_path(name)
+            if path is not None:
+                candidates.append(QuickTalkRootCandidate(name, path, deprecated=True))
+
+        omnirt_model_root = _env_path("OMNIRT_MODEL_ROOT")
+        if omnirt_model_root is not None:
+            candidates.append(
+                QuickTalkRootCandidate(
+                    "OMNIRT_MODEL_ROOT/quicktalk",
+                    (omnirt_model_root / "quicktalk").resolve(),
+                    deprecated=True,
+                )
+            )
+
+    if include_default and settings is not None:
+        models_dir = _path_from_raw(getattr(settings, "models_dir", ""))
+        if models_dir is not None:
+            candidates.append(
+                QuickTalkRootCandidate(
+                    "settings.models_dir/quicktalk",
+                    (models_dir / "quicktalk").resolve(),
+                    default=True,
+                )
+            )
+
+    return candidates
+
+
+def resolve_quicktalk_asset_root(
+    settings: Any | None = None,
+    *,
+    include_legacy: bool = True,
+    include_default: bool = True,
+) -> Path | None:
+    candidates = quicktalk_asset_root_candidates(
+        settings,
+        include_legacy=include_legacy,
+        include_default=include_default,
+    )
+    if not candidates:
+        return None
+    _warn_conflicting_explicit_roots(candidates)
+    return candidates[0].path
+
+
+def _warn_conflicting_explicit_roots(candidates: list[QuickTalkRootCandidate]) -> None:
+    explicit = [candidate for candidate in candidates if not candidate.default]
+    unique_paths = {candidate.path for candidate in explicit}
+    if len(unique_paths) <= 1:
+        return
+    formatted = ", ".join(f"{candidate.source}={candidate.path}" for candidate in explicit)
+    log.warning(
+        "Found conflicting QuickTalk asset roots; using %s=%s. Conflicting roots: %s",
+        candidates[0].source,
+        candidates[0].path,
+        formatted,
+    )
--- a/opentalking/models/quicktalk/runtime_v2.py
+++ b/opentalking/models/quicktalk/runtime_v2.py
@@ -47,10 +47,22 @@ def run_cmd(cmd: Sequence[str]) -> None:


 def ensure_ffmpeg() -> str:
+    configured = os.environ.get("OPENTALKING_FFMPEG_BIN", "").strip()
+    if configured:
+        return configured
    ffmpeg = shutil.which("ffmpeg")
-    if not ffmpeg:
-        raise RuntimeError("ffmpeg not found in PATH")
-    return ffmpeg
+    if ffmpeg:
+        return ffmpeg
+    try:
+        import imageio_ffmpeg
+
+        return str(imageio_ffmpeg.get_ffmpeg_exe())
+    except Exception as exc:
+        raise RuntimeError(
+            "ffmpeg not found. Install ffmpeg or install imageio-ffmpeg; "
+            "on macOS, `uv sync --extra models --extra quicktalk-cpu --python 3.11` "
+            "includes the fallback binary."
+        ) from exc


 def maybe_mkdir(path: Path) -> None:
@@ -142,14 +154,7 @@ class QuickTalkModelBackend(Protocol):

 class OnnxQuickTalkModel:
    def __init__(self, onnx_path: Path, device: torch.device) -> None:
-        if device.type == "cuda":
-            device_id = device.index if device.index is not None else 0
-            providers = [
-                ("CUDAExecutionProvider", {"device_id": device_id}),
-                "CPUExecutionProvider",
-            ]
-        else:
-            providers = ["CPUExecutionProvider"]
+        providers = _onnx_providers_for_device(device)
        self.session = ort.InferenceSession(str(onnx_path), providers=providers)
        self.input_names = [x.name for x in self.session.get_inputs()]

@@ -172,6 +177,20 @@ class OnnxQuickTalkModel:
        return cast(np.ndarray, g), cast(np.ndarray, hn_out), cast(np.ndarray, cn_out)


+def _onnx_providers_for_device(device: torch.device) -> list[str | tuple[str, dict[str, int]]]:
+    available = set(ort.get_available_providers())
+    if device.type == "cuda":
+        device_id = device.index if device.index is not None else 0
+        providers: list[str | tuple[str, dict[str, int]]] = []
+        if "CUDAExecutionProvider" in available:
+            providers.append(("CUDAExecutionProvider", {"device_id": device_id}))
+        providers.append("CPUExecutionProvider")
+        return providers
+    if device.type == "mps" and "CoreMLExecutionProvider" in available:
+        return ["CoreMLExecutionProvider", "CPUExecutionProvider"]
+    return ["CPUExecutionProvider"]
+
+
 class TorchQuickTalkModel:
    input_names = ["input_1", "input_2", "input_3", "input_4"]

--- a/opentalking/providers/synthesis/audio2video_client.py
+++ b/opentalking/providers/synthesis/audio2video_client.py
@@ -29,6 +29,38 @@ def make_audio_chunk(audio_pcm: np.ndarray, *, sample_rate: int = 16000) -> Audi
    return AudioChunk(data=pcm, sample_rate=int(sample_rate), duration_ms=duration_ms)


+def _positive_int_env(*names: str) -> int | None:
+    for name in names:
+        raw = os.environ.get(name, "").strip()
+        if not raw:
+            continue
+        try:
+            value = int(raw)
+        except ValueError:
+            logger.warning("Ignoring invalid positive integer env %s=%r", name, raw)
+            continue
+        if value > 0:
+            return value
+        logger.warning("Ignoring non-positive integer env %s=%r", name, raw)
+    return None
+
+
+def _quicktalk_slice_len_for_device(device: str) -> int:
+    configured = _positive_int_env(
+        "OPENTALKING_QUICKTALK_SLICE_LEN",
+        "OPENTALKING_QUICKTALK_CHUNK_FRAMES",
+    )
+    if configured is not None:
+        return configured
+    if str(device or "").strip().lower().startswith("mps"):
+        return 12
+    return 28
+
+
+def _quicktalk_fps() -> int:
+    return _positive_int_env("OPENTALKING_QUICKTALK_FPS") or 25
+
+
@runtime_checkable
 class Audio2VideoClient(Protocol):
    """Common realtime audio-to-video client contract for local and OmniRT backends."""
@@ -359,9 +391,8 @@ class LocalAudio2VideoClient:
            frames = getattr(state, "frames", None)
            self.frame_num = len(frames) if frames is not None else 1
        if self._is_quicktalk_adapter():
-            self.fps = 25
-            if self.slice_len <= 0:
-                self.slice_len = 28
+            self.fps = _quicktalk_fps()
+            self.slice_len = _quicktalk_slice_len_for_device(self.device)
            self.audio_chunk_samples = max(
                1,
                int(round(float(self.sample_rate) * float(self.slice_len) / max(1, self.fps))),
--- a/opentalking/providers/synthesis/availability.py
+++ b/opentalking/providers/synthesis/availability.py
@@ -87,7 +87,7 @@ def _explicit_env_enabled(name: str) -> bool:
    return raw is not None and raw.strip().lower() in {"1", "true", "yes", "on"}


-def _local_adapter_available(model: str) -> bool:
+def _local_adapter_available(model: str, settings=None) -> bool:
    try:
        adapter = get_adapter(model)
    except Exception:
@@ -95,7 +95,12 @@ def _local_adapter_available(model: str) -> bool:
    runtime_available = getattr(adapter, "runtime_available", None)
    if callable(runtime_available):
        try:
-            return bool(runtime_available())
+            return bool(runtime_available(settings=settings))
+        except TypeError:
+            try:
+                return bool(runtime_available())
+            except Exception:
+                return False
        except Exception:
            return False
    return True
@@ -131,7 +136,7 @@ async def resolve_model_statuses(settings) -> list[ModelStatus]:
            connected = True
            reason = "local_self_test"
        elif resolved.backend == "local":
-            connected = _local_adapter_available(model)
+            connected = _local_adapter_available(model, settings=settings)
            reason = "local_runtime" if connected else "local_adapter_missing"
        elif resolved.backend == "omnirt":
            if has_omnirt:
--- a/opentalking/providers/synthesis/backends.py
+++ b/opentalking/providers/synthesis/backends.py
@@ -45,7 +45,9 @@ def direct_ws_url(model: str, settings: Any) -> str:

 def resolve_model_backend(model: str, settings: Any) -> ModelBackend:
    model = model.strip().lower()
-    backend = get_model_backend(model)
+    backend = str(getattr(settings, f"{model}_backend", "") or "").strip().lower()
+    if backend not in {"mock", "local", "omnirt", "direct_ws"}:
+        backend = get_model_backend(model)
    if backend == "direct_ws":
        return ModelBackend(model=model, backend=backend, ws_url=direct_ws_url(model, settings))
    return ModelBackend(model=model, backend=backend)
--- a/opentalking/runtime/task_consumer.py
+++ b/opentalking/runtime/task_consumer.py
@@ -90,15 +90,15 @@ def _log_task_exception(task: asyncio.Task, sid: str) -> None:
 def _local_runner_device(model: str, settings: Any, default_device: str) -> str:
    model = model.strip().lower()
    if model == "quicktalk":
-        return str(
-            os.environ.get("OPENTALKING_QUICKTALK_DEVICE")
-            or getattr(settings, "quicktalk_device", "")
-            or os.environ.get("OPENTALKING_TORCH_DEVICE")
-            or getattr(settings, "torch_device", "")
-            or os.environ.get("OPENTALKING_DEVICE")
-            or getattr(settings, "device", "")
-            or os.environ.get("DEVICE")
-            or default_device
+        from opentalking.models.quicktalk.adapter import _configured_quicktalk_device
+
+        return _configured_quicktalk_device(
+            getattr(settings, "quicktalk_device", ""),
+            os.environ.get("OPENTALKING_DEVICE"),
+            os.environ.get("DEVICE"),
+            getattr(settings, "torch_device", ""),
+            getattr(settings, "device", ""),
+            default_device,
        )
    if model == "wav2lip":
        return str(
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -97,9 +97,11 @@ local-qwen3-tts-service = [
  "uvicorn[standard]>=0.27",
 ]
 quicktalk-cpu = [
+  "imageio-ffmpeg>=0.5",
  "onnxruntime>=1.24.3",
 ]
 quicktalk-cuda = [
+  "imageio-ffmpeg>=0.5",
  "onnxruntime-gpu>=1.24.0",
 ]
 local-cosyvoice-service = [
--- a/scripts/quickstart/_helpers.sh
+++ b/scripts/quickstart/_helpers.sh
@@ -134,3 +134,55 @@ quickstart_describe_port() {

  return 1
 }
+
+quickstart_resolve_ffmpeg() {
+  local py_bin="${repo_root:-}/.venv/bin/python"
+
+  if [[ -n "${OPENTALKING_FFMPEG_BIN:-}" ]]; then
+    printf '%s\n' "$OPENTALKING_FFMPEG_BIN"
+    return 0
+  fi
+
+  if command -v ffmpeg >/dev/null 2>&1; then
+    command -v ffmpeg
+    return 0
+  fi
+
+  if [[ ! -x "$py_bin" ]]; then
+    py_bin="python3"
+  fi
+  "$py_bin" - <<'PY'
+import imageio_ffmpeg
+
+print(imageio_ffmpeg.get_ffmpeg_exe())
+PY
+}
+
+quickstart_detach() {
+  local log_file="$1"
+  shift
+
+  if command -v setsid >/dev/null 2>&1; then
+    setsid "$@" >"$log_file" 2>&1 < /dev/null &
+    printf '%s\n' "$!"
+    return 0
+  fi
+
+  python3 - "$log_file" "$@" <<'PY'
+import subprocess
+import sys
+
+log_file = sys.argv[1]
+argv = sys.argv[2:]
+with open(log_file, "ab", buffering=0) as log:
+    process = subprocess.Popen(
+        argv,
+        stdin=subprocess.DEVNULL,
+        stdout=log,
+        stderr=subprocess.STDOUT,
+        close_fds=True,
+        start_new_session=True,
+    )
+print(process.pid)
+PY
+}
--- a/scripts/quickstart/start_frontend.sh
+++ b/scripts/quickstart/start_frontend.sh
@@ -96,8 +96,7 @@ echo "  api:  http://127.0.0.1:$backend_port"
 (
  cd "$web_dir"
  export VITE_BACKEND_PORT="$backend_port"
-  setsid ./node_modules/.bin/vite --host "$web_host" --port "$web_port" >"$log_file" 2>&1 < /dev/null &
-  echo "$!" >"$pid_file"
+  quickstart_detach "$log_file" ./node_modules/.bin/vite --host "$web_host" --port "$web_port" >"$pid_file"
 )

 pid="$(cat "$pid_file" 2>/dev/null || true)"
--- a/scripts/quickstart/start_opentalking.sh
+++ b/scripts/quickstart/start_opentalking.sh
@@ -139,14 +139,13 @@ fi
  export OPENTALKING_FLASHTALK_TTS_TRAILING_SILENCE_MS="${OPENTALKING_FLASHTALK_TTS_TRAILING_SILENCE_MS:-320}"

  # 其它运行时参数
-  export OPENTALKING_FFMPEG_BIN="${OPENTALKING_FFMPEG_BIN:-ffmpeg}"
+  export OPENTALKING_FFMPEG_BIN="$(quickstart_resolve_ffmpeg)"
  export OPENTALKING_TTS_STREAMING_DECODE="${OPENTALKING_TTS_STREAMING_DECODE:-1}"
  export OPENTALKING_TTS_SAMPLE_RATE="${OPENTALKING_TTS_SAMPLE_RATE:-16000}"
  export OMNIRT_AUDIO2VIDEO_PATH_TEMPLATE="${OMNIRT_AUDIO2VIDEO_PATH_TEMPLATE:-/v1/audio2video/{model}}"
  export FLASHTALK_PREBUFFER_CHUNKS="${FLASHTALK_PREBUFFER_CHUNKS:-2}"

-  setsid opentalking-unified >"$log_file" 2>&1 < /dev/null &
-  echo "$!" >"$pid_file"
+  quickstart_detach "$log_file" opentalking-unified >"$pid_file"
 )

 pid="$(cat "$pid_file" 2>/dev/null || true)"
--- a/scripts/start_unified.sh
+++ b/scripts/start_unified.sh
@@ -189,6 +189,31 @@ if [[ "$backend" == "local" && "$model" == "musetalk" ]]; then
  bash "$quickstart_dir/prepare_local_musetalk.sh"
 fi

+if [[ "$backend" == "local" && "$model" == "quicktalk" ]]; then
+  export OMNIRT_ENDPOINT=""
+  export OPENTALKING_OMNIRT_ENDPOINT=""
+  if [[ "$(uname -s)" == "Darwin" && -z "${OPENTALKING_QUICKTALK_DEVICE:-}" && -z "${OPENTALKING_TORCH_DEVICE:-}" ]]; then
+    quicktalk_mac_device="$("$script_dir/../.venv/bin/python" - <<'PY' 2>/dev/null || true
+import platform
+import sys
+
+if sys.platform == 'darwin' and platform.machine().lower() in {'arm64', 'aarch64'}:
+    try:
+        import torch
+
+        print('mps' if torch.backends.mps.is_available() else 'cpu')
+    except Exception:
+        print('cpu')
+PY
+)"
+    quicktalk_mac_device="${quicktalk_mac_device:-cpu}"
+    export OPENTALKING_QUICKTALK_DEVICE="$quicktalk_mac_device"
+    export OPENTALKING_TORCH_DEVICE="$quicktalk_mac_device"
+    echo "Apple Silicon QuickTalk local defaults: OPENTALKING_QUICKTALK_DEVICE=$quicktalk_mac_device"
+    echo "Install macOS QuickTalk dependencies with: uv sync --extra dev --extra models --extra quicktalk-cpu --python 3.11"
+  fi
+fi
+
 bash "$quickstart_dir/start_opentalking.sh" "${start_args[@]}"
 bash "$quickstart_dir/start_frontend.sh" "${web_args[@]}"

--- a/tests/unit/test_audio2video_client.py
+++ b/tests/unit/test_audio2video_client.py
@@ -321,3 +321,61 @@ async def test_local_quicktalk_uses_omnirt_chunk_defaults(tmp_path: Path) -> Non
    assert init["slice_len"] == 28
    assert init["chunk_samples"] == 17920
    assert client.audio_chunk_samples == 17920
+
+
+@pytest.mark.asyncio
+async def test_local_quicktalk_uses_smaller_chunks_on_mps_by_default(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    monkeypatch.delenv("OPENTALKING_QUICKTALK_SLICE_LEN", raising=False)
+    monkeypatch.delenv("OPENTALKING_QUICKTALK_CHUNK_FRAMES", raising=False)
+    adapter = FakeQuickTalkLocalAdapter()
+    client = LocalAudio2VideoClient(adapter, device="mps")
+    avatar = tmp_path / "avatar"
+    avatar.mkdir()
+
+    init = await client.init_session(avatar_path=avatar)
+
+    assert init["fps"] == 25
+    assert init["slice_len"] == 12
+    assert init["chunk_samples"] == 7680
+    assert client.audio_chunk_samples == 7680
+
+
+@pytest.mark.asyncio
+async def test_local_quicktalk_slice_len_env_overrides_mps_default(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    monkeypatch.setenv("OPENTALKING_QUICKTALK_SLICE_LEN", "16")
+    adapter = FakeQuickTalkLocalAdapter()
+    client = LocalAudio2VideoClient(adapter, device="mps")
+    avatar = tmp_path / "avatar"
+    avatar.mkdir()
+
+    init = await client.init_session(avatar_path=avatar)
+
+    assert init["slice_len"] == 16
+    assert init["chunk_samples"] == 10240
+
+
+@pytest.mark.asyncio
+async def test_local_quicktalk_fps_env_can_lower_mps_playback_rate(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    monkeypatch.setenv("OPENTALKING_QUICKTALK_FPS", "14")
+    monkeypatch.delenv("OPENTALKING_QUICKTALK_SLICE_LEN", raising=False)
+    monkeypatch.delenv("OPENTALKING_QUICKTALK_CHUNK_FRAMES", raising=False)
+    adapter = FakeQuickTalkLocalAdapter()
+    client = LocalAudio2VideoClient(adapter, device="mps")
+    avatar = tmp_path / "avatar"
+    avatar.mkdir()
+
+    init = await client.init_session(avatar_path=avatar)
+
+    assert init["fps"] == 14
+    assert init["slice_len"] == 12
+    assert init["chunk_samples"] == 13714
+    assert client.audio_chunk_samples == 13714
--- a/tests/unit/test_local_audio_providers.py
+++ b/tests/unit/test_local_audio_providers.py
@@ -1221,6 +1221,7 @@ def test_quicktalk_cuda_extra_declares_gpu_onnxruntime():
    assert "quicktalk-cuda" in pyproject
    assert "onnxruntime>=1.24.3" in pyproject
    assert "onnxruntime-gpu>=1.24.0" in pyproject
+    assert "imageio-ffmpeg>=0.5" in pyproject

    base_deps = pyproject.split("dependencies = [", 1)[1].split("]", 1)[0]
    models_extra = pyproject.split("models = [", 1)[1].split("]", 1)[0]
@@ -1238,6 +1239,50 @@ def test_onnxruntime_extras_declare_uv_conflicts():
    assert '{ extra = "demo" }' in pyproject


+def test_quicktalk_onnx_provider_prefers_coreml_for_mps(monkeypatch):
+    from opentalking.models.quicktalk import runtime_v2
+
+    monkeypatch.setattr(
+        runtime_v2.ort,
+        "get_available_providers",
+        lambda: ["CoreMLExecutionProvider", "CPUExecutionProvider"],
+    )
+
+    assert runtime_v2._onnx_providers_for_device(runtime_v2.torch.device("mps")) == [
+        "CoreMLExecutionProvider",
+        "CPUExecutionProvider",
+    ]
+
+
+def test_quicktalk_onnx_provider_keeps_cuda_for_cuda_device(monkeypatch):
+    from opentalking.models.quicktalk import runtime_v2
+
+    monkeypatch.setattr(
+        runtime_v2.ort,
+        "get_available_providers",
+        lambda: ["CUDAExecutionProvider", "CPUExecutionProvider"],
+    )
+
+    assert runtime_v2._onnx_providers_for_device(runtime_v2.torch.device("cuda:2")) == [
+        ("CUDAExecutionProvider", {"device_id": 2}),
+        "CPUExecutionProvider",
+    ]
+
+
+def test_quicktalk_runtime_uses_imageio_ffmpeg_fallback(monkeypatch):
+    import types
+
+    from opentalking.models.quicktalk import runtime_v2
+
+    monkeypatch.delenv("OPENTALKING_FFMPEG_BIN", raising=False)
+    monkeypatch.setattr(runtime_v2.shutil, "which", lambda _: None)
+
+    fake_imageio = types.SimpleNamespace(get_ffmpeg_exe=lambda: "/tmp/ffmpeg-imageio")
+    monkeypatch.setitem(runtime_v2.sys.modules, "imageio_ffmpeg", fake_imageio)
+
+    assert runtime_v2.ensure_ffmpeg() == "/tmp/ffmpeg-imageio"
+
+
 def test_download_script_excludes_experimental_model_candidates():
    from scripts import download_local_audio_models as downloader

--- a/tests/unit/test_quickstart_env.py
+++ b/tests/unit/test_quickstart_env.py
@@ -69,7 +69,7 @@ def test_quickstart_source_env_keeps_new_env_file_assignments(tmp_path: Path) ->
        pytest.skip("bash is not available")
    env_file = tmp_path / "quickstart.env"
    env_file.write_text(
-        "OPENTALKING_QUICKTALK_MODEL_ROOT=/models/quicktalk\n"
+        "OPENTALKING_QUICKTALK_ASSET_ROOT=/models/quicktalk\n"
        "OPENTALKING_WAV2LIP_DEVICE=cuda:6\n",
        encoding="utf-8",
    )
@@ -77,18 +77,52 @@ def test_quickstart_source_env_keeps_new_env_file_assignments(tmp_path: Path) ->
    script = f"""
 set -euo pipefail
 export OPENTALKING_TORCH_DEVICE=cuda:6
-unset OPENTALKING_QUICKTALK_MODEL_ROOT
+unset OPENTALKING_QUICKTALK_ASSET_ROOT
 unset OPENTALKING_WAV2LIP_DEVICE
 source scripts/quickstart/_helpers.sh
 quickstart_source_env {env_file}
 bash -c 'test "$OPENTALKING_TORCH_DEVICE" = cuda:6'
-bash -c 'test "$OPENTALKING_QUICKTALK_MODEL_ROOT" = /models/quicktalk'
+bash -c 'test "$OPENTALKING_QUICKTALK_ASSET_ROOT" = /models/quicktalk'
 bash -c 'test "$OPENTALKING_WAV2LIP_DEVICE" = cuda:6'
 """

    subprocess.run(["bash", "-lc", script], cwd=REPO_ROOT, check=True)


+def test_start_unified_sets_apple_silicon_quicktalk_defaults() -> None:
+    source = (REPO_ROOT / "scripts/start_unified.sh").read_text(encoding="utf-8")
+
+    assert 'if [[ "$backend" == "local" && "$model" == "quicktalk" ]]' in source
+    assert "quicktalk-cpu" in source
+    assert "OPENTALKING_QUICKTALK_DEVICE" in source
+    assert "sys.platform == 'darwin'" in source
+
+
+@pytest.mark.parametrize(
+    "relpath",
+    [
+        "scripts/quickstart/start_opentalking.sh",
+        "scripts/quickstart/start_frontend.sh",
+    ],
+)
+def test_quickstart_process_launch_does_not_require_setsid_on_macos(relpath: str) -> None:
+    source = (REPO_ROOT / relpath).read_text(encoding="utf-8")
+    helpers = (REPO_ROOT / "scripts/quickstart/_helpers.sh").read_text(encoding="utf-8")
+
+    assert "quickstart_detach" in source
+    assert "command -v setsid" in helpers
+    assert "start_new_session=True" in helpers
+
+
+def test_start_opentalking_resolves_ffmpeg_fallback() -> None:
+    source = (REPO_ROOT / "scripts/quickstart/start_opentalking.sh").read_text(encoding="utf-8")
+    helpers = (REPO_ROOT / "scripts/quickstart/_helpers.sh").read_text(encoding="utf-8")
+
+    assert "quickstart_resolve_ffmpeg" in source
+    assert 'OPENTALKING_FFMPEG_BIN="${OPENTALKING_FFMPEG_BIN:-ffmpeg}"' not in source
+    assert "imageio_ffmpeg.get_ffmpeg_exe()" in helpers
+
+
 def test_quickstart_source_ascend_env_tolerates_unset_ld_library_path(tmp_path: Path) -> None:
    if shutil.which("bash") is None:
        pytest.skip("bash is not available")
--- a/tests/unit/test_quicktalk_adapter.py
+++ b/tests/unit/test_quicktalk_adapter.py
@@ -8,8 +8,12 @@ from pathlib import Path
 import numpy as np
 import pytest

-from opentalking.core.types.frames import VideoFrameData
-from opentalking.models.quicktalk.adapter import QuickTalkAdapter
+from opentalking.core.types.frames import AudioChunk, VideoFrameData
+from opentalking.models.quicktalk.adapter import (
+    QuickTalkAdapter,
+    _configured_quicktalk_device,
+    _default_quicktalk_device,
+)


 def _write_quicktalk_local_assets(asset_root: Path) -> None:
@@ -56,6 +60,47 @@ def test_quicktalk_runtime_available_rejects_unavailable_explicit_cuda(
    assert QuickTalkAdapter.runtime_available() is False


+def test_quicktalk_default_device_prefers_mps_on_apple_silicon(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    class FakeMps:
+        @staticmethod
+        def is_available() -> bool:
+            return True
+
+    fake_torch = types.SimpleNamespace(backends=types.SimpleNamespace(mps=FakeMps()))
+    monkeypatch.setitem(sys.modules, "torch", fake_torch)
+    monkeypatch.setattr("platform.system", lambda: "Darwin")
+    monkeypatch.setattr("platform.machine", lambda: "arm64")
+
+    assert _default_quicktalk_device() == "mps"
+
+
+def test_quicktalk_default_device_falls_back_to_cpu_on_apple_silicon_without_mps(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    class FakeMps:
+        @staticmethod
+        def is_available() -> bool:
+            return False
+
+    fake_torch = types.SimpleNamespace(backends=types.SimpleNamespace(mps=FakeMps()))
+    monkeypatch.setitem(sys.modules, "torch", fake_torch)
+    monkeypatch.setattr("platform.system", lambda: "Darwin")
+    monkeypatch.setattr("platform.machine", lambda: "arm64")
+
+    assert _default_quicktalk_device() == "cpu"
+
+
+def test_quicktalk_configured_device_preserves_explicit_generic_device(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    monkeypatch.delenv("OPENTALKING_QUICKTALK_DEVICE", raising=False)
+    monkeypatch.delenv("OPENTALKING_TORCH_DEVICE", raising=False)
+
+    assert _configured_quicktalk_device("auto", "cuda:3") == "cuda:3"
+
+
 def test_quicktalk_adapter_treats_empty_asset_root_env_as_unset(
    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
@@ -127,6 +172,94 @@ def test_quicktalk_adapter_falls_back_to_model_root_env(
    assert captured["template_video"] == template.resolve()


+def test_quicktalk_adapter_falls_back_to_settings_asset_root(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    asset_root = tmp_path / "models" / "quicktalk"
+    _write_quicktalk_pth_assets(asset_root)
+    avatar_dir = tmp_path / "avatars" / "anchor"
+    quicktalk_dir = avatar_dir / "quicktalk"
+    quicktalk_dir.mkdir(parents=True)
+    template = quicktalk_dir / "template_512x512.mp4"
+    template.write_bytes(b"video")
+    (avatar_dir / "manifest.json").write_text(
+        json.dumps(
+            {
+                "id": "anchor",
+                "model_type": "quicktalk",
+                "fps": 25,
+                "sample_rate": 16000,
+                "width": 512,
+                "height": 512,
+                "version": "1.0",
+            }
+        ),
+        encoding="utf-8",
+    )
+
+    captured: dict[str, Path | str | None] = {}
+
+    class FakeWorker:
+        fps = 25
+
+        def __init__(
+            self,
+            *,
+            asset_root: Path,
+            template_video: Path,
+            device: str,
+            hubert_device: str | None,
+            model_backend: str,
+            **_: object,
+        ) -> None:
+            captured["asset_root"] = asset_root
+            captured["template_video"] = template_video
+            captured["device"] = device
+            captured["hubert_device"] = hubert_device
+            captured["model_backend"] = model_backend
+
+        def make_state(self) -> object:
+            return object()
+
+    fake_runtime = types.ModuleType("opentalking.models.quicktalk.runtime")
+    fake_runtime.RealtimeV3Worker = FakeWorker
+    monkeypatch.setitem(sys.modules, "opentalking.models.quicktalk.runtime", fake_runtime)
+    monkeypatch.delenv("OPENTALKING_QUICKTALK_ASSET_ROOT", raising=False)
+    monkeypatch.delenv("OPENTALKING_QUICKTALK_MODEL_ROOT", raising=False)
+    monkeypatch.delenv("OMNIRT_QUICKTALK_MODEL_ROOT", raising=False)
+    monkeypatch.setenv("OPENTALKING_QUICKTALK_ASSET_ROOT", "")
+    monkeypatch.setenv("OPENTALKING_QUICKTALK_MODEL_ROOT", "")
+
+    from opentalking.core.config import get_settings
+
+    get_settings.cache_clear()
+    monkeypatch.setattr(
+        "opentalking.core.config.get_settings",
+        lambda: types.SimpleNamespace(
+            quicktalk_asset_root=str(asset_root),
+            quicktalk_model_root="",
+            quicktalk_device="mps",
+            quicktalk_hubert_device="cpu",
+            quicktalk_model_backend="onnx",
+            torch_device="",
+            device="",
+        ),
+    )
+
+    try:
+        adapter = QuickTalkAdapter()
+        adapter.load_avatar(str(avatar_dir))
+    finally:
+        get_settings.cache_clear()
+
+    assert captured["asset_root"] == asset_root.resolve()
+    assert captured["template_video"] == template.resolve()
+    assert captured["device"] == "mps"
+    assert captured["hubert_device"] == "cpu"
+    assert captured["model_backend"] == "onnx"
+
+
 def test_quicktalk_adapter_accepts_avatar_with_quicktalk_metadata(
    tmp_path: Path,
    monkeypatch: pytest.MonkeyPatch,
@@ -556,6 +689,94 @@ def test_quicktalk_adapter_warmup_runs_silence_and_restores_stream_state() -> No
    assert state.session_state == {"existing": True}


+def test_quicktalk_adapter_can_downsample_generated_frames_for_mac(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    monkeypatch.setenv("OPENTALKING_QUICKTALK_FPS", "14")
+    adapter = QuickTalkAdapter()
+    generated_rep_ids: list[int] = []
+
+    class FakeWorker:
+        fps = 25
+
+        def prepare_pcm_features(self, pcm, sample_rate):
+            return [np.full((1, 1), i, dtype=np.float32) for i in range(21)], 0.1
+
+        def generate_frames_from_reps(self, reps, state=None):
+            del state
+            for rep in reps:
+                generated_rep_ids.append(int(rep[0, 0]))
+                yield np.zeros((4, 4, 3), dtype=np.uint8)
+
+    state = types.SimpleNamespace(
+        worker=FakeWorker(),
+        fps=25,
+        frame_index=0,
+        session_state=None,
+    )
+
+    features, frames = adapter.render_audio_chunk(
+        state,  # type: ignore[arg-type]
+        AudioChunk(
+            data=np.zeros(13714, dtype=np.int16),
+            sample_rate=16000,
+            duration_ms=857.125,
+        ),
+    )
+
+    assert len(features.reps) == 21
+    assert len(frames) == 12
+    assert state.frame_index == 12
+    assert generated_rep_ids == [0, 2, 4, 5, 7, 9, 11, 13, 15, 16, 18, 20]
+
+
+def test_quicktalk_adapter_downsamples_through_live_render_pipeline(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    from opentalking.pipeline.speak.render_pipeline import render_audio_chunk_sync
+
+    monkeypatch.setenv("OPENTALKING_QUICKTALK_FPS", "14")
+    adapter = QuickTalkAdapter()
+    generated_rep_ids: list[int] = []
+
+    class FakeWorker:
+        fps = 25
+
+        def prepare_pcm_features(self, pcm, sample_rate):
+            return [np.full((1, 1), i, dtype=np.float32) for i in range(21)], 0.1
+
+        def generate_frames_from_reps(self, reps, state=None):
+            del state
+            for rep in reps:
+                generated_rep_ids.append(int(rep[0, 0]))
+                yield np.zeros((4, 4, 3), dtype=np.uint8)
+
+    state = types.SimpleNamespace(
+        worker=FakeWorker(),
+        fps=25,
+        frame_index=0,
+        extra={},
+        session_state=None,
+    )
+
+    next_frame_idx, frames = render_audio_chunk_sync(
+        adapter,
+        state,
+        AudioChunk(
+            data=np.zeros(13714, dtype=np.int16),
+            sample_rate=16000,
+            duration_ms=857.125,
+        ),
+        frame_index_start=0,
+        speech_frame_index_start=0,
+    )
+
+    assert next_frame_idx == 12
+    assert len(frames) == 12
+    assert frames[1].timestamp_ms == pytest.approx(1000.0 / 14.0)
+    assert generated_rep_ids == [0, 2, 4, 5, 7, 9, 11, 13, 15, 16, 18, 20]
+
+
 def test_quicktalk_adapter_evicts_old_worker_cache_entries(
    tmp_path: Path,
    monkeypatch: pytest.MonkeyPatch,
--- a/tests/unit/test_quicktalk_paths.py
+++ b/tests/unit/test_quicktalk_paths.py
@@ -0,0 +1,81 @@
+from __future__ import annotations
+
+from types import SimpleNamespace
+
+import pytest
+
+from opentalking.models.quicktalk.paths import resolve_quicktalk_asset_root
+
+
+def test_resolve_quicktalk_asset_root_uses_one_public_root_before_legacy(
+    tmp_path,
+    monkeypatch,
+) -> None:
+    settings_asset_root = tmp_path / "settings-assets"
+    env_asset_root = tmp_path / "env-assets"
+    legacy_settings_root = tmp_path / "settings-legacy"
+    legacy_env_root = tmp_path / "env-legacy"
+    omnirt_root = tmp_path / "omnirt-legacy"
+    shared_omnirt_root = tmp_path / "shared-omnirt"
+
+    monkeypatch.setenv("OPENTALKING_QUICKTALK_ASSET_ROOT", str(env_asset_root))
+    monkeypatch.setenv("OPENTALKING_QUICKTALK_MODEL_ROOT", str(legacy_env_root))
+    monkeypatch.setenv("OMNIRT_QUICKTALK_MODEL_ROOT", str(omnirt_root))
+    monkeypatch.setenv("OMNIRT_MODEL_ROOT", str(shared_omnirt_root))
+    settings = SimpleNamespace(
+        quicktalk_asset_root=str(settings_asset_root),
+        quicktalk_model_root=str(legacy_settings_root),
+        models_dir=str(tmp_path / "repo-models"),
+    )
+
+    assert resolve_quicktalk_asset_root(settings) == settings_asset_root.resolve()
+
+    settings.quicktalk_asset_root = ""
+    assert resolve_quicktalk_asset_root(settings) == env_asset_root.resolve()
+
+    monkeypatch.delenv("OPENTALKING_QUICKTALK_ASSET_ROOT")
+    assert resolve_quicktalk_asset_root(settings) == legacy_settings_root.resolve()
+
+    settings.quicktalk_model_root = ""
+    assert resolve_quicktalk_asset_root(settings) == legacy_env_root.resolve()
+
+    monkeypatch.delenv("OPENTALKING_QUICKTALK_MODEL_ROOT")
+    assert resolve_quicktalk_asset_root(settings) == omnirt_root.resolve()
+
+    monkeypatch.delenv("OMNIRT_QUICKTALK_MODEL_ROOT")
+    assert resolve_quicktalk_asset_root(settings) == (
+        shared_omnirt_root / "quicktalk"
+    ).resolve()
+
+    monkeypatch.delenv("OMNIRT_MODEL_ROOT")
+    assert resolve_quicktalk_asset_root(settings) == (
+        tmp_path / "repo-models" / "quicktalk"
+    ).resolve()
+
+
+def test_resolve_quicktalk_asset_root_can_skip_default_fallback(tmp_path, monkeypatch) -> None:
+    monkeypatch.delenv("OPENTALKING_QUICKTALK_ASSET_ROOT", raising=False)
+    monkeypatch.delenv("OPENTALKING_QUICKTALK_MODEL_ROOT", raising=False)
+    monkeypatch.delenv("OMNIRT_QUICKTALK_MODEL_ROOT", raising=False)
+    monkeypatch.delenv("OMNIRT_MODEL_ROOT", raising=False)
+
+    settings = SimpleNamespace(models_dir=str(tmp_path / "repo-models"))
+
+    assert resolve_quicktalk_asset_root(settings, include_default=False) is None
+
+
+def test_resolve_quicktalk_asset_root_warns_on_conflicting_explicit_roots(
+    tmp_path,
+    monkeypatch,
+    caplog: pytest.LogCaptureFixture,
+) -> None:
+    monkeypatch.setenv("OPENTALKING_QUICKTALK_ASSET_ROOT", str(tmp_path / "env-assets"))
+    monkeypatch.setenv("OPENTALKING_QUICKTALK_MODEL_ROOT", str(tmp_path / "legacy-env"))
+    settings = SimpleNamespace(
+        quicktalk_asset_root=str(tmp_path / "settings-assets"),
+        quicktalk_model_root="",
+        models_dir=str(tmp_path / "repo-models"),
+    )
+
+    assert resolve_quicktalk_asset_root(settings) == (tmp_path / "settings-assets").resolve()
+    assert "conflicting QuickTalk asset roots" in caplog.text