Add optional avatar background removal

Support immesive mode in 视频创作
2026-07-03 23:56:46 +08:00 · 2026-06-23 17:29:16 +08:00
parent d3083e799d
commit 8087110620
29 changed files with 1738 additions and 57 deletions
--- a/.env.example
+++ b/.env.example
@@ -16,6 +16,15 @@ VITE_BACKEND_PORT=8000
 # 头像资产与生成结果目录 (avatar assets / exports)。
 OPENTALKING_AVATARS_DIR=./examples/avatars
 OPENTALKING_EXPORTS_DIR=./data/exports
+# 自定义形象上传时的可选抠图 provider；仅在上传弹窗勾选“上传时抠除背景”后调用。
+# 启用本地 rembg 前先安装：uv pip install --python .venv/bin/python '.[avatar-matting]'
+OPENTALKING_AVATAR_MATTING_PROVIDER=rembg
+OPENTALKING_AVATAR_MATTING_DEVICE=cpu
+# rembg provider 需要预先下载 u2net.onnx，并显式填写模型文件路径。
+# 下载：https://github.com/danielgatis/rembg/releases/download/v0.0.0/u2net.onnx
+# MD5：60024c5c889badc19c04ad937298a77b
+OPENTALKING_AVATAR_MATTING_MODEL_PATH=
+OPENTALKING_AVATAR_MATTING_TIMEOUT_SEC=60

 # WebUI 默认展示的数字人模型；CLI --model 会覆盖该值。
 OPENTALKING_DEFAULT_MODEL=mock
--- a/apps/api/routes/avatars.py
+++ b/apps/api/routes/avatars.py
@@ -21,6 +21,7 @@ from PIL import Image

 from opentalking.avatar import mouth_metadata
 from opentalking.avatar.loader import load_avatar_bundle
+from opentalking.avatar.matting import MattingError, image_has_transparency, remove_avatar_background
 from opentalking.avatar.validator import list_avatar_dirs
 from opentalking.models.quicktalk.paths import resolve_quicktalk_asset_root
 from opentalking.models.registry import get_adapter
@@ -247,13 +248,7 @@ def _resize_uploaded_avatar_image(image: Image.Image, *, max_width: int, max_hei


 def _avatar_image_has_alpha(image: Image.Image) -> bool:
-    if "A" not in image.getbands():
-        return False
-    alpha = image.getchannel("A")
-    low, high = alpha.getextrema()
-    if not isinstance(low, int | float) or not isinstance(high, int | float):
-        return False
-    return low < 255 or high < 255
+    return image_has_transparency(image)


 def _update_manifest_matting_status(manifest_path: Path, image: Image.Image) -> None:
@@ -264,6 +259,21 @@ def _update_manifest_matting_status(manifest_path: Path, image: Image.Image) ->
    manifest_path.write_text(json.dumps(raw, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")


+def _update_manifest_matting_source(
+    manifest_path: Path,
+    *,
+    provider_name: str,
+    original_source_image: str,
+) -> None:
+    raw = _read_manifest(manifest_path)
+    metadata = dict(raw.get("metadata") or {})
+    metadata["matting_provider"] = provider_name
+    metadata["matting_source"] = "upload_auto"
+    metadata["original_source_image"] = original_source_image
+    raw["metadata"] = metadata
+    _write_manifest(manifest_path, raw)
+
+
 def _update_manifest_dimensions(manifest_path: Path, image: Image.Image) -> None:
    raw = json.loads(manifest_path.read_text(encoding="utf-8"))
    raw["width"] = int(image.width)
@@ -920,6 +930,7 @@ async def create_custom_avatar(
    base_avatar_id: str = Form(...),
    name: str = Form(...),
    model: str | None = Form(default=None),
+    remove_background: bool = Form(default=False),
    image: UploadFile | None = File(default=None),
    video: UploadFile | None = File(default=None),
 ) -> AvatarSummary:
@@ -965,12 +976,28 @@ async def create_custom_avatar(
        )
        max_w, max_h = _custom_avatar_max_size()
        fitted_image = _resize_uploaded_avatar_image(image_rgb, max_width=max_w, max_height=max_h)
+        source_dir = target_dir / "source"
+        source_dir.mkdir(parents=True, exist_ok=True)
+        if remove_background and video_body is None:
+            original_image = fitted_image.copy()
+            try:
+                fitted_image, matting_provider = remove_avatar_background(
+                    fitted_image,
+                    provider_name=str(getattr(request.app.state.settings, "avatar_matting_provider", "rembg")),
+                    settings=request.app.state.settings,
+                )
+            except MattingError as exc:
+                raise HTTPException(status_code=400, detail=str(exc)) from exc
+            original_image.save(source_dir / "original.png", format="PNG")
+            _update_manifest_matting_source(
+                target_dir / "manifest.json",
+                provider_name=matting_provider,
+                original_source_image="source/original.png",
+            )
        _update_manifest_dimensions(target_dir / "manifest.json", fitted_image)
        _update_manifest_matting_status(target_dir / "manifest.json", fitted_image)
        fitted_image.save(target_dir / "preview.png", format="PNG")
        fitted_image.save(target_dir / "reference.png", format="PNG")
-        source_dir = target_dir / "source"
-        source_dir.mkdir(parents=True, exist_ok=True)
        fitted_image.save(source_dir / "source.png", format="PNG")
        if video_body is not None:
            video_name = f"source_video{video_suffix}"
@@ -1000,6 +1027,9 @@ async def create_custom_avatar(
            metadata["frame_dir"] = "frames"
            raw["metadata"] = metadata
            _write_manifest(target_dir / "manifest.json", raw)
+    except HTTPException:
+        shutil.rmtree(target_dir, ignore_errors=True)
+        raise
    except Exception as exc:  # noqa: BLE001
        shutil.rmtree(target_dir, ignore_errors=True)
        raise HTTPException(status_code=500, detail=f"failed to create custom avatar: {exc}") from exc
--- a/apps/api/routes/scene_assets.py
+++ b/apps/api/routes/scene_assets.py
@@ -18,7 +18,7 @@ router = APIRouter(prefix="/scene-assets", tags=["scene-assets"])
 def _store(request: Request) -> SceneAssetStore:
    settings = request.app.state.settings
    root = Path(getattr(settings, "scene_assets_dir", "./data/scene-assets"))
-    return SceneAssetStore(root)
+    return SceneAssetStore(root, seed_defaults=True)


@router.get("/backgrounds", response_model=None)
--- a/apps/api/routes/video_creation.py
+++ b/apps/api/routes/video_creation.py
@@ -70,6 +70,18 @@ def _parse_indextts_config(tts_provider: str | None, raw: str | None, *, emotion
    return dict(config) or None


+def _parse_video_composition_config(raw: str | None) -> dict[str, object] | None:
+    if not raw:
+        return None
+    try:
+        decoded = json.loads(raw)
+    except json.JSONDecodeError as exc:
+        raise HTTPException(status_code=400, detail="composition_config must be valid JSON") from exc
+    if not isinstance(decoded, dict):
+        raise HTTPException(status_code=400, detail="composition_config must be a JSON object")
+    return decoded
+
+
 async def _save_indextts_emotion_audio(upload: UploadFile | None) -> Path | None:
    if upload is None:
        return None
@@ -97,6 +109,7 @@ async def create_video_creation_job(
    duration_sec: int | None = Form(default=None),
    fasterliveportrait_config: str | None = Form(default=None),
    indextts_config: str | None = Form(default=None),
+    composition_config: str | None = Form(default=None),
    indextts_emotion_audio_file: UploadFile | None = File(default=None),
 ) -> dict[str, Any]:
    source = audio_source.strip().lower()
@@ -104,6 +117,7 @@ async def create_video_creation_job(
        raise HTTPException(status_code=400, detail="audio_source must be upload, tts_text, voice_clone, or reference_video")
    settings = request.app.state.settings
    flp_config = _parse_fasterliveportrait_config(model, fasterliveportrait_config)
+    video_composition_config = _parse_video_composition_config(composition_config)
    emotion_audio_path = await _save_indextts_emotion_audio(indextts_emotion_audio_file)
    try:
        index_config = _parse_indextts_config(tts_provider, indextts_config, emotion_audio_path=emotion_audio_path)
@@ -134,6 +148,7 @@ async def create_video_creation_job(
                    title=title,
                    mime_type=audio_file.content_type,
                    fasterliveportrait_config=flp_config,
+                    composition_config=video_composition_config,
                )
            finally:
                upload_path.unlink(missing_ok=True)
@@ -145,6 +160,7 @@ async def create_video_creation_job(
                avatar_id=avatar_id,
                duration_sec=duration_sec,
                title=title,
+                composition_config=video_composition_config,
            )
            return _with_download_url(result)

@@ -159,6 +175,7 @@ async def create_video_creation_job(
            source=source,
            fasterliveportrait_config=flp_config,
            indextts_config=index_config,
+            composition_config=video_composition_config,
        )
        return _with_download_url(result)
    except HTTPException:
--- a/apps/api/tests/test_custom_avatars.py
+++ b/apps/api/tests/test_custom_avatars.py
@@ -185,6 +185,158 @@ def test_create_custom_avatar_preserves_uploaded_png_alpha(tmp_path, monkeypatch
        assert image.getchannel("A").getextrema()[0] == 0


+def test_create_custom_avatar_does_not_remove_background_by_default(tmp_path, monkeypatch):
+    base = tmp_path / "base-avatar"
+    base.mkdir()
+    (base / "preview.png").write_bytes(_png_bytes())
+    (base / "reference.png").write_bytes(_png_bytes())
+    (base / "manifest.json").write_text(
+        json.dumps(
+            {
+                "id": "base-avatar",
+                "name": "Base Avatar",
+                "model_type": "mock",
+                "fps": 25,
+                "sample_rate": 16000,
+                "width": 8,
+                "height": 8,
+                "version": "1.0",
+            }
+        ),
+        encoding="utf-8",
+    )
+    monkeypatch.setattr(avatars.mouth_metadata, "detect_mouth_landmarks", lambda frame: None)
+
+    def fail_remove_background(*args, **kwargs):  # noqa: ANN002, ANN003
+        raise AssertionError("matting provider should not run unless requested")
+
+    monkeypatch.setattr(avatars, "remove_avatar_background", fail_remove_background)
+
+    app = FastAPI()
+    app.state.settings = SimpleNamespace(avatars_dir=str(tmp_path))
+    app.include_router(avatars.router)
+    client = TestClient(app)
+
+    response = client.post(
+        "/avatars/custom",
+        data={"base_avatar_id": "base-avatar", "name": "普通形象"},
+        files={"image": ("avatar.png", _png_bytes(), "image/png")},
+    )
+
+    assert response.status_code == 200
+    created = response.json()
+    custom_dir = tmp_path / created["id"]
+    manifest = json.loads((custom_dir / "manifest.json").read_text(encoding="utf-8"))
+    assert created["matting_status"] == "opaque"
+    assert manifest["metadata"]["matting_status"] == "opaque"
+    assert "matting_provider" not in manifest["metadata"]
+    assert not (custom_dir / "source" / "original.png").exists()
+
+
+def test_create_custom_avatar_removes_background_when_requested(tmp_path, monkeypatch):
+    base = tmp_path / "base-avatar"
+    base.mkdir()
+    (base / "preview.png").write_bytes(_png_bytes())
+    (base / "reference.png").write_bytes(_png_bytes())
+    (base / "manifest.json").write_text(
+        json.dumps(
+            {
+                "id": "base-avatar",
+                "name": "Base Avatar",
+                "model_type": "mock",
+                "fps": 25,
+                "sample_rate": 16000,
+                "width": 8,
+                "height": 8,
+                "version": "1.0",
+            }
+        ),
+        encoding="utf-8",
+    )
+    monkeypatch.setattr(avatars.mouth_metadata, "detect_mouth_landmarks", lambda frame: None)
+
+    calls: list[str] = []
+
+    def fake_remove_background(image, *, provider_name, settings):
+        calls.append(provider_name)
+        result = image.convert("RGBA")
+        result.putpixel((0, 0), (*result.getpixel((0, 0))[:3], 0))
+        return result, "fake-provider"
+
+    monkeypatch.setattr(avatars, "remove_avatar_background", fake_remove_background)
+
+    app = FastAPI()
+    app.state.settings = SimpleNamespace(
+        avatars_dir=str(tmp_path),
+        avatar_matting_provider="configured-provider",
+        avatar_matting_device="cpu",
+        avatar_matting_timeout_sec=30,
+    )
+    app.include_router(avatars.router)
+    client = TestClient(app)
+
+    response = client.post(
+        "/avatars/custom",
+        data={"base_avatar_id": "base-avatar", "name": "抠图形象", "remove_background": "true"},
+        files={"image": ("avatar.png", _png_bytes(), "image/png")},
+    )
+
+    assert response.status_code == 200
+    assert calls == ["configured-provider"]
+    created = response.json()
+    custom_dir = tmp_path / created["id"]
+    manifest = json.loads((custom_dir / "manifest.json").read_text(encoding="utf-8"))
+    assert created["matting_status"] == "transparent_ready"
+    assert manifest["metadata"]["matting_status"] == "transparent_ready"
+    assert manifest["metadata"]["matting_provider"] == "fake-provider"
+    assert manifest["metadata"]["matting_source"] == "upload_auto"
+    assert manifest["metadata"]["original_source_image"] == "source/original.png"
+    assert (custom_dir / "source" / "original.png").is_file()
+    assert Image.open(custom_dir / "reference.png").getchannel("A").getextrema()[0] == 0
+
+
+def test_create_custom_avatar_reports_missing_matting_model(tmp_path, monkeypatch):
+    base = tmp_path / "base-avatar"
+    base.mkdir()
+    (base / "preview.png").write_bytes(_png_bytes())
+    (base / "reference.png").write_bytes(_png_bytes())
+    (base / "manifest.json").write_text(
+        json.dumps(
+            {
+                "id": "base-avatar",
+                "name": "Base Avatar",
+                "model_type": "mock",
+                "fps": 25,
+                "sample_rate": 16000,
+                "width": 8,
+                "height": 8,
+                "version": "1.0",
+            }
+        ),
+        encoding="utf-8",
+    )
+    monkeypatch.setattr(avatars.mouth_metadata, "detect_mouth_landmarks", lambda frame: None)
+
+    def fail_missing_model(*args, **kwargs):  # noqa: ANN002, ANN003
+        raise avatars.MattingError("未找到抠除背景模型 u2net.onnx。\n下载地址：https://example.test/u2net.onnx")
+
+    monkeypatch.setattr(avatars, "remove_avatar_background", fail_missing_model)
+
+    app = FastAPI()
+    app.state.settings = SimpleNamespace(avatars_dir=str(tmp_path))
+    app.include_router(avatars.router)
+
+    response = TestClient(app).post(
+        "/avatars/custom",
+        data={"base_avatar_id": "base-avatar", "name": "缺模型形象", "remove_background": "true"},
+        files={"image": ("avatar.png", _png_bytes(), "image/png")},
+    )
+
+    assert response.status_code == 400
+    assert "未找到抠除背景模型" in response.json()["detail"]
+    assert not any(path.name.startswith("custom-") for path in tmp_path.iterdir() if path.is_dir())
+
+
 def test_quicktalk_model_root_falls_back_to_omnirt_model_root(tmp_path, monkeypatch):
    monkeypatch.delenv("OPENTALKING_QUICKTALK_ASSET_ROOT", raising=False)
    monkeypatch.delenv("OPENTALKING_QUICKTALK_MODEL_ROOT", raising=False)
--- a/apps/api/tests/test_scene_assets.py
+++ b/apps/api/tests/test_scene_assets.py
@@ -65,6 +65,21 @@ def test_scene_asset_store_rejects_spoofed_background_content(tmp_path: Path) ->
        )


+def test_scene_asset_store_seeds_default_backgrounds_once(tmp_path: Path) -> None:
+    store = SceneAssetStore(tmp_path, seed_defaults=True)
+
+    backgrounds = store.list_backgrounds()
+
+    assert backgrounds[0]["id"] == "bg-default-data-wall"
+    assert backgrounds[0]["name"] == "数据玻璃幕墙"
+    assert backgrounds[0]["kind"] == "image"
+    assert backgrounds[0]["mime_type"] == "image/jpeg"
+    assert store.background_file_path("bg-default-data-wall").is_file()
+
+    assert store.delete_background("bg-default-data-wall") is True
+    assert store.list_backgrounds() == []
+
+
 def test_scene_asset_store_rejects_zero_avatar_scale(tmp_path: Path) -> None:
    store = SceneAssetStore(tmp_path)

@@ -196,6 +211,20 @@ def test_scene_asset_api_uploads_lists_downloads_and_deletes_background(tmp_path
        assert deleted.json()["deleted"] is True


+def test_scene_asset_api_lists_default_backgrounds_on_fresh_workspace(tmp_path: Path) -> None:
+    with _client(tmp_path) as client:
+        listed = client.get("/scene-assets/backgrounds")
+
+        assert listed.status_code == 200
+        items = listed.json()["items"]
+        assert items[0]["id"] == "bg-default-data-wall"
+        assert items[0]["name"] == "数据玻璃幕墙"
+
+        downloaded = client.get(items[0]["url"])
+        assert downloaded.status_code == 200
+        assert downloaded.headers["content-type"].startswith("image/jpeg")
+
+
 def test_scene_asset_api_rejects_oversized_background_upload(tmp_path: Path) -> None:
    with _client(tmp_path) as client:
        upload = client.post(
--- a/apps/api/tests/test_video_creation.py
+++ b/apps/api/tests/test_video_creation.py
@@ -154,6 +154,81 @@ def test_video_creation_audio_upload_returns_export_video(tmp_path: Path, monkey
    assert payload["export_video"]["download_url"].startswith("/exports/videos/")


+def test_video_creation_route_passes_composition_config(tmp_path: Path, monkeypatch) -> None:
+    client, creators = _client(tmp_path, monkeypatch)
+    composition = {
+        "scene_composition_id": "scene-anchor-news",
+        "background_id": "bg-newsroom",
+        "background_color": "#ffffff",
+        "avatar_fit": "contain",
+        "avatar_anchor": "center",
+        "avatar_scale": 1.25,
+        "avatar_offset_x": 96,
+        "avatar_offset_y": -32,
+    }
+    with client:
+        response = client.post(
+            "/video-creation/jobs",
+            data={
+                "model": "wav2lip",
+                "avatar_id": "anchor",
+                "audio_source": "upload",
+                "title": "Composed take",
+                "composition_config": json.dumps(composition),
+            },
+            files={"audio_file": ("speech.wav", b"RIFFaudio", "audio/wav")},
+        )
+
+    assert response.status_code == 200, response.text
+    assert creators[0].calls[0][1]["composition_config"] == composition
+
+
+def test_video_creation_route_rejects_invalid_composition_config(tmp_path: Path, monkeypatch) -> None:
+    client, _creators = _client(tmp_path, monkeypatch)
+    with client:
+        response = client.post(
+            "/video-creation/jobs",
+            data={
+                "model": "wav2lip",
+                "avatar_id": "anchor",
+                "audio_source": "upload",
+                "title": "Broken composition",
+                "composition_config": "{",
+            },
+            files={"audio_file": ("speech.wav", b"RIFFaudio", "audio/wav")},
+        )
+
+    assert response.status_code == 400
+    assert response.json()["detail"] == "composition_config must be valid JSON"
+
+
+def test_write_video_only_preserves_bgr_frames_for_opencv_writer(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
+    from opentalking import video_creation as video_creation_module
+
+    captured: list[np.ndarray] = []
+
+    class FakeWriter:
+        def isOpened(self) -> bool:
+            return True
+
+        def write(self, frame: np.ndarray) -> None:
+            captured.append(np.asarray(frame).copy())
+
+        def release(self) -> None:
+            return None
+
+    monkeypatch.setattr(video_creation_module.cv2, "VideoWriter_fourcc", lambda *_args: 0)
+    monkeypatch.setattr(video_creation_module.cv2, "VideoWriter", lambda *_args, **_kwargs: FakeWriter())
+
+    bgr = np.zeros((2, 2, 3), dtype=np.uint8)
+    bgr[:, :] = [200, 20, 10]
+
+    video_creation_module._write_video_only(tmp_path / "out.mp4", [bgr], 25)
+
+    assert captured
+    assert captured[0][0, 0].tolist() == [200, 20, 10]
+
+
 def test_video_creation_quicktalk_default_backend_is_omnirt(monkeypatch: pytest.MonkeyPatch) -> None:
    from opentalking.core.model_config import clear_model_config_cache
    from opentalking.providers.synthesis.backends import resolve_model_backend
@@ -893,6 +968,133 @@ async def test_video_creation_service_renders_quicktalk_via_omnirt(
    assert result["export_video"]["model"] == "quicktalk"


+@pytest.mark.asyncio
+async def test_video_creation_service_composites_generated_frames_over_scene_background(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    from opentalking import video_creation as video_creation_module
+    from opentalking.scene_assets import SceneAssetStore
+    from PIL import Image
+    import io
+
+    avatars = tmp_path / "avatars"
+    exports = tmp_path / "exports"
+    scene_assets = tmp_path / "scene-assets"
+    _write_avatar(avatars)
+    transparent_reference = Image.new("RGBA", (4, 4), (255, 0, 0, 0))
+    transparent_reference.save(avatars / "anchor" / "reference.png")
+    uploaded = tmp_path / "speech.wav"
+    uploaded.write_bytes(b"RIFFaudio")
+
+    buffer = io.BytesIO()
+    Image.new("RGB", (4, 4), (10, 20, 200)).save(buffer, format="PNG")
+    background = SceneAssetStore(scene_assets).create_background(
+        content=buffer.getvalue(),
+        filename="blue.png",
+        mime_type="image/png",
+        name="Blue",
+    )
+
+    captured_frames: list[np.ndarray] = []
+
+    class FakeWSClient:
+        def __init__(self, ws_url: str, *, extra_headers: dict[str, str] | None = None) -> None:
+            self.ws_url = ws_url
+            self.extra_headers = extra_headers or {}
+
+    class FakeOmniRTClient:
+        def __init__(self, _ws_client: FakeWSClient) -> None:
+            self.fps = 25
+            self.audio_chunk_samples = 4
+
+        async def init_session(self, **_kwargs: object) -> dict[str, object]:
+            return {"type": "init_ok"}
+
+        async def prewarm(self) -> dict[str, object]:
+            return {"type": "prewarm_skipped"}
+
+        async def generate(self, _audio_pcm: np.ndarray) -> list[VideoFrameData]:
+            red = np.zeros((4, 4, 3), dtype=np.uint8)
+            red[:, :, 0] = 255
+            return [VideoFrameData(data=red, width=4, height=4, timestamp_ms=0.0)]
+
+        async def close(self, send_close_msg: bool = True) -> None:
+            return None
+
+    async def fake_decode(_path: Path) -> np.ndarray:
+        return np.arange(4, dtype=np.int16)
+
+    async def fake_mux(_ffmpeg_bin: str, _video_in: Path, _audio_in: Path, out_mp4: Path) -> None:
+        out_mp4.write_bytes(b"mp4")
+
+    def fake_write_video_only(path: Path, frames: list[np.ndarray], _fps: float) -> None:
+        captured_frames.extend(np.asarray(frame).copy() for frame in frames)
+        path.write_bytes(b"video")
+
+    def fake_create_video_export(root: Path, **kwargs: object) -> dict[str, object]:
+        return {
+            "id": "export-composed",
+            "kind": "video_creation",
+            "title": kwargs["title"],
+            "duration_sec": kwargs["duration_sec"],
+            "size_bytes": len(kwargs["content"]),
+            "mime_type": "video/mp4",
+            "created_at": "2026-06-04T00:00:00Z",
+            "path": str(root / "export-composed.mp4"),
+            "session_id": kwargs["session_id"],
+            "avatar_id": kwargs["avatar_id"],
+            "model": kwargs["model"],
+        }
+
+    monkeypatch.setattr(video_creation_module, "FlashTalkWSClient", FakeWSClient, raising=False)
+    monkeypatch.setattr(video_creation_module, "OmniRTAudio2VideoClient", FakeOmniRTClient, raising=False)
+    monkeypatch.setattr(video_creation_module, "decode_audio_file_to_pcm_i16", fake_decode)
+    monkeypatch.setattr(video_creation_module, "_write_video_only", fake_write_video_only)
+    monkeypatch.setattr(video_creation_module, "_ffmpeg_mux", fake_mux)
+    monkeypatch.setattr(
+        video_creation_module,
+        "resolve_model_backend",
+        lambda model, _settings: SimpleNamespace(model=model, backend="omnirt", ws_url=""),
+    )
+    monkeypatch.setattr(video_creation_module, "create_video_export", fake_create_video_export)
+
+    service = VideoCreationService(
+        SimpleNamespace(
+            avatars_dir=str(avatars),
+            exports_dir=str(exports),
+            scene_assets_dir=str(scene_assets),
+            export_max_bytes=1024 * 1024,
+            ffmpeg_bin="ffmpeg",
+            omnirt_endpoint="http://127.0.0.1:9000",
+            omnirt_audio2video_path_template="/v1/audio2video/{model}",
+            omnirt_api_key="",
+        )
+    )
+
+    result = await service.create_from_audio_file(
+        model="wav2lip",
+        avatar_id="anchor",
+        upload_path=uploaded,
+        title="Composed take",
+        composition_config={
+            "background_id": background["id"],
+            "avatar_fit": "contain",
+            "avatar_anchor": "center",
+            "avatar_scale": 1.0,
+            "avatar_offset_x": 0,
+            "avatar_offset_y": 0,
+            "output_width": 320,
+            "output_height": 180,
+        },
+    )
+
+    assert result["export_video"]["model"] == "wav2lip"
+    assert captured_frames
+    assert captured_frames[0].shape == (180, 320, 3)
+    assert captured_frames[0][0, 0].tolist() == [200, 20, 10]
+
+
@pytest.mark.asyncio
 async def test_video_creation_service_renders_musetalk_via_omnirt(
    tmp_path: Path,
--- a/apps/web/src/App.tsx
+++ b/apps/web/src/App.tsx
@@ -1001,6 +1001,7 @@ export default function App() {
    }
  });
  const [toasts, setToasts] = useState<ToastMessage[]>([]);
+  const toastTimersRef = useRef<Map<string, ReturnType<typeof window.setTimeout>>>(new Map());
  const [recordingSaving, setRecordingSaving] = useState(false);
  const [ftRecordPhase, setFtRecordPhase] = useState<"idle" | "recording" | "stopped">("idle");
  const [ftRecordBusy, setFtRecordBusy] = useState(false);
@@ -1098,17 +1099,41 @@ export default function App() {
  );

  const dismissToast = useCallback((id: string) => {
+    const timer = toastTimersRef.current.get(id);
+    if (timer) window.clearTimeout(timer);
+    toastTimersRef.current.delete(id);
    setToasts((prev) => prev.filter((toast) => toast.id !== id));
  }, []);

  const notify = useCallback((message: string, tone: ToastTone = "info") => {
    const id = makeToastId();
    setToasts((prev) => [...prev.slice(-2), { id, tone, message }]);
-    window.setTimeout(() => {
-      setToasts((prev) => prev.filter((toast) => toast.id !== id));
-    }, tone === "error" ? 5200 : 3600);
+    if (tone !== "error") {
+      const timer = window.setTimeout(() => {
+        toastTimersRef.current.delete(id);
+        setToasts((prev) => prev.filter((toast) => toast.id !== id));
+      }, 3600);
+      toastTimersRef.current.set(id, timer);
+    }
  }, []);

+  const pauseToast = useCallback((id: string) => {
+    const timer = toastTimersRef.current.get(id);
+    if (!timer) return;
+    window.clearTimeout(timer);
+    toastTimersRef.current.delete(id);
+  }, []);
+
+  const resumeToast = useCallback((id: string) => {
+    const toast = toasts.find((item) => item.id === id);
+    if (!toast || toast.tone === "error" || toastTimersRef.current.has(id)) return;
+    const timer = window.setTimeout(() => {
+      toastTimersRef.current.delete(id);
+      setToasts((prev) => prev.filter((item) => item.id !== id));
+    }, 1800);
+    toastTimersRef.current.set(id, timer);
+  }, [toasts]);
+
  const syncRuntimeConfigSelection = useCallback((next: RuntimeConfigResponse) => {
    const nextAsrProvider = normalizeAsrProvider(next.stt.provider, "dashscope");
    setAsrProvider(nextAsrProvider);
@@ -2328,7 +2353,32 @@ export default function App() {
    }
  }, [connection, fasterliveportraitConfig, model, notify]);

-  const handleCreateCustomAvatar = useCallback(async (file: File, name: string) => {
+  const handleSavePrompt = useCallback(async () => {
+    setPromptSaving(true);
+    try {
+      await apiPost("/sessions/customize/prompt", {
+        avatar_id: avatarId,
+        llm_system_prompt: llmSystemPrompt,
+      });
+      const sid = sessionIdRef.current;
+      if (sid) await releaseSession(sid);
+      resetLiveState(true);
+      setConnection("idle");
+      notify("System Prompt 已保存，页面即将刷新并在新会话生效。", "success");
+      window.setTimeout(() => window.location.reload(), 900);
+    } catch (e) {
+      console.warn("save prompt failed", e);
+      notify("保存 Prompt 失败，请查看后端日志。", "error");
+    } finally {
+      setPromptSaving(false);
+    }
+  }, [avatarId, llmSystemPrompt, notify, releaseSession, resetLiveState]);
+
+  const handleCreateCustomAvatar = useCallback(async (
+    file: File,
+    name: string,
+    options?: { removeBackground?: boolean },
+  ) => {
    const trimmedName = name.trim();
    if (!trimmedName) {
      notify("请先给形象起个名字。", "info");
@@ -2346,6 +2396,7 @@ export default function App() {
      fd.set("name", trimmedName);
      fd.set("model", model);
      fd.set("image", file);
+      fd.set("remove_background", options?.removeBackground ? "true" : "false");
      const created = await apiPostForm<AvatarSummary>("/avatars/custom", fd);
      setAvatars((prev) => {
        const filtered = prev.filter((avatar) => avatar.id !== created.id);
@@ -2358,9 +2409,12 @@ export default function App() {
      resetLiveState(true);
      setConnection("idle");
      notify(`自定义形象「${created.name ?? trimmedName}」已加入形象库。`, "success");
+      return created;
    } catch (e) {
      console.warn("create custom avatar failed", e);
-      notify("创建自定义形象失败，请查看后端日志。", "error");
+      const detail = e instanceof ApiError ? e.detail : null;
+      notify(detail ? `创建失败：${detail}` : "创建自定义形象失败，请查看后端日志。", "error");
+      return null;
    } finally {
      setReferenceSaving(false);
    }
@@ -2901,6 +2955,9 @@ export default function App() {
          <VideoCreationWorkspace
            avatars={avatars}
            avatarId={avatarId}
+            sceneBackgrounds={sceneBackgrounds}
+            sceneCompositions={sceneCompositions}
+            selectedSceneIdsByAvatar={selectedSceneIdsByAvatar}
            models={models}
            onAvatarChange={handleAvatarChange}
            onAvatarUploaded={handleVideoCloneAvatarUploaded}
@@ -3170,7 +3227,7 @@ export default function App() {
                    onPersonaImport={handlePersonaImport}
                    onAvatarChange={handleAvatarChange}
                    onStart={() => void handleStart()}
-                    onCustomAvatarCreate={(file, name) => void handleCreateCustomAvatar(file, name)}
+                    onCustomAvatarCreate={(file, name, options) => handleCreateCustomAvatar(file, name, options)}
                    onAvatarDelete={(target) => void handleDeleteAvatar(target)}
                    referenceSaving={referenceSaving}
                  />
@@ -3304,7 +3361,7 @@ export default function App() {
        </aside>
      </div>
      )}
-      <ToastStack toasts={toasts} onDismiss={dismissToast} />
+      <ToastStack toasts={toasts} onDismiss={dismissToast} onPause={pauseToast} onResume={resumeToast} />
    </div>
  );
 }
--- a/apps/web/src/components/AssetLibraryWorkspace.tsx
+++ b/apps/web/src/components/AssetLibraryWorkspace.tsx
@@ -1040,6 +1040,10 @@ export function AssetLibraryWorkspace({
  );

  const avatarById = useMemo(() => new Map((avatars ?? []).map((avatar) => [avatar.id, avatar])), [avatars]);
+  const backgroundById = useMemo(
+    () => new Map(sceneBackgrounds.map((background) => [background.id, background])),
+    [sceneBackgrounds],
+  );
  const sceneGroups = useMemo(() => {
    const avatarGroups = (avatars ?? [])
      .map((avatar) => ({
@@ -1182,6 +1186,9 @@ export function AssetLibraryWorkspace({
                {scenes.map((scene) => {
                  const selected = selectedSceneIdsByAvatar[scene.avatar_id] === scene.id;
                  const sceneAvatar = avatarById.get(scene.avatar_id);
+                  const sceneBackground = scene.background_id
+                    ? backgroundById.get(scene.background_id)
+                    : null;
                  return (
                    <article
                      key={scene.id}
@@ -1192,8 +1199,12 @@ export function AssetLibraryWorkspace({
                      }`}
                    >
                      <p className="truncate text-sm font-semibold text-slate-950">{scene.name}</p>
-                      <p className="mt-1 truncate text-xs text-slate-500">Avatar {sceneAvatar?.name ?? scene.avatar_id}</p>
-                      <p className="mt-1 truncate text-xs text-slate-500">Background {scene.background_id ?? scene.background_color}</p>
+                      <p className="mt-1 truncate text-xs text-slate-500">
+                        数字人形象：{sceneAvatar?.name ?? scene.avatar_id}
+                      </p>
+                      <p className="mt-1 truncate text-xs text-slate-500">
+                        背景：{sceneBackground?.name ?? scene.background_id ?? scene.background_color}
+                      </p>
                      <div className="mt-3 flex items-center gap-3">
                        <button
                          type="button"
--- a/apps/web/src/components/AvatarSelectionStage.tsx
+++ b/apps/web/src/components/AvatarSelectionStage.tsx
@@ -24,7 +24,11 @@ type AvatarSelectionStageProps = {
  prewarmState?: "idle" | "preparing" | "ready" | "failed";
  onAvatarChange: (id: string) => void;
  onStart: () => void;
-  onCustomAvatarCreate: (file: File, name: string) => void;
+  onCustomAvatarCreate: (
+    file: File,
+    name: string,
+    options?: { removeBackground?: boolean },
+  ) => Promise<AvatarSummary | null | void>;
  onAvatarDelete?: (avatar: AvatarSummary) => void;
  referenceSaving?: boolean;
  personas: PersonaSummary[];
@@ -94,6 +98,9 @@ export function AvatarSelectionStage({
  });
  const [customFile, setCustomFile] = useState<File | null>(null);
  const [customPreviewUrl, setCustomPreviewUrl] = useState<string | null>(null);
+  const [customRemoveBackground, setCustomRemoveBackground] = useState(false);
+  const [customUploadState, setCustomUploadState] = useState<"idle" | "processing" | "complete">("idle");
+  const [createdCustomAvatar, setCreatedCustomAvatar] = useState<AvatarSummary | null>(null);
  const selectedPersona = personas.find((persona) => persona.id === selectedPersonaId) ?? null;
  const configDisabled = loading || queued || prewarmState === "preparing";
  const baseDisabled = loading || queued || prewarmState === "preparing" || !selectedAvatar || !modelConnected;
@@ -120,7 +127,14 @@ export function AvatarSelectionStage({
    setCustomPreviewUrl(file ? URL.createObjectURL(file) : null);
  };

-  const handleCustomUpload = () => {
+  const closeCustomUpload = () => {
+    if (referenceSaving || customUploadState === "processing") return;
+    setCustomUploadOpen(false);
+    setCustomUploadState("idle");
+    setCreatedCustomAvatar(null);
+  };
+
+  const handleCustomUpload = async () => {
    const name = customName.trim();
    if (!customFile || !name) return;
    try {
@@ -128,8 +142,19 @@ export function AvatarSelectionStage({
    } catch {
      /* ignore */
    }
-    onCustomAvatarCreate(customFile, name);
-    setCustomUploadOpen(false);
+    setCreatedCustomAvatar(null);
+    setCustomUploadState(customRemoveBackground ? "processing" : "idle");
+    const created = await onCustomAvatarCreate(customFile, name, { removeBackground: customRemoveBackground });
+    if (created) {
+      setCreatedCustomAvatar(created);
+      if (customRemoveBackground) {
+        setCustomUploadState("complete");
+      } else {
+        setCustomUploadOpen(false);
+      }
+    } else {
+      setCustomUploadState("idle");
+    }
  };

  const handlePersonaFileChange = (event: ChangeEvent<HTMLInputElement>) => {
@@ -396,6 +421,7 @@ export function AvatarSelectionStage({
              <button
                type="button"
                onClick={() => fileInputRef.current?.click()}
+                disabled={referenceSaving}
                className="flex w-full items-center gap-3 rounded-lg border border-dashed border-cyan-300 bg-cyan-50 p-3 text-left transition hover:bg-cyan-100"
              >
                <span className="flex h-12 w-12 shrink-0 items-center justify-center overflow-hidden rounded-lg bg-white text-2xl font-light text-cyan-700">
@@ -412,23 +438,67 @@ export function AvatarSelectionStage({
                  <span className="mt-0.5 block text-xs text-slate-500">会作为新资产加入形象库</span>
                </span>
              </button>
+              <label className="flex items-center gap-2 rounded-lg border border-slate-200 bg-slate-50 px-3 py-2.5">
+                <input
+                  type="checkbox"
+                  checked={customRemoveBackground}
+                  onChange={(event) => setCustomRemoveBackground(event.target.checked)}
+                  disabled={referenceSaving}
+                  className="h-4 w-4 rounded border-slate-300 text-cyan-600 focus:ring-cyan-500"
+                />
+                <span className="text-sm font-medium text-slate-700">上传时抠除背景</span>
+              </label>
+              {customUploadState === "processing" ? (
+                <div className="rounded-lg border border-cyan-200 bg-cyan-50 px-3 py-2.5">
+                  <div className="flex items-center justify-between gap-3">
+                    <span className="text-sm font-semibold text-cyan-800">正在抠除背景...</span>
+                    <span className="h-4 w-4 animate-spin rounded-full border-2 border-cyan-200 border-t-cyan-600" />
+                  </div>
+                  <div className="mt-2 h-1.5 overflow-hidden rounded-full bg-cyan-100">
+                    <div className="h-full w-2/3 animate-pulse rounded-full bg-cyan-500" />
+                  </div>
+                  <p className="mt-2 text-xs text-cyan-700">正在识别人像边缘，首次处理可能较慢。</p>
+                </div>
+              ) : null}
+              {customUploadState === "complete" && createdCustomAvatar ? (
+                <div className="rounded-lg border border-emerald-200 bg-emerald-50 p-3">
+                  <div className="flex items-center gap-3">
+                    <span className="flex h-16 w-16 shrink-0 items-center justify-center overflow-hidden rounded-lg bg-[linear-gradient(45deg,#e2e8f0_25%,transparent_25%),linear-gradient(-45deg,#e2e8f0_25%,transparent_25%),linear-gradient(45deg,transparent_75%,#e2e8f0_75%),linear-gradient(-45deg,transparent_75%,#e2e8f0_75%)] bg-[length:16px_16px] bg-[position:0_0,0_8px,8px_-8px,-8px_0]">
+                      <img
+                        src={buildApiUrl(`/avatars/${encodeURIComponent(createdCustomAvatar.id)}/preview`)}
+                        alt={createdCustomAvatar.name ?? createdCustomAvatar.id}
+                        className="h-full w-full object-contain"
+                      />
+                    </span>
+                    <span className="min-w-0">
+                      <span className="block text-sm font-semibold text-emerald-900">抠图完成</span>
+                      <span className="mt-0.5 block truncate text-xs text-emerald-700">
+                        {createdCustomAvatar.name ?? createdCustomAvatar.id} 已加入形象库
+                      </span>
+                    </span>
+                  </div>
+                </div>
+              ) : null}
            </div>
            <div className="flex items-center justify-end gap-2 border-t border-slate-100 bg-slate-50 px-4 py-3">
              <button
                type="button"
-                onClick={() => setCustomUploadOpen(false)}
+                onClick={closeCustomUpload}
+                disabled={referenceSaving || customUploadState === "processing"}
                className="rounded-lg border border-slate-200 bg-white px-3 py-2 text-sm font-semibold text-slate-700 transition hover:border-slate-300"
              >
-                取消
-              </button>
-              <button
-                type="button"
-                onClick={handleCustomUpload}
-                disabled={referenceSaving || !customFile || !customName.trim()}
-                className="rounded-lg bg-cyan-600 px-3 py-2 text-sm font-semibold text-white transition hover:bg-cyan-500 disabled:cursor-not-allowed disabled:opacity-60"
-              >
-                {referenceSaving ? "创建中..." : "保存形象"}
+                {customUploadState === "complete" ? "完成" : "取消"}
              </button>
+              {customUploadState !== "complete" ? (
+                <button
+                  type="button"
+                  onClick={() => void handleCustomUpload()}
+                  disabled={referenceSaving || !customFile || !customName.trim()}
+                  className="rounded-lg bg-cyan-600 px-3 py-2 text-sm font-semibold text-white transition hover:bg-cyan-500 disabled:cursor-not-allowed disabled:opacity-60"
+                >
+                  {referenceSaving && customRemoveBackground ? "正在抠除背景..." : referenceSaving ? "创建中..." : "保存形象"}
+                </button>
+              ) : null}
            </div>
          </div>
        </div>
--- a/apps/web/src/components/ToastStack.tsx
+++ b/apps/web/src/components/ToastStack.tsx
@@ -21,9 +21,11 @@ const DOT_CLASSES: Record<ToastTone, string> = {
 type ToastStackProps = {
  toasts: ToastMessage[];
  onDismiss: (id: string) => void;
+  onPause: (id: string) => void;
+  onResume: (id: string) => void;
 };

-export function ToastStack({ toasts, onDismiss }: ToastStackProps) {
+export function ToastStack({ toasts, onDismiss, onPause, onResume }: ToastStackProps) {
  if (toasts.length === 0) return null;

  return (
@@ -33,9 +35,11 @@ export function ToastStack({ toasts, onDismiss }: ToastStackProps) {
          key={toast.id}
          className={`flex items-start gap-2 rounded-lg border px-3 py-2.5 text-sm shadow-lg shadow-slate-200/70 ${TONE_CLASSES[toast.tone]}`}
          role="status"
+          onMouseEnter={() => onPause(toast.id)}
+          onMouseLeave={() => onResume(toast.id)}
        >
          <span className={`mt-1.5 h-2 w-2 shrink-0 rounded-full ${DOT_CLASSES[toast.tone]}`} />
-          <p className="min-w-0 flex-1 leading-relaxed">{toast.message}</p>
+          <p className="min-w-0 flex-1 whitespace-pre-line break-words leading-relaxed">{toast.message}</p>
          <button
            type="button"
            onClick={() => onDismiss(toast.id)}
--- a/apps/web/src/components/VideoCreationWorkspace.tsx
+++ b/apps/web/src/components/VideoCreationWorkspace.tsx
@@ -10,6 +10,9 @@ import {
  type AvatarSummary,
  type ExportVideoItem,
  type IndexTTSConfig,
+  type SceneBackgroundAsset,
+  type SceneComposition,
+  type VideoCreationCompositionConfig,
  type VoiceCatalogItem,
 } from "../lib/api";
 import type { VoiceCloneApplication } from "../lib/voiceCloneApply";
@@ -19,12 +22,16 @@ import { buildTTSPreviewPayload, requestTTSPreview } from "../lib/ttsPreview";

 export type VideoCreationAudioSource = "upload" | "tts_text" | "voice_clone";
 type VideoCreationMode = "spoken_video" | "reference_video";
+type VideoCreationOutputAspect = "16:9" | "9:16" | "1:1";

 type VoiceOpt = { id: string; label: string; targetModel?: string | null };

 type VideoCreationWorkspaceProps = {
  avatars: AvatarSummary[];
  avatarId: string;
+  sceneBackgrounds: SceneBackgroundAsset[];
+  sceneCompositions: SceneComposition[];
+  selectedSceneIdsByAvatar?: Record<string, string>;
  models: string[];
  onAvatarChange: (id: string) => void;
  onAvatarUploaded: (avatar: AvatarSummary) => void;
@@ -68,6 +75,12 @@ const VIDEO_CREATION_MODEL_LABELS: Record<string, string> = {
  quicktalk: "QuickTalk",
  wav2lip: "Wav2Lip",
 };
+const VIDEO_CREATION_OUTPUT_SIZES = {
+  "16:9": { label: "16:9", width: 1280, height: 720, previewClassName: "aspect-video w-full" },
+  "9:16": { label: "9:16", width: 720, height: 1280, previewClassName: "aspect-[9/16] w-[min(100%,22rem)]" },
+  "1:1": { label: "1:1", width: 1080, height: 1080, previewClassName: "aspect-square w-[min(100%,34rem)]" },
+} as const satisfies Record<VideoCreationOutputAspect, { label: string; width: number; height: number; previewClassName: string }>;
+const VIDEO_CREATION_OUTPUT_ASPECTS = Object.keys(VIDEO_CREATION_OUTPUT_SIZES) as VideoCreationOutputAspect[];
 const VIDEO_CREATION_SCRIPT_MAX_CHARS = 1000;
 const FASTERLIVEPORTRAIT_ANIMATION_REGION_OPTIONS: { id: FasterLivePortraitConfig["animation_region"]; label: string }[] = [
  { id: "lip", label: "嘴部" },
@@ -245,9 +258,16 @@ function avatarNameFromFile(file: File): string {
  return stem ? `视频创作 ${stem}` : "视频创作形象";
 }

+function sceneBackgroundUrl(background: SceneBackgroundAsset): string {
+  return buildApiUrl(background.url);
+}
+
 export function VideoCreationWorkspace({
  avatars,
  avatarId,
+  sceneBackgrounds,
+  sceneCompositions,
+  selectedSceneIdsByAvatar = {},
  models,
  onAvatarChange,
  onAvatarUploaded,
@@ -285,6 +305,9 @@ export function VideoCreationWorkspace({
  const [indexttsConfig, setIndexttsConfig] = useState<IndexTTSConfig>(() => freshIndexTTSConfig());
  const [indexttsEmotionAudioFile, setIndexttsEmotionAudioFile] = useState<File | null>(null);
  const [activeIndexTTSPresetLabel, setActiveIndexTTSPresetLabel] = useState<string | null>(null);
+  const [videoBackgroundId, setVideoBackgroundId] = useState<string | null>(null);
+  const [videoAvatarAdjust, setVideoAvatarAdjust] = useState({ x: 0, y: 0, scale: 1 });
+  const [videoOutputAspect, setVideoOutputAspect] = useState<VideoCreationOutputAspect>("16:9");
  const sourceUploadRef = useRef<HTMLInputElement>(null);
  const ttsPreviewAudioRef = useRef<HTMLAudioElement | null>(null);
  const ttsPreviewUrlRef = useRef<string | null>(null);
@@ -302,6 +325,59 @@ export function VideoCreationWorkspace({
  const showIndexTTSControls = !isReferenceVideoMode && audioSource !== "upload" && INDEXTTS_PROVIDER_SET.has(ttsProvider);
  const effectiveIndexTTSConfig = showIndexTTSControls ? buildIndexTTSQualityConfig(indexTTSRequestConfig(indexttsConfig)) : undefined;
  const showIndexTTSEmotionStrength = indexttsConfig.emotion_mode !== "voice";
+  const selectedScene = useMemo(() => {
+    if (!selectedAvatar) return null;
+    const selectedSceneId = selectedSceneIdsByAvatar[selectedAvatar.id];
+    const avatarScenes = sceneCompositions.filter((scene) => scene.avatar_id === selectedAvatar.id);
+    return avatarScenes.find((scene) => scene.id === selectedSceneId) ?? avatarScenes[0] ?? null;
+  }, [sceneCompositions, selectedAvatar, selectedSceneIdsByAvatar]);
+  const selectedVideoBackground = useMemo(
+    () => videoBackgroundId ? sceneBackgrounds.find((background) => background.id === videoBackgroundId) ?? null : null,
+    [sceneBackgrounds, videoBackgroundId],
+  );
+  const videoAvatarAnchor = selectedScene?.avatar_anchor ?? "center";
+  const videoAvatarFit = selectedScene?.avatar_fit ?? "contain";
+  const videoAvatarBaseScale = selectedScene?.avatar_scale ?? 1;
+  const videoAvatarDisplayScale = videoAvatarBaseScale * videoAvatarAdjust.scale;
+  const selectedVideoOutputSize = VIDEO_CREATION_OUTPUT_SIZES[videoOutputAspect];
+  const videoAvatarPreviewLayer = useMemo(() => {
+    const canvasW = selectedVideoOutputSize.width;
+    const canvasH = selectedVideoOutputSize.height;
+    const avatarW = Math.max(1, Number(selectedAvatar?.width || canvasW));
+    const avatarH = Math.max(1, Number(selectedAvatar?.height || canvasH));
+    const containScale = Math.min(canvasW / avatarW, canvasH / avatarH);
+    const coverScale = Math.max(canvasW / avatarW, canvasH / avatarH);
+    const fitScale = videoAvatarFit === "cover" ? coverScale : containScale;
+    const layerW = Math.max(1, avatarW * fitScale * videoAvatarDisplayScale);
+    const layerH = Math.max(1, avatarH * fitScale * videoAvatarDisplayScale);
+    const originX = videoAvatarAnchor === "left"
+      ? 0
+      : videoAvatarAnchor === "right"
+        ? canvasW - layerW
+        : (canvasW - layerW) / 2;
+    const originY = videoAvatarAnchor === "bottom" ? canvasH - layerH : (canvasH - layerH) / 2;
+    return {
+      leftPct: ((originX + videoAvatarAdjust.x) / canvasW) * 100,
+      topPct: ((originY + videoAvatarAdjust.y) / canvasH) * 100,
+      widthPct: (layerW / canvasW) * 100,
+      heightPct: (layerH / canvasH) * 100,
+    };
+  }, [selectedAvatar?.height, selectedAvatar?.width, selectedVideoOutputSize.height, selectedVideoOutputSize.width, videoAvatarAdjust.x, videoAvatarAdjust.y, videoAvatarAnchor, videoAvatarDisplayScale, videoAvatarFit]);
+  const compositionConfig = useMemo<VideoCreationCompositionConfig | null>(() => {
+    if (!videoBackgroundId) return null;
+    return {
+      scene_composition_id: selectedScene?.id ?? null,
+      background_id: videoBackgroundId,
+      background_color: selectedScene?.background_color ?? "#ffffff",
+      avatar_fit: videoAvatarFit,
+      avatar_anchor: videoAvatarAnchor,
+      avatar_scale: videoAvatarDisplayScale,
+      avatar_offset_x: videoAvatarAdjust.x,
+      avatar_offset_y: videoAvatarAdjust.y,
+      output_width: selectedVideoOutputSize.width,
+      output_height: selectedVideoOutputSize.height,
+    };
+  }, [selectedScene?.background_color, selectedScene?.id, selectedVideoOutputSize.height, selectedVideoOutputSize.width, videoAvatarAdjust.scale, videoAvatarAdjust.x, videoAvatarAdjust.y, videoAvatarAnchor, videoAvatarDisplayScale, videoAvatarFit, videoBackgroundId]);

  const updateFasterLivePortraitNumber = useCallback((
    key: Exclude<keyof FasterLivePortraitConfig, "animation_region" | "flag_stitching" | "flag_pasteback" | "flag_relative_motion" | "flag_normalize_lip" | "flag_lip_retargeting">,
@@ -351,6 +427,11 @@ export function VideoCreationWorkspace({
    };
  }, []);

+  useEffect(() => {
+    setVideoBackgroundId(selectedScene?.background_id ?? null);
+    setVideoAvatarAdjust({ x: 0, y: 0, scale: 1 });
+  }, [selectedAvatar?.id, selectedScene?.id, selectedScene?.background_id]);
+
  const handleSourceAsset = useCallback(async (file: File | null) => {
    if (!file || !selectedAvatar) return;
    const isVideo = file.type.startsWith("video/");
@@ -468,6 +549,7 @@ export function VideoCreationWorkspace({
          title,
          audioSource: "reference_video",
          durationSec: referenceDurationSec,
+          compositionConfig,
        });
        setResult(response.export_video);
        onExportCreated?.(response.export_video);
@@ -487,6 +569,7 @@ export function VideoCreationWorkspace({
        fasterliveportraitConfig: effectiveModel === "fasterliveportrait" ? fasterliveportraitConfig : undefined,
        indexttsConfig: effectiveIndexTTSConfig,
        indexttsEmotionAudioFile,
+        compositionConfig,
      });
      setResult(response.export_video);
      onExportCreated?.(response.export_video);
@@ -498,11 +581,11 @@ export function VideoCreationWorkspace({
    } finally {
      setGenerating(false);
    }
-  }, [audioFile, audioSource, edgeVoice, effectiveIndexTTSConfig, effectiveModel, fasterliveportraitConfig, indexttsConfig.emotion_mode, indexttsEmotionAudioFile, isReferenceVideoMode, models, onExportCreated, onNotify, qwenModel, qwenVoice, referenceDurationSec, selectedAvatar, showIndexTTSControls, text, title, ttsProvider]);
+  }, [audioFile, audioSource, compositionConfig, edgeVoice, effectiveIndexTTSConfig, effectiveModel, fasterliveportraitConfig, indexttsConfig.emotion_mode, indexttsEmotionAudioFile, isReferenceVideoMode, models, onExportCreated, onNotify, qwenModel, qwenVoice, referenceDurationSec, selectedAvatar, showIndexTTSControls, text, title, ttsProvider]);

  return (
    <main className="flex min-h-0 flex-1 flex-col bg-slate-100 p-4">
-      <div className="grid min-h-0 flex-1 gap-4 xl:grid-cols-[20rem_minmax(0,1fr)_22rem]">
+      <div className="grid min-h-0 flex-1 gap-4 xl:grid-cols-[18rem_minmax(28rem,1fr)_minmax(32rem,42rem)]">
        <section className="min-h-0 overflow-y-auto rounded-lg border border-slate-200 bg-white p-4 shadow-sm">
          <div className="flex items-center justify-between gap-3">
            <div>
@@ -969,27 +1052,168 @@ export function VideoCreationWorkspace({
              </button>
              {result ? <span className="text-sm font-medium text-emerald-700">已保存到资产库</span> : null}
            </div>
+            {result ? (
+              <div data-testid="video-creation-result-panel" className="mt-6 rounded-lg border border-slate-200 bg-slate-50 p-4">
+                <div className="flex flex-wrap items-start justify-between gap-3">
+                  <div>
+                    <p className="text-xs font-medium text-slate-500">Result</p>
+                    <h2 className="mt-1 text-base font-semibold text-slate-950">生成结果</h2>
+                  </div>
+                  <div className="flex flex-wrap gap-2">
+                    <a href={buildApiDownloadUrl(result.download_url)} download className="rounded-lg bg-cyan-600 px-3 py-1.5 text-xs font-semibold text-white hover:bg-cyan-500">下载</a>
+                    <button type="button" onClick={onGoAssetLibrary} className="rounded-lg border border-slate-200 bg-white px-3 py-1.5 text-xs font-semibold text-slate-700 hover:border-cyan-200 hover:text-cyan-700">去资产库查看</button>
+                  </div>
+                </div>
+                <video src={buildApiDownloadUrl(result.download_url)} className={`mt-3 mx-auto rounded-lg bg-slate-950 object-contain ${selectedVideoOutputSize.previewClassName}`} controls preload="metadata" />
+                <div className="mt-3 rounded-lg bg-white p-3 text-xs text-slate-600">
+                  <p className="font-semibold text-slate-800">{result.title}</p>
+                  <p className="mt-1 break-all font-mono text-[11px]">{result.path}</p>
+                </div>
+              </div>
+            ) : null}
          </div>
        </section>

-        <aside className="flex min-h-0 flex-col rounded-lg border border-slate-200 bg-white p-4 shadow-sm">
-          <p className="text-xs font-medium text-slate-500">Result</p>
-          <h2 className="mt-1 text-base font-semibold text-slate-950">生成结果</h2>
-          {result ? (
-            <div className="mt-4 space-y-3">
-              <video src={buildApiDownloadUrl(result.download_url)} className="aspect-video w-full rounded-lg bg-slate-950 object-contain" controls preload="metadata" />
-              <div className="rounded-lg bg-slate-50 p-3 text-xs text-slate-600">
-                <p className="font-semibold text-slate-800">{result.title}</p>
-                <p className="mt-1 break-all font-mono text-[11px]">{result.path}</p>
+        <aside className="flex min-h-0 flex-col overflow-y-auto rounded-lg border border-slate-200 bg-white p-4 shadow-sm">
+          <p className="text-xs font-medium text-slate-500">Composition</p>
+          <h2 className="mt-1 text-base font-semibold text-slate-950">构图设置</h2>
+          <div className="mt-4 shrink-0 overflow-hidden rounded-lg border border-slate-200 bg-slate-950 p-3">
+            <div className="mb-2 flex items-center justify-between gap-3">
+              <div>
+                <p className="text-xs font-semibold text-white/55">画面预览</p>
+                <h3 className="text-sm font-semibold text-white">生成前预览</h3>
              </div>
-              <div className="flex flex-wrap gap-2">
-                <a href={buildApiDownloadUrl(result.download_url)} download className="rounded-lg bg-cyan-600 px-3 py-1.5 text-xs font-semibold text-white hover:bg-cyan-500">下载</a>
-                <button type="button" onClick={onGoAssetLibrary} className="rounded-lg border border-slate-200 bg-white px-3 py-1.5 text-xs font-semibold text-slate-700 hover:border-cyan-200 hover:text-cyan-700">去资产库查看</button>
+              <span className="rounded-md border border-white/15 bg-white/10 px-2 py-0.5 text-[11px] font-semibold text-white/75">
+                {selectedVideoOutputSize.width}x{selectedVideoOutputSize.height}
+              </span>
+            </div>
+            <div
+              className={`relative mx-auto overflow-hidden rounded-md border border-white/10 bg-white ${selectedVideoOutputSize.previewClassName}`}
+              style={{ backgroundColor: selectedScene?.background_color ?? "#f8fafc" }}
+            >
+              {selectedVideoBackground?.kind === "image" ? (
+                <img src={sceneBackgroundUrl(selectedVideoBackground)} alt={selectedVideoBackground.name} className="absolute inset-0 h-full w-full object-cover" />
+              ) : null}
+              {selectedVideoBackground?.kind === "video" ? (
+                <div className="absolute inset-0 flex items-center justify-center bg-slate-900 px-4 text-center text-xs font-medium text-white/80">
+                  视频创作暂不支持视频背景
+                </div>
+              ) : null}
+              {!selectedVideoBackground ? (
+                <div className="absolute inset-0 bg-white" />
+              ) : null}
+              {selectedAvatar ? (
+                <div
+                  className="absolute"
+                  style={{
+                    left: `${videoAvatarPreviewLayer.leftPct}%`,
+                    top: `${videoAvatarPreviewLayer.topPct}%`,
+                    width: `${videoAvatarPreviewLayer.widthPct}%`,
+                    height: `${videoAvatarPreviewLayer.heightPct}%`,
+                  }}
+                >
+                  <img
+                    src={buildApiUrl(`/avatars/${encodeURIComponent(selectedAvatar.id)}/preview`)}
+                    alt={selectedAvatar.name ?? selectedAvatar.id}
+                    className="absolute inset-0 h-full w-full object-fill"
+                  />
+                </div>
+              ) : null}
+              <div className="pointer-events-none absolute inset-x-5 bottom-5 rounded border border-white/35 bg-slate-950/35 px-3 py-1 text-center text-xs font-semibold text-white/80">
+                字幕安全区
              </div>
            </div>
-          ) : (
-            <div className="mt-4 flex min-h-[18rem] items-center justify-center rounded-lg border border-dashed border-slate-300 bg-slate-50 text-sm font-medium text-slate-500">生成后显示视频预览</div>
-          )}
+          </div>
+          <div data-testid="video-creation-composition-controls" className="mt-3 space-y-3 rounded-lg border border-slate-200 bg-slate-50 p-3">
+            <div>
+              <p className="mb-1.5 text-xs font-semibold text-slate-700">输出画幅</p>
+              <div className="grid grid-cols-3 gap-2">
+                {VIDEO_CREATION_OUTPUT_ASPECTS.map((aspect) => {
+                  const option = VIDEO_CREATION_OUTPUT_SIZES[aspect];
+                  const active = aspect === videoOutputAspect;
+                  return (
+                    <button
+                      key={aspect}
+                      type="button"
+                      onClick={() => setVideoOutputAspect(aspect)}
+                      className={`rounded-md border px-2 py-1.5 text-xs font-semibold transition-colors ${
+                        active
+                          ? "border-cyan-500 bg-cyan-50 text-cyan-700"
+                          : "border-slate-200 bg-white text-slate-600 hover:border-cyan-200 hover:text-cyan-700"
+                      }`}
+                    >
+                      {option.label}
+                    </button>
+                  );
+                })}
+              </div>
+            </div>
+            <label className="block text-xs font-semibold text-slate-700">
+              本次生成背景
+              <select
+                value={videoBackgroundId ?? ""}
+                onChange={(event) => setVideoBackgroundId(event.target.value || null)}
+                className="mt-1 w-full rounded-md border border-slate-200 bg-white px-2 py-1.5 text-xs font-medium text-slate-700"
+              >
+                <option value="">不使用背景</option>
+                {sceneBackgrounds.map((background) => (
+                  <option key={background.id} value={background.id}>{background.name}</option>
+                ))}
+              </select>
+            </label>
+            <label className="block text-xs font-medium text-slate-600">
+              <span className="mb-1 flex items-center justify-between gap-2">
+                <span>水平位置</span>
+                <span className="tabular-nums">{videoAvatarAdjust.x}px</span>
+              </span>
+              <input
+                type="range"
+                min="-800"
+                max="800"
+                step="4"
+                value={videoAvatarAdjust.x}
+                onChange={(event) => setVideoAvatarAdjust((current) => ({ ...current, x: Number(event.target.value) }))}
+                className="w-full accent-cyan-600"
+              />
+            </label>
+            <label className="block text-xs font-medium text-slate-600">
+              <span className="mb-1 flex items-center justify-between gap-2">
+                <span>垂直位置</span>
+                <span className="tabular-nums">{videoAvatarAdjust.y}px</span>
+              </span>
+              <input
+                type="range"
+                min="-600"
+                max="600"
+                step="4"
+                value={videoAvatarAdjust.y}
+                onChange={(event) => setVideoAvatarAdjust((current) => ({ ...current, y: Number(event.target.value) }))}
+                className="w-full accent-cyan-600"
+              />
+            </label>
+            <label className="block text-xs font-medium text-slate-600">
+              <span className="mb-1 flex items-center justify-between gap-2">
+                <span>人物缩放</span>
+                <span className="tabular-nums">{videoAvatarDisplayScale.toFixed(2)}x</span>
+              </span>
+              <input
+                type="range"
+                min="0.2"
+                max="3"
+                step="0.02"
+                value={videoAvatarAdjust.scale}
+                onChange={(event) => setVideoAvatarAdjust((current) => ({ ...current, scale: Number(event.target.value) }))}
+                className="w-full accent-cyan-600"
+              />
+            </label>
+            <button
+              type="button"
+              onClick={() => setVideoAvatarAdjust({ x: 0, y: 0, scale: 1 })}
+              className="w-full rounded-lg border border-slate-200 bg-white px-3 py-1.5 text-xs font-semibold text-slate-700 transition hover:border-cyan-200 hover:text-cyan-700"
+            >
+              重置本次生成构图
+            </button>
+          </div>
        </aside>
      </div>

--- a/apps/web/src/lib/api.ts
+++ b/apps/web/src/lib/api.ts
@@ -332,6 +332,19 @@ export type VideoCreationJobResponse = {
  export_video: ExportVideoItem;
 };

+export type VideoCreationCompositionConfig = {
+  scene_composition_id?: string | null;
+  background_id?: string | null;
+  background_color?: string;
+  avatar_fit?: "contain" | "cover";
+  avatar_anchor?: "center" | "bottom" | "left" | "right";
+  avatar_scale?: number;
+  avatar_offset_x?: number;
+  avatar_offset_y?: number;
+  output_width?: number;
+  output_height?: number;
+};
+
 export type CreateVideoCreationJobInput = {
  model: string;
  avatarId: string;
@@ -346,6 +359,7 @@ export type CreateVideoCreationJobInput = {
  fasterliveportraitConfig?: Record<string, unknown>;
  indexttsConfig?: IndexTTSConfig;
  indexttsEmotionAudioFile?: File | null;
+  compositionConfig?: VideoCreationCompositionConfig | null;
 };

 export async function createVideoCreationJob(input: CreateVideoCreationJobInput): Promise<VideoCreationJobResponse> {
@@ -373,6 +387,9 @@ export async function createVideoCreationJob(input: CreateVideoCreationJobInput)
  if (input.indexttsEmotionAudioFile) {
    form.set("indextts_emotion_audio_file", input.indexttsEmotionAudioFile);
  }
+  if (input.compositionConfig) {
+    form.set("composition_config", JSON.stringify(input.compositionConfig));
+  }
  return apiPostForm<VideoCreationJobResponse>("/video-creation/jobs", form);
 }

--- a/docs/en/tutorials/configuration.md
+++ b/docs/en/tutorials/configuration.md
@@ -155,10 +155,16 @@ mode. The single-process unified mode (`opentalking-unified`) ignores all entrie
 | `OPENTALKING_WORKER_URL` | `http://127.0.0.1:9001` | URL through which the API reaches the Worker. |
 | `OPENTALKING_TORCH_DEVICE` | `cpu` | Device used for orchestration-side audio and frame post-processing. |
 | `OPENTALKING_AVATARS_DIR` | `./examples/avatars` | Avatar bundle root directory. |
+| `OPENTALKING_AVATAR_MATTING_PROVIDER` | `rembg` | Optional matting provider for custom avatar uploads; called only when the upload dialog option is enabled. |
+| `OPENTALKING_AVATAR_MATTING_DEVICE` | `cpu` | Reserved device setting for matting providers. |
+| `OPENTALKING_AVATAR_MATTING_MODEL_PATH` | empty | Local `u2net.onnx` model file path for the `rembg` provider. |
+| `OPENTALKING_AVATAR_MATTING_TIMEOUT_SEC` | `60` | Reserved timeout setting for matting providers. |
 | `OPENTALKING_VOICES_DIR` | `./var/voices` | Storage for cloned voices. |
 | `OPENTALKING_SQLITE_PATH` | `./data/opentalking.sqlite3` | Local metadata database file. |
 | `OPENTALKING_CORS_ORIGINS` | `http://localhost:5173,http://127.0.0.1:5173` | Comma-separated list of permitted frontend origins. |

+Custom avatar uploads do not remove the background by default. PNG uploads with an existing alpha channel are detected automatically as transparent-ready. To enable the local `rembg` provider, install the optional dependency first: `uv pip install --python .venv/bin/python '.[avatar-matting]'`. The `rembg` provider does not download models at runtime; download `u2net.onnx` ahead of time and point `OPENTALKING_AVATAR_MATTING_MODEL_PATH` at that file.
+
 ## 4. Advanced tuning

 The variables in this section are intended for fine-grained control over specific
--- a/docs/superpowers/plans/2026-06-27-video-creation-composition.md
+++ b/docs/superpowers/plans/2026-06-27-video-creation-composition.md
@@ -0,0 +1,170 @@
+# Video Creation Composition Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Add a pre-generation scene composition preview to Video Creation and use the one-off composition settings when exporting generated videos.
+
+**Architecture:** The frontend derives a selected avatar scene from App-level scene assets and sends a one-off `composition_config` with video creation jobs. The backend validates the config and post-processes generated frames over an image background before writing the video.
+
+**Tech Stack:** React + TypeScript frontend, FastAPI multipart route, Python video creation service, OpenCV/NumPy/Pillow-style frame processing.
+
+## Global Constraints
+
+- Video Creation composition adjustments only affect the current generation job.
+- Do not write changes back to Scene Assets in this version.
+- If no avatar scene/background exists, keep current Video Creation behavior.
+- Reject video backgrounds for Video Creation in this first version.
+- Local runtime is unavailable; commit locally, sync via git bundle, and run verification on `8.92.9.220:/home/ly/opentalking`.
+
+---
+
+### Task 1: Backend Composition Config Parsing
+
+**Files:**
+- Modify: `apps/api/routes/video_creation.py`
+- Test: `apps/api/tests/test_video_creation.py`
+
+**Interfaces:**
+- Produces: `_parse_video_composition_config(raw: str | None) -> dict[str, object] | None`
+- Produces: optional `composition_config` argument passed to `VideoCreationService.create_from_audio_file`, `create_from_tts_text`, and `create_reference_video`
+
+- [ ] **Step 1: Write failing route tests**
+
+Add tests that post `composition_config` JSON to `/video-creation/jobs` and assert the fake service receives a dict. Add an invalid JSON test expecting HTTP 400.
+
+- [ ] **Step 2: Run route tests to verify they fail**
+
+Run on server after sync: `uv run pytest apps/api/tests/test_video_creation.py -k "composition_config" -q`
+
+- [ ] **Step 3: Implement parser and route forwarding**
+
+Add a FastAPI `Form(default=None)` field named `composition_config`, parse JSON, require a JSON object, and pass the parsed dict to service calls.
+
+- [ ] **Step 4: Run route tests to verify they pass**
+
+Run on server after sync: `uv run pytest apps/api/tests/test_video_creation.py -k "composition_config" -q`
+
+### Task 2: Backend Frame Compositing
+
+**Files:**
+- Modify: `opentalking/video_creation.py`
+- Test: `apps/api/tests/test_video_creation.py`
+
+**Interfaces:**
+- Consumes: `composition_config: Mapping[str, object] | None`
+- Produces: `_normalize_video_composition_config(settings: object, config: Mapping[str, object] | None) -> dict[str, object] | None`
+- Produces: `_apply_video_composition(frames: list[np.ndarray], *, config: Mapping[str, object] | None) -> list[np.ndarray]`
+
+- [ ] **Step 1: Write failing service tests**
+
+Add a test that creates a temporary scene background image, sends composition config to `VideoCreationService.create_from_audio_file`, monkeypatches `_write_video_only`, and asserts written frames contain the background color behind a transparent RGBA generated frame.
+
+- [ ] **Step 2: Run service test to verify it fails**
+
+Run on server after sync: `uv run pytest apps/api/tests/test_video_creation.py -k "composites_generated_frames" -q`
+
+- [ ] **Step 3: Implement minimal composition**
+
+In `_create_from_pcm`, normalize `composition_config` before rendering, apply it before `_write_video_only`, support image backgrounds only, and raise `ValueError("video backgrounds are not supported for video creation")` for video backgrounds.
+
+- [ ] **Step 4: Run service tests to verify they pass**
+
+Run on server after sync: `uv run pytest apps/api/tests/test_video_creation.py -k "composition" -q`
+
+### Task 3: Frontend API and App Data Flow
+
+**Files:**
+- Modify: `apps/web/src/lib/api.ts`
+- Modify: `apps/web/src/App.tsx`
+- Modify: `apps/web/src/components/VideoCreationWorkspace.tsx`
+- Test: `tests/unit/test_local_audio_frontend.py`
+
+**Interfaces:**
+- Produces: `VideoCreationCompositionConfig` TypeScript type
+- Produces: `compositionConfig?: VideoCreationCompositionConfig | null` on `CreateVideoCreationJobInput`
+- Consumes: `sceneBackgrounds`, `sceneCompositions`, `selectedSceneIdsByAvatar` props in `VideoCreationWorkspace`
+
+- [ ] **Step 1: Write failing frontend text tests**
+
+Add assertions that `VideoCreationWorkspace` receives scene props from `App.tsx`, defines `compositionConfig`, renders `生成前预览`, and appends `composition_config` in `createVideoCreationJob`.
+
+- [ ] **Step 2: Run frontend text tests to verify they fail**
+
+Run on server after sync: `uv run pytest tests/unit/test_local_audio_frontend.py -k "video_creation" -q`
+
+- [ ] **Step 3: Implement API and prop plumbing**
+
+Add the TypeScript composition type, JSON form field, and pass App scene state into `VideoCreationWorkspace`.
+
+- [ ] **Step 4: Run frontend text tests to verify they pass**
+
+Run on server after sync: `uv run pytest tests/unit/test_local_audio_frontend.py -k "video_creation" -q`
+
+### Task 4: Frontend Preview and Controls
+
+**Files:**
+- Modify: `apps/web/src/components/VideoCreationWorkspace.tsx`
+- Test: `tests/unit/test_local_audio_frontend.py`
+
+**Interfaces:**
+- Consumes: scene props and `buildApiUrl`
+- Produces: one-off local state `{ backgroundId, backgroundColor, avatarFit, avatarAnchor, avatarScale, avatarOffsetX, avatarOffsetY }`
+
+- [ ] **Step 1: Write failing UI string tests**
+
+Assert `VideoCreationWorkspace.tsx` contains `生成前预览`, `本次生成`, `水平位置`, `垂直位置`, `人物缩放`, and `compositionConfig`.
+
+- [ ] **Step 2: Run tests to verify they fail**
+
+Run on server after sync: `uv run pytest tests/unit/test_local_audio_frontend.py -k "video_creation_composition" -q`
+
+- [ ] **Step 3: Implement preview and controls**
+
+Show the selected background image when present, show the selected avatar preview above it, provide sliders for X/Y/scale, reset controls, and build `compositionConfig` for generation. Keep fallback copy for no background.
+
+- [ ] **Step 4: Run tests to verify they pass**
+
+Run on server after sync: `uv run pytest tests/unit/test_local_audio_frontend.py -k "video_creation_composition" -q`
+
+### Task 5: Server Sync and Verification
+
+**Files:**
+- No code files; sync and run commands.
+
+**Interfaces:**
+- Consumes: local commits on `codex/video-creation-composition`
+- Produces: matching server git history and verification output
+
+- [ ] **Step 1: Commit local changes**
+
+Run: `git add docs apps opentalking tests && git commit -m "Add video creation scene composition"`
+
+- [ ] **Step 2: Create and upload git bundle**
+
+Run locally: `git bundle create /tmp/video-creation-composition.bundle HEAD`
+
+Upload to server: `sshpass -p 'ly.123' scp /tmp/video-creation-composition.bundle ly@8.92.9.220:/tmp/video-creation-composition.bundle`
+
+- [ ] **Step 3: Apply bundle on server**
+
+Run on server: `cd /home/ly/opentalking && git fetch /tmp/video-creation-composition.bundle codex/video-creation-composition:codex/video-creation-composition && git switch codex/video-creation-composition`
+
+- [ ] **Step 4: Run verification**
+
+Run on server:
+
+```bash
+cd /home/ly/opentalking
+uv run mypy opentalking/core opentalking/events opentalking/avatar apps/api apps/unified apps/cli --ignore-missing-imports
+uv run pytest apps/api/tests/test_video_creation.py tests/unit/test_local_audio_frontend.py -q
+```
+
+- [ ] **Step 5: Start service for manual review**
+
+Run on server:
+
+```bash
+cd /home/ly/opentalking
+bash scripts/quickstart/stop_all.sh || true
+bash scripts/start_unified.sh --mock --api-port 8211 --web-port 5281
+```
--- a/docs/superpowers/specs/2026-06-27-video-creation-composition-design.md
+++ b/docs/superpowers/specs/2026-06-27-video-creation-composition-design.md
@@ -0,0 +1,108 @@
+# Video Creation Composition Design
+
+## Goal
+
+Bring scene backgrounds and avatar placement into the Video Creation workflow so offline generated videos can match the visual composition users preview before generation.
+
+## Product Decision
+
+Video Creation uses a pre-generation composition preview, not an immersive conversation mode. The page is an offline production workspace: users choose an avatar, script or audio, model settings, and then confirm the final frame composition before clicking generate.
+
+Composition adjustments in Video Creation are one-off job settings. They must not update the active realtime conversation scene or mutate Scene Asset compositions unless the user explicitly uses a future save action.
+
+## User Experience
+
+When a user selects an avatar on the Video Creation page:
+
+- If the avatar has an active scene composition, the preview uses that scene's background, background color, avatar fit, anchor, and scale.
+- If the avatar has no active scene composition, Video Creation keeps the existing default avatar-only behavior.
+- The user can adjust horizontal position, vertical position, and scale for this generation.
+- The preview shows background plus avatar in an output-frame container before generation.
+- Reset returns the one-off transform to `x: 0`, `y: 0`, `scale: 1`.
+
+The first version does not add a write-back action to Scene Assets. It only sends composition data with the video creation job.
+
+## Frontend Architecture
+
+`App.tsx` already owns scene backgrounds, scene compositions, and selected scene ids by avatar. Video Creation should receive those values as props and derive the current avatar's active scene with the same rule as realtime conversation:
+
+1. Use `selectedSceneIdsByAvatar[avatarId]` if it points to a scene for the selected avatar.
+2. Otherwise use the first composition whose `avatar_id` matches the selected avatar.
+3. Otherwise use no scene.
+
+`VideoCreationWorkspace.tsx` adds:
+
+- Props for `sceneBackgrounds`, `sceneCompositions`, and `selectedSceneIdsByAvatar`.
+- Local state for `videoAvatarAdjust: { x: number; y: number; scale: number }`.
+- A composition preview panel using the selected scene and background data.
+- Background selection for this job only.
+- X, Y, and scale controls for this job only.
+- A `videoComposition` payload passed to `createVideoCreationJob`.
+
+The UI stays work-focused and data-dense. The preview is prominent, but controls remain visible because this is a production page rather than an immersive live demo page.
+
+## API Contract
+
+`createVideoCreationJob` accepts an optional multipart field named `composition_config`. The field is JSON:
+
+```json
+{
+  "scene_composition_id": "scene-example",
+  "background_id": "bg-example",
+  "background_color": "#ffffff",
+  "avatar_fit": "contain",
+  "avatar_anchor": "center",
+  "avatar_scale": 1.1,
+  "avatar_offset_x": 80,
+  "avatar_offset_y": -24
+}
+```
+
+All fields are optional except that a useful composition must include at least `background_id` or a non-empty scene-derived background color. If the field is absent, current video generation behavior remains unchanged.
+
+Validation rules:
+
+- `background_id` must resolve under the configured scene assets directory when present.
+- `avatar_fit` must be `contain` or `cover`.
+- `avatar_anchor` must be `center`, `bottom`, `left`, or `right`.
+- `avatar_scale` must be between `0.1` and `4.0`.
+- Offsets are pixel values and must be between `-2000` and `2000`.
+- Video backgrounds are not composited in the first version; the backend rejects a video background for Video Creation with a clear message.
+
+## Backend Architecture
+
+`VideoCreationService` accepts `composition_config` on audio upload, TTS text, and reference video generation.
+
+The first version applies composition after model frame generation and before writing `video_only.mp4`:
+
+1. Generate avatar frames as today.
+2. If no composition config or no image background is provided, write frames unchanged.
+3. If an image background is provided, resize/crop it to the generated frame size.
+4. Place each generated frame over the background according to fit, anchor, base scale, and one-off offsets.
+5. Use alpha blending if generated frames contain an alpha channel; otherwise paste the RGB frame as an opaque layer.
+6. Continue muxing audio and export metadata as today.
+
+This keeps model-specific audio-to-video logic untouched and makes composition a reusable post-processing step.
+
+## Testing
+
+Backend tests cover:
+
+- API route parses `composition_config` and passes it to `VideoCreationService`.
+- Invalid JSON is rejected with HTTP 400.
+- Service rejects unknown or video backgrounds for Video Creation.
+- Service composites generated RGBA frames over an image background with scale and offsets.
+- Existing jobs without composition still behave as before.
+
+Frontend text-level tests cover:
+
+- Video Creation receives scene data from `App.tsx`.
+- `VideoCreationWorkspace.tsx` exposes the composition preview and controls.
+- `createVideoCreationJob` sends `composition_config`.
+
+## Non-Goals
+
+- No immersive mode on Video Creation in this first version.
+- No automatic write-back to Scene Assets.
+- No video-background compositing in Video Creation yet.
+- No subtitle rendering into generated videos in this first version.
--- a/docs/zh/tutorials/configuration.md
+++ b/docs/zh/tutorials/configuration.md
@@ -145,10 +145,16 @@ FlashHead 使用专属 WebSocket 协议，不经过 OmniRT。
 | `OPENTALKING_WORKER_URL` | `http://127.0.0.1:9001` | API 访问 Worker 时使用的 URL。 |
 | `OPENTALKING_TORCH_DEVICE` | `cpu` | 编排侧音频与帧后处理使用的设备。 |
 | `OPENTALKING_AVATARS_DIR` | `./examples/avatars` | Avatar bundle 根目录。 |
+| `OPENTALKING_AVATAR_MATTING_PROVIDER` | `rembg` | 自定义形象上传时的可选抠图 provider；仅在上传弹窗勾选后调用。 |
+| `OPENTALKING_AVATAR_MATTING_DEVICE` | `cpu` | 预留给抠图 provider 的设备配置。 |
+| `OPENTALKING_AVATAR_MATTING_MODEL_PATH` | 空 | `rembg` provider 的本地 `u2net.onnx` 模型文件路径。 |
+| `OPENTALKING_AVATAR_MATTING_TIMEOUT_SEC` | `60` | 预留给抠图 provider 的超时配置。 |
 | `OPENTALKING_VOICES_DIR` | `./var/voices` | 声音复刻存储目录。 |
 | `OPENTALKING_SQLITE_PATH` | `./data/opentalking.sqlite3` | 本地元数据数据库文件。 |
 | `OPENTALKING_CORS_ORIGINS` | `http://localhost:5173,http://127.0.0.1:5173` | 允许的前端 origin，逗号分隔。 |

+自定义形象上传默认不会抠除背景；PNG 自带透明通道时会自动识别为可透明合成。若要启用本地 `rembg` provider，先安装可选依赖：`uv pip install --python .venv/bin/python '.[avatar-matting]'`。`rembg` 不会在运行时自动下载模型，请预先下载 `u2net.onnx`，并将 `OPENTALKING_AVATAR_MATTING_MODEL_PATH` 指向该文件。
+
 ## 4. 进阶调优 {#4}

 本节变量针对特定后端的细粒度控制。完整列表参见 `.env.example`。下列为代表性条目。
--- a/opentalking/assets/scene_backgrounds/default-data-wall.jpg
+++ b/opentalking/assets/scene_backgrounds/default-data-wall.jpg
--- a/opentalking/avatar/matting/init.py
+++ b/opentalking/avatar/matting/init.py
@@ -0,0 +1,39 @@
+from __future__ import annotations
+
+from PIL import Image
+
+from .alpha import image_has_transparency
+from .base import AvatarMattingProvider, MattingError
+from .rembg_provider import RembgMattingProvider
+
+_PROVIDER_FACTORIES = {
+    "rembg": RembgMattingProvider,
+}
+
+
+def resolve_avatar_matting_provider(provider_name: str) -> AvatarMattingProvider:
+    key = (provider_name or "rembg").strip().lower()
+    factory = _PROVIDER_FACTORIES.get(key)
+    if factory is None:
+        supported = ", ".join(sorted(_PROVIDER_FACTORIES))
+        raise MattingError(f"unsupported avatar matting provider: {provider_name!r}; supported: {supported}")
+    return factory()
+
+
+def remove_avatar_background(
+    image: Image.Image,
+    *,
+    provider_name: str = "rembg",
+    settings: object | None = None,
+) -> tuple[Image.Image, str]:
+    provider = resolve_avatar_matting_provider(provider_name)
+    return provider.remove_background(image, settings=settings), provider.name
+
+
+__all__ = [
+    "AvatarMattingProvider",
+    "MattingError",
+    "image_has_transparency",
+    "remove_avatar_background",
+    "resolve_avatar_matting_provider",
+]
--- a/opentalking/avatar/matting/alpha.py
+++ b/opentalking/avatar/matting/alpha.py
@@ -0,0 +1,12 @@
+from __future__ import annotations
+
+from PIL import Image
+
+
+def image_has_transparency(image: Image.Image) -> bool:
+    if "A" not in image.getbands():
+        return False
+    low, high = image.getchannel("A").getextrema()
+    if not isinstance(low, int | float) or not isinstance(high, int | float):
+        return False
+    return low < 255 or high < 255
--- a/opentalking/avatar/matting/base.py
+++ b/opentalking/avatar/matting/base.py
@@ -0,0 +1,16 @@
+from __future__ import annotations
+
+from typing import Protocol
+
+from PIL import Image
+
+
+class MattingError(RuntimeError):
+    """Raised when an avatar matting provider cannot complete."""
+
+
+class AvatarMattingProvider(Protocol):
+    name: str
+
+    def remove_background(self, image: Image.Image, *, settings: object | None = None) -> Image.Image:
+        """Return an image with transparent background."""
--- a/opentalking/avatar/matting/rembg_provider.py
+++ b/opentalking/avatar/matting/rembg_provider.py
@@ -0,0 +1,73 @@
+from __future__ import annotations
+
+import os
+from contextlib import contextmanager
+from io import BytesIO
+from pathlib import Path
+from typing import Iterator
+
+from PIL import Image
+
+from .base import MattingError
+
+REMBG_U2NET_MODEL_MD5 = "60024c5c889badc19c04ad937298a77b"
+REMBG_U2NET_MODEL_URL = "https://github.com/danielgatis/rembg/releases/download/v0.0.0/u2net.onnx"
+
+
+def _configured_model_path(settings: object | None) -> Path | None:
+    value = str(getattr(settings, "avatar_matting_model_path", "") or "").strip()
+    return Path(value).expanduser() if value else None
+
+
+def _missing_model_message() -> str:
+    return (
+        "未找到抠除背景模型 u2net.onnx。\n"
+        f"请下载模型：{REMBG_U2NET_MODEL_URL}\n"
+        "然后在 .env 中配置 OPENTALKING_AVATAR_MATTING_MODEL_PATH。"
+    )
+
+
+def _validate_model_path(settings: object | None) -> Path:
+    model_path = _configured_model_path(settings)
+    if model_path is None or not model_path.is_file():
+        raise MattingError(_missing_model_message())
+    if model_path.name != "u2net.onnx":
+        raise MattingError("OPENTALKING_AVATAR_MATTING_MODEL_PATH 必须指向 u2net.onnx 文件。")
+    return model_path.resolve()
+
+
+@contextmanager
+def _u2net_home_for_model(model_path: Path) -> Iterator[None]:
+    previous = os.environ.get("U2NET_HOME")
+    os.environ["U2NET_HOME"] = str(model_path.parent)
+    try:
+        yield
+    finally:
+        if previous is None:
+            os.environ.pop("U2NET_HOME", None)
+        else:
+            os.environ["U2NET_HOME"] = previous
+
+
+class RembgMattingProvider:
+    name = "rembg"
+
+    def remove_background(self, image: Image.Image, *, settings: object | None = None) -> Image.Image:
+        model_path = _validate_model_path(settings)
+        try:
+            from rembg import remove
+        except ImportError as exc:
+            raise MattingError(
+                "rembg is not installed; install the avatar matting extra or choose another provider"
+            ) from exc
+
+        input_buffer = BytesIO()
+        image.convert("RGBA").save(input_buffer, format="PNG")
+        try:
+            with _u2net_home_for_model(model_path):
+                output = remove(input_buffer.getvalue())
+            result = Image.open(BytesIO(output))
+            result.load()
+        except Exception as exc:  # noqa: BLE001
+            raise MattingError(f"rembg failed: {exc}") from exc
+        return result.convert("RGBA")
--- a/opentalking/core/config.py
+++ b/opentalking/core/config.py
@@ -24,6 +24,12 @@ def _flatten_config(raw: dict[str, Any] | None) -> dict[str, Any]:
            "models_dir": "models_dir",
            "worker_url": "worker_url",
        },
+        "avatar": {
+            "matting_provider": "avatar_matting_provider",
+            "matting_device": "avatar_matting_device",
+            "matting_model_path": "avatar_matting_model_path",
+            "matting_timeout_sec": "avatar_matting_timeout_sec",
+        },
        "flashtalk": {
            "ws_url": "flashtalk_ws_url",
            "ckpt_dir": "flashtalk_ckpt_dir",
@@ -355,6 +361,10 @@ class Settings(BaseSettings):
    export_max_bytes: int = 1024 * 1024 * 1024
    video_creation_audio_max_bytes: int = 50 * 1024 * 1024
    video_creation_fasterliveportrait_preroll_ms: int = 400
+    avatar_matting_provider: str = "rembg"
+    avatar_matting_device: str = "cpu"
+    avatar_matting_model_path: str = ""
+    avatar_matting_timeout_sec: int = 60

    flashtalk_ws_url: str = ""
    flashtalk_ckpt_dir: str = "./models/SoulX-FlashTalk-14B"
--- a/opentalking/scene_assets.py
+++ b/opentalking/scene_assets.py
@@ -5,6 +5,7 @@ import re
 import shutil
 import uuid
 from datetime import datetime, timezone
+from importlib import resources
 from pathlib import Path
 from typing import Any

@@ -23,6 +24,15 @@ EXT_BY_MIME = {
 VALID_AVATAR_FITS = {"contain", "cover"}
 VALID_AVATAR_ANCHORS = {"center", "bottom", "left", "right"}
 VALID_SUBTITLE_STYLES = {"none", "compact", "lower-third"}
+DEFAULT_BACKGROUNDS = (
+    {
+        "id": "bg-default-data-wall",
+        "name": "数据玻璃幕墙",
+        "filename": "default-data-wall.jpg",
+        "mime_type": "image/jpeg",
+        "resource": "assets/scene_backgrounds/default-data-wall.jpg",
+    },
+)


 def sniff_background_mime(content: bytes) -> str | None:
@@ -68,17 +78,60 @@ def _write_json(path: Path, payload: Any) -> None:


 class SceneAssetStore:
-    def __init__(self, root: Path) -> None:
+    def __init__(self, root: Path, *, seed_defaults: bool = False) -> None:
        self.root = root.expanduser().resolve()
        self.backgrounds_dir = self.root / "backgrounds"
        self.compositions_dir = self.root / "compositions"
        self.background_index_path = self.backgrounds_dir / "index.json"
        self.composition_index_path = self.compositions_dir / "index.json"
+        self.seed_defaults = seed_defaults
+        self.background_seed_marker_path = self.backgrounds_dir / ".defaults_seeded"

    def list_backgrounds(self) -> list[dict[str, object]]:
+        self._seed_default_backgrounds()
+        return self._load_backgrounds()
+
+    def _load_backgrounds(self) -> list[dict[str, object]]:
        items = _read_json(self.background_index_path, [])
        return [item for item in items if isinstance(item, dict)]

+    def _seed_default_backgrounds(self) -> None:
+        if not self.seed_defaults or self.background_seed_marker_path.exists():
+            return
+        items = self._load_backgrounds()
+        existing_ids = {str(item.get("id") or "") for item in items}
+        seeded: list[dict[str, object]] = []
+        now = _now()
+        for default in DEFAULT_BACKGROUNDS:
+            background_id = str(default["id"])
+            if background_id in existing_ids:
+                continue
+            resource_path = resources.files("opentalking").joinpath(str(default["resource"]))
+            try:
+                content = resource_path.read_bytes()
+            except FileNotFoundError:
+                continue
+            ext = EXT_BY_MIME[str(default["mime_type"])]
+            media_path = self.backgrounds_dir / background_id / f"source{ext}"
+            media_path.parent.mkdir(parents=True, exist_ok=True)
+            media_path.write_bytes(content)
+            seeded.append(
+                {
+                    "id": background_id,
+                    "name": str(default["name"]),
+                    "kind": "image",
+                    "mime_type": str(default["mime_type"]),
+                    "filename": str(default["filename"]),
+                    "size_bytes": len(content),
+                    "url": f"/scene-assets/backgrounds/{background_id}/file",
+                    "created_at": now,
+                }
+            )
+        if seeded:
+            _write_json(self.background_index_path, [*items, *seeded])
+        self.background_seed_marker_path.parent.mkdir(parents=True, exist_ok=True)
+        self.background_seed_marker_path.write_text(now + "\n", encoding="utf-8")
+
    def create_background(self, *, content: bytes, filename: str, mime_type: str, name: str) -> dict[str, object]:
        normalized_mime = (mime_type or "").split(";")[0].strip().lower()
        if not content:
@@ -102,7 +155,7 @@ class SceneAssetStore:
            "url": f"/scene-assets/backgrounds/{background_id}/file",
            "created_at": _now(),
        }
-        items = [entry for entry in self.list_backgrounds() if entry.get("id") != background_id]
+        items = [entry for entry in self._load_backgrounds() if entry.get("id") != background_id]
        items.insert(0, item)
        _write_json(self.background_index_path, items)
        return item
@@ -126,7 +179,7 @@ class SceneAssetStore:
    def delete_background(self, background_id: str) -> bool:
        if not re.fullmatch(r"bg-[\w\u4e00-\u9fff-]+", background_id or ""):
            return False
-        items = self.list_backgrounds()
+        items = self._load_backgrounds()
        next_items = [item for item in items if item.get("id") != background_id]
        if len(next_items) == len(items):
            return False
--- a/opentalking/video_creation.py
+++ b/opentalking/video_creation.py
@@ -25,6 +25,7 @@ from opentalking.providers.synthesis.backends import resolve_model_backend
 from opentalking.providers.synthesis.flashtalk.ws_client import FlashTalkWSClient
 from opentalking.providers.synthesis.omnirt import auth_headers, resolve_synthesis_ws_url
 from opentalking.providers.tts.factory import build_tts_adapter
+from opentalking.scene_assets import SceneAssetStore

 log = logging.getLogger(__name__)

@@ -89,6 +90,204 @@ def _validate_reference_duration(settings: object, duration_sec: int | None) ->
    return value


+def _coerce_composition_float(
+    payload: Mapping[str, object],
+    key: str,
+    default: float,
+    *,
+    min_value: float,
+    max_value: float,
+) -> float:
+    raw = payload.get(key)
+    if raw in (None, ""):
+        return default
+    if not isinstance(raw, str | int | float):
+        raise ValueError(f"{key} must be a number")
+    try:
+        value = float(raw)
+    except (TypeError, ValueError) as exc:
+        raise ValueError(f"{key} must be a number") from exc
+    if value < min_value or value > max_value:
+        raise ValueError(f"{key} must be between {min_value:g} and {max_value:g}")
+    return value
+
+
+def _coerce_composition_int(
+    payload: Mapping[str, object],
+    key: str,
+    default: int,
+    *,
+    min_value: int,
+    max_value: int,
+) -> int:
+    raw = payload.get(key)
+    if raw in (None, ""):
+        value = default
+    elif isinstance(raw, str | int | float):
+        try:
+            value = int(raw)
+        except (TypeError, ValueError) as exc:
+            raise ValueError(f"{key} must be an integer") from exc
+    else:
+        raise ValueError(f"{key} must be an integer")
+    if value < min_value or value > max_value:
+        raise ValueError(f"{key} must be between {min_value:g} and {max_value:g}")
+    return value + (value % 2)
+
+
+def _normalize_video_composition_config(
+    settings: object,
+    avatar_path: Path,
+    config: Mapping[str, object] | None,
+) -> dict[str, object] | None:
+    if not config:
+        return None
+    background_id = str(config.get("background_id") or "").strip()
+    if not background_id:
+        return None
+    store = SceneAssetStore(_settings_path(settings, "scene_assets_dir", "./data/scene-assets"))
+    background = next((item for item in store.list_backgrounds() if item.get("id") == background_id), None)
+    if background is None:
+        raise ValueError("background_id not found")
+    if str(background.get("kind") or "") == "video":
+        raise ValueError("video backgrounds are not supported for video creation")
+    background_path = store.background_file_path(background_id)
+    if background_path is None:
+        raise FileNotFoundError("background file not found")
+    avatar_fit = str(config.get("avatar_fit") or "contain").strip()
+    avatar_anchor = str(config.get("avatar_anchor") or "center").strip()
+    if avatar_fit not in {"contain", "cover"}:
+        raise ValueError("invalid avatar_fit")
+    if avatar_anchor not in {"center", "bottom", "left", "right"}:
+        raise ValueError("invalid avatar_anchor")
+    return {
+        "background_path": background_path,
+        "avatar_mask_path": _reference_image_path(avatar_path),
+        "avatar_fit": avatar_fit,
+        "avatar_anchor": avatar_anchor,
+        "avatar_scale": _coerce_composition_float(config, "avatar_scale", 1.0, min_value=0.1, max_value=4.0),
+        "avatar_offset_x": _coerce_composition_float(config, "avatar_offset_x", 0.0, min_value=-2000.0, max_value=2000.0),
+        "avatar_offset_y": _coerce_composition_float(config, "avatar_offset_y", 0.0, min_value=-2000.0, max_value=2000.0),
+        "output_width": _coerce_composition_int(config, "output_width", 1280, min_value=320, max_value=3840),
+        "output_height": _coerce_composition_int(config, "output_height", 720, min_value=180, max_value=2160),
+    }
+
+
+def _resize_cover(image: np.ndarray, width: int, height: int) -> np.ndarray:
+    src_h, src_w = image.shape[:2]
+    scale = max(float(width) / float(src_w), float(height) / float(src_h))
+    new_w = max(1, int(round(src_w * scale)))
+    new_h = max(1, int(round(src_h * scale)))
+    resized = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_AREA if scale < 1 else cv2.INTER_LINEAR)
+    left = max(0, (new_w - width) // 2)
+    top = max(0, (new_h - height) // 2)
+    return np.ascontiguousarray(resized[top:top + height, left:left + width])
+
+
+def _avatar_anchor_origin(anchor: str, canvas_w: int, canvas_h: int, layer_w: int, layer_h: int) -> tuple[int, int]:
+    if anchor == "bottom":
+        return (canvas_w - layer_w) // 2, canvas_h - layer_h
+    if anchor == "left":
+        return 0, (canvas_h - layer_h) // 2
+    if anchor == "right":
+        return canvas_w - layer_w, (canvas_h - layer_h) // 2
+    return (canvas_w - layer_w) // 2, (canvas_h - layer_h) // 2
+
+
+def _load_avatar_alpha_mask(path: object) -> np.ndarray | None:
+    image = cv2.imread(str(path), cv2.IMREAD_UNCHANGED)
+    if image is None or image.ndim != 3 or image.shape[2] < 4:
+        return None
+    return image[:, :, 3].astype(np.float32) / 255.0
+
+
+def _composite_avatar_layer(
+    background: np.ndarray,
+    frame: np.ndarray,
+    *,
+    avatar_fit: str,
+    avatar_anchor: str,
+    avatar_scale: float,
+    avatar_offset_x: float,
+    avatar_offset_y: float,
+    fallback_alpha: np.ndarray | None = None,
+) -> np.ndarray:
+    canvas_h, canvas_w = background.shape[:2]
+    layer = np.asarray(frame, dtype=np.uint8)
+    if layer.ndim != 3 or layer.shape[2] < 3:
+        return background
+    bgr = layer[:, :, :3]
+    if layer.shape[2] >= 4:
+        alpha = layer[:, :, 3].astype(np.float32) / 255.0
+    elif fallback_alpha is not None:
+        alpha = fallback_alpha
+        if alpha.shape[:2] != bgr.shape[:2]:
+            alpha = cv2.resize(alpha, (bgr.shape[1], bgr.shape[0]), interpolation=cv2.INTER_AREA).astype(np.float32)
+    else:
+        alpha = np.ones(layer.shape[:2], dtype=np.float32)
+    fit_scale = min(float(canvas_w) / float(bgr.shape[1]), float(canvas_h) / float(bgr.shape[0]))
+    if avatar_fit == "cover":
+        fit_scale = max(float(canvas_w) / float(bgr.shape[1]), float(canvas_h) / float(bgr.shape[0]))
+    scale = max(0.01, fit_scale * float(avatar_scale))
+    layer_w = max(1, int(round(bgr.shape[1] * scale)))
+    layer_h = max(1, int(round(bgr.shape[0] * scale)))
+    bgr_resized = cv2.resize(bgr, (layer_w, layer_h), interpolation=cv2.INTER_AREA if scale < 1 else cv2.INTER_LINEAR)
+    alpha_resized = cv2.resize(alpha, (layer_w, layer_h), interpolation=cv2.INTER_AREA if scale < 1 else cv2.INTER_LINEAR)
+    origin_x, origin_y = _avatar_anchor_origin(avatar_anchor, canvas_w, canvas_h, layer_w, layer_h)
+    left = int(round(origin_x + avatar_offset_x))
+    top = int(round(origin_y + avatar_offset_y))
+    dst_left = max(0, left)
+    dst_top = max(0, top)
+    dst_right = min(canvas_w, left + layer_w)
+    dst_bottom = min(canvas_h, top + layer_h)
+    if dst_left >= dst_right or dst_top >= dst_bottom:
+        return background
+    src_left = dst_left - left
+    src_top = dst_top - top
+    src_right = src_left + (dst_right - dst_left)
+    src_bottom = src_top + (dst_bottom - dst_top)
+    out = background.copy()
+    fg = bgr_resized[src_top:src_bottom, src_left:src_right].astype(np.float32)
+    mask = alpha_resized[src_top:src_bottom, src_left:src_right].astype(np.float32)[:, :, None]
+    bg = out[dst_top:dst_bottom, dst_left:dst_right].astype(np.float32)
+    out[dst_top:dst_bottom, dst_left:dst_right] = np.clip((fg * mask) + (bg * (1.0 - mask)), 0, 255).astype(np.uint8)
+    return out
+
+
+def _apply_video_composition(
+    frames: list[np.ndarray],
+    *,
+    config: Mapping[str, object] | None,
+) -> list[np.ndarray]:
+    if not frames or not config:
+        return frames
+    first = np.asarray(frames[0])
+    frame_height, frame_width = first.shape[:2]
+    width = _coerce_composition_int(config, "output_width", int(frame_width), min_value=320, max_value=3840)
+    height = _coerce_composition_int(config, "output_height", int(frame_height), min_value=180, max_value=2160)
+    background_raw = cv2.imread(str(config["background_path"]), cv2.IMREAD_COLOR)
+    if background_raw is None:
+        raise FileNotFoundError("background file not found")
+    background = _resize_cover(background_raw, int(width), int(height))
+    fallback_alpha = _load_avatar_alpha_mask(config.get("avatar_mask_path"))
+    avatar_scale = _coerce_composition_float(config, "avatar_scale", 1.0, min_value=0.1, max_value=4.0)
+    avatar_offset_x = _coerce_composition_float(config, "avatar_offset_x", 0.0, min_value=-2000.0, max_value=2000.0)
+    avatar_offset_y = _coerce_composition_float(config, "avatar_offset_y", 0.0, min_value=-2000.0, max_value=2000.0)
+    return [
+        _composite_avatar_layer(
+            background,
+            frame,
+            avatar_fit=str(config.get("avatar_fit") or "contain"),
+            avatar_anchor=str(config.get("avatar_anchor") or "center"),
+            avatar_scale=avatar_scale,
+            avatar_offset_x=avatar_offset_x,
+            avatar_offset_y=avatar_offset_y,
+            fallback_alpha=fallback_alpha,
+        )
+        for frame in frames
+    ]
+
+
 def _build_reference_driver_pcm(total_samples: int, *, level: float = 480.0) -> np.ndarray:
    samples = max(0, int(total_samples))
    if samples == 0:
@@ -550,7 +749,8 @@ def _frame_array(frame: VideoFrameData | Any) -> np.ndarray | None:
    arr = np.asarray(data)
    if arr.ndim != 3 or arr.shape[2] < 3:
        return None
-    return np.ascontiguousarray(arr[:, :, :3].astype(np.uint8, copy=False))
+    channels = 4 if arr.shape[2] >= 4 else 3
+    return np.ascontiguousarray(arr[:, :, :channels].astype(np.uint8, copy=False))


 def _write_wav(path: Path, pcm: np.ndarray, sample_rate: int = 16000) -> None:
@@ -584,6 +784,8 @@ def _write_video_only(path: Path, frames: list[np.ndarray], fps: float) -> None:
            if arr.shape[:2] != (height, width):
                resized = cv2.resize(arr, (width, height), interpolation=cv2.INTER_AREA)
                arr = np.asarray(resized, dtype=np.uint8)
+            if arr.ndim == 3 and arr.shape[2] >= 4:
+                arr = arr[:, :, :3]
            writer.write(arr)
    finally:
        writer.release()
@@ -631,6 +833,7 @@ class VideoCreationService:
        title: str,
        mime_type: str | None = None,
        fasterliveportrait_config: Mapping[str, object] | None = None,
+        composition_config: Mapping[str, object] | None = None,
    ) -> dict[str, Any]:
        pcm = await decode_audio_file_to_pcm_i16(upload_path)
        if pcm.size == 0:
@@ -642,6 +845,7 @@ class VideoCreationService:
            title=title,
            source="upload",
            fasterliveportrait_config=fasterliveportrait_config,
+            composition_config=composition_config,
        )

    async def create_from_tts_text(
@@ -657,6 +861,7 @@ class VideoCreationService:
        source: str = "tts_text",
        fasterliveportrait_config: Mapping[str, object] | None = None,
        indextts_config: Mapping[str, object] | None = None,
+        composition_config: Mapping[str, object] | None = None,
    ) -> dict[str, Any]:
        text_value = text.strip()
        if not text_value:
@@ -694,6 +899,7 @@ class VideoCreationService:
            title=title,
            source=source,
            fasterliveportrait_config=fasterliveportrait_config,
+            composition_config=composition_config,
        )

    async def create_reference_video(
@@ -703,6 +909,7 @@ class VideoCreationService:
        avatar_id: str,
        duration_sec: int | None,
        title: str,
+        composition_config: Mapping[str, object] | None = None,
    ) -> dict[str, Any]:
        model_value = _normalize_model(model)
        if model_value != "flashtalk":
@@ -720,6 +927,7 @@ class VideoCreationService:
            pcm=pcm,
            title=title,
            source="reference_video",
+            composition_config=composition_config,
        )

    async def _resample_pcm(self, pcm: np.ndarray, sample_rate: int) -> np.ndarray:
@@ -763,9 +971,11 @@ class VideoCreationService:
        title: str,
        source: str,
        fasterliveportrait_config: Mapping[str, object] | None = None,
+        composition_config: Mapping[str, object] | None = None,
    ) -> dict[str, Any]:
        model_value = _normalize_model(model)
        avatar_path = _avatar_dir(self.settings, avatar_id)
+        normalized_composition_config = _normalize_video_composition_config(self.settings, avatar_path, composition_config)
        job_id = uuid.uuid4().hex
        work_dir = _settings_path(self.settings, "exports_dir", "./data/exports") / "video_creation_jobs" / job_id
        work_dir.mkdir(parents=True, exist_ok=False)
@@ -832,6 +1042,7 @@ class VideoCreationService:
        target_frames = max(1, int(round(float(pcm.size) * fps / float(sample_rate))))
        if len(frames) > target_frames:
            frames = frames[:target_frames]
+        frames = _apply_video_composition(frames, config=normalized_composition_config)

        video_only = work_dir / "video_only.mp4"
        _write_video_only(video_only, frames, fps)
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,6 +38,7 @@ dependencies = [
  "lightrag-hku>=1.4.9",
  "mem0ai>=0.1.115",
  "huggingface-hub[cli]<1.0",
+  "rembg>=2.0.69",
 ]

 [project.optional-dependencies]
@@ -149,7 +150,13 @@ opentalking-persona = "apps.cli.persona:main"
 include-package-data = true

 [tool.setuptools.package-data]
-opentalking = ["assets/voices/system/*/*.json", "assets/voices/system/*/*.txt", "assets/voices/system/*/*.wav", "assets/reference_drivers/*.wav"]
+opentalking = [
+  "assets/voices/system/*/*.json",
+  "assets/voices/system/*/*.txt",
+  "assets/voices/system/*/*.wav",
+  "assets/reference_drivers/*.wav",
+  "assets/scene_backgrounds/*.jpg",
+]

 [tool.setuptools.packages.find]
 where = ["."]
--- a/tests/frontend/test_scene_assets_ui.py
+++ b/tests/frontend/test_scene_assets_ui.py
@@ -85,6 +85,16 @@ def test_asset_library_groups_scene_compositions_by_avatar() -> None:
    assert "sceneCompositions.filter((scene) => scene.avatar_id === avatar.id)" in source


+def test_asset_library_scene_cards_show_friendly_avatar_and_background_names() -> None:
+    source = Path("apps/web/src/components/AssetLibraryWorkspace.tsx").read_text(encoding="utf-8")
+
+    assert "backgroundById" in source
+    assert "数字人形象：{sceneAvatar?.name ?? scene.avatar_id}" in source
+    assert "背景：{sceneBackground?.name ?? scene.background_id ?? scene.background_color}" in source
+    assert "Avatar {sceneAvatar?.name" not in source
+    assert "Background {scene.background_id" not in source
+
+
 def test_scene_delete_actions_use_error_handled_handlers() -> None:
    source = Path("apps/web/src/components/AssetLibraryWorkspace.tsx").read_text(encoding="utf-8")

--- a/tests/unit/test_avatar_matting.py
+++ b/tests/unit/test_avatar_matting.py
@@ -0,0 +1,60 @@
+from __future__ import annotations
+
+from types import SimpleNamespace
+import os
+
+from PIL import Image
+
+from opentalking.avatar.matting.rembg_provider import RembgMattingProvider
+
+
+def test_rembg_provider_requires_configured_model_path(tmp_path, monkeypatch):
+    missing_model = tmp_path / "u2net.onnx"
+    provider = RembgMattingProvider()
+
+    image = Image.new("RGB", (4, 4), (255, 255, 255))
+
+    try:
+        provider.remove_background(image, settings=SimpleNamespace(avatar_matting_model_path=str(missing_model)))
+    except Exception as exc:  # noqa: BLE001
+        message = str(exc)
+    else:  # pragma: no cover - assertion guard
+        raise AssertionError("expected missing model to fail before rembg runs")
+
+    assert "未找到抠除背景模型 u2net.onnx" in message
+    assert "OPENTALKING_AVATAR_MATTING_MODEL_PATH" in message
+    assert "https://github.com/danielgatis/rembg/releases/download/v0.0.0/u2net.onnx" in message
+    assert "MD5" not in message
+    assert str(missing_model) not in message
+
+
+def test_rembg_provider_uses_configured_model_directory(tmp_path, monkeypatch):
+    model_path = tmp_path / "u2net.onnx"
+    model_path.write_bytes(b"fake-model")
+    calls: list[bytes] = []
+    homes_seen: list[str | None] = []
+
+    def fake_remove(body: bytes) -> bytes:
+        calls.append(body)
+        homes_seen.append(os.environ.get("U2NET_HOME"))
+        out = Image.new("RGBA", (4, 4), (1, 2, 3, 0))
+        import io
+
+        buffer = io.BytesIO()
+        out.save(buffer, format="PNG")
+        return buffer.getvalue()
+
+    monkeypatch.setenv("U2NET_HOME", "/tmp/original-u2net-home")
+    monkeypatch.setattr("rembg.remove", fake_remove)
+
+    image = Image.new("RGB", (4, 4), (255, 255, 255))
+    result = RembgMattingProvider().remove_background(
+        image,
+        settings=SimpleNamespace(avatar_matting_model_path=str(model_path)),
+    )
+
+    assert calls
+    assert homes_seen == [str(tmp_path)]
+    assert result.mode == "RGBA"
+    assert result.getchannel("A").getextrema() == (0, 0)
+    assert os.environ.get("U2NET_HOME") == "/tmp/original-u2net-home"
--- a/tests/unit/test_local_audio_frontend.py
+++ b/tests/unit/test_local_audio_frontend.py
@@ -197,6 +197,32 @@ def test_video_clone_allows_uploading_source_avatar():
    assert "onAvatarUploaded={handleVideoCloneAvatarUploaded}" in app


+def test_custom_avatar_upload_can_request_background_removal():
+    app = (WEB / "App.tsx").read_text(encoding="utf-8")
+    stage = (WEB / "components" / "AvatarSelectionStage.tsx").read_text(encoding="utf-8")
+
+    assert "上传时抠除背景" in stage
+    assert "customRemoveBackground" in stage
+    assert "customUploadState" in stage
+    assert "正在抠除背景..." in stage
+    assert "抠图完成" in stage
+    assert "createdCustomAvatar" in stage
+    assert "buildApiUrl(`/avatars/${encodeURIComponent(createdCustomAvatar.id)}/preview`)" in stage
+    assert "removeBackground: customRemoveBackground" in stage
+    assert "await onCustomAvatarCreate" in stage
+    assert 'fd.set("remove_background", options?.removeBackground ? "true" : "false")' in app
+    assert "return created" in app
+    assert "创建失败：" in app
+    assert "e instanceof ApiError ? e.detail : null" in app
+    toast = (WEB / "components" / "ToastStack.tsx").read_text(encoding="utf-8")
+    assert "whitespace-pre-line break-words" in toast
+    assert "tone !== \"error\"" in app
+    assert "pauseToast" in app
+    assert "resumeToast" in app
+    assert "onMouseEnter={() => onPause(toast.id)}" in toast
+    assert "onMouseLeave={() => onResume(toast.id)}" in toast
+
+
 def test_video_clone_lip_retargeting_disables_relative_motion():
    clone = (WEB / "components" / "VideoCloneWorkspace.tsx").read_text(encoding="utf-8")

@@ -477,6 +503,58 @@ def test_video_creation_workspace_wires_offline_generation_flow():
    assert "去资产库查看" in workspace


+def test_video_creation_workspace_supports_one_off_scene_composition():
+    app = (WEB / "App.tsx").read_text(encoding="utf-8")
+    api = (WEB / "lib" / "api.ts").read_text(encoding="utf-8")
+    workspace = (WEB / "components" / "VideoCreationWorkspace.tsx").read_text(encoding="utf-8")
+
+    assert "sceneBackgrounds={sceneBackgrounds}" in app
+    assert "sceneCompositions={sceneCompositions}" in app
+    assert "selectedSceneIdsByAvatar={selectedSceneIdsByAvatar}" in app
+    assert "export type VideoCreationCompositionConfig" in api
+    assert "compositionConfig?: VideoCreationCompositionConfig | null" in api
+    assert 'form.set("composition_config", JSON.stringify(input.compositionConfig))' in api
+    assert "sceneBackgrounds: SceneBackgroundAsset[]" in workspace
+    assert "sceneCompositions: SceneComposition[]" in workspace
+    assert "selectedSceneIdsByAvatar?: Record<string, string>" in workspace
+    assert "生成前预览" in workspace
+    assert "本次生成" in workspace
+    assert "水平位置" in workspace
+    assert "垂直位置" in workspace
+    assert "人物缩放" in workspace
+    assert "compositionConfig" in workspace
+    assert "VIDEO_CREATION_OUTPUT_SIZES" in workspace
+    assert '"16:9"' in workspace
+    assert '"9:16"' in workspace
+    assert '"1:1"' in workspace
+    assert "videoOutputAspect" in workspace
+    assert "selectedVideoOutputSize" in workspace
+    assert "videoAvatarPreviewLayer" in workspace
+    assert "left: `${videoAvatarPreviewLayer.leftPct}%`" in workspace
+    assert "top: `${videoAvatarPreviewLayer.topPct}%`" in workspace
+    assert "width: `${videoAvatarPreviewLayer.widthPct}%`" in workspace
+    assert "height: `${videoAvatarPreviewLayer.heightPct}%`" in workspace
+    assert "translate(${videoAvatarAdjust.x}px" not in workspace
+    assert "output_width: selectedVideoOutputSize.width" in workspace
+    assert "output_height: selectedVideoOutputSize.height" in workspace
+    assert 'data-testid="video-creation-result-panel"' in workspace
+    assert 'data-testid="video-creation-composition-controls"' in workspace
+    assert "flex min-h-0 flex-col overflow-y-auto rounded-lg border border-slate-200 bg-white p-4 shadow-sm" in workspace
+    assert "mt-3 space-y-3 rounded-lg border border-slate-200 bg-slate-50 p-3" in workspace
+    assert "mt-3 min-h-0 flex-1 space-y-3 overflow-y-auto" not in workspace
+    assert "mt-4 shrink-0 overflow-hidden" in workspace
+    assert "aspect-video w-full" in workspace
+    assert "aspect-[9/16]" in workspace
+    assert "aspect-square" in workspace
+    assert "aspectRatio: selectedVideoOutputSize.aspectRatio" not in workspace
+    assert "xl:grid-cols-[18rem_minmax(28rem,1fr)_minmax(32rem,42rem)]" in workspace
+    assert "画面预览" in workspace
+    assert "输出画幅" in workspace
+    assert "h-[clamp(18rem,42vh,30rem)]" not in workspace
+    assert workspace.index('data-testid="video-creation-result-panel"') < workspace.index("构图设置")
+    assert workspace.index("构图设置") < workspace.index("生成前预览")
+
+
 def test_frontend_export_controls_include_audio_renderer_models():
    app = (WEB / "App.tsx").read_text(encoding="utf-8")
    renderers_block = app[app.index("const SERVER_AUDIO_RENDERERS"):app.index("function isFlashRenderer")]