Add optional avatar background removal

Support immesive mode in 视频创作
This commit is contained in:
kero-ly
2026-06-23 17:29:16 +08:00
committed by zyairehhh
parent d3083e799d
commit 8087110620
29 changed files with 1738 additions and 57 deletions

View File

@@ -16,6 +16,15 @@ VITE_BACKEND_PORT=8000
# 头像资产与生成结果目录 (avatar assets / exports)。
OPENTALKING_AVATARS_DIR=./examples/avatars
OPENTALKING_EXPORTS_DIR=./data/exports
# 自定义形象上传时的可选抠图 provider仅在上传弹窗勾选“上传时抠除背景”后调用。
# 启用本地 rembg 前先安装uv pip install --python .venv/bin/python '.[avatar-matting]'
OPENTALKING_AVATAR_MATTING_PROVIDER=rembg
OPENTALKING_AVATAR_MATTING_DEVICE=cpu
# rembg provider 需要预先下载 u2net.onnx并显式填写模型文件路径。
# 下载https://github.com/danielgatis/rembg/releases/download/v0.0.0/u2net.onnx
# MD560024c5c889badc19c04ad937298a77b
OPENTALKING_AVATAR_MATTING_MODEL_PATH=
OPENTALKING_AVATAR_MATTING_TIMEOUT_SEC=60
# WebUI 默认展示的数字人模型CLI --model 会覆盖该值。
OPENTALKING_DEFAULT_MODEL=mock

View File

@@ -21,6 +21,7 @@ from PIL import Image
from opentalking.avatar import mouth_metadata
from opentalking.avatar.loader import load_avatar_bundle
from opentalking.avatar.matting import MattingError, image_has_transparency, remove_avatar_background
from opentalking.avatar.validator import list_avatar_dirs
from opentalking.models.quicktalk.paths import resolve_quicktalk_asset_root
from opentalking.models.registry import get_adapter
@@ -247,13 +248,7 @@ def _resize_uploaded_avatar_image(image: Image.Image, *, max_width: int, max_hei
def _avatar_image_has_alpha(image: Image.Image) -> bool:
if "A" not in image.getbands():
return False
alpha = image.getchannel("A")
low, high = alpha.getextrema()
if not isinstance(low, int | float) or not isinstance(high, int | float):
return False
return low < 255 or high < 255
return image_has_transparency(image)
def _update_manifest_matting_status(manifest_path: Path, image: Image.Image) -> None:
@@ -264,6 +259,21 @@ def _update_manifest_matting_status(manifest_path: Path, image: Image.Image) ->
manifest_path.write_text(json.dumps(raw, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
def _update_manifest_matting_source(
manifest_path: Path,
*,
provider_name: str,
original_source_image: str,
) -> None:
raw = _read_manifest(manifest_path)
metadata = dict(raw.get("metadata") or {})
metadata["matting_provider"] = provider_name
metadata["matting_source"] = "upload_auto"
metadata["original_source_image"] = original_source_image
raw["metadata"] = metadata
_write_manifest(manifest_path, raw)
def _update_manifest_dimensions(manifest_path: Path, image: Image.Image) -> None:
raw = json.loads(manifest_path.read_text(encoding="utf-8"))
raw["width"] = int(image.width)
@@ -920,6 +930,7 @@ async def create_custom_avatar(
base_avatar_id: str = Form(...),
name: str = Form(...),
model: str | None = Form(default=None),
remove_background: bool = Form(default=False),
image: UploadFile | None = File(default=None),
video: UploadFile | None = File(default=None),
) -> AvatarSummary:
@@ -965,12 +976,28 @@ async def create_custom_avatar(
)
max_w, max_h = _custom_avatar_max_size()
fitted_image = _resize_uploaded_avatar_image(image_rgb, max_width=max_w, max_height=max_h)
source_dir = target_dir / "source"
source_dir.mkdir(parents=True, exist_ok=True)
if remove_background and video_body is None:
original_image = fitted_image.copy()
try:
fitted_image, matting_provider = remove_avatar_background(
fitted_image,
provider_name=str(getattr(request.app.state.settings, "avatar_matting_provider", "rembg")),
settings=request.app.state.settings,
)
except MattingError as exc:
raise HTTPException(status_code=400, detail=str(exc)) from exc
original_image.save(source_dir / "original.png", format="PNG")
_update_manifest_matting_source(
target_dir / "manifest.json",
provider_name=matting_provider,
original_source_image="source/original.png",
)
_update_manifest_dimensions(target_dir / "manifest.json", fitted_image)
_update_manifest_matting_status(target_dir / "manifest.json", fitted_image)
fitted_image.save(target_dir / "preview.png", format="PNG")
fitted_image.save(target_dir / "reference.png", format="PNG")
source_dir = target_dir / "source"
source_dir.mkdir(parents=True, exist_ok=True)
fitted_image.save(source_dir / "source.png", format="PNG")
if video_body is not None:
video_name = f"source_video{video_suffix}"
@@ -1000,6 +1027,9 @@ async def create_custom_avatar(
metadata["frame_dir"] = "frames"
raw["metadata"] = metadata
_write_manifest(target_dir / "manifest.json", raw)
except HTTPException:
shutil.rmtree(target_dir, ignore_errors=True)
raise
except Exception as exc: # noqa: BLE001
shutil.rmtree(target_dir, ignore_errors=True)
raise HTTPException(status_code=500, detail=f"failed to create custom avatar: {exc}") from exc

View File

@@ -18,7 +18,7 @@ router = APIRouter(prefix="/scene-assets", tags=["scene-assets"])
def _store(request: Request) -> SceneAssetStore:
settings = request.app.state.settings
root = Path(getattr(settings, "scene_assets_dir", "./data/scene-assets"))
return SceneAssetStore(root)
return SceneAssetStore(root, seed_defaults=True)
@router.get("/backgrounds", response_model=None)

View File

@@ -70,6 +70,18 @@ def _parse_indextts_config(tts_provider: str | None, raw: str | None, *, emotion
return dict(config) or None
def _parse_video_composition_config(raw: str | None) -> dict[str, object] | None:
if not raw:
return None
try:
decoded = json.loads(raw)
except json.JSONDecodeError as exc:
raise HTTPException(status_code=400, detail="composition_config must be valid JSON") from exc
if not isinstance(decoded, dict):
raise HTTPException(status_code=400, detail="composition_config must be a JSON object")
return decoded
async def _save_indextts_emotion_audio(upload: UploadFile | None) -> Path | None:
if upload is None:
return None
@@ -97,6 +109,7 @@ async def create_video_creation_job(
duration_sec: int | None = Form(default=None),
fasterliveportrait_config: str | None = Form(default=None),
indextts_config: str | None = Form(default=None),
composition_config: str | None = Form(default=None),
indextts_emotion_audio_file: UploadFile | None = File(default=None),
) -> dict[str, Any]:
source = audio_source.strip().lower()
@@ -104,6 +117,7 @@ async def create_video_creation_job(
raise HTTPException(status_code=400, detail="audio_source must be upload, tts_text, voice_clone, or reference_video")
settings = request.app.state.settings
flp_config = _parse_fasterliveportrait_config(model, fasterliveportrait_config)
video_composition_config = _parse_video_composition_config(composition_config)
emotion_audio_path = await _save_indextts_emotion_audio(indextts_emotion_audio_file)
try:
index_config = _parse_indextts_config(tts_provider, indextts_config, emotion_audio_path=emotion_audio_path)
@@ -134,6 +148,7 @@ async def create_video_creation_job(
title=title,
mime_type=audio_file.content_type,
fasterliveportrait_config=flp_config,
composition_config=video_composition_config,
)
finally:
upload_path.unlink(missing_ok=True)
@@ -145,6 +160,7 @@ async def create_video_creation_job(
avatar_id=avatar_id,
duration_sec=duration_sec,
title=title,
composition_config=video_composition_config,
)
return _with_download_url(result)
@@ -159,6 +175,7 @@ async def create_video_creation_job(
source=source,
fasterliveportrait_config=flp_config,
indextts_config=index_config,
composition_config=video_composition_config,
)
return _with_download_url(result)
except HTTPException:

View File

@@ -185,6 +185,158 @@ def test_create_custom_avatar_preserves_uploaded_png_alpha(tmp_path, monkeypatch
assert image.getchannel("A").getextrema()[0] == 0
def test_create_custom_avatar_does_not_remove_background_by_default(tmp_path, monkeypatch):
base = tmp_path / "base-avatar"
base.mkdir()
(base / "preview.png").write_bytes(_png_bytes())
(base / "reference.png").write_bytes(_png_bytes())
(base / "manifest.json").write_text(
json.dumps(
{
"id": "base-avatar",
"name": "Base Avatar",
"model_type": "mock",
"fps": 25,
"sample_rate": 16000,
"width": 8,
"height": 8,
"version": "1.0",
}
),
encoding="utf-8",
)
monkeypatch.setattr(avatars.mouth_metadata, "detect_mouth_landmarks", lambda frame: None)
def fail_remove_background(*args, **kwargs): # noqa: ANN002, ANN003
raise AssertionError("matting provider should not run unless requested")
monkeypatch.setattr(avatars, "remove_avatar_background", fail_remove_background)
app = FastAPI()
app.state.settings = SimpleNamespace(avatars_dir=str(tmp_path))
app.include_router(avatars.router)
client = TestClient(app)
response = client.post(
"/avatars/custom",
data={"base_avatar_id": "base-avatar", "name": "普通形象"},
files={"image": ("avatar.png", _png_bytes(), "image/png")},
)
assert response.status_code == 200
created = response.json()
custom_dir = tmp_path / created["id"]
manifest = json.loads((custom_dir / "manifest.json").read_text(encoding="utf-8"))
assert created["matting_status"] == "opaque"
assert manifest["metadata"]["matting_status"] == "opaque"
assert "matting_provider" not in manifest["metadata"]
assert not (custom_dir / "source" / "original.png").exists()
def test_create_custom_avatar_removes_background_when_requested(tmp_path, monkeypatch):
base = tmp_path / "base-avatar"
base.mkdir()
(base / "preview.png").write_bytes(_png_bytes())
(base / "reference.png").write_bytes(_png_bytes())
(base / "manifest.json").write_text(
json.dumps(
{
"id": "base-avatar",
"name": "Base Avatar",
"model_type": "mock",
"fps": 25,
"sample_rate": 16000,
"width": 8,
"height": 8,
"version": "1.0",
}
),
encoding="utf-8",
)
monkeypatch.setattr(avatars.mouth_metadata, "detect_mouth_landmarks", lambda frame: None)
calls: list[str] = []
def fake_remove_background(image, *, provider_name, settings):
calls.append(provider_name)
result = image.convert("RGBA")
result.putpixel((0, 0), (*result.getpixel((0, 0))[:3], 0))
return result, "fake-provider"
monkeypatch.setattr(avatars, "remove_avatar_background", fake_remove_background)
app = FastAPI()
app.state.settings = SimpleNamespace(
avatars_dir=str(tmp_path),
avatar_matting_provider="configured-provider",
avatar_matting_device="cpu",
avatar_matting_timeout_sec=30,
)
app.include_router(avatars.router)
client = TestClient(app)
response = client.post(
"/avatars/custom",
data={"base_avatar_id": "base-avatar", "name": "抠图形象", "remove_background": "true"},
files={"image": ("avatar.png", _png_bytes(), "image/png")},
)
assert response.status_code == 200
assert calls == ["configured-provider"]
created = response.json()
custom_dir = tmp_path / created["id"]
manifest = json.loads((custom_dir / "manifest.json").read_text(encoding="utf-8"))
assert created["matting_status"] == "transparent_ready"
assert manifest["metadata"]["matting_status"] == "transparent_ready"
assert manifest["metadata"]["matting_provider"] == "fake-provider"
assert manifest["metadata"]["matting_source"] == "upload_auto"
assert manifest["metadata"]["original_source_image"] == "source/original.png"
assert (custom_dir / "source" / "original.png").is_file()
assert Image.open(custom_dir / "reference.png").getchannel("A").getextrema()[0] == 0
def test_create_custom_avatar_reports_missing_matting_model(tmp_path, monkeypatch):
base = tmp_path / "base-avatar"
base.mkdir()
(base / "preview.png").write_bytes(_png_bytes())
(base / "reference.png").write_bytes(_png_bytes())
(base / "manifest.json").write_text(
json.dumps(
{
"id": "base-avatar",
"name": "Base Avatar",
"model_type": "mock",
"fps": 25,
"sample_rate": 16000,
"width": 8,
"height": 8,
"version": "1.0",
}
),
encoding="utf-8",
)
monkeypatch.setattr(avatars.mouth_metadata, "detect_mouth_landmarks", lambda frame: None)
def fail_missing_model(*args, **kwargs): # noqa: ANN002, ANN003
raise avatars.MattingError("未找到抠除背景模型 u2net.onnx。\n下载地址https://example.test/u2net.onnx")
monkeypatch.setattr(avatars, "remove_avatar_background", fail_missing_model)
app = FastAPI()
app.state.settings = SimpleNamespace(avatars_dir=str(tmp_path))
app.include_router(avatars.router)
response = TestClient(app).post(
"/avatars/custom",
data={"base_avatar_id": "base-avatar", "name": "缺模型形象", "remove_background": "true"},
files={"image": ("avatar.png", _png_bytes(), "image/png")},
)
assert response.status_code == 400
assert "未找到抠除背景模型" in response.json()["detail"]
assert not any(path.name.startswith("custom-") for path in tmp_path.iterdir() if path.is_dir())
def test_quicktalk_model_root_falls_back_to_omnirt_model_root(tmp_path, monkeypatch):
monkeypatch.delenv("OPENTALKING_QUICKTALK_ASSET_ROOT", raising=False)
monkeypatch.delenv("OPENTALKING_QUICKTALK_MODEL_ROOT", raising=False)

View File

@@ -65,6 +65,21 @@ def test_scene_asset_store_rejects_spoofed_background_content(tmp_path: Path) ->
)
def test_scene_asset_store_seeds_default_backgrounds_once(tmp_path: Path) -> None:
store = SceneAssetStore(tmp_path, seed_defaults=True)
backgrounds = store.list_backgrounds()
assert backgrounds[0]["id"] == "bg-default-data-wall"
assert backgrounds[0]["name"] == "数据玻璃幕墙"
assert backgrounds[0]["kind"] == "image"
assert backgrounds[0]["mime_type"] == "image/jpeg"
assert store.background_file_path("bg-default-data-wall").is_file()
assert store.delete_background("bg-default-data-wall") is True
assert store.list_backgrounds() == []
def test_scene_asset_store_rejects_zero_avatar_scale(tmp_path: Path) -> None:
store = SceneAssetStore(tmp_path)
@@ -196,6 +211,20 @@ def test_scene_asset_api_uploads_lists_downloads_and_deletes_background(tmp_path
assert deleted.json()["deleted"] is True
def test_scene_asset_api_lists_default_backgrounds_on_fresh_workspace(tmp_path: Path) -> None:
with _client(tmp_path) as client:
listed = client.get("/scene-assets/backgrounds")
assert listed.status_code == 200
items = listed.json()["items"]
assert items[0]["id"] == "bg-default-data-wall"
assert items[0]["name"] == "数据玻璃幕墙"
downloaded = client.get(items[0]["url"])
assert downloaded.status_code == 200
assert downloaded.headers["content-type"].startswith("image/jpeg")
def test_scene_asset_api_rejects_oversized_background_upload(tmp_path: Path) -> None:
with _client(tmp_path) as client:
upload = client.post(

View File

@@ -154,6 +154,81 @@ def test_video_creation_audio_upload_returns_export_video(tmp_path: Path, monkey
assert payload["export_video"]["download_url"].startswith("/exports/videos/")
def test_video_creation_route_passes_composition_config(tmp_path: Path, monkeypatch) -> None:
client, creators = _client(tmp_path, monkeypatch)
composition = {
"scene_composition_id": "scene-anchor-news",
"background_id": "bg-newsroom",
"background_color": "#ffffff",
"avatar_fit": "contain",
"avatar_anchor": "center",
"avatar_scale": 1.25,
"avatar_offset_x": 96,
"avatar_offset_y": -32,
}
with client:
response = client.post(
"/video-creation/jobs",
data={
"model": "wav2lip",
"avatar_id": "anchor",
"audio_source": "upload",
"title": "Composed take",
"composition_config": json.dumps(composition),
},
files={"audio_file": ("speech.wav", b"RIFFaudio", "audio/wav")},
)
assert response.status_code == 200, response.text
assert creators[0].calls[0][1]["composition_config"] == composition
def test_video_creation_route_rejects_invalid_composition_config(tmp_path: Path, monkeypatch) -> None:
client, _creators = _client(tmp_path, monkeypatch)
with client:
response = client.post(
"/video-creation/jobs",
data={
"model": "wav2lip",
"avatar_id": "anchor",
"audio_source": "upload",
"title": "Broken composition",
"composition_config": "{",
},
files={"audio_file": ("speech.wav", b"RIFFaudio", "audio/wav")},
)
assert response.status_code == 400
assert response.json()["detail"] == "composition_config must be valid JSON"
def test_write_video_only_preserves_bgr_frames_for_opencv_writer(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
from opentalking import video_creation as video_creation_module
captured: list[np.ndarray] = []
class FakeWriter:
def isOpened(self) -> bool:
return True
def write(self, frame: np.ndarray) -> None:
captured.append(np.asarray(frame).copy())
def release(self) -> None:
return None
monkeypatch.setattr(video_creation_module.cv2, "VideoWriter_fourcc", lambda *_args: 0)
monkeypatch.setattr(video_creation_module.cv2, "VideoWriter", lambda *_args, **_kwargs: FakeWriter())
bgr = np.zeros((2, 2, 3), dtype=np.uint8)
bgr[:, :] = [200, 20, 10]
video_creation_module._write_video_only(tmp_path / "out.mp4", [bgr], 25)
assert captured
assert captured[0][0, 0].tolist() == [200, 20, 10]
def test_video_creation_quicktalk_default_backend_is_omnirt(monkeypatch: pytest.MonkeyPatch) -> None:
from opentalking.core.model_config import clear_model_config_cache
from opentalking.providers.synthesis.backends import resolve_model_backend
@@ -893,6 +968,133 @@ async def test_video_creation_service_renders_quicktalk_via_omnirt(
assert result["export_video"]["model"] == "quicktalk"
@pytest.mark.asyncio
async def test_video_creation_service_composites_generated_frames_over_scene_background(
tmp_path: Path,
monkeypatch: pytest.MonkeyPatch,
) -> None:
from opentalking import video_creation as video_creation_module
from opentalking.scene_assets import SceneAssetStore
from PIL import Image
import io
avatars = tmp_path / "avatars"
exports = tmp_path / "exports"
scene_assets = tmp_path / "scene-assets"
_write_avatar(avatars)
transparent_reference = Image.new("RGBA", (4, 4), (255, 0, 0, 0))
transparent_reference.save(avatars / "anchor" / "reference.png")
uploaded = tmp_path / "speech.wav"
uploaded.write_bytes(b"RIFFaudio")
buffer = io.BytesIO()
Image.new("RGB", (4, 4), (10, 20, 200)).save(buffer, format="PNG")
background = SceneAssetStore(scene_assets).create_background(
content=buffer.getvalue(),
filename="blue.png",
mime_type="image/png",
name="Blue",
)
captured_frames: list[np.ndarray] = []
class FakeWSClient:
def __init__(self, ws_url: str, *, extra_headers: dict[str, str] | None = None) -> None:
self.ws_url = ws_url
self.extra_headers = extra_headers or {}
class FakeOmniRTClient:
def __init__(self, _ws_client: FakeWSClient) -> None:
self.fps = 25
self.audio_chunk_samples = 4
async def init_session(self, **_kwargs: object) -> dict[str, object]:
return {"type": "init_ok"}
async def prewarm(self) -> dict[str, object]:
return {"type": "prewarm_skipped"}
async def generate(self, _audio_pcm: np.ndarray) -> list[VideoFrameData]:
red = np.zeros((4, 4, 3), dtype=np.uint8)
red[:, :, 0] = 255
return [VideoFrameData(data=red, width=4, height=4, timestamp_ms=0.0)]
async def close(self, send_close_msg: bool = True) -> None:
return None
async def fake_decode(_path: Path) -> np.ndarray:
return np.arange(4, dtype=np.int16)
async def fake_mux(_ffmpeg_bin: str, _video_in: Path, _audio_in: Path, out_mp4: Path) -> None:
out_mp4.write_bytes(b"mp4")
def fake_write_video_only(path: Path, frames: list[np.ndarray], _fps: float) -> None:
captured_frames.extend(np.asarray(frame).copy() for frame in frames)
path.write_bytes(b"video")
def fake_create_video_export(root: Path, **kwargs: object) -> dict[str, object]:
return {
"id": "export-composed",
"kind": "video_creation",
"title": kwargs["title"],
"duration_sec": kwargs["duration_sec"],
"size_bytes": len(kwargs["content"]),
"mime_type": "video/mp4",
"created_at": "2026-06-04T00:00:00Z",
"path": str(root / "export-composed.mp4"),
"session_id": kwargs["session_id"],
"avatar_id": kwargs["avatar_id"],
"model": kwargs["model"],
}
monkeypatch.setattr(video_creation_module, "FlashTalkWSClient", FakeWSClient, raising=False)
monkeypatch.setattr(video_creation_module, "OmniRTAudio2VideoClient", FakeOmniRTClient, raising=False)
monkeypatch.setattr(video_creation_module, "decode_audio_file_to_pcm_i16", fake_decode)
monkeypatch.setattr(video_creation_module, "_write_video_only", fake_write_video_only)
monkeypatch.setattr(video_creation_module, "_ffmpeg_mux", fake_mux)
monkeypatch.setattr(
video_creation_module,
"resolve_model_backend",
lambda model, _settings: SimpleNamespace(model=model, backend="omnirt", ws_url=""),
)
monkeypatch.setattr(video_creation_module, "create_video_export", fake_create_video_export)
service = VideoCreationService(
SimpleNamespace(
avatars_dir=str(avatars),
exports_dir=str(exports),
scene_assets_dir=str(scene_assets),
export_max_bytes=1024 * 1024,
ffmpeg_bin="ffmpeg",
omnirt_endpoint="http://127.0.0.1:9000",
omnirt_audio2video_path_template="/v1/audio2video/{model}",
omnirt_api_key="",
)
)
result = await service.create_from_audio_file(
model="wav2lip",
avatar_id="anchor",
upload_path=uploaded,
title="Composed take",
composition_config={
"background_id": background["id"],
"avatar_fit": "contain",
"avatar_anchor": "center",
"avatar_scale": 1.0,
"avatar_offset_x": 0,
"avatar_offset_y": 0,
"output_width": 320,
"output_height": 180,
},
)
assert result["export_video"]["model"] == "wav2lip"
assert captured_frames
assert captured_frames[0].shape == (180, 320, 3)
assert captured_frames[0][0, 0].tolist() == [200, 20, 10]
@pytest.mark.asyncio
async def test_video_creation_service_renders_musetalk_via_omnirt(
tmp_path: Path,

View File

@@ -1001,6 +1001,7 @@ export default function App() {
}
});
const [toasts, setToasts] = useState<ToastMessage[]>([]);
const toastTimersRef = useRef<Map<string, ReturnType<typeof window.setTimeout>>>(new Map());
const [recordingSaving, setRecordingSaving] = useState(false);
const [ftRecordPhase, setFtRecordPhase] = useState<"idle" | "recording" | "stopped">("idle");
const [ftRecordBusy, setFtRecordBusy] = useState(false);
@@ -1098,17 +1099,41 @@ export default function App() {
);
const dismissToast = useCallback((id: string) => {
const timer = toastTimersRef.current.get(id);
if (timer) window.clearTimeout(timer);
toastTimersRef.current.delete(id);
setToasts((prev) => prev.filter((toast) => toast.id !== id));
}, []);
const notify = useCallback((message: string, tone: ToastTone = "info") => {
const id = makeToastId();
setToasts((prev) => [...prev.slice(-2), { id, tone, message }]);
window.setTimeout(() => {
setToasts((prev) => prev.filter((toast) => toast.id !== id));
}, tone === "error" ? 5200 : 3600);
if (tone !== "error") {
const timer = window.setTimeout(() => {
toastTimersRef.current.delete(id);
setToasts((prev) => prev.filter((toast) => toast.id !== id));
}, 3600);
toastTimersRef.current.set(id, timer);
}
}, []);
const pauseToast = useCallback((id: string) => {
const timer = toastTimersRef.current.get(id);
if (!timer) return;
window.clearTimeout(timer);
toastTimersRef.current.delete(id);
}, []);
const resumeToast = useCallback((id: string) => {
const toast = toasts.find((item) => item.id === id);
if (!toast || toast.tone === "error" || toastTimersRef.current.has(id)) return;
const timer = window.setTimeout(() => {
toastTimersRef.current.delete(id);
setToasts((prev) => prev.filter((item) => item.id !== id));
}, 1800);
toastTimersRef.current.set(id, timer);
}, [toasts]);
const syncRuntimeConfigSelection = useCallback((next: RuntimeConfigResponse) => {
const nextAsrProvider = normalizeAsrProvider(next.stt.provider, "dashscope");
setAsrProvider(nextAsrProvider);
@@ -2328,7 +2353,32 @@ export default function App() {
}
}, [connection, fasterliveportraitConfig, model, notify]);
const handleCreateCustomAvatar = useCallback(async (file: File, name: string) => {
const handleSavePrompt = useCallback(async () => {
setPromptSaving(true);
try {
await apiPost("/sessions/customize/prompt", {
avatar_id: avatarId,
llm_system_prompt: llmSystemPrompt,
});
const sid = sessionIdRef.current;
if (sid) await releaseSession(sid);
resetLiveState(true);
setConnection("idle");
notify("System Prompt 已保存,页面即将刷新并在新会话生效。", "success");
window.setTimeout(() => window.location.reload(), 900);
} catch (e) {
console.warn("save prompt failed", e);
notify("保存 Prompt 失败,请查看后端日志。", "error");
} finally {
setPromptSaving(false);
}
}, [avatarId, llmSystemPrompt, notify, releaseSession, resetLiveState]);
const handleCreateCustomAvatar = useCallback(async (
file: File,
name: string,
options?: { removeBackground?: boolean },
) => {
const trimmedName = name.trim();
if (!trimmedName) {
notify("请先给形象起个名字。", "info");
@@ -2346,6 +2396,7 @@ export default function App() {
fd.set("name", trimmedName);
fd.set("model", model);
fd.set("image", file);
fd.set("remove_background", options?.removeBackground ? "true" : "false");
const created = await apiPostForm<AvatarSummary>("/avatars/custom", fd);
setAvatars((prev) => {
const filtered = prev.filter((avatar) => avatar.id !== created.id);
@@ -2358,9 +2409,12 @@ export default function App() {
resetLiveState(true);
setConnection("idle");
notify(`自定义形象「${created.name ?? trimmedName}」已加入形象库。`, "success");
return created;
} catch (e) {
console.warn("create custom avatar failed", e);
notify("创建自定义形象失败,请查看后端日志。", "error");
const detail = e instanceof ApiError ? e.detail : null;
notify(detail ? `创建失败:${detail}` : "创建自定义形象失败,请查看后端日志。", "error");
return null;
} finally {
setReferenceSaving(false);
}
@@ -2901,6 +2955,9 @@ export default function App() {
<VideoCreationWorkspace
avatars={avatars}
avatarId={avatarId}
sceneBackgrounds={sceneBackgrounds}
sceneCompositions={sceneCompositions}
selectedSceneIdsByAvatar={selectedSceneIdsByAvatar}
models={models}
onAvatarChange={handleAvatarChange}
onAvatarUploaded={handleVideoCloneAvatarUploaded}
@@ -3170,7 +3227,7 @@ export default function App() {
onPersonaImport={handlePersonaImport}
onAvatarChange={handleAvatarChange}
onStart={() => void handleStart()}
onCustomAvatarCreate={(file, name) => void handleCreateCustomAvatar(file, name)}
onCustomAvatarCreate={(file, name, options) => handleCreateCustomAvatar(file, name, options)}
onAvatarDelete={(target) => void handleDeleteAvatar(target)}
referenceSaving={referenceSaving}
/>
@@ -3304,7 +3361,7 @@ export default function App() {
</aside>
</div>
)}
<ToastStack toasts={toasts} onDismiss={dismissToast} />
<ToastStack toasts={toasts} onDismiss={dismissToast} onPause={pauseToast} onResume={resumeToast} />
</div>
);
}

View File

@@ -1040,6 +1040,10 @@ export function AssetLibraryWorkspace({
);
const avatarById = useMemo(() => new Map((avatars ?? []).map((avatar) => [avatar.id, avatar])), [avatars]);
const backgroundById = useMemo(
() => new Map(sceneBackgrounds.map((background) => [background.id, background])),
[sceneBackgrounds],
);
const sceneGroups = useMemo(() => {
const avatarGroups = (avatars ?? [])
.map((avatar) => ({
@@ -1182,6 +1186,9 @@ export function AssetLibraryWorkspace({
{scenes.map((scene) => {
const selected = selectedSceneIdsByAvatar[scene.avatar_id] === scene.id;
const sceneAvatar = avatarById.get(scene.avatar_id);
const sceneBackground = scene.background_id
? backgroundById.get(scene.background_id)
: null;
return (
<article
key={scene.id}
@@ -1192,8 +1199,12 @@ export function AssetLibraryWorkspace({
}`}
>
<p className="truncate text-sm font-semibold text-slate-950">{scene.name}</p>
<p className="mt-1 truncate text-xs text-slate-500">Avatar {sceneAvatar?.name ?? scene.avatar_id}</p>
<p className="mt-1 truncate text-xs text-slate-500">Background {scene.background_id ?? scene.background_color}</p>
<p className="mt-1 truncate text-xs text-slate-500">
{sceneAvatar?.name ?? scene.avatar_id}
</p>
<p className="mt-1 truncate text-xs text-slate-500">
{sceneBackground?.name ?? scene.background_id ?? scene.background_color}
</p>
<div className="mt-3 flex items-center gap-3">
<button
type="button"

View File

@@ -24,7 +24,11 @@ type AvatarSelectionStageProps = {
prewarmState?: "idle" | "preparing" | "ready" | "failed";
onAvatarChange: (id: string) => void;
onStart: () => void;
onCustomAvatarCreate: (file: File, name: string) => void;
onCustomAvatarCreate: (
file: File,
name: string,
options?: { removeBackground?: boolean },
) => Promise<AvatarSummary | null | void>;
onAvatarDelete?: (avatar: AvatarSummary) => void;
referenceSaving?: boolean;
personas: PersonaSummary[];
@@ -94,6 +98,9 @@ export function AvatarSelectionStage({
});
const [customFile, setCustomFile] = useState<File | null>(null);
const [customPreviewUrl, setCustomPreviewUrl] = useState<string | null>(null);
const [customRemoveBackground, setCustomRemoveBackground] = useState(false);
const [customUploadState, setCustomUploadState] = useState<"idle" | "processing" | "complete">("idle");
const [createdCustomAvatar, setCreatedCustomAvatar] = useState<AvatarSummary | null>(null);
const selectedPersona = personas.find((persona) => persona.id === selectedPersonaId) ?? null;
const configDisabled = loading || queued || prewarmState === "preparing";
const baseDisabled = loading || queued || prewarmState === "preparing" || !selectedAvatar || !modelConnected;
@@ -120,7 +127,14 @@ export function AvatarSelectionStage({
setCustomPreviewUrl(file ? URL.createObjectURL(file) : null);
};
const handleCustomUpload = () => {
const closeCustomUpload = () => {
if (referenceSaving || customUploadState === "processing") return;
setCustomUploadOpen(false);
setCustomUploadState("idle");
setCreatedCustomAvatar(null);
};
const handleCustomUpload = async () => {
const name = customName.trim();
if (!customFile || !name) return;
try {
@@ -128,8 +142,19 @@ export function AvatarSelectionStage({
} catch {
/* ignore */
}
onCustomAvatarCreate(customFile, name);
setCustomUploadOpen(false);
setCreatedCustomAvatar(null);
setCustomUploadState(customRemoveBackground ? "processing" : "idle");
const created = await onCustomAvatarCreate(customFile, name, { removeBackground: customRemoveBackground });
if (created) {
setCreatedCustomAvatar(created);
if (customRemoveBackground) {
setCustomUploadState("complete");
} else {
setCustomUploadOpen(false);
}
} else {
setCustomUploadState("idle");
}
};
const handlePersonaFileChange = (event: ChangeEvent<HTMLInputElement>) => {
@@ -396,6 +421,7 @@ export function AvatarSelectionStage({
<button
type="button"
onClick={() => fileInputRef.current?.click()}
disabled={referenceSaving}
className="flex w-full items-center gap-3 rounded-lg border border-dashed border-cyan-300 bg-cyan-50 p-3 text-left transition hover:bg-cyan-100"
>
<span className="flex h-12 w-12 shrink-0 items-center justify-center overflow-hidden rounded-lg bg-white text-2xl font-light text-cyan-700">
@@ -412,23 +438,67 @@ export function AvatarSelectionStage({
<span className="mt-0.5 block text-xs text-slate-500"></span>
</span>
</button>
<label className="flex items-center gap-2 rounded-lg border border-slate-200 bg-slate-50 px-3 py-2.5">
<input
type="checkbox"
checked={customRemoveBackground}
onChange={(event) => setCustomRemoveBackground(event.target.checked)}
disabled={referenceSaving}
className="h-4 w-4 rounded border-slate-300 text-cyan-600 focus:ring-cyan-500"
/>
<span className="text-sm font-medium text-slate-700"></span>
</label>
{customUploadState === "processing" ? (
<div className="rounded-lg border border-cyan-200 bg-cyan-50 px-3 py-2.5">
<div className="flex items-center justify-between gap-3">
<span className="text-sm font-semibold text-cyan-800">...</span>
<span className="h-4 w-4 animate-spin rounded-full border-2 border-cyan-200 border-t-cyan-600" />
</div>
<div className="mt-2 h-1.5 overflow-hidden rounded-full bg-cyan-100">
<div className="h-full w-2/3 animate-pulse rounded-full bg-cyan-500" />
</div>
<p className="mt-2 text-xs text-cyan-700"></p>
</div>
) : null}
{customUploadState === "complete" && createdCustomAvatar ? (
<div className="rounded-lg border border-emerald-200 bg-emerald-50 p-3">
<div className="flex items-center gap-3">
<span className="flex h-16 w-16 shrink-0 items-center justify-center overflow-hidden rounded-lg bg-[linear-gradient(45deg,#e2e8f0_25%,transparent_25%),linear-gradient(-45deg,#e2e8f0_25%,transparent_25%),linear-gradient(45deg,transparent_75%,#e2e8f0_75%),linear-gradient(-45deg,transparent_75%,#e2e8f0_75%)] bg-[length:16px_16px] bg-[position:0_0,0_8px,8px_-8px,-8px_0]">
<img
src={buildApiUrl(`/avatars/${encodeURIComponent(createdCustomAvatar.id)}/preview`)}
alt={createdCustomAvatar.name ?? createdCustomAvatar.id}
className="h-full w-full object-contain"
/>
</span>
<span className="min-w-0">
<span className="block text-sm font-semibold text-emerald-900"></span>
<span className="mt-0.5 block truncate text-xs text-emerald-700">
{createdCustomAvatar.name ?? createdCustomAvatar.id}
</span>
</span>
</div>
</div>
) : null}
</div>
<div className="flex items-center justify-end gap-2 border-t border-slate-100 bg-slate-50 px-4 py-3">
<button
type="button"
onClick={() => setCustomUploadOpen(false)}
onClick={closeCustomUpload}
disabled={referenceSaving || customUploadState === "processing"}
className="rounded-lg border border-slate-200 bg-white px-3 py-2 text-sm font-semibold text-slate-700 transition hover:border-slate-300"
>
</button>
<button
type="button"
onClick={handleCustomUpload}
disabled={referenceSaving || !customFile || !customName.trim()}
className="rounded-lg bg-cyan-600 px-3 py-2 text-sm font-semibold text-white transition hover:bg-cyan-500 disabled:cursor-not-allowed disabled:opacity-60"
>
{referenceSaving ? "创建中..." : "保存形象"}
{customUploadState === "complete" ? "完成" : "取消"}
</button>
{customUploadState !== "complete" ? (
<button
type="button"
onClick={() => void handleCustomUpload()}
disabled={referenceSaving || !customFile || !customName.trim()}
className="rounded-lg bg-cyan-600 px-3 py-2 text-sm font-semibold text-white transition hover:bg-cyan-500 disabled:cursor-not-allowed disabled:opacity-60"
>
{referenceSaving && customRemoveBackground ? "正在抠除背景..." : referenceSaving ? "创建中..." : "保存形象"}
</button>
) : null}
</div>
</div>
</div>

View File

@@ -21,9 +21,11 @@ const DOT_CLASSES: Record<ToastTone, string> = {
type ToastStackProps = {
toasts: ToastMessage[];
onDismiss: (id: string) => void;
onPause: (id: string) => void;
onResume: (id: string) => void;
};
export function ToastStack({ toasts, onDismiss }: ToastStackProps) {
export function ToastStack({ toasts, onDismiss, onPause, onResume }: ToastStackProps) {
if (toasts.length === 0) return null;
return (
@@ -33,9 +35,11 @@ export function ToastStack({ toasts, onDismiss }: ToastStackProps) {
key={toast.id}
className={`flex items-start gap-2 rounded-lg border px-3 py-2.5 text-sm shadow-lg shadow-slate-200/70 ${TONE_CLASSES[toast.tone]}`}
role="status"
onMouseEnter={() => onPause(toast.id)}
onMouseLeave={() => onResume(toast.id)}
>
<span className={`mt-1.5 h-2 w-2 shrink-0 rounded-full ${DOT_CLASSES[toast.tone]}`} />
<p className="min-w-0 flex-1 leading-relaxed">{toast.message}</p>
<p className="min-w-0 flex-1 whitespace-pre-line break-words leading-relaxed">{toast.message}</p>
<button
type="button"
onClick={() => onDismiss(toast.id)}

View File

@@ -10,6 +10,9 @@ import {
type AvatarSummary,
type ExportVideoItem,
type IndexTTSConfig,
type SceneBackgroundAsset,
type SceneComposition,
type VideoCreationCompositionConfig,
type VoiceCatalogItem,
} from "../lib/api";
import type { VoiceCloneApplication } from "../lib/voiceCloneApply";
@@ -19,12 +22,16 @@ import { buildTTSPreviewPayload, requestTTSPreview } from "../lib/ttsPreview";
export type VideoCreationAudioSource = "upload" | "tts_text" | "voice_clone";
type VideoCreationMode = "spoken_video" | "reference_video";
type VideoCreationOutputAspect = "16:9" | "9:16" | "1:1";
type VoiceOpt = { id: string; label: string; targetModel?: string | null };
type VideoCreationWorkspaceProps = {
avatars: AvatarSummary[];
avatarId: string;
sceneBackgrounds: SceneBackgroundAsset[];
sceneCompositions: SceneComposition[];
selectedSceneIdsByAvatar?: Record<string, string>;
models: string[];
onAvatarChange: (id: string) => void;
onAvatarUploaded: (avatar: AvatarSummary) => void;
@@ -68,6 +75,12 @@ const VIDEO_CREATION_MODEL_LABELS: Record<string, string> = {
quicktalk: "QuickTalk",
wav2lip: "Wav2Lip",
};
const VIDEO_CREATION_OUTPUT_SIZES = {
"16:9": { label: "16:9", width: 1280, height: 720, previewClassName: "aspect-video w-full" },
"9:16": { label: "9:16", width: 720, height: 1280, previewClassName: "aspect-[9/16] w-[min(100%,22rem)]" },
"1:1": { label: "1:1", width: 1080, height: 1080, previewClassName: "aspect-square w-[min(100%,34rem)]" },
} as const satisfies Record<VideoCreationOutputAspect, { label: string; width: number; height: number; previewClassName: string }>;
const VIDEO_CREATION_OUTPUT_ASPECTS = Object.keys(VIDEO_CREATION_OUTPUT_SIZES) as VideoCreationOutputAspect[];
const VIDEO_CREATION_SCRIPT_MAX_CHARS = 1000;
const FASTERLIVEPORTRAIT_ANIMATION_REGION_OPTIONS: { id: FasterLivePortraitConfig["animation_region"]; label: string }[] = [
{ id: "lip", label: "嘴部" },
@@ -245,9 +258,16 @@ function avatarNameFromFile(file: File): string {
return stem ? `视频创作 ${stem}` : "视频创作形象";
}
function sceneBackgroundUrl(background: SceneBackgroundAsset): string {
return buildApiUrl(background.url);
}
export function VideoCreationWorkspace({
avatars,
avatarId,
sceneBackgrounds,
sceneCompositions,
selectedSceneIdsByAvatar = {},
models,
onAvatarChange,
onAvatarUploaded,
@@ -285,6 +305,9 @@ export function VideoCreationWorkspace({
const [indexttsConfig, setIndexttsConfig] = useState<IndexTTSConfig>(() => freshIndexTTSConfig());
const [indexttsEmotionAudioFile, setIndexttsEmotionAudioFile] = useState<File | null>(null);
const [activeIndexTTSPresetLabel, setActiveIndexTTSPresetLabel] = useState<string | null>(null);
const [videoBackgroundId, setVideoBackgroundId] = useState<string | null>(null);
const [videoAvatarAdjust, setVideoAvatarAdjust] = useState({ x: 0, y: 0, scale: 1 });
const [videoOutputAspect, setVideoOutputAspect] = useState<VideoCreationOutputAspect>("16:9");
const sourceUploadRef = useRef<HTMLInputElement>(null);
const ttsPreviewAudioRef = useRef<HTMLAudioElement | null>(null);
const ttsPreviewUrlRef = useRef<string | null>(null);
@@ -302,6 +325,59 @@ export function VideoCreationWorkspace({
const showIndexTTSControls = !isReferenceVideoMode && audioSource !== "upload" && INDEXTTS_PROVIDER_SET.has(ttsProvider);
const effectiveIndexTTSConfig = showIndexTTSControls ? buildIndexTTSQualityConfig(indexTTSRequestConfig(indexttsConfig)) : undefined;
const showIndexTTSEmotionStrength = indexttsConfig.emotion_mode !== "voice";
const selectedScene = useMemo(() => {
if (!selectedAvatar) return null;
const selectedSceneId = selectedSceneIdsByAvatar[selectedAvatar.id];
const avatarScenes = sceneCompositions.filter((scene) => scene.avatar_id === selectedAvatar.id);
return avatarScenes.find((scene) => scene.id === selectedSceneId) ?? avatarScenes[0] ?? null;
}, [sceneCompositions, selectedAvatar, selectedSceneIdsByAvatar]);
const selectedVideoBackground = useMemo(
() => videoBackgroundId ? sceneBackgrounds.find((background) => background.id === videoBackgroundId) ?? null : null,
[sceneBackgrounds, videoBackgroundId],
);
const videoAvatarAnchor = selectedScene?.avatar_anchor ?? "center";
const videoAvatarFit = selectedScene?.avatar_fit ?? "contain";
const videoAvatarBaseScale = selectedScene?.avatar_scale ?? 1;
const videoAvatarDisplayScale = videoAvatarBaseScale * videoAvatarAdjust.scale;
const selectedVideoOutputSize = VIDEO_CREATION_OUTPUT_SIZES[videoOutputAspect];
const videoAvatarPreviewLayer = useMemo(() => {
const canvasW = selectedVideoOutputSize.width;
const canvasH = selectedVideoOutputSize.height;
const avatarW = Math.max(1, Number(selectedAvatar?.width || canvasW));
const avatarH = Math.max(1, Number(selectedAvatar?.height || canvasH));
const containScale = Math.min(canvasW / avatarW, canvasH / avatarH);
const coverScale = Math.max(canvasW / avatarW, canvasH / avatarH);
const fitScale = videoAvatarFit === "cover" ? coverScale : containScale;
const layerW = Math.max(1, avatarW * fitScale * videoAvatarDisplayScale);
const layerH = Math.max(1, avatarH * fitScale * videoAvatarDisplayScale);
const originX = videoAvatarAnchor === "left"
? 0
: videoAvatarAnchor === "right"
? canvasW - layerW
: (canvasW - layerW) / 2;
const originY = videoAvatarAnchor === "bottom" ? canvasH - layerH : (canvasH - layerH) / 2;
return {
leftPct: ((originX + videoAvatarAdjust.x) / canvasW) * 100,
topPct: ((originY + videoAvatarAdjust.y) / canvasH) * 100,
widthPct: (layerW / canvasW) * 100,
heightPct: (layerH / canvasH) * 100,
};
}, [selectedAvatar?.height, selectedAvatar?.width, selectedVideoOutputSize.height, selectedVideoOutputSize.width, videoAvatarAdjust.x, videoAvatarAdjust.y, videoAvatarAnchor, videoAvatarDisplayScale, videoAvatarFit]);
const compositionConfig = useMemo<VideoCreationCompositionConfig | null>(() => {
if (!videoBackgroundId) return null;
return {
scene_composition_id: selectedScene?.id ?? null,
background_id: videoBackgroundId,
background_color: selectedScene?.background_color ?? "#ffffff",
avatar_fit: videoAvatarFit,
avatar_anchor: videoAvatarAnchor,
avatar_scale: videoAvatarDisplayScale,
avatar_offset_x: videoAvatarAdjust.x,
avatar_offset_y: videoAvatarAdjust.y,
output_width: selectedVideoOutputSize.width,
output_height: selectedVideoOutputSize.height,
};
}, [selectedScene?.background_color, selectedScene?.id, selectedVideoOutputSize.height, selectedVideoOutputSize.width, videoAvatarAdjust.scale, videoAvatarAdjust.x, videoAvatarAdjust.y, videoAvatarAnchor, videoAvatarDisplayScale, videoAvatarFit, videoBackgroundId]);
const updateFasterLivePortraitNumber = useCallback((
key: Exclude<keyof FasterLivePortraitConfig, "animation_region" | "flag_stitching" | "flag_pasteback" | "flag_relative_motion" | "flag_normalize_lip" | "flag_lip_retargeting">,
@@ -351,6 +427,11 @@ export function VideoCreationWorkspace({
};
}, []);
useEffect(() => {
setVideoBackgroundId(selectedScene?.background_id ?? null);
setVideoAvatarAdjust({ x: 0, y: 0, scale: 1 });
}, [selectedAvatar?.id, selectedScene?.id, selectedScene?.background_id]);
const handleSourceAsset = useCallback(async (file: File | null) => {
if (!file || !selectedAvatar) return;
const isVideo = file.type.startsWith("video/");
@@ -468,6 +549,7 @@ export function VideoCreationWorkspace({
title,
audioSource: "reference_video",
durationSec: referenceDurationSec,
compositionConfig,
});
setResult(response.export_video);
onExportCreated?.(response.export_video);
@@ -487,6 +569,7 @@ export function VideoCreationWorkspace({
fasterliveportraitConfig: effectiveModel === "fasterliveportrait" ? fasterliveportraitConfig : undefined,
indexttsConfig: effectiveIndexTTSConfig,
indexttsEmotionAudioFile,
compositionConfig,
});
setResult(response.export_video);
onExportCreated?.(response.export_video);
@@ -498,11 +581,11 @@ export function VideoCreationWorkspace({
} finally {
setGenerating(false);
}
}, [audioFile, audioSource, edgeVoice, effectiveIndexTTSConfig, effectiveModel, fasterliveportraitConfig, indexttsConfig.emotion_mode, indexttsEmotionAudioFile, isReferenceVideoMode, models, onExportCreated, onNotify, qwenModel, qwenVoice, referenceDurationSec, selectedAvatar, showIndexTTSControls, text, title, ttsProvider]);
}, [audioFile, audioSource, compositionConfig, edgeVoice, effectiveIndexTTSConfig, effectiveModel, fasterliveportraitConfig, indexttsConfig.emotion_mode, indexttsEmotionAudioFile, isReferenceVideoMode, models, onExportCreated, onNotify, qwenModel, qwenVoice, referenceDurationSec, selectedAvatar, showIndexTTSControls, text, title, ttsProvider]);
return (
<main className="flex min-h-0 flex-1 flex-col bg-slate-100 p-4">
<div className="grid min-h-0 flex-1 gap-4 xl:grid-cols-[20rem_minmax(0,1fr)_22rem]">
<div className="grid min-h-0 flex-1 gap-4 xl:grid-cols-[18rem_minmax(28rem,1fr)_minmax(32rem,42rem)]">
<section className="min-h-0 overflow-y-auto rounded-lg border border-slate-200 bg-white p-4 shadow-sm">
<div className="flex items-center justify-between gap-3">
<div>
@@ -969,27 +1052,168 @@ export function VideoCreationWorkspace({
</button>
{result ? <span className="text-sm font-medium text-emerald-700"></span> : null}
</div>
{result ? (
<div data-testid="video-creation-result-panel" className="mt-6 rounded-lg border border-slate-200 bg-slate-50 p-4">
<div className="flex flex-wrap items-start justify-between gap-3">
<div>
<p className="text-xs font-medium text-slate-500">Result</p>
<h2 className="mt-1 text-base font-semibold text-slate-950"></h2>
</div>
<div className="flex flex-wrap gap-2">
<a href={buildApiDownloadUrl(result.download_url)} download className="rounded-lg bg-cyan-600 px-3 py-1.5 text-xs font-semibold text-white hover:bg-cyan-500"></a>
<button type="button" onClick={onGoAssetLibrary} className="rounded-lg border border-slate-200 bg-white px-3 py-1.5 text-xs font-semibold text-slate-700 hover:border-cyan-200 hover:text-cyan-700"></button>
</div>
</div>
<video src={buildApiDownloadUrl(result.download_url)} className={`mt-3 mx-auto rounded-lg bg-slate-950 object-contain ${selectedVideoOutputSize.previewClassName}`} controls preload="metadata" />
<div className="mt-3 rounded-lg bg-white p-3 text-xs text-slate-600">
<p className="font-semibold text-slate-800">{result.title}</p>
<p className="mt-1 break-all font-mono text-[11px]">{result.path}</p>
</div>
</div>
) : null}
</div>
</section>
<aside className="flex min-h-0 flex-col rounded-lg border border-slate-200 bg-white p-4 shadow-sm">
<p className="text-xs font-medium text-slate-500">Result</p>
<h2 className="mt-1 text-base font-semibold text-slate-950"></h2>
{result ? (
<div className="mt-4 space-y-3">
<video src={buildApiDownloadUrl(result.download_url)} className="aspect-video w-full rounded-lg bg-slate-950 object-contain" controls preload="metadata" />
<div className="rounded-lg bg-slate-50 p-3 text-xs text-slate-600">
<p className="font-semibold text-slate-800">{result.title}</p>
<p className="mt-1 break-all font-mono text-[11px]">{result.path}</p>
<aside className="flex min-h-0 flex-col overflow-y-auto rounded-lg border border-slate-200 bg-white p-4 shadow-sm">
<p className="text-xs font-medium text-slate-500">Composition</p>
<h2 className="mt-1 text-base font-semibold text-slate-950"></h2>
<div className="mt-4 shrink-0 overflow-hidden rounded-lg border border-slate-200 bg-slate-950 p-3">
<div className="mb-2 flex items-center justify-between gap-3">
<div>
<p className="text-xs font-semibold text-white/55"></p>
<h3 className="text-sm font-semibold text-white"></h3>
</div>
<div className="flex flex-wrap gap-2">
<a href={buildApiDownloadUrl(result.download_url)} download className="rounded-lg bg-cyan-600 px-3 py-1.5 text-xs font-semibold text-white hover:bg-cyan-500"></a>
<button type="button" onClick={onGoAssetLibrary} className="rounded-lg border border-slate-200 bg-white px-3 py-1.5 text-xs font-semibold text-slate-700 hover:border-cyan-200 hover:text-cyan-700"></button>
<span className="rounded-md border border-white/15 bg-white/10 px-2 py-0.5 text-[11px] font-semibold text-white/75">
{selectedVideoOutputSize.width}x{selectedVideoOutputSize.height}
</span>
</div>
<div
className={`relative mx-auto overflow-hidden rounded-md border border-white/10 bg-white ${selectedVideoOutputSize.previewClassName}`}
style={{ backgroundColor: selectedScene?.background_color ?? "#f8fafc" }}
>
{selectedVideoBackground?.kind === "image" ? (
<img src={sceneBackgroundUrl(selectedVideoBackground)} alt={selectedVideoBackground.name} className="absolute inset-0 h-full w-full object-cover" />
) : null}
{selectedVideoBackground?.kind === "video" ? (
<div className="absolute inset-0 flex items-center justify-center bg-slate-900 px-4 text-center text-xs font-medium text-white/80">
</div>
) : null}
{!selectedVideoBackground ? (
<div className="absolute inset-0 bg-white" />
) : null}
{selectedAvatar ? (
<div
className="absolute"
style={{
left: `${videoAvatarPreviewLayer.leftPct}%`,
top: `${videoAvatarPreviewLayer.topPct}%`,
width: `${videoAvatarPreviewLayer.widthPct}%`,
height: `${videoAvatarPreviewLayer.heightPct}%`,
}}
>
<img
src={buildApiUrl(`/avatars/${encodeURIComponent(selectedAvatar.id)}/preview`)}
alt={selectedAvatar.name ?? selectedAvatar.id}
className="absolute inset-0 h-full w-full object-fill"
/>
</div>
) : null}
<div className="pointer-events-none absolute inset-x-5 bottom-5 rounded border border-white/35 bg-slate-950/35 px-3 py-1 text-center text-xs font-semibold text-white/80">
</div>
</div>
) : (
<div className="mt-4 flex min-h-[18rem] items-center justify-center rounded-lg border border-dashed border-slate-300 bg-slate-50 text-sm font-medium text-slate-500"></div>
)}
</div>
<div data-testid="video-creation-composition-controls" className="mt-3 space-y-3 rounded-lg border border-slate-200 bg-slate-50 p-3">
<div>
<p className="mb-1.5 text-xs font-semibold text-slate-700"></p>
<div className="grid grid-cols-3 gap-2">
{VIDEO_CREATION_OUTPUT_ASPECTS.map((aspect) => {
const option = VIDEO_CREATION_OUTPUT_SIZES[aspect];
const active = aspect === videoOutputAspect;
return (
<button
key={aspect}
type="button"
onClick={() => setVideoOutputAspect(aspect)}
className={`rounded-md border px-2 py-1.5 text-xs font-semibold transition-colors ${
active
? "border-cyan-500 bg-cyan-50 text-cyan-700"
: "border-slate-200 bg-white text-slate-600 hover:border-cyan-200 hover:text-cyan-700"
}`}
>
{option.label}
</button>
);
})}
</div>
</div>
<label className="block text-xs font-semibold text-slate-700">
<select
value={videoBackgroundId ?? ""}
onChange={(event) => setVideoBackgroundId(event.target.value || null)}
className="mt-1 w-full rounded-md border border-slate-200 bg-white px-2 py-1.5 text-xs font-medium text-slate-700"
>
<option value="">使</option>
{sceneBackgrounds.map((background) => (
<option key={background.id} value={background.id}>{background.name}</option>
))}
</select>
</label>
<label className="block text-xs font-medium text-slate-600">
<span className="mb-1 flex items-center justify-between gap-2">
<span></span>
<span className="tabular-nums">{videoAvatarAdjust.x}px</span>
</span>
<input
type="range"
min="-800"
max="800"
step="4"
value={videoAvatarAdjust.x}
onChange={(event) => setVideoAvatarAdjust((current) => ({ ...current, x: Number(event.target.value) }))}
className="w-full accent-cyan-600"
/>
</label>
<label className="block text-xs font-medium text-slate-600">
<span className="mb-1 flex items-center justify-between gap-2">
<span></span>
<span className="tabular-nums">{videoAvatarAdjust.y}px</span>
</span>
<input
type="range"
min="-600"
max="600"
step="4"
value={videoAvatarAdjust.y}
onChange={(event) => setVideoAvatarAdjust((current) => ({ ...current, y: Number(event.target.value) }))}
className="w-full accent-cyan-600"
/>
</label>
<label className="block text-xs font-medium text-slate-600">
<span className="mb-1 flex items-center justify-between gap-2">
<span></span>
<span className="tabular-nums">{videoAvatarDisplayScale.toFixed(2)}x</span>
</span>
<input
type="range"
min="0.2"
max="3"
step="0.02"
value={videoAvatarAdjust.scale}
onChange={(event) => setVideoAvatarAdjust((current) => ({ ...current, scale: Number(event.target.value) }))}
className="w-full accent-cyan-600"
/>
</label>
<button
type="button"
onClick={() => setVideoAvatarAdjust({ x: 0, y: 0, scale: 1 })}
className="w-full rounded-lg border border-slate-200 bg-white px-3 py-1.5 text-xs font-semibold text-slate-700 transition hover:border-cyan-200 hover:text-cyan-700"
>
</button>
</div>
</aside>
</div>

View File

@@ -332,6 +332,19 @@ export type VideoCreationJobResponse = {
export_video: ExportVideoItem;
};
export type VideoCreationCompositionConfig = {
scene_composition_id?: string | null;
background_id?: string | null;
background_color?: string;
avatar_fit?: "contain" | "cover";
avatar_anchor?: "center" | "bottom" | "left" | "right";
avatar_scale?: number;
avatar_offset_x?: number;
avatar_offset_y?: number;
output_width?: number;
output_height?: number;
};
export type CreateVideoCreationJobInput = {
model: string;
avatarId: string;
@@ -346,6 +359,7 @@ export type CreateVideoCreationJobInput = {
fasterliveportraitConfig?: Record<string, unknown>;
indexttsConfig?: IndexTTSConfig;
indexttsEmotionAudioFile?: File | null;
compositionConfig?: VideoCreationCompositionConfig | null;
};
export async function createVideoCreationJob(input: CreateVideoCreationJobInput): Promise<VideoCreationJobResponse> {
@@ -373,6 +387,9 @@ export async function createVideoCreationJob(input: CreateVideoCreationJobInput)
if (input.indexttsEmotionAudioFile) {
form.set("indextts_emotion_audio_file", input.indexttsEmotionAudioFile);
}
if (input.compositionConfig) {
form.set("composition_config", JSON.stringify(input.compositionConfig));
}
return apiPostForm<VideoCreationJobResponse>("/video-creation/jobs", form);
}

View File

@@ -155,10 +155,16 @@ mode. The single-process unified mode (`opentalking-unified`) ignores all entrie
| `OPENTALKING_WORKER_URL` | `http://127.0.0.1:9001` | URL through which the API reaches the Worker. |
| `OPENTALKING_TORCH_DEVICE` | `cpu` | Device used for orchestration-side audio and frame post-processing. |
| `OPENTALKING_AVATARS_DIR` | `./examples/avatars` | Avatar bundle root directory. |
| `OPENTALKING_AVATAR_MATTING_PROVIDER` | `rembg` | Optional matting provider for custom avatar uploads; called only when the upload dialog option is enabled. |
| `OPENTALKING_AVATAR_MATTING_DEVICE` | `cpu` | Reserved device setting for matting providers. |
| `OPENTALKING_AVATAR_MATTING_MODEL_PATH` | empty | Local `u2net.onnx` model file path for the `rembg` provider. |
| `OPENTALKING_AVATAR_MATTING_TIMEOUT_SEC` | `60` | Reserved timeout setting for matting providers. |
| `OPENTALKING_VOICES_DIR` | `./var/voices` | Storage for cloned voices. |
| `OPENTALKING_SQLITE_PATH` | `./data/opentalking.sqlite3` | Local metadata database file. |
| `OPENTALKING_CORS_ORIGINS` | `http://localhost:5173,http://127.0.0.1:5173` | Comma-separated list of permitted frontend origins. |
Custom avatar uploads do not remove the background by default. PNG uploads with an existing alpha channel are detected automatically as transparent-ready. To enable the local `rembg` provider, install the optional dependency first: `uv pip install --python .venv/bin/python '.[avatar-matting]'`. The `rembg` provider does not download models at runtime; download `u2net.onnx` ahead of time and point `OPENTALKING_AVATAR_MATTING_MODEL_PATH` at that file.
## 4. Advanced tuning
The variables in this section are intended for fine-grained control over specific

View File

@@ -0,0 +1,170 @@
# Video Creation Composition Implementation Plan
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
**Goal:** Add a pre-generation scene composition preview to Video Creation and use the one-off composition settings when exporting generated videos.
**Architecture:** The frontend derives a selected avatar scene from App-level scene assets and sends a one-off `composition_config` with video creation jobs. The backend validates the config and post-processes generated frames over an image background before writing the video.
**Tech Stack:** React + TypeScript frontend, FastAPI multipart route, Python video creation service, OpenCV/NumPy/Pillow-style frame processing.
## Global Constraints
- Video Creation composition adjustments only affect the current generation job.
- Do not write changes back to Scene Assets in this version.
- If no avatar scene/background exists, keep current Video Creation behavior.
- Reject video backgrounds for Video Creation in this first version.
- Local runtime is unavailable; commit locally, sync via git bundle, and run verification on `8.92.9.220:/home/ly/opentalking`.
---
### Task 1: Backend Composition Config Parsing
**Files:**
- Modify: `apps/api/routes/video_creation.py`
- Test: `apps/api/tests/test_video_creation.py`
**Interfaces:**
- Produces: `_parse_video_composition_config(raw: str | None) -> dict[str, object] | None`
- Produces: optional `composition_config` argument passed to `VideoCreationService.create_from_audio_file`, `create_from_tts_text`, and `create_reference_video`
- [ ] **Step 1: Write failing route tests**
Add tests that post `composition_config` JSON to `/video-creation/jobs` and assert the fake service receives a dict. Add an invalid JSON test expecting HTTP 400.
- [ ] **Step 2: Run route tests to verify they fail**
Run on server after sync: `uv run pytest apps/api/tests/test_video_creation.py -k "composition_config" -q`
- [ ] **Step 3: Implement parser and route forwarding**
Add a FastAPI `Form(default=None)` field named `composition_config`, parse JSON, require a JSON object, and pass the parsed dict to service calls.
- [ ] **Step 4: Run route tests to verify they pass**
Run on server after sync: `uv run pytest apps/api/tests/test_video_creation.py -k "composition_config" -q`
### Task 2: Backend Frame Compositing
**Files:**
- Modify: `opentalking/video_creation.py`
- Test: `apps/api/tests/test_video_creation.py`
**Interfaces:**
- Consumes: `composition_config: Mapping[str, object] | None`
- Produces: `_normalize_video_composition_config(settings: object, config: Mapping[str, object] | None) -> dict[str, object] | None`
- Produces: `_apply_video_composition(frames: list[np.ndarray], *, config: Mapping[str, object] | None) -> list[np.ndarray]`
- [ ] **Step 1: Write failing service tests**
Add a test that creates a temporary scene background image, sends composition config to `VideoCreationService.create_from_audio_file`, monkeypatches `_write_video_only`, and asserts written frames contain the background color behind a transparent RGBA generated frame.
- [ ] **Step 2: Run service test to verify it fails**
Run on server after sync: `uv run pytest apps/api/tests/test_video_creation.py -k "composites_generated_frames" -q`
- [ ] **Step 3: Implement minimal composition**
In `_create_from_pcm`, normalize `composition_config` before rendering, apply it before `_write_video_only`, support image backgrounds only, and raise `ValueError("video backgrounds are not supported for video creation")` for video backgrounds.
- [ ] **Step 4: Run service tests to verify they pass**
Run on server after sync: `uv run pytest apps/api/tests/test_video_creation.py -k "composition" -q`
### Task 3: Frontend API and App Data Flow
**Files:**
- Modify: `apps/web/src/lib/api.ts`
- Modify: `apps/web/src/App.tsx`
- Modify: `apps/web/src/components/VideoCreationWorkspace.tsx`
- Test: `tests/unit/test_local_audio_frontend.py`
**Interfaces:**
- Produces: `VideoCreationCompositionConfig` TypeScript type
- Produces: `compositionConfig?: VideoCreationCompositionConfig | null` on `CreateVideoCreationJobInput`
- Consumes: `sceneBackgrounds`, `sceneCompositions`, `selectedSceneIdsByAvatar` props in `VideoCreationWorkspace`
- [ ] **Step 1: Write failing frontend text tests**
Add assertions that `VideoCreationWorkspace` receives scene props from `App.tsx`, defines `compositionConfig`, renders `生成前预览`, and appends `composition_config` in `createVideoCreationJob`.
- [ ] **Step 2: Run frontend text tests to verify they fail**
Run on server after sync: `uv run pytest tests/unit/test_local_audio_frontend.py -k "video_creation" -q`
- [ ] **Step 3: Implement API and prop plumbing**
Add the TypeScript composition type, JSON form field, and pass App scene state into `VideoCreationWorkspace`.
- [ ] **Step 4: Run frontend text tests to verify they pass**
Run on server after sync: `uv run pytest tests/unit/test_local_audio_frontend.py -k "video_creation" -q`
### Task 4: Frontend Preview and Controls
**Files:**
- Modify: `apps/web/src/components/VideoCreationWorkspace.tsx`
- Test: `tests/unit/test_local_audio_frontend.py`
**Interfaces:**
- Consumes: scene props and `buildApiUrl`
- Produces: one-off local state `{ backgroundId, backgroundColor, avatarFit, avatarAnchor, avatarScale, avatarOffsetX, avatarOffsetY }`
- [ ] **Step 1: Write failing UI string tests**
Assert `VideoCreationWorkspace.tsx` contains `生成前预览`, `本次生成`, `水平位置`, `垂直位置`, `人物缩放`, and `compositionConfig`.
- [ ] **Step 2: Run tests to verify they fail**
Run on server after sync: `uv run pytest tests/unit/test_local_audio_frontend.py -k "video_creation_composition" -q`
- [ ] **Step 3: Implement preview and controls**
Show the selected background image when present, show the selected avatar preview above it, provide sliders for X/Y/scale, reset controls, and build `compositionConfig` for generation. Keep fallback copy for no background.
- [ ] **Step 4: Run tests to verify they pass**
Run on server after sync: `uv run pytest tests/unit/test_local_audio_frontend.py -k "video_creation_composition" -q`
### Task 5: Server Sync and Verification
**Files:**
- No code files; sync and run commands.
**Interfaces:**
- Consumes: local commits on `codex/video-creation-composition`
- Produces: matching server git history and verification output
- [ ] **Step 1: Commit local changes**
Run: `git add docs apps opentalking tests && git commit -m "Add video creation scene composition"`
- [ ] **Step 2: Create and upload git bundle**
Run locally: `git bundle create /tmp/video-creation-composition.bundle HEAD`
Upload to server: `sshpass -p 'ly.123' scp /tmp/video-creation-composition.bundle ly@8.92.9.220:/tmp/video-creation-composition.bundle`
- [ ] **Step 3: Apply bundle on server**
Run on server: `cd /home/ly/opentalking && git fetch /tmp/video-creation-composition.bundle codex/video-creation-composition:codex/video-creation-composition && git switch codex/video-creation-composition`
- [ ] **Step 4: Run verification**
Run on server:
```bash
cd /home/ly/opentalking
uv run mypy opentalking/core opentalking/events opentalking/avatar apps/api apps/unified apps/cli --ignore-missing-imports
uv run pytest apps/api/tests/test_video_creation.py tests/unit/test_local_audio_frontend.py -q
```
- [ ] **Step 5: Start service for manual review**
Run on server:
```bash
cd /home/ly/opentalking
bash scripts/quickstart/stop_all.sh || true
bash scripts/start_unified.sh --mock --api-port 8211 --web-port 5281
```

View File

@@ -0,0 +1,108 @@
# Video Creation Composition Design
## Goal
Bring scene backgrounds and avatar placement into the Video Creation workflow so offline generated videos can match the visual composition users preview before generation.
## Product Decision
Video Creation uses a pre-generation composition preview, not an immersive conversation mode. The page is an offline production workspace: users choose an avatar, script or audio, model settings, and then confirm the final frame composition before clicking generate.
Composition adjustments in Video Creation are one-off job settings. They must not update the active realtime conversation scene or mutate Scene Asset compositions unless the user explicitly uses a future save action.
## User Experience
When a user selects an avatar on the Video Creation page:
- If the avatar has an active scene composition, the preview uses that scene's background, background color, avatar fit, anchor, and scale.
- If the avatar has no active scene composition, Video Creation keeps the existing default avatar-only behavior.
- The user can adjust horizontal position, vertical position, and scale for this generation.
- The preview shows background plus avatar in an output-frame container before generation.
- Reset returns the one-off transform to `x: 0`, `y: 0`, `scale: 1`.
The first version does not add a write-back action to Scene Assets. It only sends composition data with the video creation job.
## Frontend Architecture
`App.tsx` already owns scene backgrounds, scene compositions, and selected scene ids by avatar. Video Creation should receive those values as props and derive the current avatar's active scene with the same rule as realtime conversation:
1. Use `selectedSceneIdsByAvatar[avatarId]` if it points to a scene for the selected avatar.
2. Otherwise use the first composition whose `avatar_id` matches the selected avatar.
3. Otherwise use no scene.
`VideoCreationWorkspace.tsx` adds:
- Props for `sceneBackgrounds`, `sceneCompositions`, and `selectedSceneIdsByAvatar`.
- Local state for `videoAvatarAdjust: { x: number; y: number; scale: number }`.
- A composition preview panel using the selected scene and background data.
- Background selection for this job only.
- X, Y, and scale controls for this job only.
- A `videoComposition` payload passed to `createVideoCreationJob`.
The UI stays work-focused and data-dense. The preview is prominent, but controls remain visible because this is a production page rather than an immersive live demo page.
## API Contract
`createVideoCreationJob` accepts an optional multipart field named `composition_config`. The field is JSON:
```json
{
"scene_composition_id": "scene-example",
"background_id": "bg-example",
"background_color": "#ffffff",
"avatar_fit": "contain",
"avatar_anchor": "center",
"avatar_scale": 1.1,
"avatar_offset_x": 80,
"avatar_offset_y": -24
}
```
All fields are optional except that a useful composition must include at least `background_id` or a non-empty scene-derived background color. If the field is absent, current video generation behavior remains unchanged.
Validation rules:
- `background_id` must resolve under the configured scene assets directory when present.
- `avatar_fit` must be `contain` or `cover`.
- `avatar_anchor` must be `center`, `bottom`, `left`, or `right`.
- `avatar_scale` must be between `0.1` and `4.0`.
- Offsets are pixel values and must be between `-2000` and `2000`.
- Video backgrounds are not composited in the first version; the backend rejects a video background for Video Creation with a clear message.
## Backend Architecture
`VideoCreationService` accepts `composition_config` on audio upload, TTS text, and reference video generation.
The first version applies composition after model frame generation and before writing `video_only.mp4`:
1. Generate avatar frames as today.
2. If no composition config or no image background is provided, write frames unchanged.
3. If an image background is provided, resize/crop it to the generated frame size.
4. Place each generated frame over the background according to fit, anchor, base scale, and one-off offsets.
5. Use alpha blending if generated frames contain an alpha channel; otherwise paste the RGB frame as an opaque layer.
6. Continue muxing audio and export metadata as today.
This keeps model-specific audio-to-video logic untouched and makes composition a reusable post-processing step.
## Testing
Backend tests cover:
- API route parses `composition_config` and passes it to `VideoCreationService`.
- Invalid JSON is rejected with HTTP 400.
- Service rejects unknown or video backgrounds for Video Creation.
- Service composites generated RGBA frames over an image background with scale and offsets.
- Existing jobs without composition still behave as before.
Frontend text-level tests cover:
- Video Creation receives scene data from `App.tsx`.
- `VideoCreationWorkspace.tsx` exposes the composition preview and controls.
- `createVideoCreationJob` sends `composition_config`.
## Non-Goals
- No immersive mode on Video Creation in this first version.
- No automatic write-back to Scene Assets.
- No video-background compositing in Video Creation yet.
- No subtitle rendering into generated videos in this first version.

View File

@@ -145,10 +145,16 @@ FlashHead 使用专属 WebSocket 协议,不经过 OmniRT。
| `OPENTALKING_WORKER_URL` | `http://127.0.0.1:9001` | API 访问 Worker 时使用的 URL。 |
| `OPENTALKING_TORCH_DEVICE` | `cpu` | 编排侧音频与帧后处理使用的设备。 |
| `OPENTALKING_AVATARS_DIR` | `./examples/avatars` | Avatar bundle 根目录。 |
| `OPENTALKING_AVATAR_MATTING_PROVIDER` | `rembg` | 自定义形象上传时的可选抠图 provider仅在上传弹窗勾选后调用。 |
| `OPENTALKING_AVATAR_MATTING_DEVICE` | `cpu` | 预留给抠图 provider 的设备配置。 |
| `OPENTALKING_AVATAR_MATTING_MODEL_PATH` | 空 | `rembg` provider 的本地 `u2net.onnx` 模型文件路径。 |
| `OPENTALKING_AVATAR_MATTING_TIMEOUT_SEC` | `60` | 预留给抠图 provider 的超时配置。 |
| `OPENTALKING_VOICES_DIR` | `./var/voices` | 声音复刻存储目录。 |
| `OPENTALKING_SQLITE_PATH` | `./data/opentalking.sqlite3` | 本地元数据数据库文件。 |
| `OPENTALKING_CORS_ORIGINS` | `http://localhost:5173,http://127.0.0.1:5173` | 允许的前端 origin逗号分隔。 |
自定义形象上传默认不会抠除背景PNG 自带透明通道时会自动识别为可透明合成。若要启用本地 `rembg` provider先安装可选依赖`uv pip install --python .venv/bin/python '.[avatar-matting]'`。`rembg` 不会在运行时自动下载模型,请预先下载 `u2net.onnx`,并将 `OPENTALKING_AVATAR_MATTING_MODEL_PATH` 指向该文件。
## 4. 进阶调优 {#4}
本节变量针对特定后端的细粒度控制。完整列表参见 `.env.example`。下列为代表性条目。

Binary file not shown.

After

Width:  |  Height:  |  Size: 309 KiB

View File

@@ -0,0 +1,39 @@
from __future__ import annotations
from PIL import Image
from .alpha import image_has_transparency
from .base import AvatarMattingProvider, MattingError
from .rembg_provider import RembgMattingProvider
_PROVIDER_FACTORIES = {
"rembg": RembgMattingProvider,
}
def resolve_avatar_matting_provider(provider_name: str) -> AvatarMattingProvider:
key = (provider_name or "rembg").strip().lower()
factory = _PROVIDER_FACTORIES.get(key)
if factory is None:
supported = ", ".join(sorted(_PROVIDER_FACTORIES))
raise MattingError(f"unsupported avatar matting provider: {provider_name!r}; supported: {supported}")
return factory()
def remove_avatar_background(
image: Image.Image,
*,
provider_name: str = "rembg",
settings: object | None = None,
) -> tuple[Image.Image, str]:
provider = resolve_avatar_matting_provider(provider_name)
return provider.remove_background(image, settings=settings), provider.name
__all__ = [
"AvatarMattingProvider",
"MattingError",
"image_has_transparency",
"remove_avatar_background",
"resolve_avatar_matting_provider",
]

View File

@@ -0,0 +1,12 @@
from __future__ import annotations
from PIL import Image
def image_has_transparency(image: Image.Image) -> bool:
if "A" not in image.getbands():
return False
low, high = image.getchannel("A").getextrema()
if not isinstance(low, int | float) or not isinstance(high, int | float):
return False
return low < 255 or high < 255

View File

@@ -0,0 +1,16 @@
from __future__ import annotations
from typing import Protocol
from PIL import Image
class MattingError(RuntimeError):
"""Raised when an avatar matting provider cannot complete."""
class AvatarMattingProvider(Protocol):
name: str
def remove_background(self, image: Image.Image, *, settings: object | None = None) -> Image.Image:
"""Return an image with transparent background."""

View File

@@ -0,0 +1,73 @@
from __future__ import annotations
import os
from contextlib import contextmanager
from io import BytesIO
from pathlib import Path
from typing import Iterator
from PIL import Image
from .base import MattingError
REMBG_U2NET_MODEL_MD5 = "60024c5c889badc19c04ad937298a77b"
REMBG_U2NET_MODEL_URL = "https://github.com/danielgatis/rembg/releases/download/v0.0.0/u2net.onnx"
def _configured_model_path(settings: object | None) -> Path | None:
value = str(getattr(settings, "avatar_matting_model_path", "") or "").strip()
return Path(value).expanduser() if value else None
def _missing_model_message() -> str:
return (
"未找到抠除背景模型 u2net.onnx。\n"
f"请下载模型:{REMBG_U2NET_MODEL_URL}\n"
"然后在 .env 中配置 OPENTALKING_AVATAR_MATTING_MODEL_PATH。"
)
def _validate_model_path(settings: object | None) -> Path:
model_path = _configured_model_path(settings)
if model_path is None or not model_path.is_file():
raise MattingError(_missing_model_message())
if model_path.name != "u2net.onnx":
raise MattingError("OPENTALKING_AVATAR_MATTING_MODEL_PATH 必须指向 u2net.onnx 文件。")
return model_path.resolve()
@contextmanager
def _u2net_home_for_model(model_path: Path) -> Iterator[None]:
previous = os.environ.get("U2NET_HOME")
os.environ["U2NET_HOME"] = str(model_path.parent)
try:
yield
finally:
if previous is None:
os.environ.pop("U2NET_HOME", None)
else:
os.environ["U2NET_HOME"] = previous
class RembgMattingProvider:
name = "rembg"
def remove_background(self, image: Image.Image, *, settings: object | None = None) -> Image.Image:
model_path = _validate_model_path(settings)
try:
from rembg import remove
except ImportError as exc:
raise MattingError(
"rembg is not installed; install the avatar matting extra or choose another provider"
) from exc
input_buffer = BytesIO()
image.convert("RGBA").save(input_buffer, format="PNG")
try:
with _u2net_home_for_model(model_path):
output = remove(input_buffer.getvalue())
result = Image.open(BytesIO(output))
result.load()
except Exception as exc: # noqa: BLE001
raise MattingError(f"rembg failed: {exc}") from exc
return result.convert("RGBA")

View File

@@ -24,6 +24,12 @@ def _flatten_config(raw: dict[str, Any] | None) -> dict[str, Any]:
"models_dir": "models_dir",
"worker_url": "worker_url",
},
"avatar": {
"matting_provider": "avatar_matting_provider",
"matting_device": "avatar_matting_device",
"matting_model_path": "avatar_matting_model_path",
"matting_timeout_sec": "avatar_matting_timeout_sec",
},
"flashtalk": {
"ws_url": "flashtalk_ws_url",
"ckpt_dir": "flashtalk_ckpt_dir",
@@ -355,6 +361,10 @@ class Settings(BaseSettings):
export_max_bytes: int = 1024 * 1024 * 1024
video_creation_audio_max_bytes: int = 50 * 1024 * 1024
video_creation_fasterliveportrait_preroll_ms: int = 400
avatar_matting_provider: str = "rembg"
avatar_matting_device: str = "cpu"
avatar_matting_model_path: str = ""
avatar_matting_timeout_sec: int = 60
flashtalk_ws_url: str = ""
flashtalk_ckpt_dir: str = "./models/SoulX-FlashTalk-14B"

View File

@@ -5,6 +5,7 @@ import re
import shutil
import uuid
from datetime import datetime, timezone
from importlib import resources
from pathlib import Path
from typing import Any
@@ -23,6 +24,15 @@ EXT_BY_MIME = {
VALID_AVATAR_FITS = {"contain", "cover"}
VALID_AVATAR_ANCHORS = {"center", "bottom", "left", "right"}
VALID_SUBTITLE_STYLES = {"none", "compact", "lower-third"}
DEFAULT_BACKGROUNDS = (
{
"id": "bg-default-data-wall",
"name": "数据玻璃幕墙",
"filename": "default-data-wall.jpg",
"mime_type": "image/jpeg",
"resource": "assets/scene_backgrounds/default-data-wall.jpg",
},
)
def sniff_background_mime(content: bytes) -> str | None:
@@ -68,17 +78,60 @@ def _write_json(path: Path, payload: Any) -> None:
class SceneAssetStore:
def __init__(self, root: Path) -> None:
def __init__(self, root: Path, *, seed_defaults: bool = False) -> None:
self.root = root.expanduser().resolve()
self.backgrounds_dir = self.root / "backgrounds"
self.compositions_dir = self.root / "compositions"
self.background_index_path = self.backgrounds_dir / "index.json"
self.composition_index_path = self.compositions_dir / "index.json"
self.seed_defaults = seed_defaults
self.background_seed_marker_path = self.backgrounds_dir / ".defaults_seeded"
def list_backgrounds(self) -> list[dict[str, object]]:
self._seed_default_backgrounds()
return self._load_backgrounds()
def _load_backgrounds(self) -> list[dict[str, object]]:
items = _read_json(self.background_index_path, [])
return [item for item in items if isinstance(item, dict)]
def _seed_default_backgrounds(self) -> None:
if not self.seed_defaults or self.background_seed_marker_path.exists():
return
items = self._load_backgrounds()
existing_ids = {str(item.get("id") or "") for item in items}
seeded: list[dict[str, object]] = []
now = _now()
for default in DEFAULT_BACKGROUNDS:
background_id = str(default["id"])
if background_id in existing_ids:
continue
resource_path = resources.files("opentalking").joinpath(str(default["resource"]))
try:
content = resource_path.read_bytes()
except FileNotFoundError:
continue
ext = EXT_BY_MIME[str(default["mime_type"])]
media_path = self.backgrounds_dir / background_id / f"source{ext}"
media_path.parent.mkdir(parents=True, exist_ok=True)
media_path.write_bytes(content)
seeded.append(
{
"id": background_id,
"name": str(default["name"]),
"kind": "image",
"mime_type": str(default["mime_type"]),
"filename": str(default["filename"]),
"size_bytes": len(content),
"url": f"/scene-assets/backgrounds/{background_id}/file",
"created_at": now,
}
)
if seeded:
_write_json(self.background_index_path, [*items, *seeded])
self.background_seed_marker_path.parent.mkdir(parents=True, exist_ok=True)
self.background_seed_marker_path.write_text(now + "\n", encoding="utf-8")
def create_background(self, *, content: bytes, filename: str, mime_type: str, name: str) -> dict[str, object]:
normalized_mime = (mime_type or "").split(";")[0].strip().lower()
if not content:
@@ -102,7 +155,7 @@ class SceneAssetStore:
"url": f"/scene-assets/backgrounds/{background_id}/file",
"created_at": _now(),
}
items = [entry for entry in self.list_backgrounds() if entry.get("id") != background_id]
items = [entry for entry in self._load_backgrounds() if entry.get("id") != background_id]
items.insert(0, item)
_write_json(self.background_index_path, items)
return item
@@ -126,7 +179,7 @@ class SceneAssetStore:
def delete_background(self, background_id: str) -> bool:
if not re.fullmatch(r"bg-[\w\u4e00-\u9fff-]+", background_id or ""):
return False
items = self.list_backgrounds()
items = self._load_backgrounds()
next_items = [item for item in items if item.get("id") != background_id]
if len(next_items) == len(items):
return False

View File

@@ -25,6 +25,7 @@ from opentalking.providers.synthesis.backends import resolve_model_backend
from opentalking.providers.synthesis.flashtalk.ws_client import FlashTalkWSClient
from opentalking.providers.synthesis.omnirt import auth_headers, resolve_synthesis_ws_url
from opentalking.providers.tts.factory import build_tts_adapter
from opentalking.scene_assets import SceneAssetStore
log = logging.getLogger(__name__)
@@ -89,6 +90,204 @@ def _validate_reference_duration(settings: object, duration_sec: int | None) ->
return value
def _coerce_composition_float(
payload: Mapping[str, object],
key: str,
default: float,
*,
min_value: float,
max_value: float,
) -> float:
raw = payload.get(key)
if raw in (None, ""):
return default
if not isinstance(raw, str | int | float):
raise ValueError(f"{key} must be a number")
try:
value = float(raw)
except (TypeError, ValueError) as exc:
raise ValueError(f"{key} must be a number") from exc
if value < min_value or value > max_value:
raise ValueError(f"{key} must be between {min_value:g} and {max_value:g}")
return value
def _coerce_composition_int(
payload: Mapping[str, object],
key: str,
default: int,
*,
min_value: int,
max_value: int,
) -> int:
raw = payload.get(key)
if raw in (None, ""):
value = default
elif isinstance(raw, str | int | float):
try:
value = int(raw)
except (TypeError, ValueError) as exc:
raise ValueError(f"{key} must be an integer") from exc
else:
raise ValueError(f"{key} must be an integer")
if value < min_value or value > max_value:
raise ValueError(f"{key} must be between {min_value:g} and {max_value:g}")
return value + (value % 2)
def _normalize_video_composition_config(
settings: object,
avatar_path: Path,
config: Mapping[str, object] | None,
) -> dict[str, object] | None:
if not config:
return None
background_id = str(config.get("background_id") or "").strip()
if not background_id:
return None
store = SceneAssetStore(_settings_path(settings, "scene_assets_dir", "./data/scene-assets"))
background = next((item for item in store.list_backgrounds() if item.get("id") == background_id), None)
if background is None:
raise ValueError("background_id not found")
if str(background.get("kind") or "") == "video":
raise ValueError("video backgrounds are not supported for video creation")
background_path = store.background_file_path(background_id)
if background_path is None:
raise FileNotFoundError("background file not found")
avatar_fit = str(config.get("avatar_fit") or "contain").strip()
avatar_anchor = str(config.get("avatar_anchor") or "center").strip()
if avatar_fit not in {"contain", "cover"}:
raise ValueError("invalid avatar_fit")
if avatar_anchor not in {"center", "bottom", "left", "right"}:
raise ValueError("invalid avatar_anchor")
return {
"background_path": background_path,
"avatar_mask_path": _reference_image_path(avatar_path),
"avatar_fit": avatar_fit,
"avatar_anchor": avatar_anchor,
"avatar_scale": _coerce_composition_float(config, "avatar_scale", 1.0, min_value=0.1, max_value=4.0),
"avatar_offset_x": _coerce_composition_float(config, "avatar_offset_x", 0.0, min_value=-2000.0, max_value=2000.0),
"avatar_offset_y": _coerce_composition_float(config, "avatar_offset_y", 0.0, min_value=-2000.0, max_value=2000.0),
"output_width": _coerce_composition_int(config, "output_width", 1280, min_value=320, max_value=3840),
"output_height": _coerce_composition_int(config, "output_height", 720, min_value=180, max_value=2160),
}
def _resize_cover(image: np.ndarray, width: int, height: int) -> np.ndarray:
src_h, src_w = image.shape[:2]
scale = max(float(width) / float(src_w), float(height) / float(src_h))
new_w = max(1, int(round(src_w * scale)))
new_h = max(1, int(round(src_h * scale)))
resized = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_AREA if scale < 1 else cv2.INTER_LINEAR)
left = max(0, (new_w - width) // 2)
top = max(0, (new_h - height) // 2)
return np.ascontiguousarray(resized[top:top + height, left:left + width])
def _avatar_anchor_origin(anchor: str, canvas_w: int, canvas_h: int, layer_w: int, layer_h: int) -> tuple[int, int]:
if anchor == "bottom":
return (canvas_w - layer_w) // 2, canvas_h - layer_h
if anchor == "left":
return 0, (canvas_h - layer_h) // 2
if anchor == "right":
return canvas_w - layer_w, (canvas_h - layer_h) // 2
return (canvas_w - layer_w) // 2, (canvas_h - layer_h) // 2
def _load_avatar_alpha_mask(path: object) -> np.ndarray | None:
image = cv2.imread(str(path), cv2.IMREAD_UNCHANGED)
if image is None or image.ndim != 3 or image.shape[2] < 4:
return None
return image[:, :, 3].astype(np.float32) / 255.0
def _composite_avatar_layer(
background: np.ndarray,
frame: np.ndarray,
*,
avatar_fit: str,
avatar_anchor: str,
avatar_scale: float,
avatar_offset_x: float,
avatar_offset_y: float,
fallback_alpha: np.ndarray | None = None,
) -> np.ndarray:
canvas_h, canvas_w = background.shape[:2]
layer = np.asarray(frame, dtype=np.uint8)
if layer.ndim != 3 or layer.shape[2] < 3:
return background
bgr = layer[:, :, :3]
if layer.shape[2] >= 4:
alpha = layer[:, :, 3].astype(np.float32) / 255.0
elif fallback_alpha is not None:
alpha = fallback_alpha
if alpha.shape[:2] != bgr.shape[:2]:
alpha = cv2.resize(alpha, (bgr.shape[1], bgr.shape[0]), interpolation=cv2.INTER_AREA).astype(np.float32)
else:
alpha = np.ones(layer.shape[:2], dtype=np.float32)
fit_scale = min(float(canvas_w) / float(bgr.shape[1]), float(canvas_h) / float(bgr.shape[0]))
if avatar_fit == "cover":
fit_scale = max(float(canvas_w) / float(bgr.shape[1]), float(canvas_h) / float(bgr.shape[0]))
scale = max(0.01, fit_scale * float(avatar_scale))
layer_w = max(1, int(round(bgr.shape[1] * scale)))
layer_h = max(1, int(round(bgr.shape[0] * scale)))
bgr_resized = cv2.resize(bgr, (layer_w, layer_h), interpolation=cv2.INTER_AREA if scale < 1 else cv2.INTER_LINEAR)
alpha_resized = cv2.resize(alpha, (layer_w, layer_h), interpolation=cv2.INTER_AREA if scale < 1 else cv2.INTER_LINEAR)
origin_x, origin_y = _avatar_anchor_origin(avatar_anchor, canvas_w, canvas_h, layer_w, layer_h)
left = int(round(origin_x + avatar_offset_x))
top = int(round(origin_y + avatar_offset_y))
dst_left = max(0, left)
dst_top = max(0, top)
dst_right = min(canvas_w, left + layer_w)
dst_bottom = min(canvas_h, top + layer_h)
if dst_left >= dst_right or dst_top >= dst_bottom:
return background
src_left = dst_left - left
src_top = dst_top - top
src_right = src_left + (dst_right - dst_left)
src_bottom = src_top + (dst_bottom - dst_top)
out = background.copy()
fg = bgr_resized[src_top:src_bottom, src_left:src_right].astype(np.float32)
mask = alpha_resized[src_top:src_bottom, src_left:src_right].astype(np.float32)[:, :, None]
bg = out[dst_top:dst_bottom, dst_left:dst_right].astype(np.float32)
out[dst_top:dst_bottom, dst_left:dst_right] = np.clip((fg * mask) + (bg * (1.0 - mask)), 0, 255).astype(np.uint8)
return out
def _apply_video_composition(
frames: list[np.ndarray],
*,
config: Mapping[str, object] | None,
) -> list[np.ndarray]:
if not frames or not config:
return frames
first = np.asarray(frames[0])
frame_height, frame_width = first.shape[:2]
width = _coerce_composition_int(config, "output_width", int(frame_width), min_value=320, max_value=3840)
height = _coerce_composition_int(config, "output_height", int(frame_height), min_value=180, max_value=2160)
background_raw = cv2.imread(str(config["background_path"]), cv2.IMREAD_COLOR)
if background_raw is None:
raise FileNotFoundError("background file not found")
background = _resize_cover(background_raw, int(width), int(height))
fallback_alpha = _load_avatar_alpha_mask(config.get("avatar_mask_path"))
avatar_scale = _coerce_composition_float(config, "avatar_scale", 1.0, min_value=0.1, max_value=4.0)
avatar_offset_x = _coerce_composition_float(config, "avatar_offset_x", 0.0, min_value=-2000.0, max_value=2000.0)
avatar_offset_y = _coerce_composition_float(config, "avatar_offset_y", 0.0, min_value=-2000.0, max_value=2000.0)
return [
_composite_avatar_layer(
background,
frame,
avatar_fit=str(config.get("avatar_fit") or "contain"),
avatar_anchor=str(config.get("avatar_anchor") or "center"),
avatar_scale=avatar_scale,
avatar_offset_x=avatar_offset_x,
avatar_offset_y=avatar_offset_y,
fallback_alpha=fallback_alpha,
)
for frame in frames
]
def _build_reference_driver_pcm(total_samples: int, *, level: float = 480.0) -> np.ndarray:
samples = max(0, int(total_samples))
if samples == 0:
@@ -550,7 +749,8 @@ def _frame_array(frame: VideoFrameData | Any) -> np.ndarray | None:
arr = np.asarray(data)
if arr.ndim != 3 or arr.shape[2] < 3:
return None
return np.ascontiguousarray(arr[:, :, :3].astype(np.uint8, copy=False))
channels = 4 if arr.shape[2] >= 4 else 3
return np.ascontiguousarray(arr[:, :, :channels].astype(np.uint8, copy=False))
def _write_wav(path: Path, pcm: np.ndarray, sample_rate: int = 16000) -> None:
@@ -584,6 +784,8 @@ def _write_video_only(path: Path, frames: list[np.ndarray], fps: float) -> None:
if arr.shape[:2] != (height, width):
resized = cv2.resize(arr, (width, height), interpolation=cv2.INTER_AREA)
arr = np.asarray(resized, dtype=np.uint8)
if arr.ndim == 3 and arr.shape[2] >= 4:
arr = arr[:, :, :3]
writer.write(arr)
finally:
writer.release()
@@ -631,6 +833,7 @@ class VideoCreationService:
title: str,
mime_type: str | None = None,
fasterliveportrait_config: Mapping[str, object] | None = None,
composition_config: Mapping[str, object] | None = None,
) -> dict[str, Any]:
pcm = await decode_audio_file_to_pcm_i16(upload_path)
if pcm.size == 0:
@@ -642,6 +845,7 @@ class VideoCreationService:
title=title,
source="upload",
fasterliveportrait_config=fasterliveportrait_config,
composition_config=composition_config,
)
async def create_from_tts_text(
@@ -657,6 +861,7 @@ class VideoCreationService:
source: str = "tts_text",
fasterliveportrait_config: Mapping[str, object] | None = None,
indextts_config: Mapping[str, object] | None = None,
composition_config: Mapping[str, object] | None = None,
) -> dict[str, Any]:
text_value = text.strip()
if not text_value:
@@ -694,6 +899,7 @@ class VideoCreationService:
title=title,
source=source,
fasterliveportrait_config=fasterliveportrait_config,
composition_config=composition_config,
)
async def create_reference_video(
@@ -703,6 +909,7 @@ class VideoCreationService:
avatar_id: str,
duration_sec: int | None,
title: str,
composition_config: Mapping[str, object] | None = None,
) -> dict[str, Any]:
model_value = _normalize_model(model)
if model_value != "flashtalk":
@@ -720,6 +927,7 @@ class VideoCreationService:
pcm=pcm,
title=title,
source="reference_video",
composition_config=composition_config,
)
async def _resample_pcm(self, pcm: np.ndarray, sample_rate: int) -> np.ndarray:
@@ -763,9 +971,11 @@ class VideoCreationService:
title: str,
source: str,
fasterliveportrait_config: Mapping[str, object] | None = None,
composition_config: Mapping[str, object] | None = None,
) -> dict[str, Any]:
model_value = _normalize_model(model)
avatar_path = _avatar_dir(self.settings, avatar_id)
normalized_composition_config = _normalize_video_composition_config(self.settings, avatar_path, composition_config)
job_id = uuid.uuid4().hex
work_dir = _settings_path(self.settings, "exports_dir", "./data/exports") / "video_creation_jobs" / job_id
work_dir.mkdir(parents=True, exist_ok=False)
@@ -832,6 +1042,7 @@ class VideoCreationService:
target_frames = max(1, int(round(float(pcm.size) * fps / float(sample_rate))))
if len(frames) > target_frames:
frames = frames[:target_frames]
frames = _apply_video_composition(frames, config=normalized_composition_config)
video_only = work_dir / "video_only.mp4"
_write_video_only(video_only, frames, fps)

View File

@@ -38,6 +38,7 @@ dependencies = [
"lightrag-hku>=1.4.9",
"mem0ai>=0.1.115",
"huggingface-hub[cli]<1.0",
"rembg>=2.0.69",
]
[project.optional-dependencies]
@@ -149,7 +150,13 @@ opentalking-persona = "apps.cli.persona:main"
include-package-data = true
[tool.setuptools.package-data]
opentalking = ["assets/voices/system/*/*.json", "assets/voices/system/*/*.txt", "assets/voices/system/*/*.wav", "assets/reference_drivers/*.wav"]
opentalking = [
"assets/voices/system/*/*.json",
"assets/voices/system/*/*.txt",
"assets/voices/system/*/*.wav",
"assets/reference_drivers/*.wav",
"assets/scene_backgrounds/*.jpg",
]
[tool.setuptools.packages.find]
where = ["."]

View File

@@ -85,6 +85,16 @@ def test_asset_library_groups_scene_compositions_by_avatar() -> None:
assert "sceneCompositions.filter((scene) => scene.avatar_id === avatar.id)" in source
def test_asset_library_scene_cards_show_friendly_avatar_and_background_names() -> None:
source = Path("apps/web/src/components/AssetLibraryWorkspace.tsx").read_text(encoding="utf-8")
assert "backgroundById" in source
assert "数字人形象:{sceneAvatar?.name ?? scene.avatar_id}" in source
assert "背景:{sceneBackground?.name ?? scene.background_id ?? scene.background_color}" in source
assert "Avatar {sceneAvatar?.name" not in source
assert "Background {scene.background_id" not in source
def test_scene_delete_actions_use_error_handled_handlers() -> None:
source = Path("apps/web/src/components/AssetLibraryWorkspace.tsx").read_text(encoding="utf-8")

View File

@@ -0,0 +1,60 @@
from __future__ import annotations
from types import SimpleNamespace
import os
from PIL import Image
from opentalking.avatar.matting.rembg_provider import RembgMattingProvider
def test_rembg_provider_requires_configured_model_path(tmp_path, monkeypatch):
missing_model = tmp_path / "u2net.onnx"
provider = RembgMattingProvider()
image = Image.new("RGB", (4, 4), (255, 255, 255))
try:
provider.remove_background(image, settings=SimpleNamespace(avatar_matting_model_path=str(missing_model)))
except Exception as exc: # noqa: BLE001
message = str(exc)
else: # pragma: no cover - assertion guard
raise AssertionError("expected missing model to fail before rembg runs")
assert "未找到抠除背景模型 u2net.onnx" in message
assert "OPENTALKING_AVATAR_MATTING_MODEL_PATH" in message
assert "https://github.com/danielgatis/rembg/releases/download/v0.0.0/u2net.onnx" in message
assert "MD5" not in message
assert str(missing_model) not in message
def test_rembg_provider_uses_configured_model_directory(tmp_path, monkeypatch):
model_path = tmp_path / "u2net.onnx"
model_path.write_bytes(b"fake-model")
calls: list[bytes] = []
homes_seen: list[str | None] = []
def fake_remove(body: bytes) -> bytes:
calls.append(body)
homes_seen.append(os.environ.get("U2NET_HOME"))
out = Image.new("RGBA", (4, 4), (1, 2, 3, 0))
import io
buffer = io.BytesIO()
out.save(buffer, format="PNG")
return buffer.getvalue()
monkeypatch.setenv("U2NET_HOME", "/tmp/original-u2net-home")
monkeypatch.setattr("rembg.remove", fake_remove)
image = Image.new("RGB", (4, 4), (255, 255, 255))
result = RembgMattingProvider().remove_background(
image,
settings=SimpleNamespace(avatar_matting_model_path=str(model_path)),
)
assert calls
assert homes_seen == [str(tmp_path)]
assert result.mode == "RGBA"
assert result.getchannel("A").getextrema() == (0, 0)
assert os.environ.get("U2NET_HOME") == "/tmp/original-u2net-home"

View File

@@ -197,6 +197,32 @@ def test_video_clone_allows_uploading_source_avatar():
assert "onAvatarUploaded={handleVideoCloneAvatarUploaded}" in app
def test_custom_avatar_upload_can_request_background_removal():
app = (WEB / "App.tsx").read_text(encoding="utf-8")
stage = (WEB / "components" / "AvatarSelectionStage.tsx").read_text(encoding="utf-8")
assert "上传时抠除背景" in stage
assert "customRemoveBackground" in stage
assert "customUploadState" in stage
assert "正在抠除背景..." in stage
assert "抠图完成" in stage
assert "createdCustomAvatar" in stage
assert "buildApiUrl(`/avatars/${encodeURIComponent(createdCustomAvatar.id)}/preview`)" in stage
assert "removeBackground: customRemoveBackground" in stage
assert "await onCustomAvatarCreate" in stage
assert 'fd.set("remove_background", options?.removeBackground ? "true" : "false")' in app
assert "return created" in app
assert "创建失败:" in app
assert "e instanceof ApiError ? e.detail : null" in app
toast = (WEB / "components" / "ToastStack.tsx").read_text(encoding="utf-8")
assert "whitespace-pre-line break-words" in toast
assert "tone !== \"error\"" in app
assert "pauseToast" in app
assert "resumeToast" in app
assert "onMouseEnter={() => onPause(toast.id)}" in toast
assert "onMouseLeave={() => onResume(toast.id)}" in toast
def test_video_clone_lip_retargeting_disables_relative_motion():
clone = (WEB / "components" / "VideoCloneWorkspace.tsx").read_text(encoding="utf-8")
@@ -477,6 +503,58 @@ def test_video_creation_workspace_wires_offline_generation_flow():
assert "去资产库查看" in workspace
def test_video_creation_workspace_supports_one_off_scene_composition():
app = (WEB / "App.tsx").read_text(encoding="utf-8")
api = (WEB / "lib" / "api.ts").read_text(encoding="utf-8")
workspace = (WEB / "components" / "VideoCreationWorkspace.tsx").read_text(encoding="utf-8")
assert "sceneBackgrounds={sceneBackgrounds}" in app
assert "sceneCompositions={sceneCompositions}" in app
assert "selectedSceneIdsByAvatar={selectedSceneIdsByAvatar}" in app
assert "export type VideoCreationCompositionConfig" in api
assert "compositionConfig?: VideoCreationCompositionConfig | null" in api
assert 'form.set("composition_config", JSON.stringify(input.compositionConfig))' in api
assert "sceneBackgrounds: SceneBackgroundAsset[]" in workspace
assert "sceneCompositions: SceneComposition[]" in workspace
assert "selectedSceneIdsByAvatar?: Record<string, string>" in workspace
assert "生成前预览" in workspace
assert "本次生成" in workspace
assert "水平位置" in workspace
assert "垂直位置" in workspace
assert "人物缩放" in workspace
assert "compositionConfig" in workspace
assert "VIDEO_CREATION_OUTPUT_SIZES" in workspace
assert '"16:9"' in workspace
assert '"9:16"' in workspace
assert '"1:1"' in workspace
assert "videoOutputAspect" in workspace
assert "selectedVideoOutputSize" in workspace
assert "videoAvatarPreviewLayer" in workspace
assert "left: `${videoAvatarPreviewLayer.leftPct}%`" in workspace
assert "top: `${videoAvatarPreviewLayer.topPct}%`" in workspace
assert "width: `${videoAvatarPreviewLayer.widthPct}%`" in workspace
assert "height: `${videoAvatarPreviewLayer.heightPct}%`" in workspace
assert "translate(${videoAvatarAdjust.x}px" not in workspace
assert "output_width: selectedVideoOutputSize.width" in workspace
assert "output_height: selectedVideoOutputSize.height" in workspace
assert 'data-testid="video-creation-result-panel"' in workspace
assert 'data-testid="video-creation-composition-controls"' in workspace
assert "flex min-h-0 flex-col overflow-y-auto rounded-lg border border-slate-200 bg-white p-4 shadow-sm" in workspace
assert "mt-3 space-y-3 rounded-lg border border-slate-200 bg-slate-50 p-3" in workspace
assert "mt-3 min-h-0 flex-1 space-y-3 overflow-y-auto" not in workspace
assert "mt-4 shrink-0 overflow-hidden" in workspace
assert "aspect-video w-full" in workspace
assert "aspect-[9/16]" in workspace
assert "aspect-square" in workspace
assert "aspectRatio: selectedVideoOutputSize.aspectRatio" not in workspace
assert "xl:grid-cols-[18rem_minmax(28rem,1fr)_minmax(32rem,42rem)]" in workspace
assert "画面预览" in workspace
assert "输出画幅" in workspace
assert "h-[clamp(18rem,42vh,30rem)]" not in workspace
assert workspace.index('data-testid="video-creation-result-panel"') < workspace.index("构图设置")
assert workspace.index("构图设置") < workspace.index("生成前预览")
def test_frontend_export_controls_include_audio_renderer_models():
app = (WEB / "App.tsx").read_text(encoding="utf-8")
renderers_block = app[app.index("const SERVER_AUDIO_RENDERERS"):app.index("function isFlashRenderer")]