fix: correct facial alignment issues and add API server

- Fix frame shape bug in inference.py line 216 (use ori_frame instead of frame)
- Adjust upper_boundary_ratio from 0.5 to 0.4 for better facial blending
- Add MuseTalk API server with multiple versions
- Add inference configs and helper scripts
- Update .gitignore for conda environments

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <[email protected]>

Files changed (13) hide show

.gitignore +9 -1
activate.sh +19 -0
avatar_pipeline.py +204 -0
configs/inference/hello_world.yaml +3 -0
configs/inference/professor_test.yaml +4 -0
musetalk/utils/blending.py +2 -2
musetalk_api_server.py +551 -0
musetalk_api_server_v2.py +445 -0
musetalk_api_server_v3.py +651 -0
musetalk_api_server_v3_fixed.py +371 -0
run_inference.py +276 -0
scripts/inference.py +1 -1
server.py +607 -0

.gitignore CHANGED Viewed

@@ -15,4 +15,12 @@ ffmprobe*
 ffplay*
 debug
 exp_out
-.gradio

 ffplay*
 debug
 exp_out
+.gradio
+# Conda environment (Lightning AI persistent)
+.conda_env/
+miniconda/
+venv/
+# Arquivos temporários de instalação
+=*

activate.sh ADDED Viewed

	@@ -0,0 +1,19 @@

+#!/bin/bash
+# Script para ativar o ambiente MuseTalk
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# Ativar conda local
+source "$SCRIPT_DIR/miniconda/bin/activate" musetalk
+# Configurar token HuggingFace (defina sua variável HF_TOKEN ou edite aqui)
+# export HF_TOKEN="seu_token_aqui"
+# Configurar FFMPEG se necessário
+# export FFMPEG_PATH="$SCRIPT_DIR/ffmpeg"
+cd "${SCRIPT_DIR}"
+echo "✅ Ambiente MuseTalk ativado!"
+echo "Diretório: ${SCRIPT_DIR}"
+echo "Python: $(python --version)"
+echo "PyTorch: $(python -c 'import torch; print(torch.__version__)')"

avatar_pipeline.py ADDED Viewed

	@@ -0,0 +1,204 @@

+"""
+Multimodal Avatar Pipeline
+Audio Input -> Whisper -> LLM -> XTTS -> MuseTalk
+This creates a complete avatar that can understand spoken Portuguese
+and respond with lip-synced video.
+"""
+import os
+import requests
+import tempfile
+import soundfile as sf
+os.environ["HF_HOME"] = "/workspace/MuseTalk/.cache/huggingface"
+os.environ["COQUI_TOS_AGREED"] = "1"
+from faster_whisper import WhisperModel
+from llama_cpp import Llama
+from TTS.api import TTS
+class MultimodalAvatar:
+    def __init__(
+        self,
+        whisper_model: str = "tiny",
+        llm_model_path: str = "models/llm/qwen2.5-0.5b-instruct-q4_k_m.gguf",
+        reference_audio: str = "data/audio/mariana_ref.wav",
+        avatar_id: str = "mariana_hd",
+        musetalk_url: str = "http://localhost:8000",
+        system_prompt: str = None
+    ):
+        print("Initializing Multimodal Avatar Pipeline...")
+        # Whisper for speech-to-text
+        print("  Loading Whisper...")
+        self.whisper = WhisperModel(whisper_model, device="cpu", compute_type="int8")
+        # LLM for understanding and response
+        print("  Loading LLM...")
+        self.llm = Llama(
+            model_path=llm_model_path,
+            n_ctx=2048,
+            n_threads=4,
+            verbose=False
+        )
+        # XTTS for text-to-speech
+        print("  Loading XTTS...")
+        self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
+        self.reference_audio = reference_audio
+        self.avatar_id = avatar_id
+        self.musetalk_url = musetalk_url
+        self.system_prompt = system_prompt or """Você é Mariana, uma assistente virtual brasileira.
+Você é simpática, prestativa e sempre responde em português brasileiro.
+Suas respostas são claras, concisas e naturais, como se estivesse conversando.
+Evite respostas muito longas - prefira 2-3 frases no máximo."""
+        self.conversation_history = []
+        print("Avatar ready!")
+    def transcribe(self, audio_path: str) -> str:
+        """Transcribe audio to text using Whisper"""
+        segments, info = self.whisper.transcribe(audio_path, language="pt")
+        text = " ".join([segment.text for segment in segments]).strip()
+        return text
+    def think(self, user_message: str) -> str:
+        """Generate response using LLM"""
+        self.conversation_history.append({"role": "user", "content": user_message})
+        messages = [{"role": "system", "content": self.system_prompt}]
+        messages.extend(self.conversation_history[-10:])  # Keep last 10 messages
+        response = self.llm.create_chat_completion(
+            messages=messages,
+            max_tokens=200,
+            temperature=0.7,
+            stop=["<|im_end|>", "<|endoftext|>"]
+        )
+        assistant_message = response['choices'][0]['message']['content'].strip()
+        self.conversation_history.append({"role": "assistant", "content": assistant_message})
+        return assistant_message
+    def speak(self, text: str, output_path: str) -> str:
+        """Convert text to speech using XTTS"""
+        self.tts.tts_to_file(
+            text=text,
+            speaker_wav=self.reference_audio,
+            language="pt",
+            file_path=output_path
+        )
+        return output_path
+    def animate(self, audio_path: str, output_path: str) -> str:
+        """Generate lip-sync video using MuseTalk"""
+        with open(audio_path, 'rb') as f:
+            response = requests.post(
+                f"{self.musetalk_url}/inference",
+                files={"audio": f},
+                data={"avatar_id": self.avatar_id},
+                timeout=300
+            )
+        if response.status_code == 200:
+            with open(output_path, 'wb') as f:
+                f.write(response.content)
+            return output_path
+        else:
+            raise Exception(f"MuseTalk error: {response.text}")
+    def respond(self, audio_input: str, output_video: str) -> dict:
+        """
+        Complete pipeline: audio input -> transcribe -> think -> speak -> animate
+        Returns dict with all intermediate results
+        """
+        print("\n=== Processing Request ===")
+        # Step 1: Transcribe
+        print("1. Transcribing audio...")
+        user_text = self.transcribe(audio_input)
+        print(f"   User said: {user_text}")
+        # Step 2: Think
+        print("2. Generating response...")
+        response_text = self.think(user_text)
+        print(f"   Response: {response_text}")
+        # Step 3: Speak
+        print("3. Synthesizing speech...")
+        audio_output = output_video.replace('.mp4', '.wav')
+        self.speak(response_text, audio_output)
+        # Get audio duration
+        data, sr = sf.read(audio_output)
+        audio_duration = len(data) / sr
+        print(f"   Audio duration: {audio_duration:.2f}s")
+        # Step 4: Animate
+        print("4. Generating lip-sync video...")
+        self.animate(audio_output, output_video)
+        print(f"   Video saved: {output_video}")
+        return {
+            "user_text": user_text,
+            "response_text": response_text,
+            "audio_path": audio_output,
+            "video_path": output_video,
+            "audio_duration": audio_duration
+        }
+    def respond_to_text(self, user_text: str, output_video: str) -> dict:
+        """
+        Pipeline for text input: think -> speak -> animate
+        """
+        print("\n=== Processing Text Request ===")
+        print(f"   User: {user_text}")
+        # Step 1: Think
+        print("1. Generating response...")
+        response_text = self.think(user_text)
+        print(f"   Response: {response_text}")
+        # Step 2: Speak
+        print("2. Synthesizing speech...")
+        audio_output = output_video.replace('.mp4', '.wav')
+        self.speak(response_text, audio_output)
+        data, sr = sf.read(audio_output)
+        audio_duration = len(data) / sr
+        print(f"   Audio duration: {audio_duration:.2f}s")
+        # Step 3: Animate
+        print("3. Generating lip-sync video...")
+        self.animate(audio_output, output_video)
+        print(f"   Video saved: {output_video}")
+        return {
+            "user_text": user_text,
+            "response_text": response_text,
+            "audio_path": audio_output,
+            "video_path": output_video,
+            "audio_duration": audio_duration
+        }
+if __name__ == "__main__":
+    # Initialize the avatar
+    avatar = MultimodalAvatar()
+    # Test with text input
+    result = avatar.respond_to_text(
+        user_text="Olá Mariana! Me conte sobre você.",
+        output_video="results/avatar_test.mp4"
+    )
+    print("\n=== Result ===")
+    print(f"User: {result['user_text']}")
+    print(f"Mariana: {result['response_text']}")
+    print(f"Video: {result['video_path']} ({result['audio_duration']:.1f}s)")

configs/inference/hello_world.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+task_hello_world:
+  video_path: "data/video/video_hd_1min_25fps.mp4"
+  audio_path: "data/audio/hello_world.wav"

configs/inference/professor_test.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+task_0:
+ video_path: "data/video/yongen.mp4"
+ audio_path: "data/audio/professor_pt.wav"
+ bbox_shift: 0

musetalk/utils/blending.py CHANGED Viewed

@@ -32,7 +32,7 @@ def face_seg(image, mode="raw", fp=None):
     return seg_image
-def get_image(image, face, face_box, upper_boundary_ratio=0.5, expand=1.5, mode="raw", fp=None):
     """
     将裁剪的面部图像粘贴回原始图像，并进行一些处理。
@@ -109,7 +109,7 @@ def get_image_blending(image, face, face_box, mask_array, crop_box):
     return body[:,:,::-1]
-def get_image_prepare_material(image, face_box, upper_boundary_ratio=0.5, expand=1.5, fp=None, mode="raw"):
     body = Image.fromarray(image[:,:,::-1])
     x, y, x1, y1 = face_box

     return seg_image
+def get_image(image, face, face_box, upper_boundary_ratio=0.4, expand=1.5, mode="raw", fp=None):
     """
     将裁剪的面部图像粘贴回原始图像，并进行一些处理。
     return body[:,:,::-1]
+def get_image_prepare_material(image, face_box, upper_boundary_ratio=0.4, expand=1.5, fp=None, mode="raw"):
     body = Image.fromarray(image[:,:,::-1])
     x, y, x1, y1 = face_box

musetalk_api_server.py ADDED Viewed

	@@ -0,0 +1,551 @@

+"""
+MuseTalk HTTP API Server
+Keeps models loaded in GPU memory for fast inference.
+"""
+import os
+import cv2
+import copy
+import torch
+import glob
+import shutil
+import pickle
+import numpy as np
+import subprocess
+import tempfile
+import hashlib
+import time
+from pathlib import Path
+from typing import Optional
+from fastapi import FastAPI, File, UploadFile, Form, HTTPException, BackgroundTasks
+from fastapi.responses import FileResponse, JSONResponse
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from tqdm import tqdm
+from omegaconf import OmegaConf
+from transformers import WhisperModel
+import uvicorn
+# MuseTalk imports
+from musetalk.utils.blending import get_image
+from musetalk.utils.face_parsing import FaceParsing
+from musetalk.utils.audio_processor import AudioProcessor
+from musetalk.utils.utils import get_file_type, datagen, load_all_model
+from musetalk.utils.preprocessing import get_landmark_and_bbox, read_imgs, coord_placeholder
+class MuseTalkServer:
+    """Singleton server that keeps models loaded in GPU memory."""
+    def __init__(self):
+        self.device = None
+        self.vae = None
+        self.unet = None
+        self.pe = None
+        self.whisper = None
+        self.audio_processor = None
+        self.fp = None
+        self.timesteps = None
+        self.weight_dtype = None
+        self.is_loaded = False
+        # Cache directories
+        self.cache_dir = Path("./cache")
+        self.cache_dir.mkdir(exist_ok=True)
+        self.landmarks_cache = self.cache_dir / "landmarks"
+        self.latents_cache = self.cache_dir / "latents"
+        self.whisper_cache = self.cache_dir / "whisper_features"
+        self.landmarks_cache.mkdir(exist_ok=True)
+        self.latents_cache.mkdir(exist_ok=True)
+        self.whisper_cache.mkdir(exist_ok=True)
+        # Config
+        self.fps = 25
+        self.batch_size = 8
+        self.use_float16 = True
+        self.version = "v15"
+        self.extra_margin = 10
+        self.parsing_mode = "jaw"
+        self.left_cheek_width = 90
+        self.right_cheek_width = 90
+        self.audio_padding_left = 2
+        self.audio_padding_right = 2
+    def load_models(
+        self,
+        gpu_id: int = 0,
+        unet_model_path: str = "./models/musetalkV15/unet.pth",
+        unet_config: str = "./models/musetalk/config.json",
+        vae_type: str = "sd-vae",
+        whisper_dir: str = "./models/whisper",
+        use_float16: bool = True,
+        version: str = "v15"
+    ):
+        """Load all models into GPU memory."""
+        if self.is_loaded:
+            print("Models already loaded!")
+            return
+        print("=" * 50)
+        print("Loading MuseTalk models into GPU memory...")
+        print("=" * 50)
+        start_time = time.time()
+        # Set device
+        self.device = torch.device(f"cuda:{gpu_id}" if torch.cuda.is_available() else "cpu")
+        print(f"Using device: {self.device}")
+        # Load model weights
+        print("Loading VAE, UNet, PE...")
+        self.vae, self.unet, self.pe = load_all_model(
+            unet_model_path=unet_model_path,
+            vae_type=vae_type,
+            unet_config=unet_config,
+            device=self.device
+        )
+        self.timesteps = torch.tensor([0], device=self.device)
+        # Convert to float16 if enabled
+        self.use_float16 = use_float16
+        if use_float16:
+            print("Converting to float16...")
+            self.pe = self.pe.half()
+            self.vae.vae = self.vae.vae.half()
+            self.unet.model = self.unet.model.half()
+        # Move to device
+        self.pe = self.pe.to(self.device)
+        self.vae.vae = self.vae.vae.to(self.device)
+        self.unet.model = self.unet.model.to(self.device)
+        # Initialize audio processor and Whisper
+        print("Loading Whisper model...")
+        self.audio_processor = AudioProcessor(feature_extractor_path=whisper_dir)
+        self.weight_dtype = self.unet.model.dtype
+        self.whisper = WhisperModel.from_pretrained(whisper_dir)
+        self.whisper = self.whisper.to(device=self.device, dtype=self.weight_dtype).eval()
+        self.whisper.requires_grad_(False)
+        # Initialize face parser
+        self.version = version
+        if version == "v15":
+            self.fp = FaceParsing(
+                left_cheek_width=self.left_cheek_width,
+                right_cheek_width=self.right_cheek_width
+            )
+        else:
+            self.fp = FaceParsing()
+        self.is_loaded = True
+        load_time = time.time() - start_time
+        print(f"Models loaded in {load_time:.2f}s")
+        print("=" * 50)
+        print("Server ready for inference!")
+        print("=" * 50)
+    def _get_file_hash(self, file_path: str) -> str:
+        """Get MD5 hash of a file for caching."""
+        hash_md5 = hashlib.md5()
+        with open(file_path, "rb") as f:
+            for chunk in iter(lambda: f.read(4096), b""):
+                hash_md5.update(chunk)
+        return hash_md5.hexdigest()[:16]
+    def _get_cached_landmarks(self, video_hash: str, bbox_shift: int):
+        """Get cached landmarks if available."""
+        # Disabled due to tensor comparison issues
+        return None
+    def _save_landmarks_cache(self, video_hash: str, bbox_shift: int, coord_list, frame_list):
+        """Save landmarks to cache."""
+        cache_file = self.landmarks_cache / f"{video_hash}_shift{bbox_shift}.pkl"
+        with open(cache_file, 'wb') as f:
+            pickle.dump((coord_list, frame_list), f)
+    def _get_cached_latents(self, video_hash: str):
+        """Get cached VAE latents if available."""
+        # Disabled due to tensor comparison issues
+        return None
+    def _save_latents_cache(self, video_hash: str, latent_list):
+        """Save VAE latents to cache."""
+        cache_file = self.latents_cache / f"{video_hash}.pkl"
+        with open(cache_file, 'wb') as f:
+            pickle.dump(latent_list, f)
+    def _get_cached_whisper(self, audio_hash: str):
+        """Get cached Whisper features if available."""
+        # Disabled due to tensor comparison issues
+        return None
+    def _save_whisper_cache(self, audio_hash: str, whisper_data):
+        """Save Whisper features to cache."""
+        cache_file = self.whisper_cache / f"{audio_hash}.pkl"
+        with open(cache_file, 'wb') as f:
+            pickle.dump(whisper_data, f)
+    @torch.no_grad()
+    def generate(
+        self,
+        video_path: str,
+        audio_path: str,
+        output_path: str,
+        fps: Optional[int] = None,
+        use_cache: bool = True
+    ) -> dict:
+        """
+        Generate lip-synced video.
+        Returns dict with timing info.
+        """
+        if not self.is_loaded:
+            raise RuntimeError("Models not loaded! Call load_models() first.")
+        fps = fps or self.fps
+        timings = {"total": 0}
+        total_start = time.time()
+        # Get file hashes for caching
+        video_hash = self._get_file_hash(video_path)
+        audio_hash = self._get_file_hash(audio_path)
+        # Create temp directory
+        temp_dir = tempfile.mkdtemp()
+        try:
+            # 1. Extract frames
+            t0 = time.time()
+            input_basename = Path(video_path).stem
+            save_dir_full = os.path.join(temp_dir, "frames")
+            os.makedirs(save_dir_full, exist_ok=True)
+            if get_file_type(video_path) == "video":
+                cmd = f"ffmpeg -v fatal -i {video_path} -vf fps={fps} -start_number 0 {save_dir_full}/%08d.png"
+                os.system(cmd)
+                input_img_list = sorted(glob.glob(os.path.join(save_dir_full, '*.[jpJP][pnPN]*[gG]')))
+            elif get_file_type(video_path) == "image":
+                input_img_list = [video_path]
+            else:
+                raise ValueError(f"Unsupported video type: {video_path}")
+            timings["frame_extraction"] = time.time() - t0
+            # 2. Extract audio features (with caching)
+            t0 = time.time()
+            cached_whisper = self._get_cached_whisper(audio_hash) if use_cache else None
+            if cached_whisper:
+                whisper_chunks = cached_whisper
+                timings["whisper_source"] = "cache"
+            else:
+                whisper_input_features, librosa_length = self.audio_processor.get_audio_feature(audio_path)
+                whisper_chunks = self.audio_processor.get_whisper_chunk(
+                    whisper_input_features,
+                    self.device,
+                    self.weight_dtype,
+                    self.whisper,
+                    librosa_length,
+                    fps=fps,
+                    audio_padding_length_left=self.audio_padding_left,
+                    audio_padding_length_right=self.audio_padding_right,
+                )
+                if use_cache:
+                    self._save_whisper_cache(audio_hash, whisper_chunks)
+                timings["whisper_source"] = "computed"
+            timings["whisper_features"] = time.time() - t0
+            # 3. Get landmarks (with caching)
+            t0 = time.time()
+            bbox_shift = 0 if self.version == "v15" else 0
+            cache_key = f"{video_hash}_{fps}"
+            cached_landmarks = self._get_cached_landmarks(cache_key, bbox_shift) if use_cache else None
+            if cached_landmarks:
+                coord_list, frame_list = cached_landmarks
+                timings["landmarks_source"] = "cache"
+            else:
+                coord_list, frame_list = get_landmark_and_bbox(input_img_list, bbox_shift)
+                if use_cache:
+                    self._save_landmarks_cache(cache_key, bbox_shift, coord_list, frame_list)
+                timings["landmarks_source"] = "computed"
+            timings["landmarks"] = time.time() - t0
+            # 4. Compute VAE latents (with caching)
+            t0 = time.time()
+            latent_cache_key = f"{video_hash}_{fps}_{self.version}"
+            cached_latents = self._get_cached_latents(latent_cache_key) if use_cache else None
+            if cached_latents:
+                input_latent_list = cached_latents
+                timings["latents_source"] = "cache"
+            else:
+                input_latent_list = []
+                for bbox, frame in zip(coord_list, frame_list):
+                    if isinstance(bbox, (list, tuple)) and list(bbox) == list(coord_placeholder):
+                        continue
+                    x1, y1, x2, y2 = bbox
+                    if self.version == "v15":
+                        y2 = y2 + self.extra_margin
+                        y2 = min(y2, frame.shape[0])
+                    crop_frame = frame[y1:y2, x1:x2]
+                    crop_frame = cv2.resize(crop_frame, (256, 256), interpolation=cv2.INTER_LANCZOS4)
+                    latents = self.vae.get_latents_for_unet(crop_frame)
+                    input_latent_list.append(latents)
+                if use_cache:
+                    self._save_latents_cache(latent_cache_key, input_latent_list)
+                timings["latents_source"] = "computed"
+            timings["vae_encoding"] = time.time() - t0
+            # 5. Prepare cycled lists
+            frame_list_cycle = frame_list + frame_list[::-1]
+            coord_list_cycle = coord_list + coord_list[::-1]
+            input_latent_list_cycle = input_latent_list + input_latent_list[::-1]
+            # 6. UNet inference
+            t0 = time.time()
+            video_num = len(whisper_chunks)
+            gen = datagen(
+                whisper_chunks=whisper_chunks,
+                vae_encode_latents=input_latent_list_cycle,
+                batch_size=self.batch_size,
+                delay_frame=0,
+                device=self.device,
+            )
+            res_frame_list = []
+            for whisper_batch, latent_batch in gen:
+                audio_feature_batch = self.pe(whisper_batch)
+                latent_batch = latent_batch.to(dtype=self.unet.model.dtype)
+                pred_latents = self.unet.model(
+                    latent_batch, self.timesteps,
+                    encoder_hidden_states=audio_feature_batch
+                ).sample
+                recon = self.vae.decode_latents(pred_latents)
+                for res_frame in recon:
+                    res_frame_list.append(res_frame)
+            timings["unet_inference"] = time.time() - t0
+            # 7. Face blending
+            t0 = time.time()
+            result_img_path = os.path.join(temp_dir, "results")
+            os.makedirs(result_img_path, exist_ok=True)
+            for i, res_frame in enumerate(res_frame_list):
+                bbox = coord_list_cycle[i % len(coord_list_cycle)]
+                ori_frame = copy.deepcopy(frame_list_cycle[i % len(frame_list_cycle)])
+                x1, y1, x2, y2 = bbox
+                if self.version == "v15":
+                    y2 = y2 + self.extra_margin
+                    y2 = min(y2, ori_frame.shape[0])
+                try:
+                    res_frame = cv2.resize(res_frame.astype(np.uint8), (x2-x1, y2-y1))
+                except:
+                    continue
+                if self.version == "v15":
+                    combine_frame = get_image(ori_frame, res_frame, [x1, y1, x2, y2],
+                                             mode=self.parsing_mode, fp=self.fp)
+                else:
+                    combine_frame = get_image(ori_frame, res_frame, [x1, y1, x2, y2], fp=self.fp)
+                cv2.imwrite(f"{result_img_path}/{str(i).zfill(8)}.png", combine_frame)
+            timings["face_blending"] = time.time() - t0
+            # 8. Encode video
+            t0 = time.time()
+            temp_vid = os.path.join(temp_dir, "temp.mp4")
+            cmd_img2video = f"ffmpeg -y -v warning -r {fps} -f image2 -i {result_img_path}/%08d.png -vcodec libx264 -vf format=yuv420p -crf 18 {temp_vid}"
+            os.system(cmd_img2video)
+            cmd_combine = f"ffmpeg -y -v warning -i {audio_path} -i {temp_vid} {output_path}"
+            os.system(cmd_combine)
+            timings["video_encoding"] = time.time() - t0
+        finally:
+            # Cleanup
+            shutil.rmtree(temp_dir, ignore_errors=True)
+        timings["total"] = time.time() - total_start
+        timings["frames_generated"] = len(res_frame_list)
+        return timings
+# Global server instance
+server = MuseTalkServer()
+# FastAPI app
+app = FastAPI(
+    title="MuseTalk API",
+    description="HTTP API for MuseTalk lip-sync generation",
+    version="1.0.0"
+)
+# CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+@app.on_event("startup")
+async def startup_event():
+    """Load models on server startup."""
+    server.load_models()
+@app.get("/health")
+async def health_check():
+    """Check if server is ready."""
+    return {
+        "status": "ok" if server.is_loaded else "loading",
+        "models_loaded": server.is_loaded,
+        "device": str(server.device) if server.device else None
+    }
+@app.get("/cache/stats")
+async def cache_stats():
+    """Get cache statistics."""
+    landmarks_count = len(list(server.landmarks_cache.glob("*.pkl")))
+    latents_count = len(list(server.latents_cache.glob("*.pkl")))
+    whisper_count = len(list(server.whisper_cache.glob("*.pkl")))
+    return {
+        "landmarks_cached": landmarks_count,
+        "latents_cached": latents_count,
+        "whisper_features_cached": whisper_count
+    }
+@app.post("/cache/clear")
+async def clear_cache():
+    """Clear all caches."""
+    for cache_dir in [server.landmarks_cache, server.latents_cache, server.whisper_cache]:
+        for f in cache_dir.glob("*.pkl"):
+            f.unlink()
+    return {"status": "cleared"}
+class GenerateRequest(BaseModel):
+    video_path: str
+    audio_path: str
+    output_path: str
+    fps: Optional[int] = 25
+    use_cache: bool = True
+@app.post("/generate")
+async def generate_from_paths(request: GenerateRequest):
+    """
+    Generate lip-synced video from file paths.
+    Use this when files are already on the server.
+    """
+    if not server.is_loaded:
+        raise HTTPException(status_code=503, detail="Models not loaded yet")
+    if not os.path.exists(request.video_path):
+        raise HTTPException(status_code=404, detail=f"Video not found: {request.video_path}")
+    if not os.path.exists(request.audio_path):
+        raise HTTPException(status_code=404, detail=f"Audio not found: {request.audio_path}")
+    try:
+        timings = server.generate(
+            video_path=request.video_path,
+            audio_path=request.audio_path,
+            output_path=request.output_path,
+            fps=request.fps,
+            use_cache=request.use_cache
+        )
+        return {
+            "status": "success",
+            "output_path": request.output_path,
+            "timings": timings
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/generate/upload")
+async def generate_from_upload(
+    video: UploadFile = File(...),
+    audio: UploadFile = File(...),
+    fps: int = Form(25),
+    use_cache: bool = Form(True)
+):
+    """
+    Generate lip-synced video from uploaded files.
+    Returns the generated video file.
+    """
+    if not server.is_loaded:
+        raise HTTPException(status_code=503, detail="Models not loaded yet")
+    # Save uploaded files
+    temp_dir = tempfile.mkdtemp()
+    try:
+        video_path = os.path.join(temp_dir, video.filename)
+        audio_path = os.path.join(temp_dir, audio.filename)
+        output_path = os.path.join(temp_dir, "output.mp4")
+        with open(video_path, "wb") as f:
+            f.write(await video.read())
+        with open(audio_path, "wb") as f:
+            f.write(await audio.read())
+        timings = server.generate(
+            video_path=video_path,
+            audio_path=audio_path,
+            output_path=output_path,
+            fps=fps,
+            use_cache=use_cache
+        )
+        # Return the video file
+        return FileResponse(
+            output_path,
+            media_type="video/mp4",
+            filename="result.mp4",
+            headers={"X-Timings": str(timings)}
+        )
+    except Exception as e:
+        shutil.rmtree(temp_dir, ignore_errors=True)
+        raise HTTPException(status_code=500, detail=str(e))
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="MuseTalk API Server")
+    parser.add_argument("--host", type=str, default="0.0.0.0", help="Host to bind")
+    parser.add_argument("--port", type=int, default=8000, help="Port to bind")
+    parser.add_argument("--gpu_id", type=int, default=0, help="GPU ID")
+    parser.add_argument("--unet_model_path", type=str, default="./models/musetalkV15/unet.pth")
+    parser.add_argument("--unet_config", type=str, default="./models/musetalk/config.json")
+    parser.add_argument("--whisper_dir", type=str, default="./models/whisper")
+    parser.add_argument("--no_float16", action="store_true", help="Disable float16")
+    args = parser.parse_args()
+    # Pre-configure server
+    server.load_models(
+        gpu_id=args.gpu_id,
+        unet_model_path=args.unet_model_path,
+        unet_config=args.unet_config,
+        whisper_dir=args.whisper_dir,
+        use_float16=not args.no_float16
+    )
+    # Start server
+    uvicorn.run(app, host=args.host, port=args.port)

musetalk_api_server_v2.py ADDED Viewed

	@@ -0,0 +1,445 @@

+"""
+MuseTalk HTTP API Server v2
+Optimized for repeated use of the same avatar.
+"""
+import os
+import cv2
+import copy
+import torch
+import glob
+import shutil
+import pickle
+import numpy as np
+import subprocess
+import tempfile
+import hashlib
+import time
+from pathlib import Path
+from typing import Optional
+from fastapi import FastAPI, File, UploadFile, Form, HTTPException, BackgroundTasks
+from fastapi.responses import FileResponse, JSONResponse
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from tqdm import tqdm
+from omegaconf import OmegaConf
+from transformers import WhisperModel
+import uvicorn
+# MuseTalk imports
+from musetalk.utils.blending import get_image
+from musetalk.utils.face_parsing import FaceParsing
+from musetalk.utils.audio_processor import AudioProcessor
+from musetalk.utils.utils import get_file_type, datagen, load_all_model
+from musetalk.utils.preprocessing import get_landmark_and_bbox, read_imgs, coord_placeholder
+class MuseTalkServerV2:
+    """Server optimized for pre-processed avatars."""
+    def __init__(self):
+        self.device = None
+        self.vae = None
+        self.unet = None
+        self.pe = None
+        self.whisper = None
+        self.audio_processor = None
+        self.fp = None
+        self.timesteps = None
+        self.weight_dtype = None
+        self.is_loaded = False
+        # Avatar cache (in-memory)
+        self.loaded_avatars = {}
+        self.avatar_dir = Path("./avatars")
+        # Config
+        self.fps = 25
+        self.batch_size = 8
+        self.use_float16 = True
+        self.version = "v15"
+        self.extra_margin = 10
+        self.parsing_mode = "jaw"
+        self.left_cheek_width = 90
+        self.right_cheek_width = 90
+        self.audio_padding_left = 2
+        self.audio_padding_right = 2
+    def load_models(
+        self,
+        gpu_id: int = 0,
+        unet_model_path: str = "./models/musetalkV15/unet.pth",
+        unet_config: str = "./models/musetalk/config.json",
+        vae_type: str = "sd-vae",
+        whisper_dir: str = "./models/whisper",
+        use_float16: bool = True,
+        version: str = "v15"
+    ):
+        if self.is_loaded:
+            print("Models already loaded!")
+            return
+        print("=" * 50)
+        print("Loading MuseTalk models into GPU memory...")
+        print("=" * 50)
+        start_time = time.time()
+        self.device = torch.device(f"cuda:{gpu_id}" if torch.cuda.is_available() else "cpu")
+        print(f"Using device: {self.device}")
+        print("Loading VAE, UNet, PE...")
+        self.vae, self.unet, self.pe = load_all_model(
+            unet_model_path=unet_model_path,
+            vae_type=vae_type,
+            unet_config=unet_config,
+            device=self.device
+        )
+        self.timesteps = torch.tensor([0], device=self.device)
+        self.use_float16 = use_float16
+        if use_float16:
+            print("Converting to float16...")
+            self.pe = self.pe.half()
+            self.vae.vae = self.vae.vae.half()
+            self.unet.model = self.unet.model.half()
+        self.pe = self.pe.to(self.device)
+        self.vae.vae = self.vae.vae.to(self.device)
+        self.unet.model = self.unet.model.to(self.device)
+        print("Loading Whisper model...")
+        self.audio_processor = AudioProcessor(feature_extractor_path=whisper_dir)
+        self.weight_dtype = self.unet.model.dtype
+        self.whisper = WhisperModel.from_pretrained(whisper_dir)
+        self.whisper = self.whisper.to(device=self.device, dtype=self.weight_dtype).eval()
+        self.whisper.requires_grad_(False)
+        self.version = version
+        if version == "v15":
+            self.fp = FaceParsing(
+                left_cheek_width=self.left_cheek_width,
+                right_cheek_width=self.right_cheek_width
+            )
+        else:
+            self.fp = FaceParsing()
+        self.is_loaded = True
+        print(f"Models loaded in {time.time() - start_time:.2f}s")
+        print("=" * 50)
+    def load_avatar(self, avatar_name: str) -> dict:
+        """Load a preprocessed avatar into memory."""
+        if avatar_name in self.loaded_avatars:
+            return self.loaded_avatars[avatar_name]
+        avatar_path = self.avatar_dir / avatar_name
+        if not avatar_path.exists():
+            raise FileNotFoundError(f"Avatar not found: {avatar_name}")
+        print(f"Loading avatar '{avatar_name}' into memory...")
+        t0 = time.time()
+        avatar_data = {}
+        # Load metadata
+        with open(avatar_path / "metadata.pkl", 'rb') as f:
+            avatar_data['metadata'] = pickle.load(f)
+        # Load coords
+        with open(avatar_path / "coords.pkl", 'rb') as f:
+            avatar_data['coord_list'] = pickle.load(f)
+        # Load frames
+        with open(avatar_path / "frames.pkl", 'rb') as f:
+            avatar_data['frame_list'] = pickle.load(f)
+        # Load latents and convert to GPU tensors
+        with open(avatar_path / "latents.pkl", 'rb') as f:
+            latents_np = pickle.load(f)
+            avatar_data['latent_list'] = [
+                torch.from_numpy(l).to(self.device) for l in latents_np
+            ]
+        # Load crop info
+        with open(avatar_path / "crop_info.pkl", 'rb') as f:
+            avatar_data['crop_info'] = pickle.load(f)
+        # Load parsing data (optional)
+        parsing_path = avatar_path / "parsing.pkl"
+        if parsing_path.exists():
+            with open(parsing_path, 'rb') as f:
+                avatar_data['parsing_data'] = pickle.load(f)
+        self.loaded_avatars[avatar_name] = avatar_data
+        print(f"Avatar loaded in {time.time() - t0:.2f}s")
+        return avatar_data
+    def unload_avatar(self, avatar_name: str):
+        """Unload avatar from memory."""
+        if avatar_name in self.loaded_avatars:
+            del self.loaded_avatars[avatar_name]
+            torch.cuda.empty_cache()
+    @torch.no_grad()
+    def generate_with_avatar(
+        self,
+        avatar_name: str,
+        audio_path: str,
+        output_path: str,
+        fps: Optional[int] = None
+    ) -> dict:
+        """Generate video using pre-processed avatar. Much faster!"""
+        if not self.is_loaded:
+            raise RuntimeError("Models not loaded!")
+        fps = fps or self.fps
+        timings = {}
+        total_start = time.time()
+        # Load avatar (cached in memory)
+        t0 = time.time()
+        avatar = self.load_avatar(avatar_name)
+        timings["avatar_load"] = time.time() - t0
+        coord_list = avatar['coord_list']
+        frame_list = avatar['frame_list']
+        input_latent_list = avatar['latent_list']
+        temp_dir = tempfile.mkdtemp()
+        try:
+            # 1. Extract audio features (only audio-dependent step that's heavy)
+            t0 = time.time()
+            whisper_input_features, librosa_length = self.audio_processor.get_audio_feature(audio_path)
+            whisper_chunks = self.audio_processor.get_whisper_chunk(
+                whisper_input_features,
+                self.device,
+                self.weight_dtype,
+                self.whisper,
+                librosa_length,
+                fps=fps,
+                audio_padding_length_left=self.audio_padding_left,
+                audio_padding_length_right=self.audio_padding_right,
+            )
+            timings["whisper_features"] = time.time() - t0
+            # 2. Prepare cycled lists
+            frame_list_cycle = frame_list + frame_list[::-1]
+            coord_list_cycle = coord_list + coord_list[::-1]
+            input_latent_list_cycle = input_latent_list + input_latent_list[::-1]
+            # 3. UNet inference
+            t0 = time.time()
+            gen = datagen(
+                whisper_chunks=whisper_chunks,
+                vae_encode_latents=input_latent_list_cycle,
+                batch_size=self.batch_size,
+                delay_frame=0,
+                device=self.device,
+            )
+            res_frame_list = []
+            for whisper_batch, latent_batch in gen:
+                audio_feature_batch = self.pe(whisper_batch)
+                latent_batch = latent_batch.to(dtype=self.unet.model.dtype)
+                pred_latents = self.unet.model(
+                    latent_batch, self.timesteps,
+                    encoder_hidden_states=audio_feature_batch
+                ).sample
+                recon = self.vae.decode_latents(pred_latents)
+                for res_frame in recon:
+                    res_frame_list.append(res_frame)
+            timings["unet_inference"] = time.time() - t0
+            # 4. Face blending
+            t0 = time.time()
+            result_img_path = os.path.join(temp_dir, "results")
+            os.makedirs(result_img_path, exist_ok=True)
+            for i, res_frame in enumerate(res_frame_list):
+                bbox = coord_list_cycle[i % len(coord_list_cycle)]
+                ori_frame = copy.deepcopy(frame_list_cycle[i % len(frame_list_cycle)])
+                x1, y1, x2, y2 = bbox
+                if self.version == "v15":
+                    y2 = y2 + self.extra_margin
+                    y2 = min(y2, ori_frame.shape[0])
+                try:
+                    res_frame = cv2.resize(res_frame.astype(np.uint8), (x2-x1, y2-y1))
+                except:
+                    continue
+                if self.version == "v15":
+                    combine_frame = get_image(ori_frame, res_frame, [x1, y1, x2, y2],
+                                             mode=self.parsing_mode, fp=self.fp)
+                else:
+                    combine_frame = get_image(ori_frame, res_frame, [x1, y1, x2, y2], fp=self.fp)
+                cv2.imwrite(f"{result_img_path}/{str(i).zfill(8)}.png", combine_frame)
+            timings["face_blending"] = time.time() - t0
+            # 5. Encode video
+            t0 = time.time()
+            temp_vid = os.path.join(temp_dir, "temp.mp4")
+            cmd_img2video = f"ffmpeg -y -v warning -r {fps} -f image2 -i {result_img_path}/%08d.png -vcodec libx264 -vf format=yuv420p -crf 18 {temp_vid}"
+            os.system(cmd_img2video)
+            cmd_combine = f"ffmpeg -y -v warning -i {audio_path} -i {temp_vid} {output_path}"
+            os.system(cmd_combine)
+            timings["video_encoding"] = time.time() - t0
+        finally:
+            shutil.rmtree(temp_dir, ignore_errors=True)
+        timings["total"] = time.time() - total_start
+        timings["frames_generated"] = len(res_frame_list)
+        return timings
+# Global server instance
+server = MuseTalkServerV2()
+# FastAPI app
+app = FastAPI(
+    title="MuseTalk API v2",
+    description="Optimized API for repeated avatar usage",
+    version="2.0.0"
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+@app.on_event("startup")
+async def startup_event():
+    server.load_models()
+@app.get("/health")
+async def health_check():
+    return {
+        "status": "ok" if server.is_loaded else "loading",
+        "models_loaded": server.is_loaded,
+        "device": str(server.device) if server.device else None,
+        "loaded_avatars": list(server.loaded_avatars.keys())
+    }
+@app.get("/avatars")
+async def list_avatars():
+    """List all available preprocessed avatars."""
+    avatars = []
+    for p in server.avatar_dir.iterdir():
+        if p.is_dir() and (p / "metadata.pkl").exists():
+            with open(p / "metadata.pkl", 'rb') as f:
+                metadata = pickle.load(f)
+            metadata['loaded'] = p.name in server.loaded_avatars
+            avatars.append(metadata)
+    return {"avatars": avatars}
+@app.post("/avatars/{avatar_name}/load")
+async def load_avatar(avatar_name: str):
+    """Pre-load an avatar into GPU memory."""
+    try:
+        server.load_avatar(avatar_name)
+        return {"status": "loaded", "avatar_name": avatar_name}
+    except FileNotFoundError as e:
+        raise HTTPException(status_code=404, detail=str(e))
+@app.post("/avatars/{avatar_name}/unload")
+async def unload_avatar(avatar_name: str):
+    """Unload an avatar from memory."""
+    server.unload_avatar(avatar_name)
+    return {"status": "unloaded", "avatar_name": avatar_name}
+class GenerateWithAvatarRequest(BaseModel):
+    avatar_name: str
+    audio_path: str
+    output_path: str
+    fps: Optional[int] = 25
+@app.post("/generate/avatar")
+async def generate_with_avatar(request: GenerateWithAvatarRequest):
+    """Generate video using pre-processed avatar. FAST!"""
+    if not server.is_loaded:
+        raise HTTPException(status_code=503, detail="Models not loaded")
+    if not os.path.exists(request.audio_path):
+        raise HTTPException(status_code=404, detail=f"Audio not found: {request.audio_path}")
+    try:
+        timings = server.generate_with_avatar(
+            avatar_name=request.avatar_name,
+            audio_path=request.audio_path,
+            output_path=request.output_path,
+            fps=request.fps
+        )
+        return {
+            "status": "success",
+            "output_path": request.output_path,
+            "timings": timings
+        }
+    except FileNotFoundError as e:
+        raise HTTPException(status_code=404, detail=str(e))
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/generate/avatar/upload")
+async def generate_with_avatar_upload(
+    avatar_name: str = Form(...),
+    audio: UploadFile = File(...),
+    fps: int = Form(25)
+):
+    """Generate video from uploaded audio using pre-processed avatar."""
+    if not server.is_loaded:
+        raise HTTPException(status_code=503, detail="Models not loaded")
+    temp_dir = tempfile.mkdtemp()
+    try:
+        audio_path = os.path.join(temp_dir, audio.filename)
+        output_path = os.path.join(temp_dir, "output.mp4")
+        with open(audio_path, "wb") as f:
+            f.write(await audio.read())
+        timings = server.generate_with_avatar(
+            avatar_name=avatar_name,
+            audio_path=audio_path,
+            output_path=output_path,
+            fps=fps
+        )
+        return FileResponse(
+            output_path,
+            media_type="video/mp4",
+            filename="result.mp4",
+            headers={"X-Timings": str(timings)}
+        )
+    except Exception as e:
+        shutil.rmtree(temp_dir, ignore_errors=True)
+        raise HTTPException(status_code=500, detail=str(e))
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="0.0.0.0")
+    parser.add_argument("--port", type=int, default=8000)
+    args = parser.parse_args()
+    uvicorn.run(app, host=args.host, port=args.port)

musetalk_api_server_v3.py ADDED Viewed

	@@ -0,0 +1,651 @@

+"""
+MuseTalk HTTP API Server v3
+Ultra-optimized with:
+1. GPU-accelerated face blending (parallel processing)
+2. NVENC hardware video encoding
+3. Batch audio processing
+"""
+import os
+import cv2
+import copy
+import torch
+import glob
+import shutil
+import pickle
+import numpy as np
+import subprocess
+import tempfile
+import hashlib
+import time
+import asyncio
+from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
+from pathlib import Path
+from typing import Optional, List
+from fastapi import FastAPI, File, UploadFile, Form, HTTPException, BackgroundTasks
+from fastapi.responses import FileResponse, JSONResponse
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from tqdm import tqdm
+from omegaconf import OmegaConf
+from transformers import WhisperModel
+import uvicorn
+import multiprocessing as mp
+# MuseTalk imports
+from musetalk.utils.blending import get_image
+from musetalk.utils.face_parsing import FaceParsing
+from musetalk.utils.audio_processor import AudioProcessor
+from musetalk.utils.utils import get_file_type, datagen, load_all_model
+from musetalk.utils.preprocessing import get_landmark_and_bbox, read_imgs, coord_placeholder
+def blend_single_frame(args):
+    """Worker function for parallel face blending."""
+    i, res_frame, bbox, ori_frame, extra_margin, version, parsing_mode, fp_config = args
+    x1, y1, x2, y2 = bbox
+    if version == "v15":
+        y2 = y2 + extra_margin
+        y2 = min(y2, ori_frame.shape[0])
+    try:
+        res_frame = cv2.resize(res_frame.astype(np.uint8), (x2-x1, y2-y1))
+    except:
+        return i, None
+    # Create FaceParsing instance for this worker
+    fp = FaceParsing(
+        left_cheek_width=fp_config['left_cheek_width'],
+        right_cheek_width=fp_config['right_cheek_width']
+    )
+    if version == "v15":
+        combine_frame = get_image(ori_frame, res_frame, [x1, y1, x2, y2],
+                                 mode=parsing_mode, fp=fp)
+    else:
+        combine_frame = get_image(ori_frame, res_frame, [x1, y1, x2, y2], fp=fp)
+    return i, combine_frame
+class MuseTalkServerV3:
+    """Ultra-optimized server."""
+    def __init__(self):
+        self.device = None
+        self.vae = None
+        self.unet = None
+        self.pe = None
+        self.whisper = None
+        self.audio_processor = None
+        self.fp = None
+        self.timesteps = None
+        self.weight_dtype = None
+        self.is_loaded = False
+        # Avatar cache
+        self.loaded_avatars = {}
+        self.avatar_dir = Path("./avatars")
+        # Config
+        self.fps = 25
+        self.batch_size = 8
+        self.use_float16 = True
+        self.version = "v15"
+        self.extra_margin = 10
+        self.parsing_mode = "jaw"
+        self.left_cheek_width = 90
+        self.right_cheek_width = 90
+        self.audio_padding_left = 2
+        self.audio_padding_right = 2
+        # Thread pool for parallel blending
+        self.num_workers = min(8, mp.cpu_count())
+        self.thread_pool = ThreadPoolExecutor(max_workers=self.num_workers)
+        # NVENC settings
+        self.use_nvenc = True
+        self.nvenc_preset = "p4"  # p1(fastest) to p7(best quality)
+        self.crf = 23
+    def load_models(
+        self,
+        gpu_id: int = 0,
+        unet_model_path: str = "./models/musetalkV15/unet.pth",
+        unet_config: str = "./models/musetalk/config.json",
+        vae_type: str = "sd-vae",
+        whisper_dir: str = "./models/whisper",
+        use_float16: bool = True,
+        version: str = "v15"
+    ):
+        if self.is_loaded:
+            print("Models already loaded!")
+            return
+        print("=" * 50)
+        print("Loading MuseTalk models (v3 Ultra-Optimized)...")
+        print("=" * 50)
+        start_time = time.time()
+        self.device = torch.device(f"cuda:{gpu_id}" if torch.cuda.is_available() else "cpu")
+        print(f"Using device: {self.device}")
+        print(f"Parallel workers: {self.num_workers}")
+        print(f"NVENC encoding: {self.use_nvenc}")
+        print("Loading VAE, UNet, PE...")
+        self.vae, self.unet, self.pe = load_all_model(
+            unet_model_path=unet_model_path,
+            vae_type=vae_type,
+            unet_config=unet_config,
+            device=self.device
+        )
+        self.timesteps = torch.tensor([0], device=self.device)
+        self.use_float16 = use_float16
+        if use_float16:
+            print("Converting to float16...")
+            self.pe = self.pe.half()
+            self.vae.vae = self.vae.vae.half()
+            self.unet.model = self.unet.model.half()
+        self.pe = self.pe.to(self.device)
+        self.vae.vae = self.vae.vae.to(self.device)
+        self.unet.model = self.unet.model.to(self.device)
+        print("Loading Whisper model...")
+        self.audio_processor = AudioProcessor(feature_extractor_path=whisper_dir)
+        self.weight_dtype = self.unet.model.dtype
+        self.whisper = WhisperModel.from_pretrained(whisper_dir)
+        self.whisper = self.whisper.to(device=self.device, dtype=self.weight_dtype).eval()
+        self.whisper.requires_grad_(False)
+        self.version = version
+        if version == "v15":
+            self.fp = FaceParsing(
+                left_cheek_width=self.left_cheek_width,
+                right_cheek_width=self.right_cheek_width
+            )
+        else:
+            self.fp = FaceParsing()
+        self.is_loaded = True
+        print(f"Models loaded in {time.time() - start_time:.2f}s")
+        print("=" * 50)
+    def load_avatar(self, avatar_name: str) -> dict:
+        if avatar_name in self.loaded_avatars:
+            return self.loaded_avatars[avatar_name]
+        avatar_path = self.avatar_dir / avatar_name
+        if not avatar_path.exists():
+            raise FileNotFoundError(f"Avatar not found: {avatar_name}")
+        print(f"Loading avatar '{avatar_name}' into memory...")
+        t0 = time.time()
+        avatar_data = {}
+        with open(avatar_path / "metadata.pkl", 'rb') as f:
+            avatar_data['metadata'] = pickle.load(f)
+        with open(avatar_path / "coords.pkl", 'rb') as f:
+            avatar_data['coord_list'] = pickle.load(f)
+        with open(avatar_path / "frames.pkl", 'rb') as f:
+            avatar_data['frame_list'] = pickle.load(f)
+        with open(avatar_path / "latents.pkl", 'rb') as f:
+            latents_np = pickle.load(f)
+            avatar_data['latent_list'] = [
+                torch.from_numpy(l).to(self.device) for l in latents_np
+            ]
+        with open(avatar_path / "crop_info.pkl", 'rb') as f:
+            avatar_data['crop_info'] = pickle.load(f)
+        self.loaded_avatars[avatar_name] = avatar_data
+        print(f"Avatar loaded in {time.time() - t0:.2f}s")
+        return avatar_data
+    def unload_avatar(self, avatar_name: str):
+        if avatar_name in self.loaded_avatars:
+            del self.loaded_avatars[avatar_name]
+            torch.cuda.empty_cache()
+    def _encode_video_nvenc(self, frames_dir: str, audio_path: str, output_path: str, fps: int) -> float:
+        """Encode video using NVENC hardware acceleration."""
+        t0 = time.time()
+        temp_vid = frames_dir.replace('/results', '/temp.mp4')
+        if self.use_nvenc:
+            # NVENC H.264 encoding (much faster)
+            cmd_img2video = (
+                f"ffmpeg -y -v warning -r {fps} -f image2 -i {frames_dir}/%08d.png "
+                f"-c:v h264_nvenc -preset {self.nvenc_preset} -cq {self.crf} "
+                f"-pix_fmt yuv420p {temp_vid}"
+            )
+        else:
+            # Fallback to CPU encoding
+            cmd_img2video = (
+                f"ffmpeg -y -v warning -r {fps} -f image2 -i {frames_dir}/%08d.png "
+                f"-vcodec libx264 -vf format=yuv420p -crf 18 {temp_vid}"
+            )
+        os.system(cmd_img2video)
+        # Add audio
+        cmd_combine = f"ffmpeg -y -v warning -i {audio_path} -i {temp_vid} -c:v copy -c:a aac {output_path}"
+        os.system(cmd_combine)
+        # Cleanup temp video
+        if os.path.exists(temp_vid):
+            os.remove(temp_vid)
+        return time.time() - t0
+    def _parallel_face_blending(self, res_frame_list, coord_list_cycle, frame_list_cycle, result_img_path) -> float:
+        """Parallel face blending using thread pool."""
+        t0 = time.time()
+        fp_config = {
+            'left_cheek_width': self.left_cheek_width,
+            'right_cheek_width': self.right_cheek_width
+        }
+        # Prepare all tasks
+        tasks = []
+        for i, res_frame in enumerate(res_frame_list):
+            bbox = coord_list_cycle[i % len(coord_list_cycle)]
+            ori_frame = copy.deepcopy(frame_list_cycle[i % len(frame_list_cycle)])
+            tasks.append((
+                i, res_frame, bbox, ori_frame,
+                self.extra_margin, self.version, self.parsing_mode, fp_config
+            ))
+        # Process in parallel
+        results = list(self.thread_pool.map(blend_single_frame, tasks))
+        # Sort and save results
+        results.sort(key=lambda x: x[0])
+        for i, combine_frame in results:
+            if combine_frame is not None:
+                cv2.imwrite(f"{result_img_path}/{str(i).zfill(8)}.png", combine_frame)
+        return time.time() - t0
+    @torch.no_grad()
+    def generate_with_avatar(
+        self,
+        avatar_name: str,
+        audio_path: str,
+        output_path: str,
+        fps: Optional[int] = None,
+        use_parallel_blending: bool = True
+    ) -> dict:
+        """Generate video using pre-processed avatar with all optimizations."""
+        if not self.is_loaded:
+            raise RuntimeError("Models not loaded!")
+        fps = fps or self.fps
+        timings = {}
+        total_start = time.time()
+        # Load avatar
+        t0 = time.time()
+        avatar = self.load_avatar(avatar_name)
+        timings["avatar_load"] = time.time() - t0
+        coord_list = avatar['coord_list']
+        frame_list = avatar['frame_list']
+        input_latent_list = avatar['latent_list']
+        temp_dir = tempfile.mkdtemp()
+        try:
+            # 1. Extract audio features
+            t0 = time.time()
+            whisper_input_features, librosa_length = self.audio_processor.get_audio_feature(audio_path)
+            whisper_chunks = self.audio_processor.get_whisper_chunk(
+                whisper_input_features,
+                self.device,
+                self.weight_dtype,
+                self.whisper,
+                librosa_length,
+                fps=fps,
+                audio_padding_length_left=self.audio_padding_left,
+                audio_padding_length_right=self.audio_padding_right,
+            )
+            timings["whisper_features"] = time.time() - t0
+            # 2. Prepare cycled lists
+            frame_list_cycle = frame_list + frame_list[::-1]
+            coord_list_cycle = coord_list + coord_list[::-1]
+            input_latent_list_cycle = input_latent_list + input_latent_list[::-1]
+            # 3. UNet inference
+            t0 = time.time()
+            gen = datagen(
+                whisper_chunks=whisper_chunks,
+                vae_encode_latents=input_latent_list_cycle,
+                batch_size=self.batch_size,
+                delay_frame=0,
+                device=self.device,
+            )
+            res_frame_list = []
+            for whisper_batch, latent_batch in gen:
+                audio_feature_batch = self.pe(whisper_batch)
+                latent_batch = latent_batch.to(dtype=self.unet.model.dtype)
+                pred_latents = self.unet.model(
+                    latent_batch, self.timesteps,
+                    encoder_hidden_states=audio_feature_batch
+                ).sample
+                recon = self.vae.decode_latents(pred_latents)
+                for res_frame in recon:
+                    res_frame_list.append(res_frame)
+            timings["unet_inference"] = time.time() - t0
+            # 4. Face blending (parallel or sequential)
+            result_img_path = os.path.join(temp_dir, "results")
+            os.makedirs(result_img_path, exist_ok=True)
+            if use_parallel_blending:
+                timings["face_blending"] = self._parallel_face_blending(
+                    res_frame_list, coord_list_cycle, frame_list_cycle, result_img_path
+                )
+                timings["blending_mode"] = "parallel"
+            else:
+                t0 = time.time()
+                for i, res_frame in enumerate(res_frame_list):
+                    bbox = coord_list_cycle[i % len(coord_list_cycle)]
+                    ori_frame = copy.deepcopy(frame_list_cycle[i % len(frame_list_cycle)])
+                    x1, y1, x2, y2 = bbox
+                    if self.version == "v15":
+                        y2 = y2 + self.extra_margin
+                        y2 = min(y2, ori_frame.shape[0])
+                    try:
+                        res_frame = cv2.resize(res_frame.astype(np.uint8), (x2-x1, y2-y1))
+                    except:
+                        continue
+                    if self.version == "v15":
+                        combine_frame = get_image(ori_frame, res_frame, [x1, y1, x2, y2],
+                                                 mode=self.parsing_mode, fp=self.fp)
+                    else:
+                        combine_frame = get_image(ori_frame, res_frame, [x1, y1, x2, y2], fp=self.fp)
+                    cv2.imwrite(f"{result_img_path}/{str(i).zfill(8)}.png", combine_frame)
+                timings["face_blending"] = time.time() - t0
+                timings["blending_mode"] = "sequential"
+            # 5. Video encoding (NVENC)
+            timings["video_encoding"] = self._encode_video_nvenc(
+                result_img_path, audio_path, output_path, fps
+            )
+            timings["encoding_mode"] = "nvenc" if self.use_nvenc else "cpu"
+        finally:
+            shutil.rmtree(temp_dir, ignore_errors=True)
+        timings["total"] = time.time() - total_start
+        timings["frames_generated"] = len(res_frame_list)
+        return timings
+    @torch.no_grad()
+    def generate_batch(
+        self,
+        avatar_name: str,
+        audio_paths: List[str],
+        output_dir: str,
+        fps: Optional[int] = None
+    ) -> dict:
+        """Generate multiple videos from multiple audios efficiently."""
+        if not self.is_loaded:
+            raise RuntimeError("Models not loaded!")
+        fps = fps or self.fps
+        batch_timings = {"videos": [], "total": 0}
+        total_start = time.time()
+        # Load avatar once
+        t0 = time.time()
+        avatar = self.load_avatar(avatar_name)
+        batch_timings["avatar_load"] = time.time() - t0
+        coord_list = avatar['coord_list']
+        frame_list = avatar['frame_list']
+        input_latent_list = avatar['latent_list']
+        # Prepare cycled lists once
+        frame_list_cycle = frame_list + frame_list[::-1]
+        coord_list_cycle = coord_list + coord_list[::-1]
+        input_latent_list_cycle = input_latent_list + input_latent_list[::-1]
+        os.makedirs(output_dir, exist_ok=True)
+        for idx, audio_path in enumerate(audio_paths):
+            video_start = time.time()
+            timings = {}
+            audio_name = Path(audio_path).stem
+            output_path = os.path.join(output_dir, f"{audio_name}.mp4")
+            temp_dir = tempfile.mkdtemp()
+            try:
+                # 1. Extract audio features
+                t0 = time.time()
+                whisper_input_features, librosa_length = self.audio_processor.get_audio_feature(audio_path)
+                whisper_chunks = self.audio_processor.get_whisper_chunk(
+                    whisper_input_features,
+                    self.device,
+                    self.weight_dtype,
+                    self.whisper,
+                    librosa_length,
+                    fps=fps,
+                    audio_padding_length_left=self.audio_padding_left,
+                    audio_padding_length_right=self.audio_padding_right,
+                )
+                timings["whisper_features"] = time.time() - t0
+                # 2. UNet inference
+                t0 = time.time()
+                gen = datagen(
+                    whisper_chunks=whisper_chunks,
+                    vae_encode_latents=input_latent_list_cycle,
+                    batch_size=self.batch_size,
+                    delay_frame=0,
+                    device=self.device,
+                )
+                res_frame_list = []
+                for whisper_batch, latent_batch in gen:
+                    audio_feature_batch = self.pe(whisper_batch)
+                    latent_batch = latent_batch.to(dtype=self.unet.model.dtype)
+                    pred_latents = self.unet.model(
+                        latent_batch, self.timesteps,
+                        encoder_hidden_states=audio_feature_batch
+                    ).sample
+                    recon = self.vae.decode_latents(pred_latents)
+                    for res_frame in recon:
+                        res_frame_list.append(res_frame)
+                timings["unet_inference"] = time.time() - t0
+                # 3. Face blending (parallel)
+                result_img_path = os.path.join(temp_dir, "results")
+                os.makedirs(result_img_path, exist_ok=True)
+                timings["face_blending"] = self._parallel_face_blending(
+                    res_frame_list, coord_list_cycle, frame_list_cycle, result_img_path
+                )
+                # 4. Video encoding (NVENC)
+                timings["video_encoding"] = self._encode_video_nvenc(
+                    result_img_path, audio_path, output_path, fps
+                )
+            finally:
+                shutil.rmtree(temp_dir, ignore_errors=True)
+            timings["total"] = time.time() - video_start
+            timings["frames_generated"] = len(res_frame_list)
+            timings["output_path"] = output_path
+            timings["audio_path"] = audio_path
+            batch_timings["videos"].append(timings)
+            print(f"  [{idx+1}/{len(audio_paths)}] {audio_name}: {timings['total']:.2f}s")
+        batch_timings["total"] = time.time() - total_start
+        batch_timings["num_videos"] = len(audio_paths)
+        batch_timings["avg_per_video"] = batch_timings["total"] / len(audio_paths) if audio_paths else 0
+        return batch_timings
+# Global server
+server = MuseTalkServerV3()
+# FastAPI app
+app = FastAPI(
+    title="MuseTalk API v3",
+    description="Ultra-optimized API with parallel blending, NVENC, and batch processing",
+    version="3.0.0"
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+@app.on_event("startup")
+async def startup_event():
+    server.load_models()
+@app.get("/health")
+async def health_check():
+    return {
+        "status": "ok" if server.is_loaded else "loading",
+        "models_loaded": server.is_loaded,
+        "device": str(server.device) if server.device else None,
+        "loaded_avatars": list(server.loaded_avatars.keys()),
+        "optimizations": {
+            "parallel_workers": server.num_workers,
+            "nvenc_enabled": server.use_nvenc,
+            "nvenc_preset": server.nvenc_preset
+        }
+    }
+@app.get("/avatars")
+async def list_avatars():
+    avatars = []
+    for p in server.avatar_dir.iterdir():
+        if p.is_dir() and (p / "metadata.pkl").exists():
+            with open(p / "metadata.pkl", 'rb') as f:
+                metadata = pickle.load(f)
+            metadata['loaded'] = p.name in server.loaded_avatars
+            avatars.append(metadata)
+    return {"avatars": avatars}
+@app.post("/avatars/{avatar_name}/load")
+async def load_avatar(avatar_name: str):
+    try:
+        server.load_avatar(avatar_name)
+        return {"status": "loaded", "avatar_name": avatar_name}
+    except FileNotFoundError as e:
+        raise HTTPException(status_code=404, detail=str(e))
+@app.post("/avatars/{avatar_name}/unload")
+async def unload_avatar(avatar_name: str):
+    server.unload_avatar(avatar_name)
+    return {"status": "unloaded", "avatar_name": avatar_name}
+class GenerateRequest(BaseModel):
+    avatar_name: str
+    audio_path: str
+    output_path: str
+    fps: Optional[int] = 25
+    use_parallel_blending: bool = True
+@app.post("/generate/avatar")
+async def generate_with_avatar(request: GenerateRequest):
+    if not server.is_loaded:
+        raise HTTPException(status_code=503, detail="Models not loaded")
+    if not os.path.exists(request.audio_path):
+        raise HTTPException(status_code=404, detail=f"Audio not found: {request.audio_path}")
+    try:
+        timings = server.generate_with_avatar(
+            avatar_name=request.avatar_name,
+            audio_path=request.audio_path,
+            output_path=request.output_path,
+            fps=request.fps,
+            use_parallel_blending=request.use_parallel_blending
+        )
+        return {
+            "status": "success",
+            "output_path": request.output_path,
+            "timings": timings
+        }
+    except FileNotFoundError as e:
+        raise HTTPException(status_code=404, detail=str(e))
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+class BatchGenerateRequest(BaseModel):
+    avatar_name: str
+    audio_paths: List[str]
+    output_dir: str
+    fps: Optional[int] = 25
+@app.post("/generate/batch")
+async def generate_batch(request: BatchGenerateRequest):
+    """Generate multiple videos from multiple audios."""
+    if not server.is_loaded:
+        raise HTTPException(status_code=503, detail="Models not loaded")
+    for audio_path in request.audio_paths:
+        if not os.path.exists(audio_path):
+            raise HTTPException(status_code=404, detail=f"Audio not found: {audio_path}")
+    try:
+        timings = server.generate_batch(
+            avatar_name=request.avatar_name,
+            audio_paths=request.audio_paths,
+            output_dir=request.output_dir,
+            fps=request.fps
+        )
+        return {
+            "status": "success",
+            "output_dir": request.output_dir,
+            "timings": timings
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="0.0.0.0")
+    parser.add_argument("--port", type=int, default=8000)
+    args = parser.parse_args()
+    uvicorn.run(app, host=args.host, port=args.port)

musetalk_api_server_v3_fixed.py ADDED Viewed

	@@ -0,0 +1,371 @@

+"""
+MuseTalk HTTP API Server v3 (Fixed)
+Optimized with:
+1. Sequential face blending (parallel had overhead)
+2. NVENC hardware video encoding
+3. Batch audio processing
+"""
+import os
+import cv2
+import copy
+import torch
+import glob
+import shutil
+import pickle
+import numpy as np
+import subprocess
+import tempfile
+import hashlib
+import time
+from pathlib import Path
+from typing import Optional, List
+from fastapi import FastAPI, File, UploadFile, Form, HTTPException
+from fastapi.responses import FileResponse, JSONResponse
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from tqdm import tqdm
+from transformers import WhisperModel
+import uvicorn
+# MuseTalk imports
+from musetalk.utils.blending import get_image
+from musetalk.utils.face_parsing import FaceParsing
+from musetalk.utils.audio_processor import AudioProcessor
+from musetalk.utils.utils import get_file_type, datagen, load_all_model
+from musetalk.utils.preprocessing import coord_placeholder
+class MuseTalkServerV3:
+    def __init__(self):
+        self.device = None
+        self.vae = None
+        self.unet = None
+        self.pe = None
+        self.whisper = None
+        self.audio_processor = None
+        self.fp = None
+        self.timesteps = None
+        self.weight_dtype = None
+        self.is_loaded = False
+        self.loaded_avatars = {}
+        self.avatar_dir = Path("./avatars")
+        self.fps = 25
+        self.batch_size = 8
+        self.use_float16 = True
+        self.version = "v15"
+        self.extra_margin = 10
+        self.parsing_mode = "jaw"
+        self.left_cheek_width = 90
+        self.right_cheek_width = 90
+        self.audio_padding_left = 2
+        self.audio_padding_right = 2
+        # NVENC
+        self.use_nvenc = True
+        self.nvenc_preset = "p4"
+        self.crf = 23
+    def load_models(self, gpu_id: int = 0):
+        if self.is_loaded:
+            print("Models already loaded!")
+            return
+        print("=" * 50)
+        print("Loading MuseTalk models (v3 Optimized)...")
+        print("=" * 50)
+        start_time = time.time()
+        self.device = torch.device(f"cuda:{gpu_id}" if torch.cuda.is_available() else "cpu")
+        self.vae, self.unet, self.pe = load_all_model(
+            unet_model_path="./models/musetalkV15/unet.pth",
+            vae_type="sd-vae",
+            unet_config="./models/musetalk/config.json",
+            device=self.device
+        )
+        self.timesteps = torch.tensor([0], device=self.device)
+        self.pe = self.pe.half().to(self.device)
+        self.vae.vae = self.vae.vae.half().to(self.device)
+        self.unet.model = self.unet.model.half().to(self.device)
+        self.audio_processor = AudioProcessor(feature_extractor_path="./models/whisper")
+        self.weight_dtype = self.unet.model.dtype
+        self.whisper = WhisperModel.from_pretrained("./models/whisper")
+        self.whisper = self.whisper.to(device=self.device, dtype=self.weight_dtype).eval()
+        self.whisper.requires_grad_(False)
+        self.fp = FaceParsing(
+            left_cheek_width=self.left_cheek_width,
+            right_cheek_width=self.right_cheek_width
+        )
+        self.is_loaded = True
+        print(f"Models loaded in {time.time() - start_time:.2f}s")
+    def load_avatar(self, avatar_name: str) -> dict:
+        if avatar_name in self.loaded_avatars:
+            return self.loaded_avatars[avatar_name]
+        avatar_path = self.avatar_dir / avatar_name
+        if not avatar_path.exists():
+            raise FileNotFoundError(f"Avatar not found: {avatar_name}")
+        avatar_data = {}
+        with open(avatar_path / "metadata.pkl", 'rb') as f:
+            avatar_data['metadata'] = pickle.load(f)
+        with open(avatar_path / "coords.pkl", 'rb') as f:
+            avatar_data['coord_list'] = pickle.load(f)
+        with open(avatar_path / "frames.pkl", 'rb') as f:
+            avatar_data['frame_list'] = pickle.load(f)
+        with open(avatar_path / "latents.pkl", 'rb') as f:
+            latents_np = pickle.load(f)
+            avatar_data['latent_list'] = [torch.from_numpy(l).to(self.device) for l in latents_np]
+        self.loaded_avatars[avatar_name] = avatar_data
+        return avatar_data
+    def _encode_video_nvenc(self, frames_dir: str, audio_path: str, output_path: str, fps: int) -> float:
+        t0 = time.time()
+        temp_vid = output_path.replace('.mp4', '_temp.mp4')
+        if self.use_nvenc:
+            cmd = (
+                f"ffmpeg -y -v warning -r {fps} -f image2 -i {frames_dir}/%08d.png "
+                f"-c:v h264_nvenc -preset {self.nvenc_preset} -cq {self.crf} -pix_fmt yuv420p {temp_vid}"
+            )
+        else:
+            cmd = (
+                f"ffmpeg -y -v warning -r {fps} -f image2 -i {frames_dir}/%08d.png "
+                f"-vcodec libx264 -crf 18 -pix_fmt yuv420p {temp_vid}"
+            )
+        os.system(cmd)
+        os.system(f"ffmpeg -y -v warning -i {audio_path} -i {temp_vid} -c:v copy -c:a aac {output_path}")
+        os.remove(temp_vid) if os.path.exists(temp_vid) else None
+        return time.time() - t0
+    @torch.no_grad()
+    def generate_with_avatar(self, avatar_name: str, audio_path: str, output_path: str, fps: int = 25) -> dict:
+        if not self.is_loaded:
+            raise RuntimeError("Models not loaded!")
+        timings = {}
+        total_start = time.time()
+        t0 = time.time()
+        avatar = self.load_avatar(avatar_name)
+        timings["avatar_load"] = time.time() - t0
+        coord_list = avatar['coord_list']
+        frame_list = avatar['frame_list']
+        input_latent_list = avatar['latent_list']
+        temp_dir = tempfile.mkdtemp()
+        try:
+            # Whisper
+            t0 = time.time()
+            whisper_input_features, librosa_length = self.audio_processor.get_audio_feature(audio_path)
+            whisper_chunks = self.audio_processor.get_whisper_chunk(
+                whisper_input_features, self.device, self.weight_dtype, self.whisper,
+                librosa_length, fps=fps,
+                audio_padding_length_left=self.audio_padding_left,
+                audio_padding_length_right=self.audio_padding_right,
+            )
+            timings["whisper_features"] = time.time() - t0
+            # Cycle lists
+            frame_list_cycle = frame_list + frame_list[::-1]
+            coord_list_cycle = coord_list + coord_list[::-1]
+            input_latent_list_cycle = input_latent_list + input_latent_list[::-1]
+            # UNet
+            t0 = time.time()
+            gen = datagen(whisper_chunks=whisper_chunks, vae_encode_latents=input_latent_list_cycle,
+                         batch_size=self.batch_size, delay_frame=0, device=self.device)
+            res_frame_list = []
+            for whisper_batch, latent_batch in gen:
+                audio_feature_batch = self.pe(whisper_batch)
+                latent_batch = latent_batch.to(dtype=self.unet.model.dtype)
+                pred_latents = self.unet.model(latent_batch, self.timesteps,
+                                               encoder_hidden_states=audio_feature_batch).sample
+                recon = self.vae.decode_latents(pred_latents)
+                res_frame_list.extend(recon)
+            timings["unet_inference"] = time.time() - t0
+            # Face blending (sequential - faster than parallel due to FP overhead)
+            t0 = time.time()
+            result_img_path = os.path.join(temp_dir, "results")
+            os.makedirs(result_img_path, exist_ok=True)
+            for i, res_frame in enumerate(res_frame_list):
+                bbox = coord_list_cycle[i % len(coord_list_cycle)]
+                ori_frame = copy.deepcopy(frame_list_cycle[i % len(frame_list_cycle)])
+                x1, y1, x2, y2 = bbox
+                y2 = min(y2 + self.extra_margin, ori_frame.shape[0])
+                try:
+                    res_frame = cv2.resize(res_frame.astype(np.uint8), (x2-x1, y2-y1))
+                    combine_frame = get_image(ori_frame, res_frame, [x1, y1, x2, y2],
+                                             mode=self.parsing_mode, fp=self.fp)
+                    cv2.imwrite(f"{result_img_path}/{str(i).zfill(8)}.png", combine_frame)
+                except:
+                    continue
+            timings["face_blending"] = time.time() - t0
+            # NVENC encoding
+            timings["video_encoding"] = self._encode_video_nvenc(result_img_path, audio_path, output_path, fps)
+        finally:
+            shutil.rmtree(temp_dir, ignore_errors=True)
+        timings["total"] = time.time() - total_start
+        timings["frames_generated"] = len(res_frame_list)
+        return timings
+    @torch.no_grad()
+    def generate_batch(self, avatar_name: str, audio_paths: List[str], output_dir: str, fps: int = 25) -> dict:
+        if not self.is_loaded:
+            raise RuntimeError("Models not loaded!")
+        batch_timings = {"videos": [], "total": 0}
+        total_start = time.time()
+        t0 = time.time()
+        avatar = self.load_avatar(avatar_name)
+        batch_timings["avatar_load"] = time.time() - t0
+        coord_list = avatar['coord_list']
+        frame_list = avatar['frame_list']
+        input_latent_list = avatar['latent_list']
+        frame_list_cycle = frame_list + frame_list[::-1]
+        coord_list_cycle = coord_list + coord_list[::-1]
+        input_latent_list_cycle = input_latent_list + input_latent_list[::-1]
+        os.makedirs(output_dir, exist_ok=True)
+        for idx, audio_path in enumerate(audio_paths):
+            video_start = time.time()
+            timings = {}
+            output_path = os.path.join(output_dir, f"{Path(audio_path).stem}.mp4")
+            temp_dir = tempfile.mkdtemp()
+            try:
+                t0 = time.time()
+                whisper_input_features, librosa_length = self.audio_processor.get_audio_feature(audio_path)
+                whisper_chunks = self.audio_processor.get_whisper_chunk(
+                    whisper_input_features, self.device, self.weight_dtype, self.whisper,
+                    librosa_length, fps=fps,
+                    audio_padding_length_left=self.audio_padding_left,
+                    audio_padding_length_right=self.audio_padding_right,
+                )
+                timings["whisper"] = time.time() - t0
+                t0 = time.time()
+                gen = datagen(whisper_chunks=whisper_chunks, vae_encode_latents=input_latent_list_cycle,
+                             batch_size=self.batch_size, delay_frame=0, device=self.device)
+                res_frame_list = []
+                for whisper_batch, latent_batch in gen:
+                    audio_feature_batch = self.pe(whisper_batch)
+                    latent_batch = latent_batch.to(dtype=self.unet.model.dtype)
+                    pred_latents = self.unet.model(latent_batch, self.timesteps,
+                                                   encoder_hidden_states=audio_feature_batch).sample
+                    res_frame_list.extend(self.vae.decode_latents(pred_latents))
+                timings["unet"] = time.time() - t0
+                t0 = time.time()
+                result_img_path = os.path.join(temp_dir, "results")
+                os.makedirs(result_img_path, exist_ok=True)
+                for i, res_frame in enumerate(res_frame_list):
+                    bbox = coord_list_cycle[i % len(coord_list_cycle)]
+                    ori_frame = copy.deepcopy(frame_list_cycle[i % len(frame_list_cycle)])
+                    x1, y1, x2, y2 = bbox
+                    y2 = min(y2 + self.extra_margin, ori_frame.shape[0])
+                    try:
+                        res_frame = cv2.resize(res_frame.astype(np.uint8), (x2-x1, y2-y1))
+                        combine_frame = get_image(ori_frame, res_frame, [x1, y1, x2, y2],
+                                                 mode=self.parsing_mode, fp=self.fp)
+                        cv2.imwrite(f"{result_img_path}/{str(i).zfill(8)}.png", combine_frame)
+                    except:
+                        continue
+                timings["blending"] = time.time() - t0
+                timings["encoding"] = self._encode_video_nvenc(result_img_path, audio_path, output_path, fps)
+            finally:
+                shutil.rmtree(temp_dir, ignore_errors=True)
+            timings["total"] = time.time() - video_start
+            timings["frames"] = len(res_frame_list)
+            timings["output"] = output_path
+            batch_timings["videos"].append(timings)
+            print(f"  [{idx+1}/{len(audio_paths)}] {Path(audio_path).stem}: {timings['total']:.2f}s")
+        batch_timings["total"] = time.time() - total_start
+        batch_timings["num_videos"] = len(audio_paths)
+        batch_timings["avg_per_video"] = batch_timings["total"] / len(audio_paths) if audio_paths else 0
+        return batch_timings
+server = MuseTalkServerV3()
+app = FastAPI(title="MuseTalk API v3", version="3.0.0")
+app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"])
+@app.on_event("startup")
+async def startup():
+    server.load_models()
+@app.get("/health")
+async def health():
+    return {"status": "ok" if server.is_loaded else "loading", "device": str(server.device),
+            "avatars": list(server.loaded_avatars.keys()), "nvenc": server.use_nvenc}
+@app.get("/avatars")
+async def list_avatars():
+    avatars = []
+    for p in server.avatar_dir.iterdir():
+        if p.is_dir() and (p / "metadata.pkl").exists():
+            with open(p / "metadata.pkl", 'rb') as f:
+                avatars.append(pickle.load(f))
+    return {"avatars": avatars}
+class GenReq(BaseModel):
+    avatar_name: str
+    audio_path: str
+    output_path: str
+    fps: int = 25
+@app.post("/generate/avatar")
+async def generate(req: GenReq):
+    if not os.path.exists(req.audio_path):
+        raise HTTPException(404, f"Audio not found: {req.audio_path}")
+    try:
+        timings = server.generate_with_avatar(req.avatar_name, req.audio_path, req.output_path, req.fps)
+        return {"status": "success", "output_path": req.output_path, "timings": timings}
+    except Exception as e:
+        raise HTTPException(500, str(e))
+class BatchReq(BaseModel):
+    avatar_name: str
+    audio_paths: List[str]
+    output_dir: str
+    fps: int = 25
+@app.post("/generate/batch")
+async def batch(req: BatchReq):
+    for p in req.audio_paths:
+        if not os.path.exists(p):
+            raise HTTPException(404, f"Audio not found: {p}")
+    try:
+        timings = server.generate_batch(req.avatar_name, req.audio_paths, req.output_dir, req.fps)
+        return {"status": "success", "output_dir": req.output_dir, "timings": timings}
+    except Exception as e:
+        raise HTTPException(500, str(e))
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000)

run_inference.py ADDED Viewed

	@@ -0,0 +1,276 @@

+import os
+import cv2
+import math
+import copy
+import torch
+import glob
+import shutil
+import pickle
+import argparse
+import numpy as np
+import subprocess
+from tqdm import tqdm
+from omegaconf import OmegaConf
+from transformers import WhisperModel
+import sys
+from musetalk.utils.blending import get_image
+from musetalk.utils.face_parsing import FaceParsing
+from musetalk.utils.audio_processor import AudioProcessor
+from musetalk.utils.utils import get_file_type, get_video_fps, datagen, load_all_model
+from musetalk.utils.preprocessing import get_landmark_and_bbox, read_imgs, coord_placeholder
+def fast_check_ffmpeg():
+    try:
+        subprocess.run(["ffmpeg", "-version"], capture_output=True, check=True)
+        return True
+    except:
+        return False
+@torch.no_grad()
+def main(args):
+    # Configure ffmpeg path
+    if not fast_check_ffmpeg():
+        print("Adding ffmpeg to PATH")
+        # Choose path separator based on operating system
+        path_separator = ';' if sys.platform == 'win32' else ':'
+        os.environ["PATH"] = f"{args.ffmpeg_path}{path_separator}{os.environ['PATH']}"
+        if not fast_check_ffmpeg():
+            print("Warning: Unable to find ffmpeg, please ensure ffmpeg is properly installed")
+    # Set computing device
+    device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu")
+    # Load model weights
+    vae, unet, pe = load_all_model(
+        unet_model_path=args.unet_model_path,
+        vae_type=args.vae_type,
+        unet_config=args.unet_config,
+        device=device
+    )
+    timesteps = torch.tensor([0], device=device)
+    # Convert models to half precision if float16 is enabled
+    if args.use_float16:
+        pe = pe.half()
+        vae.vae = vae.vae.half()
+        unet.model = unet.model.half()
+    # Move models to specified device
+    pe = pe.to(device)
+    vae.vae = vae.vae.to(device)
+    unet.model = unet.model.to(device)
+    # Initialize audio processor and Whisper model
+    audio_processor = AudioProcessor(feature_extractor_path=args.whisper_dir)
+    weight_dtype = unet.model.dtype
+    whisper = WhisperModel.from_pretrained(args.whisper_dir)
+    whisper = whisper.to(device=device, dtype=weight_dtype).eval()
+    whisper.requires_grad_(False)
+    # Initialize face parser with configurable parameters based on version
+    if args.version == "v15":
+        fp = FaceParsing(
+            left_cheek_width=args.left_cheek_width,
+            right_cheek_width=args.right_cheek_width
+        )
+    else:  # v1
+        fp = FaceParsing()
+    # Load inference configuration
+    inference_config = OmegaConf.load(args.inference_config)
+    print("Loaded inference config:", inference_config)
+    # Process each task
+    for task_id in inference_config:
+        try:
+            # Get task configuration
+            video_path = inference_config[task_id]["video_path"]
+            audio_path = inference_config[task_id]["audio_path"]
+            if "result_name" in inference_config[task_id]:
+                args.output_vid_name = inference_config[task_id]["result_name"]
+            # Set bbox_shift based on version
+            if args.version == "v15":
+                bbox_shift = 0  # v15 uses fixed bbox_shift
+            else:
+                bbox_shift = inference_config[task_id].get("bbox_shift", args.bbox_shift)  # v1 uses config or default
+            # Set output paths
+            input_basename = os.path.basename(video_path).split('.')[0]
+            audio_basename = os.path.basename(audio_path).split('.')[0]
+            output_basename = f"{input_basename}_{audio_basename}"
+            # Create temporary directories
+            temp_dir = os.path.join(args.result_dir, f"{args.version}")
+            os.makedirs(temp_dir, exist_ok=True)
+            # Set result save paths
+            result_img_save_path = os.path.join(temp_dir, output_basename)
+            crop_coord_save_path = os.path.join(args.result_dir, "../", input_basename+".pkl")
+            os.makedirs(result_img_save_path, exist_ok=True)
+            # Set output video paths
+            if args.output_vid_name is None:
+                output_vid_name = os.path.join(temp_dir, output_basename + ".mp4")
+            else:
+                output_vid_name = os.path.join(temp_dir, args.output_vid_name)
+            output_vid_name_concat = os.path.join(temp_dir, output_basename + "_concat.mp4")
+            # Extract frames from source video
+            if get_file_type(video_path) == "video":
+                save_dir_full = os.path.join(temp_dir, input_basename)
+                os.makedirs(save_dir_full, exist_ok=True)
+                cmd = f"ffmpeg -v fatal -i {video_path} -vf fps={args.fps} -start_number 0 {save_dir_full}/%08d.png"  # PATCHED: extract at target fps
+                os.system(cmd)
+                input_img_list = sorted(glob.glob(os.path.join(save_dir_full, '*.[jpJP][pnPN]*[gG]')))
+                fps = args.fps  # PATCHED: use target fps instead of video fps
+            elif get_file_type(video_path) == "image":
+                input_img_list = [video_path]
+                fps = args.fps
+            elif os.path.isdir(video_path):
+                input_img_list = glob.glob(os.path.join(video_path, '*.[jpJP][pnPN]*[gG]'))
+                input_img_list = sorted(input_img_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0]))
+                fps = args.fps
+            else:
+                raise ValueError(f"{video_path} should be a video file, an image file or a directory of images")
+            # Extract audio features
+            whisper_input_features, librosa_length = audio_processor.get_audio_feature(audio_path)
+            whisper_chunks = audio_processor.get_whisper_chunk(
+                whisper_input_features,
+                device,
+                weight_dtype,
+                whisper,
+                librosa_length,
+                fps=fps,
+                audio_padding_length_left=args.audio_padding_length_left,
+                audio_padding_length_right=args.audio_padding_length_right,
+            )
+            # Preprocess input images
+            if os.path.exists(crop_coord_save_path) and args.use_saved_coord:
+                print("Using saved coordinates")
+                with open(crop_coord_save_path, 'rb') as f:
+                    coord_list = pickle.load(f)
+                frame_list = read_imgs(input_img_list)
+            else:
+                print("Extracting landmarks... time-consuming operation")
+                coord_list, frame_list = get_landmark_and_bbox(input_img_list, bbox_shift)
+                with open(crop_coord_save_path, 'wb') as f:
+                    pickle.dump(coord_list, f)
+            print(f"Number of frames: {len(frame_list)}")
+            # Process each frame
+            input_latent_list = []
+            for bbox, frame in zip(coord_list, frame_list):
+                if bbox == coord_placeholder:
+                    continue
+                x1, y1, x2, y2 = bbox
+                if args.version == "v15":
+                    y2 = y2 + args.extra_margin
+                    y2 = min(y2, frame.shape[0])
+                crop_frame = frame[y1:y2, x1:x2]
+                crop_frame = cv2.resize(crop_frame, (256,256), interpolation=cv2.INTER_LANCZOS4)
+                latents = vae.get_latents_for_unet(crop_frame)
+                input_latent_list.append(latents)
+            # Smooth first and last frames
+            frame_list_cycle = frame_list + frame_list[::-1]
+            coord_list_cycle = coord_list + coord_list[::-1]
+            input_latent_list_cycle = input_latent_list + input_latent_list[::-1]
+            # Batch inference
+            print("Starting inference")
+            video_num = len(whisper_chunks)
+            batch_size = args.batch_size
+            gen = datagen(
+                whisper_chunks=whisper_chunks,
+                vae_encode_latents=input_latent_list_cycle,
+                batch_size=batch_size,
+                delay_frame=0,
+                device=device,
+            )
+            res_frame_list = []
+            total = int(np.ceil(float(video_num) / batch_size))
+            # Execute inference
+            for i, (whisper_batch, latent_batch) in enumerate(tqdm(gen, total=total)):
+                audio_feature_batch = pe(whisper_batch)
+                latent_batch = latent_batch.to(dtype=unet.model.dtype)
+                pred_latents = unet.model(latent_batch, timesteps, encoder_hidden_states=audio_feature_batch).sample
+                recon = vae.decode_latents(pred_latents)
+                for res_frame in recon:
+                    res_frame_list.append(res_frame)
+            # Pad generated images to original video size
+            print("Padding generated images to original video size")
+            for i, res_frame in enumerate(tqdm(res_frame_list)):
+                bbox = coord_list_cycle[i%(len(coord_list_cycle))]
+                ori_frame = copy.deepcopy(frame_list_cycle[i%(len(frame_list_cycle))])
+                x1, y1, x2, y2 = bbox
+                if args.version == "v15":
+                    y2 = y2 + args.extra_margin
+                    y2 = min(y2, frame.shape[0])
+                try:
+                    res_frame = cv2.resize(res_frame.astype(np.uint8), (x2-x1, y2-y1))
+                except:
+                    continue
+                # Merge results with version-specific parameters
+                if args.version == "v15":
+                    combine_frame = get_image(ori_frame, res_frame, [x1, y1, x2, y2], mode=args.parsing_mode, fp=fp)
+                else:
+                    combine_frame = get_image(ori_frame, res_frame, [x1, y1, x2, y2], fp=fp)
+                cv2.imwrite(f"{result_img_save_path}/{str(i).zfill(8)}.png", combine_frame)
+            # Save prediction results
+            temp_vid_path = f"{temp_dir}/temp_{input_basename}_{audio_basename}.mp4"
+            cmd_img2video = f"ffmpeg -y -v warning -r {fps} -f image2 -i {result_img_save_path}/%08d.png -vcodec libx264 -vf format=yuv420p -crf 18 {temp_vid_path}"
+            print("Video generation command:", cmd_img2video)
+            os.system(cmd_img2video)
+            cmd_combine_audio = f"ffmpeg -y -v warning -i {audio_path} -i {temp_vid_path} {output_vid_name}"
+            print("Audio combination command:", cmd_combine_audio)
+            os.system(cmd_combine_audio)
+            # Clean up temporary files
+            shutil.rmtree(result_img_save_path)
+            os.remove(temp_vid_path)
+            shutil.rmtree(save_dir_full)
+            if not args.saved_coord:
+                os.remove(crop_coord_save_path)
+            print(f"Results saved to {output_vid_name}")
+        except Exception as e:
+            print("Error occurred during processing:", e)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--ffmpeg_path", type=str, default="./ffmpeg-4.4-amd64-static/", help="Path to ffmpeg executable")
+    parser.add_argument("--gpu_id", type=int, default=0, help="GPU ID to use")
+    parser.add_argument("--vae_type", type=str, default="sd-vae", help="Type of VAE model")
+    parser.add_argument("--unet_config", type=str, default="./models/musetalk/config.json", help="Path to UNet configuration file")
+    parser.add_argument("--unet_model_path", type=str, default="./models/musetalkV15/unet.pth", help="Path to UNet model weights")
+    parser.add_argument("--whisper_dir", type=str, default="./models/whisper", help="Directory containing Whisper model")
+    parser.add_argument("--inference_config", type=str, default="configs/inference/test_img.yaml", help="Path to inference configuration file")
+    parser.add_argument("--bbox_shift", type=int, default=0, help="Bounding box shift value")
+    parser.add_argument("--result_dir", default='./results', help="Directory for output results")
+    parser.add_argument("--extra_margin", type=int, default=10, help="Extra margin for face cropping")
+    parser.add_argument("--fps", type=int, default=25, help="Video frames per second")
+    parser.add_argument("--audio_padding_length_left", type=int, default=2, help="Left padding length for audio")
+    parser.add_argument("--audio_padding_length_right", type=int, default=2, help="Right padding length for audio")
+    parser.add_argument("--batch_size", type=int, default=8, help="Batch size for inference")
+    parser.add_argument("--output_vid_name", type=str, default=None, help="Name of output video file")
+    parser.add_argument("--use_saved_coord", action="store_true", help='Use saved coordinates to save time')
+    parser.add_argument("--saved_coord", action="store_true", help='Save coordinates for future use')
+    parser.add_argument("--use_float16", action="store_true", help="Use float16 for faster inference")
+    parser.add_argument("--parsing_mode", default='jaw', help="Face blending parsing mode")
+    parser.add_argument("--left_cheek_width", type=int, default=90, help="Width of left cheek region")
+    parser.add_argument("--right_cheek_width", type=int, default=90, help="Width of right cheek region")
+    parser.add_argument("--version", type=str, default="v15", choices=["v1", "v15"], help="Model version to use")
+    args = parser.parse_args()
+    main(args)

scripts/inference.py CHANGED Viewed

@@ -213,7 +213,7 @@ def main(args):
                 x1, y1, x2, y2 = bbox
                 if args.version == "v15":
                     y2 = y2 + args.extra_margin
-                    y2 = min(y2, frame.shape[0])
                 try:
                     res_frame = cv2.resize(res_frame.astype(np.uint8), (x2-x1, y2-y1))
                 except:

                 x1, y1, x2, y2 = bbox
                 if args.version == "v15":
                     y2 = y2 + args.extra_margin
+                    y2 = min(y2, ori_frame.shape[0])
                 try:
                     res_frame = cv2.resize(res_frame.astype(np.uint8), (x2-x1, y2-y1))
                 except:

server.py ADDED Viewed

	@@ -0,0 +1,607 @@

+"""
+MuseTalk Real-Time Server
+Servidor FastAPI para lip-sync em tempo real
+"""
+import os
+import sys
+import io
+import time
+import json
+import uuid
+import queue
+import pickle
+import shutil
+import asyncio
+import threading
+from pathlib import Path
+from typing import Optional
+import tempfile
+import cv2
+import glob
+import copy
+import torch
+import numpy as np
+from tqdm import tqdm
+from fastapi import FastAPI, UploadFile, File, Form, HTTPException, BackgroundTasks
+from fastapi.responses import FileResponse, StreamingResponse, JSONResponse
+from pydantic import BaseModel
+import uvicorn
+# Suppress warnings
+import warnings
+warnings.filterwarnings("ignore")
+# MuseTalk imports
+from musetalk.utils.utils import datagen, load_all_model
+from musetalk.utils.blending import get_image_prepare_material, get_image_blending
+from musetalk.utils.audio_processor import AudioProcessor
+from musetalk.utils.preprocessing_simple import get_landmark_and_bbox, read_imgs
+from transformers import WhisperModel
+app = FastAPI(title="MuseTalk Real-Time Server", version="1.5")
+# Global model instances
+models = {}
+avatars = {}
+class AvatarConfig(BaseModel):
+    avatar_id: str
+    video_path: str
+    bbox_shift: int = 0
+class InferenceRequest(BaseModel):
+    avatar_id: str
+    fps: int = 25
+def video2imgs(vid_path, save_path):
+    """Extract frames from video"""
+    cap = cv2.VideoCapture(vid_path)
+    count = 0
+    while True:
+        ret, frame = cap.read()
+        if ret:
+            cv2.imwrite(f"{save_path}/{count:08d}.png", frame)
+            count += 1
+        else:
+            break
+    cap.release()
+    return count
+@app.on_event("startup")
+async def load_models():
+    """Load all models at startup"""
+    global models
+    print("Loading MuseTalk models...")
+    # Force CPU if FORCE_CPU env var is set or if CUDA kernels are incompatible
+    force_cpu = os.environ.get("FORCE_CPU", "0") == "1"
+    if force_cpu or not torch.cuda.is_available():
+        device = torch.device("cpu")
+    else:
+        try:
+            # Test if CUDA kernels work for this GPU
+            test_tensor = torch.zeros(1).cuda()
+            _ = test_tensor.half()
+            device = torch.device("cuda:0")
+        except RuntimeError as e:
+            print(f"CUDA kernel test failed: {e}")
+            print("Falling back to CPU...")
+            device = torch.device("cpu")
+    print(f"Using device: {device}")
+    # Model paths
+    unet_model_path = "./models/musetalkV15/unet.pth"
+    unet_config = "./models/musetalkV15/musetalk.json"
+    whisper_dir = "./models/whisper"
+    vae_type = "sd-vae"
+    # Load models
+    vae, unet, pe = load_all_model(
+        unet_model_path=unet_model_path,
+        vae_type=vae_type,
+        unet_config=unet_config,
+        device=device
+    )
+    # Move to device, use half precision only for GPU
+    if device.type == "cuda":
+        pe = pe.half().to(device)
+        vae.vae = vae.vae.half().to(device)
+        unet.model = unet.model.half().to(device)
+    else:
+        pe = pe.to(device)
+        vae.vae = vae.vae.to(device)
+        unet.model = unet.model.to(device)
+    # Load whisper
+    audio_processor = AudioProcessor(feature_extractor_path=whisper_dir)
+    whisper = WhisperModel.from_pretrained(whisper_dir)
+    weight_dtype = unet.model.dtype if device.type == "cuda" else torch.float32
+    whisper = whisper.to(device=device, dtype=weight_dtype).eval()
+    whisper.requires_grad_(False)
+    # Initialize face parser
+    from musetalk.utils.face_parsing import FaceParsing
+    fp = FaceParsing(left_cheek_width=90, right_cheek_width=90)
+    timesteps = torch.tensor([0], device=device)
+    models = {
+        "vae": vae,
+        "unet": unet,
+        "pe": pe,
+        "whisper": whisper,
+        "audio_processor": audio_processor,
+        "fp": fp,
+        "device": device,
+        "timesteps": timesteps,
+        "weight_dtype": weight_dtype
+    }
+    print("Models loaded successfully!")
+@app.get("/")
+async def root():
+    return {"status": "ok", "message": "MuseTalk Real-Time Server"}
+@app.get("/health")
+async def health():
+    return {
+        "status": "healthy",
+        "models_loaded": len(models) > 0,
+        "avatars_count": len(avatars),
+        "gpu_available": torch.cuda.is_available()
+    }
+@app.post("/avatar/prepare")
+async def prepare_avatar(
+    avatar_id: str = Form(...),
+    video: UploadFile = File(...),
+    bbox_shift: int = Form(0, description="Ajusta abertura da boca: positivo=mais aberto, negativo=menos aberto (-9 a 9)"),
+    extra_margin: int = Form(10, description="Margem extra para movimento do queixo"),
+    parsing_mode: str = Form("jaw", description="Modo de parsing: 'jaw' (v1.5) ou 'raw' (v1.0)"),
+    left_cheek_width: int = Form(90, description="Largura da bochecha esquerda"),
+    right_cheek_width: int = Form(90, description="Largura da bochecha direita")
+):
+    """Prepare an avatar from video for real-time inference"""
+    global avatars
+    if not models:
+        raise HTTPException(status_code=503, detail="Models not loaded")
+    # Save uploaded video
+    avatar_path = f"./results/v15/avatars/{avatar_id}"
+    full_imgs_path = f"{avatar_path}/full_imgs"
+    mask_out_path = f"{avatar_path}/mask"
+    os.makedirs(avatar_path, exist_ok=True)
+    os.makedirs(full_imgs_path, exist_ok=True)
+    os.makedirs(mask_out_path, exist_ok=True)
+    # Save video
+    video_path = f"{avatar_path}/source_video{Path(video.filename).suffix}"
+    with open(video_path, "wb") as f:
+        content = await video.read()
+        f.write(content)
+    # Extract frames
+    print(f"Extracting frames from video...")
+    frame_count = video2imgs(video_path, full_imgs_path)
+    print(f"Extracted {frame_count} frames")
+    input_img_list = sorted(glob.glob(os.path.join(full_imgs_path, '*.[jpJP][pnPN]*[gG]')))
+    print("Extracting landmarks...")
+    # bbox_shift controls mouth openness: positive=more open, negative=less open
+    coord_list_raw, frame_list_raw = get_landmark_and_bbox(input_img_list, upperbondrange=bbox_shift)
+    # Generate latents - filter out frames without detected faces
+    input_latent_list = []
+    valid_coord_list = []
+    valid_frame_list = []
+    coord_placeholder = (0.0, 0.0, 0.0, 0.0)
+    vae = models["vae"]
+    # Create FaceParsing with custom cheek widths for this avatar
+    from musetalk.utils.face_parsing import FaceParsing
+    fp_avatar = FaceParsing(left_cheek_width=left_cheek_width, right_cheek_width=right_cheek_width)
+    for bbox, frame in zip(coord_list_raw, frame_list_raw):
+        if bbox == coord_placeholder:
+            continue
+        x1, y1, x2, y2 = bbox
+        # Validate bbox dimensions
+        if x2 <= x1 or y2 <= y1:
+            continue
+        # Add extra margin for jaw movement (v1.5 feature)
+        y2 = min(y2 + extra_margin, frame.shape[0])
+        # Store valid frame and coordinates
+        valid_coord_list.append([x1, y1, x2, y2])
+        valid_frame_list.append(frame)
+        crop_frame = frame[y1:y2, x1:x2]
+        if crop_frame.size == 0:
+            valid_coord_list.pop()
+            valid_frame_list.pop()
+            continue
+        resized_crop_frame = cv2.resize(crop_frame, (256, 256), interpolation=cv2.INTER_LANCZOS4)
+        latents = vae.get_latents_for_unet(resized_crop_frame)
+        input_latent_list.append(latents)
+    print(f"Valid frames with detected faces: {len(valid_frame_list)}/{len(frame_list_raw)}")
+    if len(valid_frame_list) == 0:
+        raise HTTPException(status_code=400, detail="No faces detected in video. Please use a video with a clear frontal face.")
+    # Create cycles from valid frames only
+    frame_list_cycle = valid_frame_list + valid_frame_list[::-1]
+    coord_list_cycle = valid_coord_list + valid_coord_list[::-1]
+    input_latent_list_cycle = input_latent_list + input_latent_list[::-1]
+    # Generate masks
+    mask_list_cycle = []
+    mask_coords_list_cycle = []
+    print(f"Generating masks with mode={parsing_mode}...")
+    for i, frame in enumerate(tqdm(frame_list_cycle)):
+        x1, y1, x2, y2 = coord_list_cycle[i]
+        mask, crop_box = get_image_prepare_material(frame, [x1, y1, x2, y2], fp=fp_avatar, mode=parsing_mode)
+        cv2.imwrite(f"{mask_out_path}/{str(i).zfill(8)}.png", mask)
+        mask_coords_list_cycle.append(crop_box)
+        mask_list_cycle.append(mask)
+    # Save preprocessed data
+    with open(f"{avatar_path}/coords.pkl", 'wb') as f:
+        pickle.dump(coord_list_cycle, f)
+    with open(f"{avatar_path}/mask_coords.pkl", 'wb') as f:
+        pickle.dump(mask_coords_list_cycle, f)
+    # Save quality settings
+    quality_settings = {
+        "bbox_shift": bbox_shift,
+        "extra_margin": extra_margin,
+        "parsing_mode": parsing_mode,
+        "left_cheek_width": left_cheek_width,
+        "right_cheek_width": right_cheek_width
+    }
+    with open(f"{avatar_path}/quality_settings.json", 'w') as f:
+        json.dump(quality_settings, f)
+    torch.save(input_latent_list_cycle, f"{avatar_path}/latents.pt")
+    # Store in memory - keep latents on CPU to save GPU memory
+    input_latent_list_cpu = [lat.cpu() for lat in input_latent_list_cycle]
+    avatars[avatar_id] = {
+        "path": avatar_path,
+        "frame_list_cycle": frame_list_cycle,
+        "coord_list_cycle": coord_list_cycle,
+        "input_latent_list_cycle": input_latent_list_cpu,
+        "mask_list_cycle": mask_list_cycle,
+        "mask_coords_list_cycle": mask_coords_list_cycle,
+        "quality_settings": quality_settings
+    }
+    # Clear GPU cache after preparation
+    import gc
+    gc.collect()
+    torch.cuda.empty_cache()
+    return {
+        "status": "success",
+        "avatar_id": avatar_id,
+        "frame_count": len(frame_list_cycle),
+        "quality_settings": quality_settings
+    }
+@app.post("/avatar/load/{avatar_id}")
+async def load_avatar(avatar_id: str):
+    """Load a previously prepared avatar"""
+    global avatars
+    avatar_path = f"./results/v15/avatars/{avatar_id}"
+    if not os.path.exists(avatar_path):
+        raise HTTPException(status_code=404, detail=f"Avatar {avatar_id} not found")
+    full_imgs_path = f"{avatar_path}/full_imgs"
+    mask_out_path = f"{avatar_path}/mask"
+    # Load preprocessed data
+    input_latent_list_cycle = torch.load(f"{avatar_path}/latents.pt")
+    with open(f"{avatar_path}/coords.pkl", 'rb') as f:
+        coord_list_cycle = pickle.load(f)
+    with open(f"{avatar_path}/mask_coords.pkl", 'rb') as f:
+        mask_coords_list_cycle = pickle.load(f)
+    # Load quality settings (with defaults for backwards compatibility)
+    quality_settings_path = f"{avatar_path}/quality_settings.json"
+    if os.path.exists(quality_settings_path):
+        with open(quality_settings_path, 'r') as f:
+            quality_settings = json.load(f)
+    else:
+        quality_settings = {
+            "bbox_shift": 0,
+            "extra_margin": 10,
+            "parsing_mode": "jaw",
+            "left_cheek_width": 90,
+            "right_cheek_width": 90
+        }
+    # Load frames
+    input_img_list = sorted(glob.glob(os.path.join(full_imgs_path, '*.[jpJP][pnPN]*[gG]')))
+    frame_list_cycle = read_imgs(input_img_list)
+    # Load masks
+    input_mask_list = sorted(glob.glob(os.path.join(mask_out_path, '*.[jpJP][pnPN]*[gG]')))
+    mask_list_cycle = read_imgs(input_mask_list)
+    # Keep latents on CPU to save GPU memory
+    input_latent_list_cpu = [lat.cpu() if hasattr(lat, 'cpu') else lat for lat in input_latent_list_cycle]
+    avatars[avatar_id] = {
+        "path": avatar_path,
+        "frame_list_cycle": frame_list_cycle,
+        "coord_list_cycle": coord_list_cycle,
+        "input_latent_list_cycle": input_latent_list_cpu,
+        "mask_list_cycle": mask_list_cycle,
+        "mask_coords_list_cycle": mask_coords_list_cycle,
+        "quality_settings": quality_settings
+    }
+    # Clear GPU cache
+    import gc
+    gc.collect()
+    torch.cuda.empty_cache()
+    return {
+        "status": "success",
+        "avatar_id": avatar_id,
+        "frame_count": len(frame_list_cycle),
+        "quality_settings": quality_settings
+    }
+@app.get("/avatars")
+async def list_avatars():
+    """List all available avatars"""
+    avatar_dir = "./results/v15/avatars"
+    if not os.path.exists(avatar_dir):
+        return {"avatars": [], "loaded": list(avatars.keys())}
+    available = [d for d in os.listdir(avatar_dir) if os.path.isdir(os.path.join(avatar_dir, d))]
+    return {"avatars": available, "loaded": list(avatars.keys())}
+@app.post("/inference")
+async def inference(
+    avatar_id: str = Form(...),
+    audio: UploadFile = File(...),
+    fps: int = Form(25)
+):
+    """Run inference with uploaded audio and return video"""
+    if avatar_id not in avatars:
+        raise HTTPException(status_code=404, detail=f"Avatar {avatar_id} not loaded. Use /avatar/load first")
+    if not models:
+        raise HTTPException(status_code=503, detail="Models not loaded")
+    avatar = avatars[avatar_id]
+    device = models["device"]
+    # Save audio temporarily
+    with tempfile.NamedTemporaryFile(suffix=Path(audio.filename).suffix, delete=False) as tmp:
+        content = await audio.read()
+        tmp.write(content)
+        audio_path = tmp.name
+    try:
+        start_time = time.time()
+        # Extract audio features
+        audio_processor = models["audio_processor"]
+        whisper = models["whisper"]
+        weight_dtype = models["weight_dtype"]
+        whisper_input_features, librosa_length = audio_processor.get_audio_feature(
+            audio_path, weight_dtype=weight_dtype
+        )
+        whisper_chunks = audio_processor.get_whisper_chunk(
+            whisper_input_features,
+            device,
+            weight_dtype,
+            whisper,
+            librosa_length,
+            fps=fps,
+            audio_padding_length_left=2,
+            audio_padding_length_right=2,
+        )
+        print(f"Audio processing: {(time.time() - start_time)*1000:.0f}ms")
+        # Inference
+        vae = models["vae"]
+        unet = models["unet"]
+        pe = models["pe"]
+        timesteps = models["timesteps"]
+        video_num = len(whisper_chunks)
+        batch_size = 4  # Reduced batch size to save GPU memory
+        gen = datagen(whisper_chunks, avatar["input_latent_list_cycle"], batch_size)
+        result_frames = []
+        inference_start = time.time()
+        for i, (whisper_batch, latent_batch) in enumerate(gen):
+            audio_feature_batch = pe(whisper_batch.to(device))
+            latent_batch = latent_batch.to(device=device, dtype=unet.model.dtype)
+            pred_latents = unet.model(
+                latent_batch,
+                timesteps,
+                encoder_hidden_states=audio_feature_batch
+            ).sample
+            pred_latents = pred_latents.to(device=device, dtype=vae.vae.dtype)
+            recon = vae.decode_latents(pred_latents)
+            for idx_in_batch, res_frame in enumerate(recon):
+                frame_idx = i * batch_size + idx_in_batch
+                if frame_idx >= video_num:
+                    break
+                bbox = avatar["coord_list_cycle"][frame_idx % len(avatar["coord_list_cycle"])]
+                ori_frame = copy.deepcopy(avatar["frame_list_cycle"][frame_idx % len(avatar["frame_list_cycle"])])
+                x1, y1, x2, y2 = bbox
+                res_frame = cv2.resize(res_frame.astype(np.uint8), (x2 - x1, y2 - y1))
+                mask = avatar["mask_list_cycle"][frame_idx % len(avatar["mask_list_cycle"])]
+                mask_crop_box = avatar["mask_coords_list_cycle"][frame_idx % len(avatar["mask_coords_list_cycle"])]
+                combine_frame = get_image_blending(ori_frame, res_frame, bbox, mask, mask_crop_box)
+                result_frames.append(combine_frame)
+        print(f"Inference: {(time.time() - inference_start)*1000:.0f}ms for {video_num} frames")
+        print(f"FPS: {video_num / (time.time() - inference_start):.1f}")
+        # Create video
+        output_path = tempfile.mktemp(suffix=".mp4")
+        h, w = result_frames[0].shape[:2]
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        out = cv2.VideoWriter(output_path, fourcc, fps, (w, h))
+        for frame in result_frames:
+            out.write(frame)
+        out.release()
+        # Combine with audio using ffmpeg
+        final_output = tempfile.mktemp(suffix=".mp4")
+        os.system(f"ffmpeg -y -v warning -i {audio_path} -i {output_path} -c:v libx264 -c:a aac {final_output}")
+        os.unlink(output_path)
+        os.unlink(audio_path)
+        total_time = time.time() - start_time
+        print(f"Total time: {total_time*1000:.0f}ms")
+        return FileResponse(
+            final_output,
+            media_type="video/mp4",
+            filename=f"output_{avatar_id}.mp4",
+            headers={"X-Processing-Time": f"{total_time:.2f}s"}
+        )
+    except Exception as e:
+        if os.path.exists(audio_path):
+            os.unlink(audio_path)
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/inference/frames")
+async def inference_frames(
+    avatar_id: str = Form(...),
+    audio: UploadFile = File(...),
+    fps: int = Form(25)
+):
+    """Run inference and return frames as JSON (for streaming)"""
+    if avatar_id not in avatars:
+        raise HTTPException(status_code=404, detail=f"Avatar {avatar_id} not loaded")
+    avatar = avatars[avatar_id]
+    device = models["device"]
+    # Save audio temporarily
+    with tempfile.NamedTemporaryFile(suffix=Path(audio.filename).suffix, delete=False) as tmp:
+        content = await audio.read()
+        tmp.write(content)
+        audio_path = tmp.name
+    try:
+        # Extract audio features
+        audio_processor = models["audio_processor"]
+        whisper = models["whisper"]
+        weight_dtype = models["weight_dtype"]
+        whisper_input_features, librosa_length = audio_processor.get_audio_feature(
+            audio_path, weight_dtype=weight_dtype
+        )
+        whisper_chunks = audio_processor.get_whisper_chunk(
+            whisper_input_features,
+            device,
+            weight_dtype,
+            whisper,
+            librosa_length,
+            fps=fps,
+        )
+        # Inference
+        vae = models["vae"]
+        unet = models["unet"]
+        pe = models["pe"]
+        timesteps = models["timesteps"]
+        video_num = len(whisper_chunks)
+        batch_size = 4  # Reduced batch size to save GPU memory
+        gen = datagen(whisper_chunks, avatar["input_latent_list_cycle"], batch_size)
+        frames_data = []
+        for i, (whisper_batch, latent_batch) in enumerate(gen):
+            audio_feature_batch = pe(whisper_batch.to(device))
+            latent_batch = latent_batch.to(device=device, dtype=unet.model.dtype)
+            pred_latents = unet.model(
+                latent_batch,
+                timesteps,
+                encoder_hidden_states=audio_feature_batch
+            ).sample
+            pred_latents = pred_latents.to(device=device, dtype=vae.vae.dtype)
+            recon = vae.decode_latents(pred_latents)
+            for idx_in_batch, res_frame in enumerate(recon):
+                frame_idx = i * batch_size + idx_in_batch
+                if frame_idx >= video_num:
+                    break
+                bbox = avatar["coord_list_cycle"][frame_idx % len(avatar["coord_list_cycle"])]
+                ori_frame = copy.deepcopy(avatar["frame_list_cycle"][frame_idx % len(avatar["frame_list_cycle"])])
+                x1, y1, x2, y2 = bbox
+                res_frame = cv2.resize(res_frame.astype(np.uint8), (x2 - x1, y2 - y1))
+                mask = avatar["mask_list_cycle"][frame_idx % len(avatar["mask_list_cycle"])]
+                mask_crop_box = avatar["mask_coords_list_cycle"][frame_idx % len(avatar["mask_coords_list_cycle"])]
+                combine_frame = get_image_blending(ori_frame, res_frame, bbox, mask, mask_crop_box)
+                # Encode frame as JPEG
+                _, buffer = cv2.imencode('.jpg', combine_frame, [cv2.IMWRITE_JPEG_QUALITY, 85])
+                import base64
+                frame_b64 = base64.b64encode(buffer).decode('utf-8')
+                frames_data.append(frame_b64)
+        os.unlink(audio_path)
+        return {
+            "frames": frames_data,
+            "fps": fps,
+            "total_frames": len(frames_data)
+        }
+    except Exception as e:
+        if os.path.exists(audio_path):
+            os.unlink(audio_path)
+        raise HTTPException(status_code=500, detail=str(e))
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000)