SkyReels_B

Paused

App Files Files Community

1inkusFace commited on Mar 6

Commit

8f1996d

verified ·

1 Parent(s): 03ff4c9

Update app.py

Browse files

Files changed (1) hide show

app.py +72 -40

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
-# app.py
 import spaces
 import gradio as gr
 import argparse
@@ -9,17 +8,17 @@ import subprocess
 from PIL import Image
 import numpy as np
-subprocess.run(['sh', './sky.sh'])
-sys.path.append("./SkyReels-V1")
-from skyreelsinfer import TaskType
-from skyreelsinfer.offload import OffloadConfig
-from skyreelsinfer.skyreels_video_infer import SkyReelsVideoSingleGpuInfer
 from diffusers.utils import export_to_video
 import torch
 import logging
-from collections import OrderedDict  # Import OrderedDict here
 torch.backends.cuda.matmul.allow_tf32 = False
 torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
@@ -31,32 +30,46 @@ torch.set_float32_matmul_precision("highest")
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 logger = logging.getLogger(__name__)
 # --- Dummy Classes (Keep for standalone execution) ---
 class OffloadConfig:
-    def __init__(self, high_cpu_memory=False, parameters_level=False, compiler_transformer=False, compiler_cache=""):
         self.high_cpu_memory = high_cpu_memory
         self.parameters_level = parameters_level
         self.compiler_transformer = compiler_transformer
         self.compiler_cache = compiler_cache
-class TaskType: #Keep here for infer
     T2V = 0
     I2V = 1
 class LlamaModel:
     @staticmethod
     def from_pretrained(*args, **kwargs):
         return LlamaModel()
     def to(self, device):
         return self
 class HunyuanVideoTransformer3DModel:
     @staticmethod
     def from_pretrained(*args, **kwargs):
         return HunyuanVideoTransformer3DModel()
     def to(self, device):
         return self
 class SkyreelsVideoPipeline:
     @staticmethod
     def from_pretrained(*args, **kwargs):
@@ -76,36 +89,45 @@ class SkyreelsVideoPipeline:
             image_tensor = torch.from_numpy(np.array(image)).float() / 255.0
             image_tensor = image_tensor.permute(2, 0, 1).unsqueeze(0)  # (H, W, C) -> (1, C, H, W)
-             # Create video by repeating the image and adding noise
             frames = image_tensor.repeat(1, 1, num_frames, 1, 1)  # (1, C, T, H, W)
-            frames = frames + torch.randn_like(frames) * 0.05 # Add a little noise.
         else:  # T2V
-            frames = torch.randn(1, 3, num_frames, height, width) # Use correct dims
-        return type('obj', (object,), {'frames' : frames})() # No longer a list!
     def __init__(self):
-      super().__init__()
-      self._modules = OrderedDict()
-      self.vae = self.VAE()
-      self._modules["vae"] = self.vae
     def named_children(self):
-      return self._modules.items()
     class VAE:
         def enable_tiling(self):
             pass
 def quantize_(*args, **kwargs):
     return
 def float8_weight_only():
     return
 # --- End Dummy Classes ---
 class SkyReelsVideoSingleGpuInfer:
-    def _load_model(self, model_id: str, base_model_id: str = "hunyuanvideo-community/HunyuanVideo", quant_model: bool = True):
         logger.info(f"load model model_id:{model_id} quan_model:{quant_model}")
         text_encoder = LlamaModel.from_pretrained(
             base_model_id, subfolder="text_encoder", torch_dtype=torch.bfloat16
@@ -160,7 +182,7 @@ class SkyReelsVideoSingleGpuInfer:
         self.pipe = self._load_model(model_id=self.model_id, quant_model=self.quant_model)
         if self.is_offload:
-          pass
         else:
             self.pipe.to(self.gpu_device)
@@ -176,10 +198,10 @@ class SkyReelsVideoSingleGpuInfer:
         self.is_initialized = True
     def warm_up(self):
-      if not self.is_initialized:
-          raise RuntimeError("Model must be initialized before warm-up.")
-      init_kwargs = {
             "prompt": "A woman is dancing in a room",
             "height": 544,
             "width": 960,
@@ -190,26 +212,38 @@ class SkyReelsVideoSingleGpuInfer:
             "generator": torch.Generator(self.gpu_device).manual_seed(42),
             "embedded_guidance_scale": 1.0,
         }
-      if self.task_type == TaskType.I2V:
-        init_kwargs["image"] = Image.new("RGB",(544,960), color="black")
-      self.pipe(**init_kwargs)
-      logger.info("Warm-up complete.")
     def infer(self, **kwargs):
         """Handles inference requests."""
         if not self.is_initialized:
-          self.initialize()
         if "seed" in kwargs:
             kwargs["generator"] = torch.Generator(self.gpu_device).manual_seed(kwargs["seed"])
             del kwargs["seed"]
         assert (self.task_type == TaskType.I2V and "image" in kwargs) or self.task_type == TaskType.T2V
-        result = self.pipe(**kwargs).frames # Return the tensor directly
         return result
 _predictor = None
 @spaces.GPU(duration=90)
-def generate_video(prompt, seed, image=None):
     global _predictor
     if seed == -1:
@@ -232,9 +266,6 @@ def generate_video(prompt, seed, image=None):
     else:
         task_type = TaskType.I2V
         model_id = "Skywork/SkyReels-V1-Hunyuan-I2V"
-        seed = 43
-        #generator = torch.Generator(device="cuda").manual_seed(seed)
         kwargs = {
             "prompt": prompt,
             "image": Image.open(image),
@@ -243,11 +274,10 @@ def generate_video(prompt, seed, image=None):
             "num_frames": 97,
             "num_inference_steps": 30,
             "seed": seed,
-            #"generator": generator,
             "guidance_scale": 6.0,
             "embedded_guidance_scale": 1.0,
             "negative_prompt": "Aerial view, low quality, bad hands",
-            "cfg_for": False,
         }
     if _predictor is None:
@@ -264,12 +294,13 @@ def generate_video(prompt, seed, image=None):
         )
         _predictor.initialize()
         logger.info("Predictor initialized")
-    out_samples = []
     with torch.no_grad():
-        output = _predictor.infer(**kwargs)
-        #out_samples.extend(output.frames[0])
-    output = (output.cpu().numpy() * 255).astype(np.uint8)
-    output = output.transpose(0, 2, 3, 4, 1)
     save_dir = f"./result"
     os.makedirs(save_dir, exist_ok=True)
@@ -278,6 +309,7 @@ def generate_video(prompt, seed, image=None):
     export_to_video(output, video_out_file, fps=24)
     return video_out_file, kwargs
 def create_gradio_interface():
     with gr.Blocks() as demo:
         with gr.Row():

 import spaces
 import gradio as gr
 import argparse
 from PIL import Image
 import numpy as np
+# subprocess.run(['sh', './sky.sh'])  # Removed as it's likely environment-specific
+# sys.path.append("./SkyReels-V1") # Removed as it's likely environment-specific
+# from skyreelsinfer import TaskType  # Dummy classes cover this
+# from skyreelsinfer.offload import OffloadConfig # Dummy classes cover this
+# from skyreelsinfer.skyreels_video_infer import SkyReelsVideoSingleGpuInfer # Dummy classes cover this
 from diffusers.utils import export_to_video
 import torch
 import logging
+from collections import OrderedDict
 torch.backends.cuda.matmul.allow_tf32 = False
 torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 logger = logging.getLogger(__name__)
 # --- Dummy Classes (Keep for standalone execution) ---
 class OffloadConfig:
+    def __init__(
+        self,
+        high_cpu_memory: bool = False,
+        parameters_level: bool = False,
+        compiler_transformer: bool = False,
+        compiler_cache: str = "",
+    ):
         self.high_cpu_memory = high_cpu_memory
         self.parameters_level = parameters_level
         self.compiler_transformer = compiler_transformer
         self.compiler_cache = compiler_cache
+class TaskType:  # Keep here for infer
     T2V = 0
     I2V = 1
 class LlamaModel:
     @staticmethod
     def from_pretrained(*args, **kwargs):
         return LlamaModel()
     def to(self, device):
         return self
 class HunyuanVideoTransformer3DModel:
     @staticmethod
     def from_pretrained(*args, **kwargs):
         return HunyuanVideoTransformer3DModel()
     def to(self, device):
         return self
 class SkyreelsVideoPipeline:
     @staticmethod
     def from_pretrained(*args, **kwargs):
             image_tensor = torch.from_numpy(np.array(image)).float() / 255.0
             image_tensor = image_tensor.permute(2, 0, 1).unsqueeze(0)  # (H, W, C) -> (1, C, H, W)
+            # Create video by repeating the image
             frames = image_tensor.repeat(1, 1, num_frames, 1, 1)  # (1, C, T, H, W)
+            frames = frames + torch.randn_like(frames) * 0.05  # Add a little noise
+            frames = frames.permute(0, 2, 1, 3, 4) #Change to 1,T,C,H,W
         else:  # T2V
+            frames = torch.randn(1, num_frames, 3, height, width)  # Use correct dims: (1, T, C, H, W)
+        return type("obj", (object,), {"frames": frames})()  # No longer a list!
     def __init__(self):
+        super().__init__()
+        self._modules = OrderedDict()
+        self.vae = self.VAE()
+        self._modules["vae"] = self.vae
     def named_children(self):
+        return self._modules.items()
     class VAE:
         def enable_tiling(self):
             pass
 def quantize_(*args, **kwargs):
     return
 def float8_weight_only():
     return
 # --- End Dummy Classes ---
 class SkyReelsVideoSingleGpuInfer:
+    def _load_model(
+        self, model_id: str, base_model_id: str = "hunyuanvideo-community/HunyuanVideo", quant_model: bool = True
+    ):
         logger.info(f"load model model_id:{model_id} quan_model:{quant_model}")
         text_encoder = LlamaModel.from_pretrained(
             base_model_id, subfolder="text_encoder", torch_dtype=torch.bfloat16
         self.pipe = self._load_model(model_id=self.model_id, quant_model=self.quant_model)
         if self.is_offload:
+            pass  # Offloading logic (if any) would go here
         else:
             self.pipe.to(self.gpu_device)
         self.is_initialized = True
     def warm_up(self):
+        if not self.is_initialized:
+            raise RuntimeError("Model must be initialized before warm-up.")
+        init_kwargs = {
             "prompt": "A woman is dancing in a room",
             "height": 544,
             "width": 960,
             "generator": torch.Generator(self.gpu_device).manual_seed(42),
             "embedded_guidance_scale": 1.0,
         }
+        if self.task_type == TaskType.I2V:
+            init_kwargs["image"] = Image.new("RGB", (544, 960), color="black")
+        self.pipe(**init_kwargs)
+        logger.info("Warm-up complete.")
     def infer(self, **kwargs):
         """Handles inference requests."""
         if not self.is_initialized:
+            self.initialize()
         if "seed" in kwargs:
             kwargs["generator"] = torch.Generator(self.gpu_device).manual_seed(kwargs["seed"])
             del kwargs["seed"]
         assert (self.task_type == TaskType.I2V and "image" in kwargs) or self.task_type == TaskType.T2V
+        result = self.pipe(**kwargs).frames  # Return the tensor directly
         return result
 _predictor = None
 @spaces.GPU(duration=90)
+def generate_video(prompt: str, seed: int, image: str = None) -> tuple[str, dict]:
+    """Generates a video based on the given prompt and seed.
+    Args:
+        prompt: The text prompt to guide video generation.
+        seed: The random seed for reproducibility.
+        image: Optional path to an image for Image-to-Video.
+    Returns:
+        A tuple containing the path to the generated video and the parameters used.
+    """
     global _predictor
     if seed == -1:
     else:
         task_type = TaskType.I2V
         model_id = "Skywork/SkyReels-V1-Hunyuan-I2V"
         kwargs = {
             "prompt": prompt,
             "image": Image.open(image),
             "num_frames": 97,
             "num_inference_steps": 30,
             "seed": seed,
             "guidance_scale": 6.0,
             "embedded_guidance_scale": 1.0,
             "negative_prompt": "Aerial view, low quality, bad hands",
+            "cfg_for": False, #Keep if present in the original
         }
     if _predictor is None:
         )
         _predictor.initialize()
         logger.info("Predictor initialized")
     with torch.no_grad():
+        output = _predictor.infer(**kwargs) #Removed [0]
+    output = (output.numpy() * 255).astype(np.uint8)
+    output = output.transpose(0, 2, 3, 4, 1) # Keep this
+    output = output[0]  # Remove batch dimension, now (T, H, W, C)
     save_dir = f"./result"
     os.makedirs(save_dir, exist_ok=True)
     export_to_video(output, video_out_file, fps=24)
     return video_out_file, kwargs
 def create_gradio_interface():
     with gr.Blocks() as demo:
         with gr.Row():