Programmatic Invokation
Is there a way you can invoke these files programmatically, ala Diffusers? The examples I've seen still require loading the original model and just end up replacing the transformers part of the pipeline with the GGUF.
All other examples I've seen use frameworks and apps like ComfyUI. I'm looking for a way of invoking these models programmatically.
@city96 Can you help here?
I was reading your ComfyUI plugin, which I think is about the only way to run GGUF quantisations of diffusion models from what I can find online. I didn't understand most of what I was looking at, I'm afraid, but any info you can share would be appreciated.
I had success with this code, adapted from these sources:
- similar discussion about QuantStack/Qwen-Image-Edit-2509-GGUF
- DFloat11's Wan2.2 example code
- Diffusers WanImageToVideoPipeline documentation
- Diffusers GGUF usage documentation
Shout out to @awssamdwar for the Qwen example, showing how it can be done with diffusers.
pip install -U gguf
import torch
import numpy as np
from diffusers import AutoencoderKLWan, WanImageToVideoPipeline, WanTransformer3DModel, GGUFQuantizationConfig
from diffusers.utils import export_to_video, load_image
import os
import time
from datetime import datetime
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Default arguments taken from exanple at https://huggingface.co/DFloat11/Wan2.2-I2V-A14B-DF11
image_path = "https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/wan_i2v_input.JPG"
prompt = "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside."
negative_prompt = "Vibrant colors, overexposure, static, blurred details, subtitles, style, artwork, painting, still image, overall grayness, worst quality, low quality, JPEG compression residue, ugly, mutilated, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, malformed limbs, fused fingers, still image, cluttered background, three legs, crowded background, walking backwards"
width = 1280
height = 720
num_frames = 81
fps = 16
guidance_scale = 3.5
num_inference_steps = 40
seed = 42
def print_inv(text):
print(f"\033[100m\033[30m{text}\033[0m")
# Download a high noise and low noise GGUF file from https://huggingface.co/QuantStack/Wan2.2-I2V-A14B-GGUF/tree/main and place it in a local folder
my_local_base_path = "/mnt/e/AI/HF"
my_local_quant_path = "gguf/QuantStack/Wan2.2-I2V-A14B-GGUF"
#transformer_file = "Wan2.2-I2V-A14B-HighNoise-Q8_0.gguf"
transformer_file = "HighNoise/Wan2.2-I2V-A14B-HighNoise-Q6_K.gguf"
transformer_path = os.path.join(my_local_base_path, my_local_quant_path, transformer_file)
#transformer_2_file = "Wan2.2-I2V-A14B-LowNoise-Q8_0.gguf"
transformer_2_file = "LowNoise/Wan2.2-I2V-A14B-LowNoise-Q6_K.gguf"
transformer_2_path = os.path.join(my_local_base_path, my_local_quant_path, transformer_2_file)
#model_id = "Wan-AI/Wan2.2-I2V-A14B"
# The config.json files, model_index.json, and folder structure of the Wan-AI/Wan2.2-I2V-A14B repository are not what diffusers expects
# You can reconfigure it manually with local edited config files, or point to this repository instead, already intended for diffusers:
model_id = "Wan-AI/Wan2.2-I2V-A14B-Diffusers"
print_inv("Configuring high noise transformer")
transformer = WanTransformer3DModel.from_single_file(
transformer_path,
quantization_config=GGUFQuantizationConfig(compute_dtype=torch.bfloat16),
config=model_id,
subfolder="transformer",
torch_dtype=torch.bfloat16,
offload_device="cpu",
device=torch.device("cuda"),
)
print_inv("Configuring low noise transformer")
transformer_2 = WanTransformer3DModel.from_single_file(
transformer_2_path,
quantization_config=GGUFQuantizationConfig(compute_dtype=torch.bfloat16),
config=model_id,
subfolder="transformer_2",
torch_dtype=torch.bfloat16,
offload_device="cpu",
device=torch.device("cuda"),
)
print_inv("Configuring VAE")
# This is configured to use the original VAE from the Wan-AI repository, which is only about 250MB larger than the one from QuantStack
vae = AutoencoderKLWan.from_pretrained(
model_id,
subfolder="vae",
torch_dtype=torch.bfloat16,
)
print_inv("Configuring pipeline")
pipe = WanImageToVideoPipeline.from_pretrained(
model_id,
transformer=transformer,
transformer_2=transformer_2,
vae=vae,
torch_dtype=torch.bfloat16,
boundary_ratio = 0.9
)
pipe.enable_model_cpu_offload()
print_inv("Loading and resizing image")
image = load_image(image_path)
max_area = width * height
aspect_ratio = image.height / image.width
mod_value = pipe.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1]
height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
image = image.resize((width, height))
generator = torch.Generator(device="cuda").manual_seed(seed)
print_inv("Generating video output")
start_time = time.time()
output = pipe(
image=image,
prompt=prompt,
negative_prompt=negative_prompt,
height=height,
width=width,
num_frames=num_frames,
guidance_scale=guidance_scale,
num_inference_steps=num_inference_steps,
generator=generator,
).frames[0]
print_inv(f"Time taken: {time.time() - start_time:.2f} seconds")
print_inv("Exporting video")
output_dir = os.path.join(my_local_base_path, "output")
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
output_path = f"{output_dir}/QuantStack-Wan22-I2V-A14B-GGUF-{timestamp}.mp4"
export_to_video(output, output_path, fps=fps)
print(f"File saved as {output_path}")
max_memory = torch.cuda.max_memory_allocated()
print(f"Max memory: {max_memory / (1000 ** 3):.2f} GB")
For reference, I'm using an RTX 5000 Ada Generation GPU with 32GB VRAM in a WSL2 environment