import torch import gradio as gr from transformers import AutoTokenizer, AutoModel, AutoImageProcessor from PIL import Image import gc import os import spaces # Model configuration MODEL_PATH = "nvidia/Llama-Nemotron-Nano-VL-8B-V1" # Load model globally print("Loading model...") model = AutoModel.from_pretrained( MODEL_PATH, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, trust_remote_code=True, ).eval() tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) image_processor = AutoImageProcessor.from_pretrained( MODEL_PATH, trust_remote_code=True ) print("Model loaded successfully!") def move_to_device(obj, device): """Recursively move tensors to device""" if torch.is_tensor(obj): return obj.to(device) elif isinstance(obj, dict): return {k: move_to_device(v, device) for k, v in obj.items()} elif isinstance(obj, list): return [move_to_device(v, device) for v in obj] elif isinstance(obj, tuple): return tuple(move_to_device(v, device) for v in obj) elif hasattr(obj, 'to'): return obj.to(device) else: return obj @spaces.GPU(duration=60) def chat_text_only(message): try: device = "cuda" # Move entire model to GPU model.to(device) generation_config = dict( max_new_tokens=512, do_sample=True, temperature=0.7, eos_token_id=tokenizer.eos_token_id ) # Tokenize on CPU then move to GPU inputs = tokenizer(message, return_tensors="pt") inputs = move_to_device(inputs, device) # Generate with torch.no_grad(): response, _ = model.chat( tokenizer, None, message, generation_config, history=None, return_history=True ) # Move model back to CPU model.to("cpu") torch.cuda.empty_cache() gc.collect() return response except Exception as e: # Ensure model is back on CPU even if error occurs model.to("cpu") torch.cuda.empty_cache() gc.collect() return f"Error: {str(e)}" @spaces.GPU(duration=60) def chat_with_image(image, message): if image is None: return "Please upload an image." try: device = "cuda" # Move entire model to GPU model.to(device) generation_config = dict( max_new_tokens=512, do_sample=True, temperature=0.7, eos_token_id=tokenizer.eos_token_id ) # Process image image_features = image_processor(image) # Move all image features to GPU image_features = move_to_device(image_features, device) # Add image token to message if not present if "" not in message: message = f"\n{message}" # Generate with torch.no_grad(): response = model.chat( tokenizer=tokenizer, question=message, generation_config=generation_config, **image_features ) # Move model back to CPU model.to("cpu") torch.cuda.empty_cache() gc.collect() return response except Exception as e: # Ensure model is back on CPU even if error occurs model.to("cpu") torch.cuda.empty_cache() gc.collect() return f"Error: {str(e)}" @spaces.GPU(duration=60) def chat_with_two_images(image1, image2, message): if image1 is None or image2 is None: return "Please upload both images." try: device = "cuda" # Move entire model to GPU model.to(device) generation_config = dict( max_new_tokens=512, do_sample=True, temperature=0.7, eos_token_id=tokenizer.eos_token_id ) # Process both images image_features = image_processor([image1, image2]) # Move all image features to GPU image_features = move_to_device(image_features, device) # Format message for two images if "" not in message and "" not in message: message = f": \n: \n{message}" # Generate with torch.no_grad(): response = model.chat( tokenizer=tokenizer, question=message, generation_config=generation_config, **image_features ) # Move model back to CPU model.to("cpu") torch.cuda.empty_cache() gc.collect() return response except Exception as e: # Ensure model is back on CPU even if error occurs model.to("cpu") torch.cuda.empty_cache() gc.collect() return f"Error: {str(e)}" # Create Gradio interface def create_interface(): with gr.Blocks(title="Llama Nemotron Nano VL 8B", theme=gr.themes.Soft()) as demo: gr.Markdown("# 🦙 Llama Nemotron Nano VL 8B Vision-Language Model") gr.Markdown("Chat with a powerful vision-language model that can understand both text and images!") with gr.Tabs(): # Text-only chat tab with gr.TabItem("💬 Text Chat"): gr.Markdown("### Chat with the model using text only") with gr.Row(): with gr.Column(): text_input = gr.Textbox( label="Your message", placeholder="Ask me anything...", lines=3 ) text_submit = gr.Button("Send", variant="primary") with gr.Column(): text_output = gr.Textbox( label="Model Response", lines=10, max_lines=20 ) text_submit.click( chat_text_only, inputs=[text_input], outputs=[text_output] ) # Example questions gr.Examples( examples=[ ["What is artificial intelligence?"], ["Explain quantum computing in simple terms."], ["What happened in 1969?"], ["Write a short story about a robot."] ], inputs=[text_input] ) # Single image chat tab with gr.TabItem("🖼️ Image + Text Chat"): gr.Markdown("### Upload an image and ask questions about it") with gr.Row(): with gr.Column(): image_input = gr.Image( label="Upload Image", type="pil" ) image_text_input = gr.Textbox( label="Your question about the image", placeholder="What do you see in this image?", lines=3 ) image_submit = gr.Button("Analyze", variant="primary") with gr.Column(): image_output = gr.Textbox( label="Model Response", lines=10, max_lines=20 ) image_submit.click( chat_with_image, inputs=[image_input, image_text_input], outputs=[image_output] ) # Example prompts gr.Examples( examples=[ ["Describe what you see in this image."], ["What objects are in this image?"], ["Extract any text from this image."], ["What is the main subject of this image?"] ], inputs=[image_text_input] ) # Two images comparison tab with gr.TabItem("🖼️🖼️ Compare Two Images"): gr.Markdown("### Upload two images and ask the model to compare them") with gr.Row(): with gr.Column(): image1_input = gr.Image( label="First Image", type="pil" ) image2_input = gr.Image( label="Second Image", type="pil" ) two_images_text_input = gr.Textbox( label="Your question about both images", placeholder="Compare these two images...", lines=3 ) two_images_submit = gr.Button("Compare", variant="primary") with gr.Column(): two_images_output = gr.Textbox( label="Model Response", lines=10, max_lines=20 ) two_images_submit.click( chat_with_two_images, inputs=[image1_input, image2_input, two_images_text_input], outputs=[two_images_output] ) # Example prompts gr.Examples( examples=[ ["What are the main differences between these two images?"], ["Describe both images briefly."], ["Which image is more colorful?"], ["Compare the subjects in these images."] ], inputs=[two_images_text_input] ) # Footer gr.Markdown("---") gr.Markdown("⚡ Powered by NVIDIA Llama Nemotron Nano VL 8B") return demo # Create and launch the interface if __name__ == "__main__": demo = create_interface() demo.queue() # Enable queuing for Zero GPU demo.launch( server_name="0.0.0.0", server_port=7860, ssr_mode=False )