Spaces:
Paused
Paused
Tonic
commited on
add reference code from vllm
Browse files
app.py
CHANGED
|
@@ -14,7 +14,7 @@ import spaces
|
|
| 14 |
import math
|
| 15 |
from typing import List, Optional, Tuple
|
| 16 |
|
| 17 |
-
title = "#
|
| 18 |
description = """
|
| 19 |
This demo showcases two capabilities of the Pixtral model:
|
| 20 |
1. Image-to-Text Generation
|
|
@@ -27,6 +27,7 @@ This demo showcases two capabilities of the Pixtral model:
|
|
| 27 |
model_path = snapshot_download(repo_id="mistralai/Pixtral-12B-2409")
|
| 28 |
with open(f'{model_path}/params.json', 'r') as f:
|
| 29 |
params = json.load(f)
|
|
|
|
| 30 |
with open(f'{model_path}/tekken.json', 'r') as f:
|
| 31 |
tokenizer_config = json.load(f)
|
| 32 |
|
|
@@ -177,14 +178,14 @@ class PixtralModel(nn.Module):
|
|
| 177 |
def __init__(self, params):
|
| 178 |
super().__init__()
|
| 179 |
self.vision_encoder = VisionTransformer(params['vision_encoder'])
|
| 180 |
-
self.vision_language_adapter = VisionLanguageAdapter(params['vision_encoder'], params['
|
| 181 |
self.language_model = nn.TransformerDecoder(
|
| 182 |
-
nn.TransformerDecoderLayer(d_model=params['
|
| 183 |
-
nhead=params['
|
| 184 |
-
dim_feedforward=params['
|
| 185 |
-
num_layers=params['
|
| 186 |
)
|
| 187 |
-
self.lm_head = nn.Linear(params['
|
| 188 |
|
| 189 |
def forward(self, image, input_ids=None):
|
| 190 |
vision_output = self.vision_encoder(image)
|
|
@@ -274,16 +275,18 @@ def calculate_similarity(image1, image2):
|
|
| 274 |
with gr.Blocks(theme=gr.themes.Base()) as demo:
|
| 275 |
gr.Markdown(title)
|
| 276 |
gr.Markdown("## Model Details")
|
|
|
|
|
|
|
|
|
|
| 277 |
gr.Markdown(f"- Vision Encoder Hidden Size: {params['vision_encoder']['hidden_size']}")
|
| 278 |
gr.Markdown(f"- Number of Vision Encoder Layers: {params['vision_encoder']['num_hidden_layers']}")
|
| 279 |
-
gr.Markdown(f"- Number of Attention Heads: {params['vision_encoder']['num_attention_heads']}")
|
| 280 |
gr.Markdown(f"- Image Size: {params['vision_encoder']['image_size']}x{params['vision_encoder']['image_size']}")
|
| 281 |
gr.Markdown(f"- Patch Size: {params['vision_encoder']['patch_size']}x{params['vision_encoder']['patch_size']}")
|
| 282 |
gr.Markdown("## How it works")
|
| 283 |
gr.Markdown("1. The image is processed by a Vision Encoder using 2D ROPE (Rotary Position Embedding).")
|
| 284 |
gr.Markdown("2. The encoder uses SiLU activation in its feed-forward layers.")
|
| 285 |
gr.Markdown("3. The encoded image is used for text generation or similarity comparison.")
|
| 286 |
-
|
| 287 |
gr.Markdown(description)
|
| 288 |
|
| 289 |
with gr.Tabs():
|
|
|
|
| 14 |
import math
|
| 15 |
from typing import List, Optional, Tuple
|
| 16 |
|
| 17 |
+
title = "# 🙋🏻♂️Welcome to Tonic's Pixtral Model Demo"
|
| 18 |
description = """
|
| 19 |
This demo showcases two capabilities of the Pixtral model:
|
| 20 |
1. Image-to-Text Generation
|
|
|
|
| 27 |
model_path = snapshot_download(repo_id="mistralai/Pixtral-12B-2409")
|
| 28 |
with open(f'{model_path}/params.json', 'r') as f:
|
| 29 |
params = json.load(f)
|
| 30 |
+
|
| 31 |
with open(f'{model_path}/tekken.json', 'r') as f:
|
| 32 |
tokenizer_config = json.load(f)
|
| 33 |
|
|
|
|
| 178 |
def __init__(self, params):
|
| 179 |
super().__init__()
|
| 180 |
self.vision_encoder = VisionTransformer(params['vision_encoder'])
|
| 181 |
+
self.vision_language_adapter = VisionLanguageAdapter(params['vision_encoder'], params['dim'])
|
| 182 |
self.language_model = nn.TransformerDecoder(
|
| 183 |
+
nn.TransformerDecoderLayer(d_model=params['dim'],
|
| 184 |
+
nhead=params['n_heads'],
|
| 185 |
+
dim_feedforward=params['hidden_dim']),
|
| 186 |
+
num_layers=params['n_layers']
|
| 187 |
)
|
| 188 |
+
self.lm_head = nn.Linear(params['dim'], params['vocab_size'], bias=False)
|
| 189 |
|
| 190 |
def forward(self, image, input_ids=None):
|
| 191 |
vision_output = self.vision_encoder(image)
|
|
|
|
| 275 |
with gr.Blocks(theme=gr.themes.Base()) as demo:
|
| 276 |
gr.Markdown(title)
|
| 277 |
gr.Markdown("## Model Details")
|
| 278 |
+
gr.Markdown(f"- Model Dimension: {params['dim']}")
|
| 279 |
+
gr.Markdown(f"- Number of Layers: {params['n_layers']}")
|
| 280 |
+
gr.Markdown(f"- Number of Attention Heads: {params['n_heads']}")
|
| 281 |
gr.Markdown(f"- Vision Encoder Hidden Size: {params['vision_encoder']['hidden_size']}")
|
| 282 |
gr.Markdown(f"- Number of Vision Encoder Layers: {params['vision_encoder']['num_hidden_layers']}")
|
| 283 |
+
gr.Markdown(f"- Number of Vision Encoder Attention Heads: {params['vision_encoder']['num_attention_heads']}")
|
| 284 |
gr.Markdown(f"- Image Size: {params['vision_encoder']['image_size']}x{params['vision_encoder']['image_size']}")
|
| 285 |
gr.Markdown(f"- Patch Size: {params['vision_encoder']['patch_size']}x{params['vision_encoder']['patch_size']}")
|
| 286 |
gr.Markdown("## How it works")
|
| 287 |
gr.Markdown("1. The image is processed by a Vision Encoder using 2D ROPE (Rotary Position Embedding).")
|
| 288 |
gr.Markdown("2. The encoder uses SiLU activation in its feed-forward layers.")
|
| 289 |
gr.Markdown("3. The encoded image is used for text generation or similarity comparison.")
|
|
|
|
| 290 |
gr.Markdown(description)
|
| 291 |
|
| 292 |
with gr.Tabs():
|