Spaces:

Tonic
/

Pixtral

Paused

App Files Files Community

Tonic commited on Sep 12, 2024

Commit

c6378e6

unverified ·

1 Parent(s): 6408837

add reference code from vllm

Browse files

Files changed (1) hide show

app.py +12 -9

app.py CHANGED Viewed

@@ -14,7 +14,7 @@ import spaces
 import math
 from typing import List, Optional, Tuple
-title = "#  🙋🏻‍♂️Welcome to Tonic's Pixtral Model Demo"
 description = """
 This demo showcases two capabilities of the Pixtral model:
 1. Image-to-Text Generation
@@ -27,6 +27,7 @@ This demo showcases two capabilities of the Pixtral model:
 model_path = snapshot_download(repo_id="mistralai/Pixtral-12B-2409")
 with open(f'{model_path}/params.json', 'r') as f:
     params = json.load(f)
 with open(f'{model_path}/tekken.json', 'r') as f:
     tokenizer_config = json.load(f)
@@ -177,14 +178,14 @@ class PixtralModel(nn.Module):
     def __init__(self, params):
         super().__init__()
         self.vision_encoder = VisionTransformer(params['vision_encoder'])
-        self.vision_language_adapter = VisionLanguageAdapter(params['vision_encoder'], params['text_config']['hidden_size'])
         self.language_model = nn.TransformerDecoder(
-            nn.TransformerDecoderLayer(d_model=params['text_config']['hidden_size'],
-                                       nhead=params['text_config']['num_attention_heads'],
-                                       dim_feedforward=params['text_config']['intermediate_size']),
-            num_layers=params['text_config']['num_hidden_layers']
         )
-        self.lm_head = nn.Linear(params['text_config']['hidden_size'], params['text_config']['vocab_size'], bias=False)
     def forward(self, image, input_ids=None):
         vision_output = self.vision_encoder(image)
@@ -274,16 +275,18 @@ def calculate_similarity(image1, image2):
 with gr.Blocks(theme=gr.themes.Base()) as demo:
     gr.Markdown(title)
     gr.Markdown("## Model Details")
     gr.Markdown(f"- Vision Encoder Hidden Size: {params['vision_encoder']['hidden_size']}")
     gr.Markdown(f"- Number of Vision Encoder Layers: {params['vision_encoder']['num_hidden_layers']}")
-    gr.Markdown(f"- Number of Attention Heads: {params['vision_encoder']['num_attention_heads']}")
     gr.Markdown(f"- Image Size: {params['vision_encoder']['image_size']}x{params['vision_encoder']['image_size']}")
     gr.Markdown(f"- Patch Size: {params['vision_encoder']['patch_size']}x{params['vision_encoder']['patch_size']}")
     gr.Markdown("## How it works")
     gr.Markdown("1. The image is processed by a Vision Encoder using 2D ROPE (Rotary Position Embedding).")
     gr.Markdown("2. The encoder uses SiLU activation in its feed-forward layers.")
     gr.Markdown("3. The encoded image is used for text generation or similarity comparison.")
     gr.Markdown(description)
     with gr.Tabs():

 import math
 from typing import List, Optional, Tuple
+title = "# 🙋🏻‍♂️Welcome to Tonic's Pixtral Model Demo"
 description = """
 This demo showcases two capabilities of the Pixtral model:
 1. Image-to-Text Generation
 model_path = snapshot_download(repo_id="mistralai/Pixtral-12B-2409")
 with open(f'{model_path}/params.json', 'r') as f:
     params = json.load(f)
 with open(f'{model_path}/tekken.json', 'r') as f:
     tokenizer_config = json.load(f)
     def __init__(self, params):
         super().__init__()
         self.vision_encoder = VisionTransformer(params['vision_encoder'])
+        self.vision_language_adapter = VisionLanguageAdapter(params['vision_encoder'], params['dim'])
         self.language_model = nn.TransformerDecoder(
+            nn.TransformerDecoderLayer(d_model=params['dim'],
+                                       nhead=params['n_heads'],
+                                       dim_feedforward=params['hidden_dim']),
+            num_layers=params['n_layers']
         )
+        self.lm_head = nn.Linear(params['dim'], params['vocab_size'], bias=False)
     def forward(self, image, input_ids=None):
         vision_output = self.vision_encoder(image)
 with gr.Blocks(theme=gr.themes.Base()) as demo:
     gr.Markdown(title)
     gr.Markdown("## Model Details")
+    gr.Markdown(f"- Model Dimension: {params['dim']}")
+    gr.Markdown(f"- Number of Layers: {params['n_layers']}")
+    gr.Markdown(f"- Number of Attention Heads: {params['n_heads']}")
     gr.Markdown(f"- Vision Encoder Hidden Size: {params['vision_encoder']['hidden_size']}")
     gr.Markdown(f"- Number of Vision Encoder Layers: {params['vision_encoder']['num_hidden_layers']}")
+    gr.Markdown(f"- Number of Vision Encoder Attention Heads: {params['vision_encoder']['num_attention_heads']}")
     gr.Markdown(f"- Image Size: {params['vision_encoder']['image_size']}x{params['vision_encoder']['image_size']}")
     gr.Markdown(f"- Patch Size: {params['vision_encoder']['patch_size']}x{params['vision_encoder']['patch_size']}")
     gr.Markdown("## How it works")
     gr.Markdown("1. The image is processed by a Vision Encoder using 2D ROPE (Rotary Position Embedding).")
     gr.Markdown("2. The encoder uses SiLU activation in its feed-forward layers.")
     gr.Markdown("3. The encoded image is used for text generation or similarity comparison.")
     gr.Markdown(description)
     with gr.Tabs():