TIGER-Lab
/

VLM2Vec-Full

@@ -41,6 +41,7 @@ Our model can outperform the existing baselines by a huge margin.
 First you can clone our github
 ```bash
 git clone https://github.com/TIGER-AI-Lab/VLM2Vec.git
 ```
 Then you can enter the directory to run the following command.
@@ -53,7 +54,7 @@ from PIL import Image
 import numpy as np
 model_args = ModelArguments(
-    model_name='microsoft/Phi-3.5-vision-instruct',
     pooling='last',
     normalize=True,
     lora=True,
@@ -74,17 +75,19 @@ inputs = processor('<|image_1|> Represent the given image with the following que
 inputs = {key: value.to('cuda') for key, value in inputs.items()}
 qry_output = model(qry=inputs)["qry_reps"]
-## Compute the similarity;
 string = 'A cat and a dog'
 inputs = processor(string)
 inputs = {key: value.to('cuda') for key, value in inputs.items()}
 tgt_output = model(tgt=inputs)["tgt_reps"]
 print(string, '=', model.compute_similarity(qry_output, tgt_output))
 inputs = processor(string)
 inputs = {key: value.to('cuda') for key, value in inputs.items()}
 tgt_output = model(tgt=inputs)["tgt_reps"]
 print(string, '=', model.compute_similarity(qry_output, tgt_output))
 # Text -> Image
 inputs = processor('Find me an everyday image that matches the given caption: A cat and a dog.',)
@@ -92,10 +95,11 @@ inputs = {key: value.to('cuda') for key, value in inputs.items()}
 qry_output = model(qry=inputs)["qry_reps"]
 string = '<|image_1|> Represent the given image.'
-inputs = processor(string, [Image.open('figures/example.jpg')]])
 inputs = {key: value.to('cuda') for key, value in inputs.items()}
 tgt_output = model(tgt=inputs)["tgt_reps"]
 print(string, '=', model.compute_similarity(qry_output, tgt_output))
 ```
 ## Citation

 First you can clone our github
 ```bash
 git clone https://github.com/TIGER-AI-Lab/VLM2Vec.git
+pip -r requirements.txt
 ```
 Then you can enter the directory to run the following command.
 import numpy as np
 model_args = ModelArguments(
+    model_name='microsoft/Phi-3.5-vision-instruct',
     pooling='last',
     normalize=True,
     lora=True,
 inputs = {key: value.to('cuda') for key, value in inputs.items()}
 qry_output = model(qry=inputs)["qry_reps"]
 string = 'A cat and a dog'
 inputs = processor(string)
 inputs = {key: value.to('cuda') for key, value in inputs.items()}
 tgt_output = model(tgt=inputs)["tgt_reps"]
 print(string, '=', model.compute_similarity(qry_output, tgt_output))
+## A cat and a dog = tensor([[0.2969]], device='cuda:0', dtype=torch.bfloat16)
+string = 'A cat and a tiger'
 inputs = processor(string)
 inputs = {key: value.to('cuda') for key, value in inputs.items()}
 tgt_output = model(tgt=inputs)["tgt_reps"]
 print(string, '=', model.compute_similarity(qry_output, tgt_output))
+## A cat and a tiger = tensor([[0.2080]], device='cuda:0', dtype=torch.bfloat16)
 # Text -> Image
 inputs = processor('Find me an everyday image that matches the given caption: A cat and a dog.',)
 qry_output = model(qry=inputs)["qry_reps"]
 string = '<|image_1|> Represent the given image.'
+inputs = processor(string, [Image.open('figures/example.jpg')])
 inputs = {key: value.to('cuda') for key, value in inputs.items()}
 tgt_output = model(tgt=inputs)["tgt_reps"]
 print(string, '=', model.compute_similarity(qry_output, tgt_output))
+## <|image_1|> Represent the given image. = tensor([[0.3105]], device='cuda:0', dtype=torch.bfloat16)
 ```
 ## Citation