Update README.md
Browse files
README.md
CHANGED
|
@@ -38,56 +38,94 @@ pipeline = transformers.pipeline(
|
|
| 38 |
device_map="auto",
|
| 39 |
)
|
| 40 |
|
| 41 |
-
# prompt formatting
|
| 42 |
|
|
|
|
|
|
|
| 43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
-
test_caption = "a blue rabbit and a red plane"
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
model = PromptCap("vqascore/promptcap-coco-vqa") # also support OFA checkpoints. e.g. "OFA-Sys/ofa-large"
|
| 51 |
-
|
| 52 |
-
if torch.cuda.is_available():
|
| 53 |
-
model.cuda()
|
| 54 |
-
|
| 55 |
-
prompt = "please describe this image according to the given question: what piece of clothing is this boy putting on?"
|
| 56 |
-
image = "glove_boy.jpeg"
|
| 57 |
-
|
| 58 |
-
print(model.caption(prompt, image))
|
| 59 |
-
```
|
| 60 |
-
|
| 61 |
-
To try generic captioning, just use "what does the image describe?"
|
| 62 |
|
| 63 |
-
|
| 64 |
-
prompt = "what does the image describe?"
|
| 65 |
-
image = "glove_boy.jpeg"
|
| 66 |
|
| 67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
```
|
| 69 |
|
|
|
|
| 70 |
|
| 71 |
-
|
| 72 |
-
PromptCap also support taking OCR inputs:
|
| 73 |
|
| 74 |
```python
|
| 75 |
-
|
| 76 |
-
image = "dvds.jpg"
|
| 77 |
-
ocr = "yip AE Mht juor 02/14/2012"
|
| 78 |
-
|
| 79 |
-
print(model.caption(prompt, image, ocr))
|
| 80 |
-
```
|
| 81 |
|
|
|
|
| 82 |
|
|
|
|
| 83 |
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
## Bibtex
|
| 86 |
```
|
| 87 |
-
@article{
|
| 88 |
-
title={
|
| 89 |
-
author={Hu, Yushi and
|
| 90 |
-
journal={arXiv preprint arXiv:
|
| 91 |
-
year={
|
| 92 |
}
|
| 93 |
```
|
|
|
|
| 38 |
device_map="auto",
|
| 39 |
)
|
| 40 |
|
|
|
|
| 41 |
|
| 42 |
+
# formating prompt following LLaMA 2 style
|
| 43 |
+
def create_qg_prompt(caption):
|
| 44 |
|
| 45 |
+
INTRO_BLURB = """Given an image description, generate one or two multiple-choice questions that verifies if the image description is correct.
|
| 46 |
+
Classify each concept into a type (object, human, animal, food, activity, attribute, counting, color, material, spatial, location, shape, other), and then generate a question for each type.
|
| 47 |
+
"""
|
| 48 |
+
|
| 49 |
+
formated_prompt = f"<s>[INST] <<SYS>>\n{INTRO_BLURB}\n<</SYS>>\n\n"
|
| 50 |
+
formated_prompt += f"Description: {caption} [/INST] Entities:"
|
| 51 |
+
return formated_prompt
|
| 52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
+
test_caption = "a blue rabbit and a red plane"
|
|
|
|
|
|
|
| 55 |
|
| 56 |
+
# create prompt
|
| 57 |
+
prompt = create_qg_prompt(text_caption)
|
| 58 |
+
|
| 59 |
+
# text completion
|
| 60 |
+
sequences = pipeline(
|
| 61 |
+
prompt, do_sample=False, num_beams=5, num_return_sequences=1, max_length=512)
|
| 62 |
+
output = sequences[0]['generated_text'][len(prompt):]
|
| 63 |
+
output = output.split('\n\n')[0]
|
| 64 |
+
|
| 65 |
+
# output
|
| 66 |
+
print(output)
|
| 67 |
+
|
| 68 |
+
#### Expected output ###
|
| 69 |
+
# rabbit, plane
|
| 70 |
+
# Activites:
|
| 71 |
+
# Colors: blue, red
|
| 72 |
+
# Counting:
|
| 73 |
+
# Other attributes:
|
| 74 |
+
# About rabbit (animal):
|
| 75 |
+
# Q: is this a rabbit?
|
| 76 |
+
# Choices: yes, no
|
| 77 |
+
# A: yes
|
| 78 |
+
# About rabbit (animal):
|
| 79 |
+
# Q: what animal is in the picture?
|
| 80 |
+
# Choices: rabbit, dog, cat, fish
|
| 81 |
+
# A: rabbit
|
| 82 |
+
# About plane (object):
|
| 83 |
+
# Q: is this a plane?
|
| 84 |
+
# Choices: yes, no
|
| 85 |
+
# A: yes
|
| 86 |
+
# About plane (object):
|
| 87 |
+
# Q: what type of vehicle is this?
|
| 88 |
+
# Choices: plane, car, motorcycle, bus
|
| 89 |
+
# A: plane
|
| 90 |
+
# About blue (color):
|
| 91 |
+
# Q: is the rabbit blue?
|
| 92 |
+
# Choices: yes, no
|
| 93 |
+
# A: yes
|
| 94 |
+
# About blue (color):
|
| 95 |
+
# Q: what color is the rabbit?
|
| 96 |
+
# Choices: blue, red, yellow, green
|
| 97 |
+
# A: blue
|
| 98 |
+
# About red (color):
|
| 99 |
+
# Q: is the plane red?
|
| 100 |
+
# Choices: yes, no
|
| 101 |
+
# A: yes
|
| 102 |
+
# About red (color):
|
| 103 |
+
# Q: what color is the plane?
|
| 104 |
+
# Choices: red, blue, yellow, green
|
| 105 |
+
# A: red
|
| 106 |
```
|
| 107 |
|
| 108 |
+
# Use this LM under tifascore package
|
| 109 |
|
| 110 |
+
tifascore provides extra functions to parse this output etc. The usage is below
|
|
|
|
| 111 |
|
| 112 |
```python
|
| 113 |
+
from tifascore import get_llama2_pipeline, get_llama2_question_and_answers
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
+
pipeline = get_llama2_pipeline("tifa-benchmark/llama2_tifa_question_generation")
|
| 116 |
|
| 117 |
+
print(get_llama2_question_and_answers(pipeline, "a blue rabbit and a red plane"))
|
| 118 |
|
| 119 |
+
#### Expected output ###
|
| 120 |
+
[{'caption': 'a blue rabbit and a red plane', 'element': 'rabbit', 'question': 'what animal is in the picture?', 'choices': ['rabbit', 'dog', 'cat', 'fish'], 'answer': 'rabbit', 'element_type': 'animal/human'}, {'caption': 'a blue rabbit and a red plane', 'element': 'plane', 'question': 'is this a plane?', 'choices': ['yes', 'no'], 'answer': 'yes', 'element_type': 'object'}, {'caption': 'a blue rabbit and a red plane', 'element': 'plane', 'question': 'what type of vehicle is this?', 'choices': ['plane', 'car', 'motorcycle', 'bus'], 'answer': 'plane', 'element_type': 'object'}, {'caption': 'a blue rabbit and a red plane', 'element': 'blue', 'question': 'is the rabbit blue?', 'choices': ['yes', 'no'], 'answer': 'yes', 'element_type': 'color'}, {'caption': 'a blue rabbit and a red plane', 'element': 'blue', 'question': 'what color is the rabbit?', 'choices': ['blue', 'red', 'yellow', 'green'], 'answer': 'blue', 'element_type': 'color'}, {'caption': 'a blue rabbit and a red plane', 'element': 'red', 'question': 'is the plane red?', 'choices': ['yes', 'no'], 'answer': 'yes', 'element_type': 'color'}, {'caption': 'a blue rabbit and a red plane', 'element': 'red', 'question': 'what color is the plane?', 'choices': ['red', 'blue', 'yellow', 'green'], 'answer': 'red', 'element_type': 'color'}]
|
| 121 |
+
```
|
| 122 |
|
| 123 |
## Bibtex
|
| 124 |
```
|
| 125 |
+
@article{hu2023tifa,
|
| 126 |
+
title={Tifa: Accurate and interpretable text-to-image faithfulness evaluation with question answering},
|
| 127 |
+
author={Hu, Yushi and Liu, Benlin and Kasai, Jungo and Wang, Yizhong and Ostendorf, Mari and Krishna, Ranjay and Smith, Noah A},
|
| 128 |
+
journal={arXiv preprint arXiv:2303.11897},
|
| 129 |
+
year={2023}
|
| 130 |
}
|
| 131 |
```
|