tifa-benchmark
/

llama2_tifa_question_generation

@@ -38,56 +38,94 @@ pipeline = transformers.pipeline(
     device_map="auto",
 )
-# prompt formatting
-test_caption = "a blue rabbit and a red plane"
-model = PromptCap("vqascore/promptcap-coco-vqa")  # also support OFA checkpoints. e.g. "OFA-Sys/ofa-large"
-if torch.cuda.is_available():
-  model.cuda()
-prompt = "please describe this image according to the given question: what piece of clothing is this boy putting on?"
-image = "glove_boy.jpeg"
-print(model.caption(prompt, image))
-```
-To try generic captioning, just use "what does the image describe?"
-```python
-prompt = "what does the image describe?"
-image = "glove_boy.jpeg"
-print(model.caption(prompt, image))
 ```
-PromptCap also support taking OCR inputs:
 ```python
-prompt = "please describe this image according to the given question: what year was this taken?"
-image = "dvds.jpg"
-ocr = "yip AE Mht juor 02/14/2012"
-print(model.caption(prompt, image, ocr))
-```
 ## Bibtex
 ```
-@article{hu2022promptcap,
-  title={PromptCap: Prompt-Guided Task-Aware Image Captioning},
-  author={Hu, Yushi and Hua, Hang and Yang, Zhengyuan and Shi, Weijia and Smith, Noah A and Luo, Jiebo},
-  journal={arXiv preprint arXiv:2211.09699},
-  year={2022}
 }
 ```

     device_map="auto",
 )
+# formating prompt following LLaMA 2 style
+def create_qg_prompt(caption):
+    INTRO_BLURB = """Given an image description, generate one or two multiple-choice questions that verifies if the image description is correct.
+Classify each concept into a type (object, human, animal, food, activity, attribute, counting, color, material, spatial, location, shape, other), and then generate a question for each type.
+"""
+    formated_prompt = f"<s>[INST] <<SYS>>\n{INTRO_BLURB}\n<</SYS>>\n\n"
+    formated_prompt += f"Description: {caption} [/INST] Entities:"
+    return formated_prompt
+test_caption = "a blue rabbit and a red plane"
+# create prompt
+prompt = create_qg_prompt(text_caption)
+# text completion
+sequences = pipeline(
+        prompt, do_sample=False, num_beams=5, num_return_sequences=1, max_length=512)
+output = sequences[0]['generated_text'][len(prompt):]
+output = output.split('\n\n')[0]
+# output
+print(output)
+#### Expected output ###
+#  rabbit, plane
+# Activites:
+# Colors: blue, red
+# Counting:
+# Other attributes:
+# About rabbit (animal):
+# Q: is this a rabbit?
+# Choices: yes, no
+# A: yes
+# About rabbit (animal):
+# Q: what animal is in the picture?
+# Choices: rabbit, dog, cat, fish
+# A: rabbit
+# About plane (object):
+# Q: is this a plane?
+# Choices: yes, no
+# A: yes
+# About plane (object):
+# Q: what type of vehicle is this?
+# Choices: plane, car, motorcycle, bus
+# A: plane
+# About blue (color):
+# Q: is the rabbit blue?
+# Choices: yes, no
+# A: yes
+# About blue (color):
+# Q: what color is the rabbit?
+# Choices: blue, red, yellow, green
+# A: blue
+# About red (color):
+# Q: is the plane red?
+# Choices: yes, no
+# A: yes
+# About red (color):
+# Q: what color is the plane?
+# Choices: red, blue, yellow, green
+# A: red
 ```
+# Use this LM under tifascore package
+tifascore provides extra functions to parse this output etc. The usage is below
 ```python
+from tifascore import get_llama2_pipeline, get_llama2_question_and_answers
+pipeline = get_llama2_pipeline("tifa-benchmark/llama2_tifa_question_generation")
+print(get_llama2_question_and_answers(pipeline, "a blue rabbit and a red plane"))
+#### Expected output ###
+[{'caption': 'a blue rabbit and a red plane', 'element': 'rabbit', 'question': 'what animal is in the picture?', 'choices': ['rabbit', 'dog', 'cat', 'fish'], 'answer': 'rabbit', 'element_type': 'animal/human'}, {'caption': 'a blue rabbit and a red plane', 'element': 'plane', 'question': 'is this a plane?', 'choices': ['yes', 'no'], 'answer': 'yes', 'element_type': 'object'}, {'caption': 'a blue rabbit and a red plane', 'element': 'plane', 'question': 'what type of vehicle is this?', 'choices': ['plane', 'car', 'motorcycle', 'bus'], 'answer': 'plane', 'element_type': 'object'}, {'caption': 'a blue rabbit and a red plane', 'element': 'blue', 'question': 'is the rabbit blue?', 'choices': ['yes', 'no'], 'answer': 'yes', 'element_type': 'color'}, {'caption': 'a blue rabbit and a red plane', 'element': 'blue', 'question': 'what color is the rabbit?', 'choices': ['blue', 'red', 'yellow', 'green'], 'answer': 'blue', 'element_type': 'color'}, {'caption': 'a blue rabbit and a red plane', 'element': 'red', 'question': 'is the plane red?', 'choices': ['yes', 'no'], 'answer': 'yes', 'element_type': 'color'}, {'caption': 'a blue rabbit and a red plane', 'element': 'red', 'question': 'what color is the plane?', 'choices': ['red', 'blue', 'yellow', 'green'], 'answer': 'red', 'element_type': 'color'}]
+```
 ## Bibtex
 ```
+@article{hu2023tifa,
+  title={Tifa: Accurate and interpretable text-to-image faithfulness evaluation with question answering},
+  author={Hu, Yushi and Liu, Benlin and Kasai, Jungo and Wang, Yizhong and Ostendorf, Mari and Krishna, Ranjay and Smith, Noah A},
+  journal={arXiv preprint arXiv:2303.11897},
+  year={2023}
 }
 ```