mwmathis commited on
Commit
9035bec
·
verified ·
1 Parent(s): a42559b

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +90 -81
README.md CHANGED
@@ -15,86 +15,88 @@ tags:
15
  - Video
16
  - MQA
17
  - multimodal
 
 
 
18
  model-index:
19
- - name: LLaVAction-7B
20
- results:
21
- - task:
22
- type: multimodal
23
- dataset:
24
- name: EgoSchema
25
- type: egoschema
26
- metrics:
27
- - type: accuracy
28
- value: 59.0
29
- name: accuracy
30
- verified: true
31
- - task:
32
- type: multimodal
33
- dataset:
34
- name: MVBench
35
- type: mvbench
36
- metrics:
37
- - type: accuracy
38
- value: 61.1
39
- name: accuracy
40
- verified: true
41
- - task:
42
- type: multimodal
43
- dataset:
44
- name: NextQA
45
- type: nextqa
46
- metrics:
47
- - type: accuracy
48
- value: 82.8
49
- name: accuracy
50
- verified: true
51
- - task:
52
- type: multimodal
53
- dataset:
54
- name: PercepTest
55
- type: percepTest
56
- metrics:
57
- - type: accuracy
58
- value: 70.2
59
- name: accuracy
60
- verified: true
61
- - task:
62
- type: multimodal
63
- dataset:
64
- name: LongVideoBench
65
- type: longvideobench
66
- metrics:
67
- - type: accuracy
68
- value: 58.6
69
- name: accuracy
70
- verified: true
71
- - task:
72
- type: multimodal
73
- dataset:
74
- name: VideoMME
75
- type: videomme
76
- metrics:
77
- - type: accuracy
78
- value: 63.9
79
- name: accuracy
80
- verified: true
81
- - task:
82
- type: multimodal
83
- dataset:
84
- name: VideoMME (w-subs)
85
- type: videomme
86
- metrics:
87
- - type: accuracy
88
- value: 71.4
89
- name: accuracy
90
- verified: true
91
  ---
92
 
93
  # LLaVAction-7B
94
 
95
  ## Model Summary
96
- The LLaVAction models are 7B parameter models trained on LLaVA-Video-178K and EPIC-KITCHENS-100-MQA, based on Qwen2 language model with a context window of 32K tokens.
97
-
98
  This model supports at most 64 frames.
99
 
100
  - **Project Page**: [https://mmathislab.github.io/llavaction/](https://mmathislab.github.io/llavaction/)
@@ -103,20 +105,18 @@ This model supports at most 64 frames.
103
  - **Point of Contact**: [Mackenzie Mathis](https://people.epfl.ch/mackenzie.mathis)
104
  - **Languages**: English
105
  -
106
- ## Use
107
 
108
  ### Intended use
109
  The model was trained on EPIC-KITCHENS-100-MQA and LLaVA-Video-178K (link). It has improved capability on understanding human egocentric actions from videos.
110
 
111
 
112
- **Feel free to share your generations in the Community tab!**
113
-
114
-
115
  ### Generation
116
  We provide the simple generation process for using our model. For more details, you could refer to Github.
117
 
118
  ```python
119
  !pip install llavaction
 
120
  from llavaction.model.builder import load_pretrained_model
121
  from llavaction.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
122
  from llavaction.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX
@@ -130,6 +130,14 @@ import warnings
130
  from decord import VideoReader, cpu
131
  import numpy as np
132
  warnings.filterwarnings("ignore")
 
 
 
 
 
 
 
 
133
  def load_video(video_path, max_frames_num,fps=1,force_sample=False):
134
  if max_frames_num == 0:
135
  return np.zeros((1, 336, 336, 3))
@@ -146,27 +154,27 @@ def load_video(video_path, max_frames_num,fps=1,force_sample=False):
146
  spare_frames = vr.get_batch(frame_idx).asnumpy()
147
  # import pdb;pdb.set_trace()
148
  return spare_frames,frame_time,video_time
 
149
  pretrained = "MLAdaptiveIntelligence/LLaVAction-7B"
150
  model_name = "llava_qwen"
151
  device = "cuda"
152
  device_map = "auto"
153
  tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, torch_dtype="bfloat16", device_map=device_map) # Add any other thing you want to pass in llava_model_args
154
  model.eval()
155
- video_path = "XXXX"
156
  max_frames_num = 64
157
  video,frame_time,video_time = load_video(video_path, max_frames_num, 1, force_sample=True)
158
  video = image_processor.preprocess(video, return_tensors="pt")["pixel_values"].cuda().to(torch.bfloat16)
159
  video = [video]
160
  conv_template = "qwen_1_5" # Make sure you use correct chat template for different models
161
  time_instruction = f"The video lasts for {video_time:.2f} seconds, and {len(video[0])} frames are uniformly sampled from it. "
162
- perspective_prompt = "You are seeing this video from egocentric view and you are the person. Your hands are sometimes interacting with objects. What action are you doing?"
163
- task_prompt = "Describe in details what you see from the video frames."
164
  question = DEFAULT_IMAGE_TOKEN + f"\n{time_instruction}\n{perspective_prompt} {task_prompt}"
 
165
  conv = copy.deepcopy(conv_templates[conv_template])
166
  conv.append_message(conv.roles[0], question)
167
  conv.append_message(conv.roles[1], None)
168
  prompt_question = conv.get_prompt()
169
  input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
 
170
  cont = model.generate(
171
  input_ids,
172
  images=video,
@@ -182,6 +190,7 @@ print(text_outputs)
182
 
183
  ## Training
184
 
 
185
 
186
  ### Model
187
  - **Architecture**: SO400M + Qwen2
 
15
  - Video
16
  - MQA
17
  - multimodal
18
+ - VLM
19
+ - LLaVAction
20
+ - MLLMs
21
  model-index:
22
+ - name: LLaVAction-7B
23
+ results:
24
+ - task:
25
+ type: multimodal
26
+ dataset:
27
+ name: EgoSchema
28
+ type: egoschema
29
+ metrics:
30
+ - type: accuracy
31
+ value: 59
32
+ name: accuracy
33
+ verified: true
34
+ - task:
35
+ type: multimodal
36
+ dataset:
37
+ name: MVBench
38
+ type: mvbench
39
+ metrics:
40
+ - type: accuracy
41
+ value: 61.1
42
+ name: accuracy
43
+ verified: true
44
+ - task:
45
+ type: multimodal
46
+ dataset:
47
+ name: NextQA
48
+ type: nextqa
49
+ metrics:
50
+ - type: accuracy
51
+ value: 82.8
52
+ name: accuracy
53
+ verified: true
54
+ - task:
55
+ type: multimodal
56
+ dataset:
57
+ name: PercepTest
58
+ type: percepTest
59
+ metrics:
60
+ - type: accuracy
61
+ value: 70.2
62
+ name: accuracy
63
+ verified: true
64
+ - task:
65
+ type: multimodal
66
+ dataset:
67
+ name: LongVideoBench
68
+ type: longvideobench
69
+ metrics:
70
+ - type: accuracy
71
+ value: 58.6
72
+ name: accuracy
73
+ verified: true
74
+ - task:
75
+ type: multimodal
76
+ dataset:
77
+ name: VideoMME
78
+ type: videomme
79
+ metrics:
80
+ - type: accuracy
81
+ value: 63.9
82
+ name: accuracy
83
+ verified: true
84
+ - task:
85
+ type: multimodal
86
+ dataset:
87
+ name: VideoMME (w-subs)
88
+ type: videomme
89
+ metrics:
90
+ - type: accuracy
91
+ value: 71.4
92
+ name: accuracy
93
+ verified: true
94
  ---
95
 
96
  # LLaVAction-7B
97
 
98
  ## Model Summary
99
+ The LLaVAction-7B model is trained on EPIC-KITCHENS-100-MQA, based on Qwen2 language model with a context window of 32K tokens.
 
100
  This model supports at most 64 frames.
101
 
102
  - **Project Page**: [https://mmathislab.github.io/llavaction/](https://mmathislab.github.io/llavaction/)
 
105
  - **Point of Contact**: [Mackenzie Mathis](https://people.epfl.ch/mackenzie.mathis)
106
  - **Languages**: English
107
  -
108
+ ## Useage
109
 
110
  ### Intended use
111
  The model was trained on EPIC-KITCHENS-100-MQA and LLaVA-Video-178K (link). It has improved capability on understanding human egocentric actions from videos.
112
 
113
 
 
 
 
114
  ### Generation
115
  We provide the simple generation process for using our model. For more details, you could refer to Github.
116
 
117
  ```python
118
  !pip install llavaction
119
+
120
  from llavaction.model.builder import load_pretrained_model
121
  from llavaction.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
122
  from llavaction.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX
 
130
  from decord import VideoReader, cpu
131
  import numpy as np
132
  warnings.filterwarnings("ignore")
133
+
134
+ #Your video (it assumes an egocentric view point)
135
+ video_path = "XXXX"
136
+
137
+ #These are the prompts we trained with, but you can test others:
138
+ perspective_prompt = "You are seeing this video from egocentric view and you are the person. Your hands are sometimes interacting with objects. What action are you doing?"
139
+ task_prompt = "Describe in details what you see from the video frames."
140
+
141
  def load_video(video_path, max_frames_num,fps=1,force_sample=False):
142
  if max_frames_num == 0:
143
  return np.zeros((1, 336, 336, 3))
 
154
  spare_frames = vr.get_batch(frame_idx).asnumpy()
155
  # import pdb;pdb.set_trace()
156
  return spare_frames,frame_time,video_time
157
+
158
  pretrained = "MLAdaptiveIntelligence/LLaVAction-7B"
159
  model_name = "llava_qwen"
160
  device = "cuda"
161
  device_map = "auto"
162
  tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, torch_dtype="bfloat16", device_map=device_map) # Add any other thing you want to pass in llava_model_args
163
  model.eval()
 
164
  max_frames_num = 64
165
  video,frame_time,video_time = load_video(video_path, max_frames_num, 1, force_sample=True)
166
  video = image_processor.preprocess(video, return_tensors="pt")["pixel_values"].cuda().to(torch.bfloat16)
167
  video = [video]
168
  conv_template = "qwen_1_5" # Make sure you use correct chat template for different models
169
  time_instruction = f"The video lasts for {video_time:.2f} seconds, and {len(video[0])} frames are uniformly sampled from it. "
 
 
170
  question = DEFAULT_IMAGE_TOKEN + f"\n{time_instruction}\n{perspective_prompt} {task_prompt}"
171
+
172
  conv = copy.deepcopy(conv_templates[conv_template])
173
  conv.append_message(conv.roles[0], question)
174
  conv.append_message(conv.roles[1], None)
175
  prompt_question = conv.get_prompt()
176
  input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
177
+
178
  cont = model.generate(
179
  input_ids,
180
  images=video,
 
190
 
191
  ## Training
192
 
193
+ See details in Ye et al. 2025.
194
 
195
  ### Model
196
  - **Architecture**: SO400M + Qwen2