Sentence Similarity
Transformers
Safetensors
PyTorch
English
qwen2_5_vl
feature-extraction
video
retrieval
embedding
multimodal
qwen2.5-vl
custom_code
Instructions to use Alibaba-NLP/GVE-7B with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use Alibaba-NLP/GVE-7B with Transformers:
# Load model directly from transformers import AutoProcessor, AutoModel processor = AutoProcessor.from_pretrained("Alibaba-NLP/GVE-7B", trust_remote_code=True) model = AutoModel.from_pretrained("Alibaba-NLP/GVE-7B", trust_remote_code=True) - Notebooks
- Google Colab
- Kaggle
| import math | |
| from dataclasses import dataclass | |
| from typing import List, Optional, Tuple, Union | |
| import torch | |
| import torch.nn as nn | |
| import torch.nn.functional as F | |
| from torch.nn import CrossEntropyLoss | |
| from transformers.activations import ACT2FN | |
| from transformers.cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache | |
| from transformers.generation import GenerationMixin | |
| from transformers.modeling_attn_mask_utils import AttentionMaskConverter | |
| from transformers.modeling_outputs import BaseModelOutputWithPast, ModelOutput | |
| from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS | |
| from transformers.modeling_utils import PreTrainedModel | |
| from transformers.utils import ( | |
| add_start_docstrings, | |
| add_start_docstrings_to_model_forward, | |
| is_flash_attn_2_available, | |
| is_flash_attn_greater_or_equal_2_10, | |
| logging, | |
| replace_return_docstrings, | |
| ) | |
| from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLConfig, Qwen2_5_VLVisionConfig | |
| from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLForConditionalGeneration, QWEN2_5_VL_INPUTS_DOCSTRING, Qwen2_5_VLCausalLMOutputWithPast | |
| if is_flash_attn_2_available(): | |
| from flash_attn import flash_attn_varlen_func | |
| from flash_attn.layers.rotary import apply_rotary_emb | |
| else: | |
| flash_attn_varlen_func = None | |
| apply_rotary_emb = None | |
| if is_flash_attn_2_available(): | |
| from transformers.modeling_flash_attention_utils import _flash_attention_forward | |
| else: | |
| flash_attn_varlen_func = None | |
| logger = logging.get_logger(__name__) | |
| _CONFIG_FOR_DOC = "Qwen2_5_VLConfig" | |
| class Qwen25VLForEmbedding(Qwen2_5_VLForConditionalGeneration): | |
| _tied_weights_keys = ["lm_head.weight"] | |
| config_class = Qwen2_5_VLConfig | |
| _no_split_modules = ["Qwen2_5_VLDecoderLayer", "Qwen2_5_VLVisionBlock"] | |
| def forward( | |
| self, | |
| input_ids: torch.LongTensor = None, | |
| attention_mask: Optional[torch.Tensor] = None, | |
| position_ids: Optional[torch.LongTensor] = None, | |
| past_key_values: Optional[List[torch.FloatTensor]] = None, | |
| inputs_embeds: Optional[torch.FloatTensor] = None, | |
| labels: Optional[torch.LongTensor] = None, | |
| use_cache: Optional[bool] = None, | |
| output_attentions: Optional[bool] = None, | |
| output_hidden_states: Optional[bool] = None, | |
| return_dict: Optional[bool] = None, | |
| pixel_values: Optional[torch.Tensor] = None, | |
| pixel_values_videos: Optional[torch.FloatTensor] = None, | |
| image_grid_thw: Optional[torch.LongTensor] = None, | |
| video_grid_thw: Optional[torch.LongTensor] = None, | |
| rope_deltas: Optional[torch.LongTensor] = None, | |
| cache_position: Optional[torch.LongTensor] = None, | |
| second_per_grid_ts: Optional[torch.Tensor] = None, | |
| ) -> Union[Tuple, Qwen2_5_VLCausalLMOutputWithPast]: | |
| r""" | |
| Args: | |
| labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): | |
| Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., | |
| config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored | |
| (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. | |
| Returns: | |
| Example: | |
| ```python | |
| >>> from PIL import Image | |
| >>> import requests | |
| >>> from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration | |
| >>> model = Qwen2_5_VLForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct") | |
| >>> processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct") | |
| >>> messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image"}, | |
| {"type": "text", "text": "What is shown in this image?"}, | |
| ], | |
| }, | |
| ] | |
| >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg" | |
| >>> image = Image.open(requests.get(url, stream=True).raw) | |
| >>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) | |
| >>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos]) | |
| >>> # Generate | |
| >>> generate_ids = model.generate(inputs.input_ids, max_length=30) | |
| >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] | |
| "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..." | |
| ```""" | |
| output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions | |
| output_hidden_states = ( | |
| output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states | |
| ) | |
| return_dict = return_dict if return_dict is not None else self.config.use_return_dict | |
| if inputs_embeds is None: | |
| inputs_embeds = self.model.embed_tokens(input_ids) | |
| if pixel_values is not None: | |
| pixel_values = pixel_values.type(self.visual.dtype) | |
| image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw) | |
| n_image_tokens = (input_ids == self.config.image_token_id).sum().item() | |
| n_image_features = image_embeds.shape[0] | |
| if n_image_tokens != n_image_features: | |
| raise ValueError( | |
| f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" | |
| ) | |
| mask = input_ids == self.config.image_token_id | |
| mask_unsqueezed = mask.unsqueeze(-1) | |
| mask_expanded = mask_unsqueezed.expand_as(inputs_embeds) | |
| image_mask = mask_expanded.to(inputs_embeds.device) | |
| image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype) | |
| inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds) | |
| if pixel_values_videos is not None: | |
| pixel_values_videos = pixel_values_videos.type(self.visual.dtype) | |
| video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw) | |
| n_video_tokens = (input_ids == self.config.video_token_id).sum().item() | |
| n_video_features = video_embeds.shape[0] | |
| if n_video_tokens != n_video_features: | |
| raise ValueError( | |
| f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}" | |
| ) | |
| mask = input_ids == self.config.video_token_id | |
| mask_unsqueezed = mask.unsqueeze(-1) | |
| mask_expanded = mask_unsqueezed.expand_as(inputs_embeds) | |
| video_mask = mask_expanded.to(inputs_embeds.device) | |
| video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype) | |
| inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds) | |
| if attention_mask is not None: | |
| attention_mask = attention_mask.to(inputs_embeds.device) | |
| # if we get 4D attention mask we cannot calculate rope deltas anymore. TODO @raushan fixme | |
| if position_ids is None and (attention_mask is None or attention_mask.ndim == 2): | |
| # calculate RoPE index once per generation in the pre-fill stage only | |
| if (cache_position is not None and cache_position[0] == 0) or self.rope_deltas is None: | |
| position_ids, rope_deltas = self.get_rope_index( | |
| input_ids, | |
| image_grid_thw, | |
| video_grid_thw, | |
| second_per_grid_ts, | |
| attention_mask, | |
| ) | |
| self.rope_deltas = rope_deltas | |
| # then use the prev pre-calculated rope-deltas to get the correct position ids | |
| else: | |
| batch_size, seq_length, _ = inputs_embeds.shape | |
| delta = ( | |
| (cache_position[0] + self.rope_deltas).to(inputs_embeds.device) | |
| if cache_position is not None | |
| else 0 | |
| ) | |
| position_ids = torch.arange(seq_length, device=inputs_embeds.device) | |
| position_ids = position_ids.view(1, -1).expand(batch_size, -1) | |
| if cache_position is not None: # otherwise `deltas` is an int `0` | |
| delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0) | |
| position_ids = position_ids.add(delta) | |
| position_ids = position_ids.unsqueeze(0).expand(3, -1, -1) | |
| outputs = self.model( | |
| input_ids=None, | |
| position_ids=position_ids, | |
| attention_mask=attention_mask, | |
| past_key_values=past_key_values, | |
| inputs_embeds=inputs_embeds, | |
| use_cache=use_cache, | |
| output_attentions=output_attentions, | |
| output_hidden_states=output_hidden_states, | |
| return_dict=return_dict, | |
| cache_position=cache_position, | |
| ) | |
| return outputs | |