import os from abc import ABCMeta, abstractmethod from typing import Optional, Union, Dict, List from termcolor import colored import random import numpy as np import torch from transformers import ( AutoProcessor, AutoTokenizer, LlavaConfig, LlamaForCausalLM, ) from torchvision.transforms.v2 import ( ToPILImage, ) import decord from decord import VideoReader decord.bridge.set_bridge("torch") # TODO: need to use these directly from tarsier.modeling_tarsier import TarsierForConditionalGeneration from tarsier.processor import Processor # from utils.model import transform_pixel_values EOL_PROMPTS = { 'text': '\nSummary above sentence in one word:', 'image': '\nSummary above image in one word:', 'video': '