Update README.md
Browse files
README.md
CHANGED
|
@@ -7,4 +7,144 @@ language:
|
|
| 7 |
---
|
| 8 |
|
| 9 |
LLaVA-Qwen1.5-1.8b model trained with LoRA, on a subset of Vista Vi LLaVA Complex Reasoning.
|
| 10 |
-
Loss: ~1.5
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
---
|
| 8 |
|
| 9 |
LLaVA-Qwen1.5-1.8b model trained with LoRA, on a subset of Vista Vi LLaVA Complex Reasoning.
|
| 10 |
+
Loss: ~1.5
|
| 11 |
+
|
| 12 |
+
Training script
|
| 13 |
+
```bash
|
| 14 |
+
deepspeed moellava/train/train_mem.py \
|
| 15 |
+
--lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 0.00000125 \
|
| 16 |
+
--lora_path /kaggle/temp/lora-llavaqwen \
|
| 17 |
+
--deepspeed ./scripts/zero3.json \
|
| 18 |
+
--model_name_or_path Qwen/Qwen1.5-1.8B \
|
| 19 |
+
--version qwen \
|
| 20 |
+
--data_path /kaggle/temp/vi_llava_train.json \
|
| 21 |
+
--image_folder /kaggle/input/coco-2017-dataset/coco2017/train2017 \
|
| 22 |
+
--image_tower google/siglip-base-patch16-256-multilingual \
|
| 23 |
+
--image_projector_type mlp2x_gelu \
|
| 24 |
+
--pretrain_mm_mlp_adapter /kaggle/temp/pt-llavaqwen1.5-1.8b/mm_projector.bin \
|
| 25 |
+
--mm_vision_select_layer -2 \
|
| 26 |
+
--mm_use_im_start_end False \
|
| 27 |
+
--mm_use_im_patch_token False \
|
| 28 |
+
--image_aspect_ratio pad \
|
| 29 |
+
--group_by_modality_length True \
|
| 30 |
+
--fp16 True \
|
| 31 |
+
--output_dir ./checkpoints/ft-lora-llavaqwen1.5-1.8b-complex_reasoning \
|
| 32 |
+
--num_train_epochs 1 \
|
| 33 |
+
--per_device_train_batch_size 2 \
|
| 34 |
+
--per_device_eval_batch_size 4 \
|
| 35 |
+
--gradient_accumulation_steps 8 \
|
| 36 |
+
--evaluation_strategy "no" \
|
| 37 |
+
--save_strategy "steps" \
|
| 38 |
+
--save_steps 100 \
|
| 39 |
+
--save_total_limit 1 \
|
| 40 |
+
--learning_rate 1e-5 \
|
| 41 |
+
--weight_decay 0. \
|
| 42 |
+
--warmup_ratio 0 \
|
| 43 |
+
--lr_scheduler_type "cosine" \
|
| 44 |
+
--logging_steps 5 \
|
| 45 |
+
--tf32 False \
|
| 46 |
+
--model_max_length 1024 \
|
| 47 |
+
--gradient_checkpointing True \
|
| 48 |
+
--dataloader_num_workers 4 \
|
| 49 |
+
--lazy_preprocess True \
|
| 50 |
+
--report_to wandb \
|
| 51 |
+
--run_name ft-llava-qwen1.5-1.8b-lora-vista_reasoning-cont \
|
| 52 |
+
--push_to_hub True
|
| 53 |
+
```
|
| 54 |
+
|
| 55 |
+
Python code to merge LoRA
|
| 56 |
+
```python
|
| 57 |
+
from typing import Optional, List
|
| 58 |
+
class ModelArguments:
|
| 59 |
+
model_name_or_path: Optional[str] = "facebook/opt-125m"
|
| 60 |
+
version: Optional[str] = "v0"
|
| 61 |
+
freeze_backbone: bool = False
|
| 62 |
+
tune_mm_mlp_adapter: bool = False
|
| 63 |
+
mm_vision_select_layer: Optional[int] = -1 # default to the last layer
|
| 64 |
+
pretrain_mm_mlp_adapter: Optional[str] = None
|
| 65 |
+
mm_use_im_start_end: bool = False
|
| 66 |
+
mm_use_im_patch_token: bool = True
|
| 67 |
+
mm_vision_select_feature: Optional[str] = "patch"
|
| 68 |
+
# ===================================================================
|
| 69 |
+
image_tower: Optional[str] = 'google/siglip-base-patch16-256-multilingual'
|
| 70 |
+
video_tower: Optional[str] = None
|
| 71 |
+
image_projector_type: Optional[str] = 'linear'
|
| 72 |
+
video_projector_type: Optional[str] = 'linear'
|
| 73 |
+
video_global_proj: bool = False
|
| 74 |
+
video_temproal_proj: bool = False
|
| 75 |
+
video_spatial_proj: bool = False
|
| 76 |
+
# ===================================================================
|
| 77 |
+
|
| 78 |
+
# =============================================================
|
| 79 |
+
only_lora_ffn: bool = True
|
| 80 |
+
moe_enable: bool = False
|
| 81 |
+
train_modules: Optional[List[str]] = None
|
| 82 |
+
moe_mode: str = "sparse"
|
| 83 |
+
moe_layers_idx: Optional[List[int]] = None
|
| 84 |
+
ep_size: int = 1
|
| 85 |
+
num_experts: Optional[List[int]] = 4
|
| 86 |
+
top_k_experts: int = 2
|
| 87 |
+
capacity_factor: float = 1.
|
| 88 |
+
eval_capacity_factor: float = 2.
|
| 89 |
+
min_capacity: int = 0
|
| 90 |
+
use_residual: bool = False
|
| 91 |
+
router_aux_loss_coef: float = 0.01
|
| 92 |
+
|
| 93 |
+
class DataArguments:
|
| 94 |
+
lazy_preprocess: bool = False
|
| 95 |
+
is_multimodal: bool = False
|
| 96 |
+
image_aspect_ratio: str = 'pad'
|
| 97 |
+
# ===================================================================
|
| 98 |
+
data_path: Optional[List[str]] = None
|
| 99 |
+
image_folder: Optional[str] = None
|
| 100 |
+
video_folder: Optional[str] = None
|
| 101 |
+
num_frames: int = 8
|
| 102 |
+
|
| 103 |
+
model_args = ModelArguments()
|
| 104 |
+
data_args = DataArguments()
|
| 105 |
+
|
| 106 |
+
import torch
|
| 107 |
+
from peft import PeftModel
|
| 108 |
+
from moellava.model import LlavaQwen1_5ForCausalLM
|
| 109 |
+
|
| 110 |
+
model_name_or_path = 'Qwen/Qwen1.5-1.8B'
|
| 111 |
+
lora_path = 'llavaqwen1.5-lora'
|
| 112 |
+
|
| 113 |
+
model = LlavaQwen1_5ForCausalLM.from_pretrained(
|
| 114 |
+
model_name_or_path,
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
model.to(torch.float16)
|
| 118 |
+
model = PeftModel.from_pretrained(model, lora_path)
|
| 119 |
+
model
|
| 120 |
+
|
| 121 |
+
import transformers
|
| 122 |
+
|
| 123 |
+
tokenizer = transformers.AutoTokenizer.from_pretrained(
|
| 124 |
+
model_args.model_name_or_path,
|
| 125 |
+
model_max_length=1024,
|
| 126 |
+
padding_side="right",
|
| 127 |
+
use_fast=False,
|
| 128 |
+
)
|
| 129 |
+
tokenizer.add_special_tokens({'unk_token': '<|extra_0|>'})
|
| 130 |
+
|
| 131 |
+
model.get_model().initialize_vision_modules(
|
| 132 |
+
model_args=model_args,
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
image_tower = model.get_image_tower()
|
| 136 |
+
image_tower.to(dtype=torch.float16)
|
| 137 |
+
|
| 138 |
+
data_args.image_processor = image_tower.image_processor
|
| 139 |
+
data_args.is_multimodal = True
|
| 140 |
+
|
| 141 |
+
model.config.image_aspect_ratio = data_args.image_aspect_ratio
|
| 142 |
+
model.config.tokenizer_padding_side = tokenizer.padding_side
|
| 143 |
+
|
| 144 |
+
model.config.mm_use_im_start_end = data_args.mm_use_im_start_end = model_args.mm_use_im_start_end
|
| 145 |
+
model.config.mm_use_im_patch_token = model_args.mm_use_im_patch_token
|
| 146 |
+
model.initialize_vision_tokenizer(model_args, tokenizer=tokenizer)
|
| 147 |
+
|
| 148 |
+
merged_model = model.merge_and_unload()
|
| 149 |
+
merged_model.save_pretrained("llava-qwen1.5-1.8b-complex_reasoning-merged")
|
| 150 |
+
```
|