Update README.md
Browse files
README.md
CHANGED
|
@@ -13,23 +13,30 @@ inference: false
|
|
| 13 |
# CogVLM2-Llama3-Caption
|
| 14 |
|
| 15 |
<div align="center">
|
| 16 |
-
<img src=https://raw.githubusercontent.com/THUDM/CogVLM2/cf9cb3c60a871e0c8e5bde7feaf642e3021153e6/resources/logo.svg>
|
| 17 |
</div>
|
| 18 |
|
|
|
|
| 19 |
# Introduction
|
| 20 |
|
| 21 |
Typically, most video data does not come with corresponding descriptive text, so it is necessary to convert the video
|
| 22 |
-
data into textual descriptions to provide the essential training data for text-to-video models.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
## Usage
|
| 25 |
|
| 26 |
```python
|
| 27 |
import io
|
|
|
|
|
|
|
| 28 |
import numpy as np
|
| 29 |
import torch
|
| 30 |
from decord import cpu, VideoReader, bridge
|
| 31 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 32 |
-
import argparse
|
| 33 |
|
| 34 |
MODEL_PATH = "THUDM/cogvlm2-llama3-caption"
|
| 35 |
|
|
@@ -77,7 +84,6 @@ def load_video(video_data, strategy='chat'):
|
|
| 77 |
tokenizer = AutoTokenizer.from_pretrained(
|
| 78 |
MODEL_PATH,
|
| 79 |
trust_remote_code=True,
|
| 80 |
-
# padding_side="left"
|
| 81 |
)
|
| 82 |
|
| 83 |
model = AutoModelForCausalLM.from_pretrained(
|
|
@@ -132,7 +138,6 @@ def test():
|
|
| 132 |
|
| 133 |
if __name__ == '__main__':
|
| 134 |
test()
|
| 135 |
-
|
| 136 |
```
|
| 137 |
|
| 138 |
## License
|
|
|
|
| 13 |
# CogVLM2-Llama3-Caption
|
| 14 |
|
| 15 |
<div align="center">
|
| 16 |
+
<img src=https://raw.githubusercontent.com/THUDM/CogVLM2/cf9cb3c60a871e0c8e5bde7feaf642e3021153e6/resources/logo.svg>
|
| 17 |
</div>
|
| 18 |
|
| 19 |
+
|
| 20 |
# Introduction
|
| 21 |
|
| 22 |
Typically, most video data does not come with corresponding descriptive text, so it is necessary to convert the video
|
| 23 |
+
data into textual descriptions to provide the essential training data for text-to-video models.
|
| 24 |
+
CogVLM2-Caption is a video captioning model used to generate training data for the CogVideoX model.
|
| 25 |
+
|
| 26 |
+
<div align="center">
|
| 27 |
+
<img width="600px" height="auto" src="./CogVLM2-Caption-example.png">
|
| 28 |
+
</div>
|
| 29 |
|
| 30 |
## Usage
|
| 31 |
|
| 32 |
```python
|
| 33 |
import io
|
| 34 |
+
|
| 35 |
+
import argparse
|
| 36 |
import numpy as np
|
| 37 |
import torch
|
| 38 |
from decord import cpu, VideoReader, bridge
|
| 39 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
|
|
| 40 |
|
| 41 |
MODEL_PATH = "THUDM/cogvlm2-llama3-caption"
|
| 42 |
|
|
|
|
| 84 |
tokenizer = AutoTokenizer.from_pretrained(
|
| 85 |
MODEL_PATH,
|
| 86 |
trust_remote_code=True,
|
|
|
|
| 87 |
)
|
| 88 |
|
| 89 |
model = AutoModelForCausalLM.from_pretrained(
|
|
|
|
| 138 |
|
| 139 |
if __name__ == '__main__':
|
| 140 |
test()
|
|
|
|
| 141 |
```
|
| 142 |
|
| 143 |
## License
|