bpiyush
/

TARA

@@ -66,6 +66,37 @@ See the script at [demo_usage.py](demo_usage.py) for a quick start. You can run
 python demo_usage.py
 ```
 ## Citation
 If you use this model, please cite:

 python demo_usage.py
 ```
+OR use the snippet below:
+```python
+import torch
+from modeling_tara import TARA, read_frames_decord
+model = TARA.from_pretrained(
+    ".",  # Load from current directory
+    device_map='auto',
+    torch_dtype=torch.bfloat16,
+)
+n_params = sum(p.numel() for p in model.model.parameters())
+print(f"Number of parameters: {round(n_params/1e9, 3)}B")
+# Embed a video
+video_path = "./assets/folding_paper.mp4"
+video_tensor = read_frames_decord(video_path, num_frames=16)
+video_tensor = video_tensor.unsqueeze(0)
+video_tensor = video_tensor.to(model.model.device)
+with torch.no_grad():
+    video_emb = model.encode_vision(video_tensor).cpu().squeeze(0).float()
+print(f"Video shape: {video_tensor.shape}")  # torch.Size([1, 16, 3, 240, 426])
+print(f"Video embedding shape: {video_emb.shape}")  # torch.Size([4096])
+# Embed a text
+text = ['someone is folding a paper', 'cutting a paper', 'someone is folding a paper']
+with torch.no_grad():
+    text_emb = model.encode_text(text).cpu().float()
+print(f"Text embedding shape: {text_emb.shape}")  # torch.Size([3, 4096])
+```
 ## Citation
 If you use this model, please cite: