jadechoghari HF Staff commited on
Commit
0a03396
·
verified ·
1 Parent(s): b303146

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ datasets: unknown
3
+ library_name: lerobot
4
+ license: apache-2.0
5
+ model_name: xvla
6
+ pipeline_tag: robotics
7
+ tags:
8
+ - robotics
9
+ - lerobot
10
+ - xvla
11
+ ---
12
+
13
+ # Model Card for xvla
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+ _Model type not recognized — please update this template._
19
+
20
+
21
+ This policy has been trained and pushed to the Hub using [LeRobot](https://github.com/huggingface/lerobot).
22
+ See the full documentation at [LeRobot Docs](https://huggingface.co/docs/lerobot/index).
23
+
24
+ ---
25
+
26
+ ## How to Get Started with the Model
27
+
28
+ For a complete walkthrough, see the [training guide](https://huggingface.co/docs/lerobot/il_robots#train-a-policy).
29
+ Below is the short version on how to train and run inference/eval:
30
+
31
+ ### Train from scratch
32
+
33
+ ```bash
34
+ lerobot-train \
35
+ --dataset.repo_id=${HF_USER}/<dataset> \
36
+ --policy.type=act \
37
+ --output_dir=outputs/train/<desired_policy_repo_id> \
38
+ --job_name=lerobot_training \
39
+ --policy.device=cuda \
40
+ --policy.repo_id=${HF_USER}/<desired_policy_repo_id>
41
+ --wandb.enable=true
42
+ ```
43
+
44
+ _Writes checkpoints to `outputs/train/<desired_policy_repo_id>/checkpoints/`._
45
+
46
+ ### Evaluate the policy/run inference
47
+
48
+ ```bash
49
+ lerobot-record \
50
+ --robot.type=so100_follower \
51
+ --dataset.repo_id=<hf_user>/eval_<dataset> \
52
+ --policy.path=<hf_user>/<desired_policy_repo_id> \
53
+ --episodes=10
54
+ ```
55
+
56
+ Prefix the dataset repo with **eval\_** and supply `--policy.path` pointing to a local or hub checkpoint.
57
+
58
+ ---
59
+
60
+ ## Model Details
61
+
62
+ - **License:** apache-2.0
config.json ADDED
@@ -0,0 +1,320 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "type": "xvla",
3
+ "n_obs_steps": 1,
4
+ "input_features": {
5
+ "observation.images.image": {
6
+ "type": "VISUAL",
7
+ "shape": [
8
+ 3,
9
+ 256,
10
+ 256
11
+ ]
12
+ },
13
+ "observation.images.image2": {
14
+ "type": "VISUAL",
15
+ "shape": [
16
+ 3,
17
+ 256,
18
+ 256
19
+ ]
20
+ },
21
+ "observation.state": {
22
+ "type": "STATE",
23
+ "shape": [
24
+ 8
25
+ ]
26
+ },
27
+ "observation.images.empty_camera_0": {
28
+ "type": "VISUAL",
29
+ "shape": [
30
+ 3,
31
+ 224,
32
+ 224
33
+ ]
34
+ }
35
+ },
36
+ "output_features": {
37
+ "action": {
38
+ "type": "ACTION",
39
+ "shape": [
40
+ 20
41
+ ]
42
+ }
43
+ },
44
+ "device": "cuda",
45
+ "use_amp": false,
46
+ "push_to_hub": true,
47
+ "repo_id": null,
48
+ "private": null,
49
+ "tags": null,
50
+ "license": null,
51
+ "pretrained_path": null,
52
+ "chunk_size": 30,
53
+ "n_action_steps": 30,
54
+ "normalization_mapping": {
55
+ "STATE": "IDENTITY",
56
+ "ACTION": "IDENTITY",
57
+ "VISUAL": "IDENTITY"
58
+ },
59
+ "florence_config": {
60
+ "_attn_implementation_autoset": true,
61
+ "bos_token_id": 0,
62
+ "eos_token_id": 2,
63
+ "ignore_index": -100,
64
+ "is_encoder_decoder": true,
65
+ "model_type": "florence2",
66
+ "pad_token_id": 1,
67
+ "projection_dim": 1024,
68
+ "text_config": {
69
+ "_attn_implementation_autoset": true,
70
+ "_name_or_path": "",
71
+ "activation_dropout": 0.1,
72
+ "activation_function": "gelu",
73
+ "add_cross_attention": false,
74
+ "architectures": null,
75
+ "attention_dropout": 0.1,
76
+ "bad_words_ids": null,
77
+ "begin_suppress_tokens": null,
78
+ "bos_token_id": 0,
79
+ "chunk_size_feed_forward": 0,
80
+ "classifier_dropout": 0.0,
81
+ "cross_attention_hidden_size": null,
82
+ "d_model": 1024,
83
+ "decoder_attention_heads": 16,
84
+ "decoder_ffn_dim": 4096,
85
+ "decoder_layerdrop": 0.0,
86
+ "decoder_layers": 12,
87
+ "decoder_start_token_id": 2,
88
+ "diversity_penalty": 0.0,
89
+ "do_sample": false,
90
+ "dropout": 0.1,
91
+ "early_stopping": false,
92
+ "encoder_attention_heads": 16,
93
+ "encoder_ffn_dim": 4096,
94
+ "encoder_layerdrop": 0.0,
95
+ "encoder_layers": 12,
96
+ "encoder_no_repeat_ngram_size": 0,
97
+ "eos_token_id": 2,
98
+ "exponential_decay_length_penalty": null,
99
+ "finetuning_task": null,
100
+ "forced_bos_token_id": null,
101
+ "forced_eos_token_id": 2,
102
+ "id2label": {
103
+ "0": "LABEL_0",
104
+ "1": "LABEL_1",
105
+ "2": "LABEL_2"
106
+ },
107
+ "init_std": 0.02,
108
+ "is_decoder": false,
109
+ "is_encoder_decoder": true,
110
+ "label2id": {
111
+ "LABEL_0": 0,
112
+ "LABEL_1": 1,
113
+ "LABEL_2": 2
114
+ },
115
+ "length_penalty": 1.0,
116
+ "max_length": 20,
117
+ "max_position_embeddings": 4096,
118
+ "min_length": 0,
119
+ "model_type": "florence2_language",
120
+ "no_repeat_ngram_size": 0,
121
+ "num_beam_groups": 1,
122
+ "num_beams": 3,
123
+ "num_hidden_layers": 12,
124
+ "num_return_sequences": 1,
125
+ "output_attentions": false,
126
+ "output_hidden_states": false,
127
+ "output_scores": false,
128
+ "pad_token_id": 1,
129
+ "prefix": null,
130
+ "problem_type": null,
131
+ "pruned_heads": {},
132
+ "remove_invalid_values": false,
133
+ "repetition_penalty": 1.0,
134
+ "return_dict": true,
135
+ "return_dict_in_generate": false,
136
+ "scale_embedding": false,
137
+ "sep_token_id": null,
138
+ "suppress_tokens": null,
139
+ "task_specific_params": null,
140
+ "temperature": 1.0,
141
+ "tf_legacy_loss": false,
142
+ "tie_encoder_decoder": false,
143
+ "tie_word_embeddings": true,
144
+ "tokenizer_class": null,
145
+ "top_k": 50,
146
+ "top_p": 1.0,
147
+ "torch_dtype": null,
148
+ "torchscript": false,
149
+ "typical_p": 1.0,
150
+ "use_bfloat16": false,
151
+ "use_cache": true,
152
+ "vocab_size": 51289
153
+ },
154
+ "torch_dtype": "float32",
155
+ "vision_config": {
156
+ "_attn_implementation_autoset": false,
157
+ "_name_or_path": "",
158
+ "add_cross_attention": false,
159
+ "architectures": null,
160
+ "bad_words_ids": null,
161
+ "begin_suppress_tokens": null,
162
+ "bos_token_id": null,
163
+ "chunk_size_feed_forward": 0,
164
+ "cross_attention_hidden_size": null,
165
+ "decoder_start_token_id": null,
166
+ "depths": [
167
+ 1,
168
+ 1,
169
+ 9,
170
+ 1
171
+ ],
172
+ "dim_embed": [
173
+ 256,
174
+ 512,
175
+ 1024,
176
+ 2048
177
+ ],
178
+ "diversity_penalty": 0.0,
179
+ "do_sample": false,
180
+ "drop_path_rate": 0.1,
181
+ "early_stopping": false,
182
+ "enable_checkpoint": false,
183
+ "encoder_no_repeat_ngram_size": 0,
184
+ "eos_token_id": null,
185
+ "exponential_decay_length_penalty": null,
186
+ "finetuning_task": null,
187
+ "forced_bos_token_id": null,
188
+ "forced_eos_token_id": null,
189
+ "id2label": {
190
+ "0": "LABEL_0",
191
+ "1": "LABEL_1"
192
+ },
193
+ "image_feature_source": [
194
+ "spatial_avg_pool",
195
+ "temporal_avg_pool"
196
+ ],
197
+ "image_pos_embed": {
198
+ "max_pos_embeddings": 50,
199
+ "type": "learned_abs_2d"
200
+ },
201
+ "is_decoder": false,
202
+ "is_encoder_decoder": false,
203
+ "label2id": {
204
+ "LABEL_0": 0,
205
+ "LABEL_1": 1
206
+ },
207
+ "length_penalty": 1.0,
208
+ "max_length": 20,
209
+ "min_length": 0,
210
+ "model_type": "davit",
211
+ "no_repeat_ngram_size": 0,
212
+ "num_beam_groups": 1,
213
+ "num_beams": 1,
214
+ "num_groups": [
215
+ 8,
216
+ 16,
217
+ 32,
218
+ 64
219
+ ],
220
+ "num_heads": [
221
+ 8,
222
+ 16,
223
+ 32,
224
+ 64
225
+ ],
226
+ "num_return_sequences": 1,
227
+ "output_attentions": false,
228
+ "output_hidden_states": false,
229
+ "output_scores": false,
230
+ "pad_token_id": null,
231
+ "patch_padding": [
232
+ 3,
233
+ 1,
234
+ 1,
235
+ 1
236
+ ],
237
+ "patch_prenorm": [
238
+ false,
239
+ true,
240
+ true,
241
+ true
242
+ ],
243
+ "patch_size": [
244
+ 7,
245
+ 3,
246
+ 3,
247
+ 3
248
+ ],
249
+ "patch_stride": [
250
+ 4,
251
+ 2,
252
+ 2,
253
+ 2
254
+ ],
255
+ "prefix": null,
256
+ "problem_type": null,
257
+ "projection_dim": 1024,
258
+ "pruned_heads": {},
259
+ "remove_invalid_values": false,
260
+ "repetition_penalty": 1.0,
261
+ "return_dict": true,
262
+ "return_dict_in_generate": false,
263
+ "sep_token_id": null,
264
+ "suppress_tokens": null,
265
+ "task_specific_params": null,
266
+ "temperature": 1.0,
267
+ "tf_legacy_loss": false,
268
+ "tie_encoder_decoder": false,
269
+ "tie_word_embeddings": true,
270
+ "tokenizer_class": null,
271
+ "top_k": 50,
272
+ "top_p": 1.0,
273
+ "torch_dtype": null,
274
+ "torchscript": false,
275
+ "typical_p": 1.0,
276
+ "use_bfloat16": false,
277
+ "visual_temporal_embedding": {
278
+ "max_temporal_embeddings": 100,
279
+ "type": "COSINE"
280
+ },
281
+ "window_size": 12
282
+ },
283
+ "vocab_size": 51289
284
+ },
285
+ "tokenizer_name": "facebook/bart-large",
286
+ "tokenizer_max_length": 1024,
287
+ "tokenizer_padding_side": "right",
288
+ "pad_language_to": "max_length",
289
+ "hidden_size": 1024,
290
+ "depth": 24,
291
+ "num_heads": 16,
292
+ "mlp_ratio": 4.0,
293
+ "num_domains": 30,
294
+ "len_soft_prompts": 32,
295
+ "dim_time": 32,
296
+ "max_len_seq": 512,
297
+ "use_hetero_proj": false,
298
+ "action_mode": "ee6d",
299
+ "num_denoising_steps": 10,
300
+ "use_proprio": true,
301
+ "max_state_dim": 20,
302
+ "domain_feature_key": null,
303
+ "resize_imgs_with_padding": [
304
+ 224,
305
+ 224
306
+ ],
307
+ "num_image_views": 3,
308
+ "empty_cameras": 1,
309
+ "optimizer_lr": 0.0001,
310
+ "optimizer_betas": [
311
+ 0.9,
312
+ 0.95
313
+ ],
314
+ "optimizer_eps": 1e-08,
315
+ "optimizer_weight_decay": 0.0001,
316
+ "optimizer_grad_clip_norm": 10.0,
317
+ "scheduler_warmup_steps": 1000,
318
+ "scheduler_decay_steps": 30000,
319
+ "scheduler_decay_lr": 2.5e-06
320
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a0e29448111df56b9485cb1f964db799205794c49baeda48ef960589ce649ab
3
+ size 3519073692
policy_postprocessor.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "policy_postprocessor",
3
+ "steps": [
4
+ {
5
+ "registry_name": "unnormalizer_processor",
6
+ "config": {
7
+ "eps": 1e-08,
8
+ "features": {
9
+ "action": {
10
+ "type": "ACTION",
11
+ "shape": [
12
+ 20
13
+ ]
14
+ }
15
+ },
16
+ "norm_map": {
17
+ "VISUAL": "MEAN_STD",
18
+ "STATE": "IDENTITY",
19
+ "ACTION": "IDENTITY"
20
+ }
21
+ }
22
+ },
23
+ {
24
+ "registry_name": "device_processor",
25
+ "config": {
26
+ "device": "cpu",
27
+ "float_dtype": null
28
+ }
29
+ }
30
+ ]
31
+ }
policy_preprocessor.json ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "policy_preprocessor",
3
+ "steps": [
4
+ {
5
+ "registry_name": "rename_observations_processor",
6
+ "config": {
7
+ "rename_map": {}
8
+ }
9
+ },
10
+ {
11
+ "registry_name": "to_batch_processor",
12
+ "config": {}
13
+ },
14
+ {
15
+ "registry_name": "tokenizer_processor",
16
+ "config": {
17
+ "max_length": 50,
18
+ "task_key": "task",
19
+ "padding_side": "right",
20
+ "padding": "max_length",
21
+ "truncation": true,
22
+ "tokenizer_name": "facebook/bart-large"
23
+ }
24
+ },
25
+ {
26
+ "registry_name": "xvla_add_domain_id",
27
+ "config": {
28
+ "domain_id": 3
29
+ }
30
+ },
31
+ {
32
+ "registry_name": "device_processor",
33
+ "config": {
34
+ "device": "cuda",
35
+ "float_dtype": null
36
+ }
37
+ },
38
+ {
39
+ "registry_name": "normalizer_processor",
40
+ "config": {
41
+ "eps": 1e-08,
42
+ "features": {
43
+ "observation.images.image": {
44
+ "type": "VISUAL",
45
+ "shape": [
46
+ 3,
47
+ 224,
48
+ 224
49
+ ]
50
+ },
51
+ "observation.images.image2": {
52
+ "type": "VISUAL",
53
+ "shape": [
54
+ 3,
55
+ 224,
56
+ 224
57
+ ]
58
+ },
59
+ "observation.state": {
60
+ "type": "STATE",
61
+ "shape": [
62
+ 8
63
+ ]
64
+ },
65
+ "action": {
66
+ "type": "ACTION",
67
+ "shape": [
68
+ 20
69
+ ]
70
+ }
71
+ },
72
+ "norm_map": {
73
+ "VISUAL": "IDENTITY",
74
+ "STATE": "IDENTITY",
75
+ "ACTION": "IDENTITY"
76
+ }
77
+ }
78
+ }
79
+ ]
80
+ }