rkazants commited on
Commit
2934fba
·
verified ·
1 Parent(s): 20ca9a3

Upload 10 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
config.json CHANGED
@@ -1,28 +1,36 @@
1
  {
2
- "_name_or_path": "openbmb/MiniCPM-o-2_6",
3
  "architectures": [
4
  "MiniCPMO"
5
  ],
6
  "attention_dropout": 0.0,
7
  "audio_chunk_length": 1.0,
8
  "audio_config": {
 
9
  "_name_or_path": "openai/whisper-medium",
 
 
 
10
  "architectures": [
11
  "MiniCPMWhisperEncoder"
12
  ],
 
13
  "begin_suppress_tokens": [
14
  220,
15
  50257
16
  ],
17
  "bos_token_id": 50257,
 
18
  "d_model": 1024,
19
  "decoder_attention_heads": 16,
20
- "decoder_ffn_dim": 4096,
21
- "decoder_layers": 24,
 
22
  "decoder_start_token_id": 50258,
 
23
  "encoder_attention_heads": 16,
24
  "encoder_ffn_dim": 4096,
25
- "encoder_layers": 2,
 
26
  "eos_token_id": 50257,
27
  "forced_decoder_ids": [
28
  [
@@ -38,10 +46,22 @@
38
  50363
39
  ]
40
  ],
 
 
 
 
 
 
 
41
  "max_length": 448,
 
 
 
42
  "model_type": "whisper",
43
  "num_hidden_layers": 24,
 
44
  "pad_token_id": 50257,
 
45
  "suppress_tokens": [
46
  1,
47
  2,
@@ -132,7 +152,11 @@
132
  50361,
133
  50362
134
  ],
135
- "torch_dtype": "float32"
 
 
 
 
136
  },
137
  "audio_pool_step": 2,
138
  "auto_map": {
@@ -146,39 +170,68 @@
146
  "drop_vision_last_layer": false,
147
  "eos_token_id": 151645,
148
  "hidden_act": "silu",
149
- "hidden_size": 168,
150
  "image_size": 448,
151
  "init_audio": true,
152
  "init_tts": true,
153
  "init_vision": true,
154
  "initializer_range": 0.02,
155
- "intermediate_size": 32,
156
  "listen_speak_type": "asr",
157
  "max_position_embeddings": 32768,
158
  "max_window_layers": 28,
159
  "model_type": "minicpmo",
160
  "num_attention_heads": 28,
161
- "num_heads": 2,
162
- "num_hidden_layers": 2,
163
  "num_key_value_heads": 4,
164
  "patch_size": 14,
165
  "query_num": 64,
166
  "rms_norm_eps": 1e-06,
 
167
  "rope_theta": 1000000.0,
168
  "slice_config": {
169
  "max_slice_nums": 9,
170
- "model_type": "minicpmv"
 
 
171
  },
172
  "slice_mode": true,
173
- "sliding_window": null,
174
  "stream_input": false,
175
  "tie_word_embeddings": false,
176
- "torch_dtype": "float32",
177
- "transformers_version": "4.44.2",
178
  "tts_config": {
179
- "hidden_size": 24,
180
- "llm_dim": 32,
181
- "model_type": "conditional_chattts"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  },
183
  "use_cache": true,
184
  "use_image_id": true,
@@ -186,11 +239,16 @@
186
  "version": 2.6,
187
  "vision_batch_size": 16,
188
  "vision_config": {
189
- "hidden_size": 32,
190
- "image_size": 224,
191
- "intermediate_size": 32,
 
 
 
 
192
  "model_type": "siglip_vision_model",
193
- "num_attention_heads": 2,
 
194
  "num_hidden_layers": 1,
195
  "patch_size": 14
196
  },
 
1
  {
 
2
  "architectures": [
3
  "MiniCPMO"
4
  ],
5
  "attention_dropout": 0.0,
6
  "audio_chunk_length": 1.0,
7
  "audio_config": {
8
+ "_attn_implementation_autoset": true,
9
  "_name_or_path": "openai/whisper-medium",
10
+ "activation_dropout": 0.0,
11
+ "activation_function": "gelu",
12
+ "apply_spec_augment": false,
13
  "architectures": [
14
  "MiniCPMWhisperEncoder"
15
  ],
16
+ "attention_dropout": 0.0,
17
  "begin_suppress_tokens": [
18
  220,
19
  50257
20
  ],
21
  "bos_token_id": 50257,
22
+ "classifier_proj_size": 256,
23
  "d_model": 1024,
24
  "decoder_attention_heads": 16,
25
+ "decoder_ffn_dim": 1024,
26
+ "decoder_layerdrop": 0.0,
27
+ "decoder_layers": 1,
28
  "decoder_start_token_id": 50258,
29
+ "dropout": 0.0,
30
  "encoder_attention_heads": 16,
31
  "encoder_ffn_dim": 4096,
32
+ "encoder_layerdrop": 0.0,
33
+ "encoder_layers": 1,
34
  "eos_token_id": 50257,
35
  "forced_decoder_ids": [
36
  [
 
46
  50363
47
  ]
48
  ],
49
+ "init_std": 0.02,
50
+ "mask_feature_length": 10,
51
+ "mask_feature_min_masks": 0,
52
+ "mask_feature_prob": 0.0,
53
+ "mask_time_length": 10,
54
+ "mask_time_min_masks": 2,
55
+ "mask_time_prob": 0.05,
56
  "max_length": 448,
57
+ "max_source_positions": 1500,
58
+ "max_target_positions": 448,
59
+ "median_filter_width": 7,
60
  "model_type": "whisper",
61
  "num_hidden_layers": 24,
62
+ "num_mel_bins": 80,
63
  "pad_token_id": 50257,
64
+ "scale_embedding": false,
65
  "suppress_tokens": [
66
  1,
67
  2,
 
152
  50361,
153
  50362
154
  ],
155
+ "torch_dtype": "float32",
156
+ "use_bfloat16": true,
157
+ "use_cache": true,
158
+ "use_weighted_layer_sum": false,
159
+ "vocab_size": 51865
160
  },
161
  "audio_pool_step": 2,
162
  "auto_map": {
 
170
  "drop_vision_last_layer": false,
171
  "eos_token_id": 151645,
172
  "hidden_act": "silu",
173
+ "hidden_size": 128,
174
  "image_size": 448,
175
  "init_audio": true,
176
  "init_tts": true,
177
  "init_vision": true,
178
  "initializer_range": 0.02,
179
+ "intermediate_size": 16,
180
  "listen_speak_type": "asr",
181
  "max_position_embeddings": 32768,
182
  "max_window_layers": 28,
183
  "model_type": "minicpmo",
184
  "num_attention_heads": 28,
185
+ "num_heads": 1,
186
+ "num_hidden_layers": 1,
187
  "num_key_value_heads": 4,
188
  "patch_size": 14,
189
  "query_num": 64,
190
  "rms_norm_eps": 1e-06,
191
+ "rope_scaling": null,
192
  "rope_theta": 1000000.0,
193
  "slice_config": {
194
  "max_slice_nums": 9,
195
+ "model_type": "minicpmv",
196
+ "patch_size": 14,
197
+ "scale_resolution": 448
198
  },
199
  "slice_mode": true,
200
+ "sliding_window": 131072,
201
  "stream_input": false,
202
  "tie_word_embeddings": false,
203
+ "torch_dtype": "bfloat16",
204
+ "transformers_version": "4.50.0",
205
  "tts_config": {
206
+ "_attn_implementation_autoset": true,
207
+ "attn_implementation": "sdpa",
208
+ "audio_bos_token_id": 21132,
209
+ "aug_loss_weight": true,
210
+ "hidden_size": 8,
211
+ "intermediate_size": 4,
212
+ "llm_dim": 4,
213
+ "max_position_embeddings": 4096,
214
+ "model_type": "conditional_chattts",
215
+ "num_attention_heads": 1,
216
+ "num_audio_tokens": 10,
217
+ "num_heads": 1,
218
+ "num_hidden_layers": 1,
219
+ "num_layers": 1,
220
+ "num_mel_bins": 10,
221
+ "num_spk_embs": 1,
222
+ "num_text_tokens": 20,
223
+ "num_vq": 4,
224
+ "spk_emb_token_id": 21143,
225
+ "streaming": true,
226
+ "streaming_audio_chunk_size": 50,
227
+ "streaming_text_chunk_size": 10,
228
+ "streaming_text_reserved_len": 300,
229
+ "text_eos_token_id": 21133,
230
+ "use_bfloat16": true,
231
+ "use_llm_hidden_state": false,
232
+ "use_mlp": true,
233
+ "use_speaker_embedding": true,
234
+ "use_text": true
235
  },
236
  "use_cache": true,
237
  "use_image_id": true,
 
239
  "version": 2.6,
240
  "vision_batch_size": 16,
241
  "vision_config": {
242
+ "_attn_implementation_autoset": true,
243
+ "attention_dropout": 0.0,
244
+ "hidden_act": "gelu_pytorch_tanh",
245
+ "hidden_size": 8,
246
+ "image_size": 100,
247
+ "intermediate_size": 8,
248
+ "layer_norm_eps": 1e-06,
249
  "model_type": "siglip_vision_model",
250
+ "num_attention_heads": 1,
251
+ "num_channels": 3,
252
  "num_hidden_layers": 1,
253
  "patch_size": 14
254
  },
generation_config.json CHANGED
@@ -2,5 +2,5 @@
2
  "_from_model_config": true,
3
  "bos_token_id": 151643,
4
  "eos_token_id": 151645,
5
- "transformers_version": "4.44.2"
6
  }
 
2
  "_from_model_config": true,
3
  "bos_token_id": 151643,
4
  "eos_token_id": 151645,
5
+ "transformers_version": "4.50.0"
6
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e78bc28359ce3fdf08cfc2dc701d4903c34ebb5943e12decb3d5ce6ad9901f80
3
- size 410265248
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef2070dfe835c58851cfdb0911b4333a20d2be6445f18b8ae14a0c7994eb34aa
3
+ size 143909968
preprocessor_config.json CHANGED
@@ -4,6 +4,7 @@
4
  "AutoProcessor": "openbmb/MiniCPM-o-2_6--processing_minicpmo.MiniCPMOProcessor"
5
  },
6
  "chunk_length": 30,
 
7
  "feature_extractor_type": "WhisperFeatureExtractor",
8
  "feature_size": 80,
9
  "hop_length": 160,
 
4
  "AutoProcessor": "openbmb/MiniCPM-o-2_6--processing_minicpmo.MiniCPMOProcessor"
5
  },
6
  "chunk_length": 30,
7
+ "dither": 0.0,
8
  "feature_extractor_type": "WhisperFeatureExtractor",
9
  "feature_size": 80,
10
  "hop_length": 160,
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -515,8 +515,10 @@
515
  "clean_up_tokenization_spaces": false,
516
  "eos_token": "<|im_end|>",
517
  "errors": "replace",
 
518
  "model_max_length": 131072,
519
  "pad_token": "<|endoftext|>",
 
520
  "split_special_tokens": false,
521
  "tokenizer_class": "MiniCPMOTokenizer",
522
  "unk_token": "<unk>"
 
515
  "clean_up_tokenization_spaces": false,
516
  "eos_token": "<|im_end|>",
517
  "errors": "replace",
518
+ "extra_special_tokens": {},
519
  "model_max_length": 131072,
520
  "pad_token": "<|endoftext|>",
521
+ "processor_class": "MiniCPMOProcessor",
522
  "split_special_tokens": false,
523
  "tokenizer_class": "MiniCPMOTokenizer",
524
  "unk_token": "<unk>"