burtenshaw HF Staff commited on
Commit
2d26ed8
·
verified ·
1 Parent(s): c183265

Upload nanochat_based20.ipynb

Browse files
Files changed (1) hide show
  1. nanochat_based20.ipynb +887 -0
nanochat_based20.ipynb ADDED
@@ -0,0 +1,887 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": [],
7
+ "gpuType": "A100"
8
+ },
9
+ "kernelspec": {
10
+ "name": "python3",
11
+ "display_name": "Python 3"
12
+ },
13
+ "language_info": {
14
+ "name": "python"
15
+ },
16
+ "accelerator": "GPU",
17
+ "widgets": {
18
+ "application/vnd.jupyter.widget-state+json": {
19
+ "a0a81cb728564e88aee25e748b11133a": {
20
+ "model_module": "@jupyter-widgets/controls",
21
+ "model_name": "HBoxModel",
22
+ "model_module_version": "1.5.0",
23
+ "state": {
24
+ "_dom_classes": [],
25
+ "_model_module": "@jupyter-widgets/controls",
26
+ "_model_module_version": "1.5.0",
27
+ "_model_name": "HBoxModel",
28
+ "_view_count": null,
29
+ "_view_module": "@jupyter-widgets/controls",
30
+ "_view_module_version": "1.5.0",
31
+ "_view_name": "HBoxView",
32
+ "box_style": "",
33
+ "children": [
34
+ "IPY_MODEL_bd79f708cfcd4405a57fd09d310bd35a",
35
+ "IPY_MODEL_a4ad4de496354084ae1e7b8647fde653",
36
+ "IPY_MODEL_c7516cc979174de08e77cb797de2b7be"
37
+ ],
38
+ "layout": "IPY_MODEL_14a29f32e4af4f8184b26cb13a8c4767"
39
+ }
40
+ },
41
+ "bd79f708cfcd4405a57fd09d310bd35a": {
42
+ "model_module": "@jupyter-widgets/controls",
43
+ "model_name": "HTMLModel",
44
+ "model_module_version": "1.5.0",
45
+ "state": {
46
+ "_dom_classes": [],
47
+ "_model_module": "@jupyter-widgets/controls",
48
+ "_model_module_version": "1.5.0",
49
+ "_model_name": "HTMLModel",
50
+ "_view_count": null,
51
+ "_view_module": "@jupyter-widgets/controls",
52
+ "_view_module_version": "1.5.0",
53
+ "_view_name": "HTMLView",
54
+ "description": "",
55
+ "description_tooltip": null,
56
+ "layout": "IPY_MODEL_6755d6aa3dc74d11bff72997ca80ed50",
57
+ "placeholder": "​",
58
+ "style": "IPY_MODEL_07b80f4c2fdd4f8b8048b524d0e204e7",
59
+ "value": "tokenizer_nanogpt.py: "
60
+ }
61
+ },
62
+ "a4ad4de496354084ae1e7b8647fde653": {
63
+ "model_module": "@jupyter-widgets/controls",
64
+ "model_name": "FloatProgressModel",
65
+ "model_module_version": "1.5.0",
66
+ "state": {
67
+ "_dom_classes": [],
68
+ "_model_module": "@jupyter-widgets/controls",
69
+ "_model_module_version": "1.5.0",
70
+ "_model_name": "FloatProgressModel",
71
+ "_view_count": null,
72
+ "_view_module": "@jupyter-widgets/controls",
73
+ "_view_module_version": "1.5.0",
74
+ "_view_name": "ProgressView",
75
+ "bar_style": "success",
76
+ "description": "",
77
+ "description_tooltip": null,
78
+ "layout": "IPY_MODEL_fdf4039acc38437687ecb0d91f4c067e",
79
+ "max": 1,
80
+ "min": 0,
81
+ "orientation": "horizontal",
82
+ "style": "IPY_MODEL_31e58c8aa3f942939e3f5807373d737a",
83
+ "value": 1
84
+ }
85
+ },
86
+ "c7516cc979174de08e77cb797de2b7be": {
87
+ "model_module": "@jupyter-widgets/controls",
88
+ "model_name": "HTMLModel",
89
+ "model_module_version": "1.5.0",
90
+ "state": {
91
+ "_dom_classes": [],
92
+ "_model_module": "@jupyter-widgets/controls",
93
+ "_model_module_version": "1.5.0",
94
+ "_model_name": "HTMLModel",
95
+ "_view_count": null,
96
+ "_view_module": "@jupyter-widgets/controls",
97
+ "_view_module_version": "1.5.0",
98
+ "_view_name": "HTMLView",
99
+ "description": "",
100
+ "description_tooltip": null,
101
+ "layout": "IPY_MODEL_f0254ddfd4514f3f9f75093ad9f1801e",
102
+ "placeholder": "​",
103
+ "style": "IPY_MODEL_cfbc1c6a33554bfe82986782e8b1c1c1",
104
+ "value": " 2.39k/? [00:00<00:00, 77.5kB/s]"
105
+ }
106
+ },
107
+ "14a29f32e4af4f8184b26cb13a8c4767": {
108
+ "model_module": "@jupyter-widgets/base",
109
+ "model_name": "LayoutModel",
110
+ "model_module_version": "1.2.0",
111
+ "state": {
112
+ "_model_module": "@jupyter-widgets/base",
113
+ "_model_module_version": "1.2.0",
114
+ "_model_name": "LayoutModel",
115
+ "_view_count": null,
116
+ "_view_module": "@jupyter-widgets/base",
117
+ "_view_module_version": "1.2.0",
118
+ "_view_name": "LayoutView",
119
+ "align_content": null,
120
+ "align_items": null,
121
+ "align_self": null,
122
+ "border": null,
123
+ "bottom": null,
124
+ "display": null,
125
+ "flex": null,
126
+ "flex_flow": null,
127
+ "grid_area": null,
128
+ "grid_auto_columns": null,
129
+ "grid_auto_flow": null,
130
+ "grid_auto_rows": null,
131
+ "grid_column": null,
132
+ "grid_gap": null,
133
+ "grid_row": null,
134
+ "grid_template_areas": null,
135
+ "grid_template_columns": null,
136
+ "grid_template_rows": null,
137
+ "height": null,
138
+ "justify_content": null,
139
+ "justify_items": null,
140
+ "left": null,
141
+ "margin": null,
142
+ "max_height": null,
143
+ "max_width": null,
144
+ "min_height": null,
145
+ "min_width": null,
146
+ "object_fit": null,
147
+ "object_position": null,
148
+ "order": null,
149
+ "overflow": null,
150
+ "overflow_x": null,
151
+ "overflow_y": null,
152
+ "padding": null,
153
+ "right": null,
154
+ "top": null,
155
+ "visibility": null,
156
+ "width": null
157
+ }
158
+ },
159
+ "6755d6aa3dc74d11bff72997ca80ed50": {
160
+ "model_module": "@jupyter-widgets/base",
161
+ "model_name": "LayoutModel",
162
+ "model_module_version": "1.2.0",
163
+ "state": {
164
+ "_model_module": "@jupyter-widgets/base",
165
+ "_model_module_version": "1.2.0",
166
+ "_model_name": "LayoutModel",
167
+ "_view_count": null,
168
+ "_view_module": "@jupyter-widgets/base",
169
+ "_view_module_version": "1.2.0",
170
+ "_view_name": "LayoutView",
171
+ "align_content": null,
172
+ "align_items": null,
173
+ "align_self": null,
174
+ "border": null,
175
+ "bottom": null,
176
+ "display": null,
177
+ "flex": null,
178
+ "flex_flow": null,
179
+ "grid_area": null,
180
+ "grid_auto_columns": null,
181
+ "grid_auto_flow": null,
182
+ "grid_auto_rows": null,
183
+ "grid_column": null,
184
+ "grid_gap": null,
185
+ "grid_row": null,
186
+ "grid_template_areas": null,
187
+ "grid_template_columns": null,
188
+ "grid_template_rows": null,
189
+ "height": null,
190
+ "justify_content": null,
191
+ "justify_items": null,
192
+ "left": null,
193
+ "margin": null,
194
+ "max_height": null,
195
+ "max_width": null,
196
+ "min_height": null,
197
+ "min_width": null,
198
+ "object_fit": null,
199
+ "object_position": null,
200
+ "order": null,
201
+ "overflow": null,
202
+ "overflow_x": null,
203
+ "overflow_y": null,
204
+ "padding": null,
205
+ "right": null,
206
+ "top": null,
207
+ "visibility": null,
208
+ "width": null
209
+ }
210
+ },
211
+ "07b80f4c2fdd4f8b8048b524d0e204e7": {
212
+ "model_module": "@jupyter-widgets/controls",
213
+ "model_name": "DescriptionStyleModel",
214
+ "model_module_version": "1.5.0",
215
+ "state": {
216
+ "_model_module": "@jupyter-widgets/controls",
217
+ "_model_module_version": "1.5.0",
218
+ "_model_name": "DescriptionStyleModel",
219
+ "_view_count": null,
220
+ "_view_module": "@jupyter-widgets/base",
221
+ "_view_module_version": "1.2.0",
222
+ "_view_name": "StyleView",
223
+ "description_width": ""
224
+ }
225
+ },
226
+ "fdf4039acc38437687ecb0d91f4c067e": {
227
+ "model_module": "@jupyter-widgets/base",
228
+ "model_name": "LayoutModel",
229
+ "model_module_version": "1.2.0",
230
+ "state": {
231
+ "_model_module": "@jupyter-widgets/base",
232
+ "_model_module_version": "1.2.0",
233
+ "_model_name": "LayoutModel",
234
+ "_view_count": null,
235
+ "_view_module": "@jupyter-widgets/base",
236
+ "_view_module_version": "1.2.0",
237
+ "_view_name": "LayoutView",
238
+ "align_content": null,
239
+ "align_items": null,
240
+ "align_self": null,
241
+ "border": null,
242
+ "bottom": null,
243
+ "display": null,
244
+ "flex": null,
245
+ "flex_flow": null,
246
+ "grid_area": null,
247
+ "grid_auto_columns": null,
248
+ "grid_auto_flow": null,
249
+ "grid_auto_rows": null,
250
+ "grid_column": null,
251
+ "grid_gap": null,
252
+ "grid_row": null,
253
+ "grid_template_areas": null,
254
+ "grid_template_columns": null,
255
+ "grid_template_rows": null,
256
+ "height": null,
257
+ "justify_content": null,
258
+ "justify_items": null,
259
+ "left": null,
260
+ "margin": null,
261
+ "max_height": null,
262
+ "max_width": null,
263
+ "min_height": null,
264
+ "min_width": null,
265
+ "object_fit": null,
266
+ "object_position": null,
267
+ "order": null,
268
+ "overflow": null,
269
+ "overflow_x": null,
270
+ "overflow_y": null,
271
+ "padding": null,
272
+ "right": null,
273
+ "top": null,
274
+ "visibility": null,
275
+ "width": "20px"
276
+ }
277
+ },
278
+ "31e58c8aa3f942939e3f5807373d737a": {
279
+ "model_module": "@jupyter-widgets/controls",
280
+ "model_name": "ProgressStyleModel",
281
+ "model_module_version": "1.5.0",
282
+ "state": {
283
+ "_model_module": "@jupyter-widgets/controls",
284
+ "_model_module_version": "1.5.0",
285
+ "_model_name": "ProgressStyleModel",
286
+ "_view_count": null,
287
+ "_view_module": "@jupyter-widgets/base",
288
+ "_view_module_version": "1.2.0",
289
+ "_view_name": "StyleView",
290
+ "bar_color": null,
291
+ "description_width": ""
292
+ }
293
+ },
294
+ "f0254ddfd4514f3f9f75093ad9f1801e": {
295
+ "model_module": "@jupyter-widgets/base",
296
+ "model_name": "LayoutModel",
297
+ "model_module_version": "1.2.0",
298
+ "state": {
299
+ "_model_module": "@jupyter-widgets/base",
300
+ "_model_module_version": "1.2.0",
301
+ "_model_name": "LayoutModel",
302
+ "_view_count": null,
303
+ "_view_module": "@jupyter-widgets/base",
304
+ "_view_module_version": "1.2.0",
305
+ "_view_name": "LayoutView",
306
+ "align_content": null,
307
+ "align_items": null,
308
+ "align_self": null,
309
+ "border": null,
310
+ "bottom": null,
311
+ "display": null,
312
+ "flex": null,
313
+ "flex_flow": null,
314
+ "grid_area": null,
315
+ "grid_auto_columns": null,
316
+ "grid_auto_flow": null,
317
+ "grid_auto_rows": null,
318
+ "grid_column": null,
319
+ "grid_gap": null,
320
+ "grid_row": null,
321
+ "grid_template_areas": null,
322
+ "grid_template_columns": null,
323
+ "grid_template_rows": null,
324
+ "height": null,
325
+ "justify_content": null,
326
+ "justify_items": null,
327
+ "left": null,
328
+ "margin": null,
329
+ "max_height": null,
330
+ "max_width": null,
331
+ "min_height": null,
332
+ "min_width": null,
333
+ "object_fit": null,
334
+ "object_position": null,
335
+ "order": null,
336
+ "overflow": null,
337
+ "overflow_x": null,
338
+ "overflow_y": null,
339
+ "padding": null,
340
+ "right": null,
341
+ "top": null,
342
+ "visibility": null,
343
+ "width": null
344
+ }
345
+ },
346
+ "cfbc1c6a33554bfe82986782e8b1c1c1": {
347
+ "model_module": "@jupyter-widgets/controls",
348
+ "model_name": "DescriptionStyleModel",
349
+ "model_module_version": "1.5.0",
350
+ "state": {
351
+ "_model_module": "@jupyter-widgets/controls",
352
+ "_model_module_version": "1.5.0",
353
+ "_model_name": "DescriptionStyleModel",
354
+ "_view_count": null,
355
+ "_view_module": "@jupyter-widgets/base",
356
+ "_view_module_version": "1.2.0",
357
+ "_view_name": "StyleView",
358
+ "description_width": ""
359
+ }
360
+ },
361
+ "f5a13e3fe6624fe7928e7656868de5a5": {
362
+ "model_module": "@jupyter-widgets/controls",
363
+ "model_name": "HBoxModel",
364
+ "model_module_version": "1.5.0",
365
+ "state": {
366
+ "_dom_classes": [],
367
+ "_model_module": "@jupyter-widgets/controls",
368
+ "_model_module_version": "1.5.0",
369
+ "_model_name": "HBoxModel",
370
+ "_view_count": null,
371
+ "_view_module": "@jupyter-widgets/controls",
372
+ "_view_module_version": "1.5.0",
373
+ "_view_name": "HBoxView",
374
+ "box_style": "",
375
+ "children": [
376
+ "IPY_MODEL_aa7aa11be8a14489893928fa5542098f",
377
+ "IPY_MODEL_7731ce779e2343ed8dfb75d91fff85be",
378
+ "IPY_MODEL_2932c2f5bf634d8385a45302f475aa97"
379
+ ],
380
+ "layout": "IPY_MODEL_fc5d32bdd43e41c6860c44cfddbd2d61"
381
+ }
382
+ },
383
+ "aa7aa11be8a14489893928fa5542098f": {
384
+ "model_module": "@jupyter-widgets/controls",
385
+ "model_name": "HTMLModel",
386
+ "model_module_version": "1.5.0",
387
+ "state": {
388
+ "_dom_classes": [],
389
+ "_model_module": "@jupyter-widgets/controls",
390
+ "_model_module_version": "1.5.0",
391
+ "_model_name": "HTMLModel",
392
+ "_view_count": null,
393
+ "_view_module": "@jupyter-widgets/controls",
394
+ "_view_module_version": "1.5.0",
395
+ "_view_name": "HTMLView",
396
+ "description": "",
397
+ "description_tooltip": null,
398
+ "layout": "IPY_MODEL_66d77906d42449e090c3f0e5912793d2",
399
+ "placeholder": "​",
400
+ "style": "IPY_MODEL_c5c2d00b9b3f447e957b934fccf7db42",
401
+ "value": "tokenizer.pkl: 100%"
402
+ }
403
+ },
404
+ "7731ce779e2343ed8dfb75d91fff85be": {
405
+ "model_module": "@jupyter-widgets/controls",
406
+ "model_name": "FloatProgressModel",
407
+ "model_module_version": "1.5.0",
408
+ "state": {
409
+ "_dom_classes": [],
410
+ "_model_module": "@jupyter-widgets/controls",
411
+ "_model_module_version": "1.5.0",
412
+ "_model_name": "FloatProgressModel",
413
+ "_view_count": null,
414
+ "_view_module": "@jupyter-widgets/controls",
415
+ "_view_module_version": "1.5.0",
416
+ "_view_name": "ProgressView",
417
+ "bar_style": "success",
418
+ "description": "",
419
+ "description_tooltip": null,
420
+ "layout": "IPY_MODEL_5afeebb4d064422786d575ee7d1b6226",
421
+ "max": 846518,
422
+ "min": 0,
423
+ "orientation": "horizontal",
424
+ "style": "IPY_MODEL_9df83a314c1a4bc3a9c39cd63ca6f182",
425
+ "value": 846518
426
+ }
427
+ },
428
+ "2932c2f5bf634d8385a45302f475aa97": {
429
+ "model_module": "@jupyter-widgets/controls",
430
+ "model_name": "HTMLModel",
431
+ "model_module_version": "1.5.0",
432
+ "state": {
433
+ "_dom_classes": [],
434
+ "_model_module": "@jupyter-widgets/controls",
435
+ "_model_module_version": "1.5.0",
436
+ "_model_name": "HTMLModel",
437
+ "_view_count": null,
438
+ "_view_module": "@jupyter-widgets/controls",
439
+ "_view_module_version": "1.5.0",
440
+ "_view_name": "HTMLView",
441
+ "description": "",
442
+ "description_tooltip": null,
443
+ "layout": "IPY_MODEL_27546e720b2d4da29a272c8bb90fb351",
444
+ "placeholder": "​",
445
+ "style": "IPY_MODEL_14a3e3a938c5417f9727716bbb7d1564",
446
+ "value": " 847k/847k [00:02<00:00, 285kB/s]"
447
+ }
448
+ },
449
+ "fc5d32bdd43e41c6860c44cfddbd2d61": {
450
+ "model_module": "@jupyter-widgets/base",
451
+ "model_name": "LayoutModel",
452
+ "model_module_version": "1.2.0",
453
+ "state": {
454
+ "_model_module": "@jupyter-widgets/base",
455
+ "_model_module_version": "1.2.0",
456
+ "_model_name": "LayoutModel",
457
+ "_view_count": null,
458
+ "_view_module": "@jupyter-widgets/base",
459
+ "_view_module_version": "1.2.0",
460
+ "_view_name": "LayoutView",
461
+ "align_content": null,
462
+ "align_items": null,
463
+ "align_self": null,
464
+ "border": null,
465
+ "bottom": null,
466
+ "display": null,
467
+ "flex": null,
468
+ "flex_flow": null,
469
+ "grid_area": null,
470
+ "grid_auto_columns": null,
471
+ "grid_auto_flow": null,
472
+ "grid_auto_rows": null,
473
+ "grid_column": null,
474
+ "grid_gap": null,
475
+ "grid_row": null,
476
+ "grid_template_areas": null,
477
+ "grid_template_columns": null,
478
+ "grid_template_rows": null,
479
+ "height": null,
480
+ "justify_content": null,
481
+ "justify_items": null,
482
+ "left": null,
483
+ "margin": null,
484
+ "max_height": null,
485
+ "max_width": null,
486
+ "min_height": null,
487
+ "min_width": null,
488
+ "object_fit": null,
489
+ "object_position": null,
490
+ "order": null,
491
+ "overflow": null,
492
+ "overflow_x": null,
493
+ "overflow_y": null,
494
+ "padding": null,
495
+ "right": null,
496
+ "top": null,
497
+ "visibility": null,
498
+ "width": null
499
+ }
500
+ },
501
+ "66d77906d42449e090c3f0e5912793d2": {
502
+ "model_module": "@jupyter-widgets/base",
503
+ "model_name": "LayoutModel",
504
+ "model_module_version": "1.2.0",
505
+ "state": {
506
+ "_model_module": "@jupyter-widgets/base",
507
+ "_model_module_version": "1.2.0",
508
+ "_model_name": "LayoutModel",
509
+ "_view_count": null,
510
+ "_view_module": "@jupyter-widgets/base",
511
+ "_view_module_version": "1.2.0",
512
+ "_view_name": "LayoutView",
513
+ "align_content": null,
514
+ "align_items": null,
515
+ "align_self": null,
516
+ "border": null,
517
+ "bottom": null,
518
+ "display": null,
519
+ "flex": null,
520
+ "flex_flow": null,
521
+ "grid_area": null,
522
+ "grid_auto_columns": null,
523
+ "grid_auto_flow": null,
524
+ "grid_auto_rows": null,
525
+ "grid_column": null,
526
+ "grid_gap": null,
527
+ "grid_row": null,
528
+ "grid_template_areas": null,
529
+ "grid_template_columns": null,
530
+ "grid_template_rows": null,
531
+ "height": null,
532
+ "justify_content": null,
533
+ "justify_items": null,
534
+ "left": null,
535
+ "margin": null,
536
+ "max_height": null,
537
+ "max_width": null,
538
+ "min_height": null,
539
+ "min_width": null,
540
+ "object_fit": null,
541
+ "object_position": null,
542
+ "order": null,
543
+ "overflow": null,
544
+ "overflow_x": null,
545
+ "overflow_y": null,
546
+ "padding": null,
547
+ "right": null,
548
+ "top": null,
549
+ "visibility": null,
550
+ "width": null
551
+ }
552
+ },
553
+ "c5c2d00b9b3f447e957b934fccf7db42": {
554
+ "model_module": "@jupyter-widgets/controls",
555
+ "model_name": "DescriptionStyleModel",
556
+ "model_module_version": "1.5.0",
557
+ "state": {
558
+ "_model_module": "@jupyter-widgets/controls",
559
+ "_model_module_version": "1.5.0",
560
+ "_model_name": "DescriptionStyleModel",
561
+ "_view_count": null,
562
+ "_view_module": "@jupyter-widgets/base",
563
+ "_view_module_version": "1.2.0",
564
+ "_view_name": "StyleView",
565
+ "description_width": ""
566
+ }
567
+ },
568
+ "5afeebb4d064422786d575ee7d1b6226": {
569
+ "model_module": "@jupyter-widgets/base",
570
+ "model_name": "LayoutModel",
571
+ "model_module_version": "1.2.0",
572
+ "state": {
573
+ "_model_module": "@jupyter-widgets/base",
574
+ "_model_module_version": "1.2.0",
575
+ "_model_name": "LayoutModel",
576
+ "_view_count": null,
577
+ "_view_module": "@jupyter-widgets/base",
578
+ "_view_module_version": "1.2.0",
579
+ "_view_name": "LayoutView",
580
+ "align_content": null,
581
+ "align_items": null,
582
+ "align_self": null,
583
+ "border": null,
584
+ "bottom": null,
585
+ "display": null,
586
+ "flex": null,
587
+ "flex_flow": null,
588
+ "grid_area": null,
589
+ "grid_auto_columns": null,
590
+ "grid_auto_flow": null,
591
+ "grid_auto_rows": null,
592
+ "grid_column": null,
593
+ "grid_gap": null,
594
+ "grid_row": null,
595
+ "grid_template_areas": null,
596
+ "grid_template_columns": null,
597
+ "grid_template_rows": null,
598
+ "height": null,
599
+ "justify_content": null,
600
+ "justify_items": null,
601
+ "left": null,
602
+ "margin": null,
603
+ "max_height": null,
604
+ "max_width": null,
605
+ "min_height": null,
606
+ "min_width": null,
607
+ "object_fit": null,
608
+ "object_position": null,
609
+ "order": null,
610
+ "overflow": null,
611
+ "overflow_x": null,
612
+ "overflow_y": null,
613
+ "padding": null,
614
+ "right": null,
615
+ "top": null,
616
+ "visibility": null,
617
+ "width": null
618
+ }
619
+ },
620
+ "9df83a314c1a4bc3a9c39cd63ca6f182": {
621
+ "model_module": "@jupyter-widgets/controls",
622
+ "model_name": "ProgressStyleModel",
623
+ "model_module_version": "1.5.0",
624
+ "state": {
625
+ "_model_module": "@jupyter-widgets/controls",
626
+ "_model_module_version": "1.5.0",
627
+ "_model_name": "ProgressStyleModel",
628
+ "_view_count": null,
629
+ "_view_module": "@jupyter-widgets/base",
630
+ "_view_module_version": "1.2.0",
631
+ "_view_name": "StyleView",
632
+ "bar_color": null,
633
+ "description_width": ""
634
+ }
635
+ },
636
+ "27546e720b2d4da29a272c8bb90fb351": {
637
+ "model_module": "@jupyter-widgets/base",
638
+ "model_name": "LayoutModel",
639
+ "model_module_version": "1.2.0",
640
+ "state": {
641
+ "_model_module": "@jupyter-widgets/base",
642
+ "_model_module_version": "1.2.0",
643
+ "_model_name": "LayoutModel",
644
+ "_view_count": null,
645
+ "_view_module": "@jupyter-widgets/base",
646
+ "_view_module_version": "1.2.0",
647
+ "_view_name": "LayoutView",
648
+ "align_content": null,
649
+ "align_items": null,
650
+ "align_self": null,
651
+ "border": null,
652
+ "bottom": null,
653
+ "display": null,
654
+ "flex": null,
655
+ "flex_flow": null,
656
+ "grid_area": null,
657
+ "grid_auto_columns": null,
658
+ "grid_auto_flow": null,
659
+ "grid_auto_rows": null,
660
+ "grid_column": null,
661
+ "grid_gap": null,
662
+ "grid_row": null,
663
+ "grid_template_areas": null,
664
+ "grid_template_columns": null,
665
+ "grid_template_rows": null,
666
+ "height": null,
667
+ "justify_content": null,
668
+ "justify_items": null,
669
+ "left": null,
670
+ "margin": null,
671
+ "max_height": null,
672
+ "max_width": null,
673
+ "min_height": null,
674
+ "min_width": null,
675
+ "object_fit": null,
676
+ "object_position": null,
677
+ "order": null,
678
+ "overflow": null,
679
+ "overflow_x": null,
680
+ "overflow_y": null,
681
+ "padding": null,
682
+ "right": null,
683
+ "top": null,
684
+ "visibility": null,
685
+ "width": null
686
+ }
687
+ },
688
+ "14a3e3a938c5417f9727716bbb7d1564": {
689
+ "model_module": "@jupyter-widgets/controls",
690
+ "model_name": "DescriptionStyleModel",
691
+ "model_module_version": "1.5.0",
692
+ "state": {
693
+ "_model_module": "@jupyter-widgets/controls",
694
+ "_model_module_version": "1.5.0",
695
+ "_model_name": "DescriptionStyleModel",
696
+ "_view_count": null,
697
+ "_view_module": "@jupyter-widgets/base",
698
+ "_view_module_version": "1.2.0",
699
+ "_view_name": "StyleView",
700
+ "description_width": ""
701
+ }
702
+ }
703
+ }
704
+ }
705
+ },
706
+ "cells": [
707
+ {
708
+ "cell_type": "code",
709
+ "execution_count": 4,
710
+ "metadata": {
711
+ "colab": {
712
+ "base_uri": "https://localhost:8080/"
713
+ },
714
+ "id": "YCQ2SpNSFKkj",
715
+ "outputId": "cb463fd3-d412-4e05-b67e-f001c17b58b0"
716
+ },
717
+ "outputs": [
718
+ {
719
+ "output_type": "stream",
720
+ "name": "stderr",
721
+ "text": [
722
+ "A new version of the following files was downloaded from https://huggingface.co/nanochat-students/base-d20:\n",
723
+ "- configuration_nanogpt.py\n",
724
+ ". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n",
725
+ "A new version of the following files was downloaded from https://huggingface.co/nanochat-students/base-d20:\n",
726
+ "- modeling_nanogpt.py\n",
727
+ ". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n"
728
+ ]
729
+ },
730
+ {
731
+ "output_type": "execute_result",
732
+ "data": {
733
+ "text/plain": [
734
+ "NanoGPTModel(\n",
735
+ " (transformer): ModuleDict(\n",
736
+ " (wte): Embedding(65536, 1280)\n",
737
+ " (h): ModuleList(\n",
738
+ " (0-19): 20 x Block(\n",
739
+ " (attn): CausalSelfAttention(\n",
740
+ " (c_q): Linear(in_features=1280, out_features=1280, bias=False)\n",
741
+ " (c_k): Linear(in_features=1280, out_features=1280, bias=False)\n",
742
+ " (c_v): Linear(in_features=1280, out_features=1280, bias=False)\n",
743
+ " (c_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
744
+ " )\n",
745
+ " (mlp): MLP(\n",
746
+ " (c_fc): Linear(in_features=1280, out_features=5120, bias=False)\n",
747
+ " (c_proj): Linear(in_features=5120, out_features=1280, bias=False)\n",
748
+ " )\n",
749
+ " )\n",
750
+ " )\n",
751
+ " )\n",
752
+ " (lm_head): Linear(in_features=1280, out_features=65536, bias=False)\n",
753
+ ")"
754
+ ]
755
+ },
756
+ "metadata": {},
757
+ "execution_count": 4
758
+ }
759
+ ],
760
+ "source": [
761
+ "from transformers import AutoConfig, AutoModel, AutoTokenizer\n",
762
+ "import torch\n",
763
+ "\n",
764
+ "model_dir = \"nanochat-students/base-d20\"\n",
765
+ "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
766
+ "model = AutoModel.from_pretrained(model_dir, trust_remote_code=True)\n",
767
+ "model = model.to(device)\n",
768
+ "model.eval()"
769
+ ]
770
+ },
771
+ {
772
+ "cell_type": "code",
773
+ "source": [
774
+ "tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)"
775
+ ],
776
+ "metadata": {
777
+ "colab": {
778
+ "base_uri": "https://localhost:8080/",
779
+ "height": 133,
780
+ "referenced_widgets": [
781
+ "a0a81cb728564e88aee25e748b11133a",
782
+ "bd79f708cfcd4405a57fd09d310bd35a",
783
+ "a4ad4de496354084ae1e7b8647fde653",
784
+ "c7516cc979174de08e77cb797de2b7be",
785
+ "14a29f32e4af4f8184b26cb13a8c4767",
786
+ "6755d6aa3dc74d11bff72997ca80ed50",
787
+ "07b80f4c2fdd4f8b8048b524d0e204e7",
788
+ "fdf4039acc38437687ecb0d91f4c067e",
789
+ "31e58c8aa3f942939e3f5807373d737a",
790
+ "f0254ddfd4514f3f9f75093ad9f1801e",
791
+ "cfbc1c6a33554bfe82986782e8b1c1c1",
792
+ "f5a13e3fe6624fe7928e7656868de5a5",
793
+ "aa7aa11be8a14489893928fa5542098f",
794
+ "7731ce779e2343ed8dfb75d91fff85be",
795
+ "2932c2f5bf634d8385a45302f475aa97",
796
+ "fc5d32bdd43e41c6860c44cfddbd2d61",
797
+ "66d77906d42449e090c3f0e5912793d2",
798
+ "c5c2d00b9b3f447e957b934fccf7db42",
799
+ "5afeebb4d064422786d575ee7d1b6226",
800
+ "9df83a314c1a4bc3a9c39cd63ca6f182",
801
+ "27546e720b2d4da29a272c8bb90fb351",
802
+ "14a3e3a938c5417f9727716bbb7d1564"
803
+ ]
804
+ },
805
+ "id": "i6AWzw0-Hl6j",
806
+ "outputId": "5069f13e-3162-4a5e-96af-1ed1712eef69"
807
+ },
808
+ "execution_count": 2,
809
+ "outputs": [
810
+ {
811
+ "output_type": "display_data",
812
+ "data": {
813
+ "text/plain": [
814
+ "tokenizer_nanogpt.py: 0.00B [00:00, ?B/s]"
815
+ ],
816
+ "application/vnd.jupyter.widget-view+json": {
817
+ "version_major": 2,
818
+ "version_minor": 0,
819
+ "model_id": "a0a81cb728564e88aee25e748b11133a"
820
+ }
821
+ },
822
+ "metadata": {}
823
+ },
824
+ {
825
+ "output_type": "stream",
826
+ "name": "stderr",
827
+ "text": [
828
+ "A new version of the following files was downloaded from https://huggingface.co/nanochat-students/base-d20:\n",
829
+ "- tokenizer_nanogpt.py\n",
830
+ ". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n"
831
+ ]
832
+ },
833
+ {
834
+ "output_type": "display_data",
835
+ "data": {
836
+ "text/plain": [
837
+ "tokenizer.pkl: 0%| | 0.00/847k [00:00<?, ?B/s]"
838
+ ],
839
+ "application/vnd.jupyter.widget-view+json": {
840
+ "version_major": 2,
841
+ "version_minor": 0,
842
+ "model_id": "f5a13e3fe6624fe7928e7656868de5a5"
843
+ }
844
+ },
845
+ "metadata": {}
846
+ }
847
+ ]
848
+ },
849
+ {
850
+ "cell_type": "code",
851
+ "source": [
852
+ "# Hardcoded prompt and greedy generation\n",
853
+ "prompt = \"The capital of Belgium is \"\n",
854
+ "input_ids = tokenizer.encode(prompt, prepend=tokenizer.get_bos_token_id())\n",
855
+ "ids = torch.tensor([input_ids], dtype=torch.long, device=device)\n",
856
+ "\n",
857
+ "max_new_tokens = 50\n",
858
+ "with torch.inference_mode():\n",
859
+ " for _ in range(max_new_tokens):\n",
860
+ " outputs = model(input_ids=ids)\n",
861
+ " logits = outputs[\"logits\"] if isinstance(outputs, dict) else outputs.logits\n",
862
+ " next_token = torch.argmax(logits[:, -1, :], dim=-1, keepdim=True)\n",
863
+ " ids = torch.cat([ids, next_token], dim=1)\n",
864
+ "\n",
865
+ "decoded = tokenizer.decode(ids[0].tolist())\n",
866
+ "print(decoded)"
867
+ ],
868
+ "metadata": {
869
+ "colab": {
870
+ "base_uri": "https://localhost:8080/"
871
+ },
872
+ "id": "XfDjQu4gGRUk",
873
+ "outputId": "08f77681-8712-4553-d354-1783546844b9"
874
+ },
875
+ "execution_count": 3,
876
+ "outputs": [
877
+ {
878
+ "output_type": "stream",
879
+ "name": "stdout",
880
+ "text": [
881
+ "<|bos|>The capital of Belgium is 11,000 square kilometers and is located in the north of the country. The capital of Belgium is Brussels. The capital of Belgium is located in the north of the country. The capital of Belgium is located in the north of the country. The\n"
882
+ ]
883
+ }
884
+ ]
885
+ }
886
+ ]
887
+ }