WillHeld commited on
Commit
aebda50
·
verified ·
1 Parent(s): 4650677

Upload ContactDoc fixed-vocab tokenizer

Browse files
Files changed (2) hide show
  1. tokenizer.json +0 -45
  2. tokenizer_config.json +0 -7
tokenizer.json CHANGED
@@ -21,24 +21,6 @@
21
  "normalized": false,
22
  "special": true
23
  },
24
- {
25
- "id": 2,
26
- "content": "<begin_contacts>",
27
- "single_word": false,
28
- "lstrip": false,
29
- "rstrip": false,
30
- "normalized": false,
31
- "special": true
32
- },
33
- {
34
- "id": 3,
35
- "content": "<end_contacts>",
36
- "single_word": false,
37
- "lstrip": false,
38
- "rstrip": false,
39
- "normalized": false,
40
- "special": true
41
- },
42
  {
43
  "id": 4,
44
  "content": "<end>",
@@ -48,33 +30,6 @@
48
  "normalized": false,
49
  "special": true
50
  },
51
- {
52
- "id": 5,
53
- "content": "<newline>",
54
- "single_word": false,
55
- "lstrip": false,
56
- "rstrip": false,
57
- "normalized": false,
58
- "special": true
59
- },
60
- {
61
- "id": 6,
62
- "content": "<end_of_document>",
63
- "single_word": false,
64
- "lstrip": false,
65
- "rstrip": false,
66
- "normalized": false,
67
- "special": true
68
- },
69
- {
70
- "id": 7,
71
- "content": "<deterministic-positives-only>",
72
- "single_word": false,
73
- "lstrip": false,
74
- "rstrip": false,
75
- "normalized": false,
76
- "special": true
77
- },
78
  {
79
  "id": 27,
80
  "content": "<UNK>",
 
21
  "normalized": false,
22
  "special": true
23
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  {
25
  "id": 4,
26
  "content": "<end>",
 
30
  "normalized": false,
31
  "special": true
32
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  {
34
  "id": 27,
35
  "content": "<UNK>",
tokenizer_config.json CHANGED
@@ -2,13 +2,6 @@
2
  "backend": "tokenizers",
3
  "bos_token": "<begin_sequence>",
4
  "eos_token": "<end>",
5
- "extra_special_tokens": [
6
- "<begin_contacts>",
7
- "<end_contacts>",
8
- "<newline>",
9
- "<end_of_document>",
10
- "<deterministic-positives-only>"
11
- ],
12
  "model_max_length": 1000000000000000019884624838656,
13
  "pad_token": "<pad>",
14
  "tokenizer_class": "PreTrainedTokenizerFast",
 
2
  "backend": "tokenizers",
3
  "bos_token": "<begin_sequence>",
4
  "eos_token": "<end>",
 
 
 
 
 
 
 
5
  "model_max_length": 1000000000000000019884624838656,
6
  "pad_token": "<pad>",
7
  "tokenizer_class": "PreTrainedTokenizerFast",