krotima1 commited on
Commit
bbf50b1
·
2 Parent(s): 8cf2e56 52ecdfe

Merge branch 'main' of https://huggingface.co/krotima1/AlignScoreCS

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1 +1,75 @@
1
- hello
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ - cs
5
+ license: cc-by-4.0
6
+ metrics:
7
+ - bleurt
8
+ - bleu
9
+ - bertscore
10
+ pipeline_tag: text-classification
11
+ ---
12
+ # AlignScoreCS
13
+ MultiTask multilingual model for assessing facticity in various NLU tasks in Czech and English language. We followed the initial paper AlignScore https://arxiv.org/abs/2305.16739.
14
+ We trained a model using a shared architecture of checkpoint xlm-roberta-large https://huggingface.co/FacebookAI/xlm-roberta-large with three linear layers for regression,
15
+ binary classification and ternary classification.
16
+
17
+
18
+ # Usage
19
+ ```python
20
+ # Assuming you copied the attached Files_and_versions/AlignScore.py file for ease of use in transformers.
21
+ from AlignScoreCS import AlignScoreCS
22
+ alignScoreCS = AlignScoreCS.from_pretrained("krotima1/AlignScoreCS")
23
+ # put the model to cuda to accelerate
24
+ print(alignScoreCS.score(context="This is context", claim="This is claim"))
25
+
26
+ ```
27
+
28
+ # Results
29
+
30
+
31
+
32
+ # Training datasets
33
+ The following table shows datasets that has been utilized for training the model. We translated these english datasets to Czech using seamLessM4t.
34
+
35
+ | NLP Task | Dataset | Training Task | Context (n words) | Claim (n words) | Sample Count |
36
+ |-----------------------|-------------------|---------------|-------------------|-----------------|--------------|
37
+ | NLI | SNLI | 3-way | 10 | 13 | Cs: 500k |
38
+ | | | | | | En: 550k |
39
+ | | MultiNLI | 3-way | 16 | 20 | Cs: 393k |
40
+ | | | | | | En: 393k |
41
+ | | Adversarial NLI | 3-way | 48 | 54 | Cs: 163k |
42
+ | | | | | | En: 163k |
43
+ | | DocNLI | 2-way | 97 | 285 | Cs: 200k |
44
+ | | | | | | En: 942k |
45
+ | Fact Verification | NLI-style FEVER | 3-way | 48 | 50 | Cs: 208k |
46
+ | | | | | | En: 208k |
47
+ | | Vitamin C | 3-way | 23 | 25 | Cs: 371k |
48
+ | | | | | | En: 371k |
49
+ | Paraphrase | QQP | 2-way | 9 | 11 | Cs: 162k |
50
+ | | | | | | En: 364k |
51
+ | | PAWS | 2-way | - | 18 | Cs: - |
52
+ | | | | | | En: 707k |
53
+ | | PAWS labeled | 2-way | 18 | - | Cs: 49k |
54
+ | | | | | | En: - |
55
+ | | PAWS unlabeled | 2-way | 18 | - | Cs: 487k |
56
+ | | | | | | En: - |
57
+ | STS | SICK | reg | - | 10 | Cs: - |
58
+ | | | | | | En: 4k |
59
+ | | STS Benchmark | reg | - | 10 | Cs: - |
60
+ | | | | | | En: 6k |
61
+ | | Free-N1 | reg | 18 | - | Cs: 20k |
62
+ | | | | | | En: - |
63
+ | QA | SQuAD v2 | 2-way | 105 | 119 | Cs: 130k |
64
+ | | | | | | En: 130k |
65
+ | | RACE | 2-way | 266 | 273 | Cs: 200k |
66
+ | | | | | | En: 351k |
67
+ | Information Retrieval| MS MARCO | 2-way | 49 | 56 | Cs: 200k |
68
+ | | | | | | En: 5M |
69
+ | Summarization | WikiHow | 2-way | 434 | 508 | Cs: 157k |
70
+ | | | | | | En: 157k |
71
+ | | SumAug | 2-way | - | - | Cs: - |
72
+ | | | | | | En: - |
73
+
74
+
75
+
special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f59925fcb90c92b894cb93e51bb9b4a6105c5c249fe54ce1c704420ac39b81af
3
+ size 17082756
tokenizer_config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "250001": {
36
+ "content": "<mask>",
37
+ "lstrip": true,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "<s>",
45
+ "clean_up_tokenization_spaces": true,
46
+ "cls_token": "<s>",
47
+ "eos_token": "</s>",
48
+ "mask_token": "<mask>",
49
+ "model_max_length": 512,
50
+ "pad_token": "<pad>",
51
+ "sep_token": "</s>",
52
+ "tokenizer_class": "XLMRobertaTokenizer",
53
+ "unk_token": "<unk>"
54
+ }