Datasets used for training
Hello,
I am conducting comprehensive research on models and their training datasets. I noticed that several models listed under your account don't mention the datasets used for fine-tuning, such as:
- https://huggingface.co/NDugar/v3large-2epoch
- https://huggingface.co/NDugar/v3-Large-mnli
- https://huggingface.co/NDugar/debertav3-mnli-snli-anli
And a few others...
Would it be possible for you to specify which datasets were used for fine-tuning these models? I can provide the full list if that would be helpful.
Thank you for your work and for making these models publicly available!
Hey , sorry I can answer the dataset question for the last 2:
https://huggingface.co/NDugar/v3-Large-mnli fine tuned on mnli only
https://huggingface.co/NDugar/debertav3-mnli-snli-anli on mnli snli and anli
i have some of the code, if you want to go over but its been 4 years so I dont remember much of the specifics.
I will try to upload some more files regarding this and other repo here. Hope it is helpful.-
Although this is not the exact codebase used in this training, but rather the final version, I hope you find it useful. have a good day.
Code:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
import os
# hf notebook on training classifiers https://github.com/huggingface/notebooks/blob/master/examples/text_classification.ipynb
from datasets import load_dataset, load_metric, Dataset, DatasetDict
from transformers import AutoModelForSequenceClassification,AutoModelWithLMHead, AutoTokenizer, AutoConfig, DataCollatorWithPadding, DataCollator
from transformers import DataCollatorForSeq2Seq
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")
label2id = {"entailment": 0, "neutral": 1, "contradiction": 2}
id2label = {0: "entailment", 1: "neutral", 2: "contradiction"}
model_name = "microsoft/deberta-v3-large"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, model_max_length=128) # model_max_length=512
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3, label2id=label2id, id2label=id2label).to(device) # num_labels=3
#### NLI 3-class
from datasets import concatenate_datasets
## MNLI
dataset_train_mnli = load_dataset('multi_nli', split="train") # split='train'
#dataset_train_mnli = dataset_train_mnli.remove_columns_(['promptID', 'pairID', 'premise_binary_parse', 'premise_parse', 'hypothesis_binary_parse', 'hypothesis_parse', 'genre'])
dataset_test_mnli_m = load_dataset('multi_nli', split="validation_matched") # split='train'
#dataset_test_mnli_m = dataset_test_mnli_m.remove_columns_(['promptID', 'pairID', 'premise_binary_parse', 'premise_parse', 'hypothesis_binary_parse', 'hypothesis_parse', 'genre'])
dataset_test_mnli_mm = load_dataset('multi_nli', split="validation_mismatched") # split='train'
#dataset_test_mnli_mm = dataset_test_mnli_mm.remove_columns_(['promptID', 'pairID', 'premise_binary_parse', 'premise_parse', 'hypothesis_binary_parse', 'hypothesis_parse', 'genre'])
print(dataset_train_mnli['premise'][:4])
print(dataset_train_mnli['hypothesis'][:4])
print(dataset_train_mnli['label'][:4])
#### tokenization
# Dynamic padding HF course: https://huggingface.co/course/chapter3/2?fw=pt
dynamic_padding = True
## tokanization & padding to same max length. is better for TPU, because they prefer all batches to be same length.
if dynamic_padding == False:
def tokenize_func(examples):
return tokenizer(examples["premise"], examples["hypothesis"], truncation=True, max_length=256) # max_length=512, padding=True
## dynamic padding
elif dynamic_padding == True:
def tokenize_func(examples):
return tokenizer(examples["premise"], examples["hypothesis"], truncation=True) # max_length=512, padding=True
### 3 class tokenization
# training on:
encoded_dataset_train = dataset_train_mnli.map(tokenize_func, batched=True)
#print(len(encoded_dataset_train))
# testing on:
encoded_dataset_test = dataset_test_mnli_m.map(tokenize_func, batched=True)
#print(len(encoded_dataset_test))
if dynamic_padding == True:
data_collator = DataCollatorWithPadding(tokenizer)
# specific dataset testing
#encoded_dataset_test_anli = dataset_test_anli.map(tokenize_func, batched=True)
#encoded_dataset_test_anli_r3 = dataset_test_anli_r3.map(tokenize_func, batched=True)
#encoded_dataset_dev_fever = dataset_dev_fever.map(tokenize_func, batched=True)
#encoded_dataset_test_mnli_m = dataset_test_mnli_m.map(tokenize_func, batched=True)
#encoded_dataset_test_mnli_mm = dataset_test_mnli_mm.map(tokenize_func, batched=True)
## inspect length of training data - can speed up significantly via max_length
n_tokens = [len(encoding) for encoding in encoded_dataset_train["input_ids"]]
print(max(n_tokens))
# e.g. mnli has 69 texts > 256 tokens, 1169 texts > 128 tokens => sequence length of 256 tokens is fine for tests on mnli
df_n_tokens = pd.DataFrame(data={"n_tokens": n_tokens})
df_n_tokens[df_n_tokens.n_tokens > 256] # for mnli-fever-anli: >300 removed around 307 texts, >256 removes 881 texts compared to 512, > 200 removes 5209
df_n_tokens.n_tokens.plot.hist(bins=20);
from datasets import list_metrics
print(list_metrics())
metric = load_metric('accuracy') # 'glue', "mnli"
from transformers import TrainingArguments, Trainer
training_directory = "nli-few-shot/mnli-v2xl/"
# https://huggingface.co/transformers/main_classes/trainer.html#transformers.TrainingArguments
train_args = TrainingArguments(
output_dir=f'./results/{training_directory}',
overwrite_output_dir=True,
save_steps=10_000,
save_total_limit=2,
learning_rate=3e-6,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=3,
#warmup_steps=0, # 1000,
warmup_ratio=0.06, #0.1, 0.06
weight_decay=0.1, #0.1,
fp16=True,
fp16_full_eval=True,
seed=42,
prediction_loss_only=True,
)
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return metric.compute(references=labels,predictions=predictions)
trainer = Trainer(
model=model,
tokenizer=tokenizer,
args=train_args,
data_collator=data_collator,
train_dataset=encoded_dataset_train, #.shard(index=1, num_shards=100), # https://huggingface.co/docs/datasets/processing.html#sharding-the-dataset-shard
eval_dataset=encoded_dataset_test, # encoded_dataset["validation_matched"],
compute_metrics=compute_metrics
)
## cleaning memory in case of memory overload
import torch
import gc
#del(model)
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
gc.collect()
#torch.cuda.memory_summary(device=None, abbreviated=True)
trainer.train()