Datasets used for training

#2
by Fadi12 - opened

Hello,
I am conducting comprehensive research on models and their training datasets. I noticed that several models listed under your account don't mention the datasets used for fine-tuning, such as:

And a few others...
Would it be possible for you to specify which datasets were used for fine-tuning these models? I can provide the full list if that would be helpful.
Thank you for your work and for making these models publicly available!

Hey , sorry I can answer the dataset question for the last 2:
https://huggingface.co/NDugar/v3-Large-mnli fine tuned on mnli only
https://huggingface.co/NDugar/debertav3-mnli-snli-anli on mnli snli and anli

i have some of the code, if you want to go over but its been 4 years so I dont remember much of the specifics.
I will try to upload some more files regarding this and other repo here. Hope it is helpful.-

Although this is not the exact codebase used in this training, but rather the final version, I hope you find it useful. have a good day.
Code:

import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
import os

# hf notebook on training classifiers https://github.com/huggingface/notebooks/blob/master/examples/text_classification.ipynb
from datasets import load_dataset, load_metric, Dataset, DatasetDict
from transformers import AutoModelForSequenceClassification,AutoModelWithLMHead, AutoTokenizer, AutoConfig, DataCollatorWithPadding, DataCollator
from transformers import DataCollatorForSeq2Seq


device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")

label2id = {"entailment": 0, "neutral": 1, "contradiction": 2}
id2label = {0: "entailment", 1: "neutral", 2: "contradiction"}

model_name = "microsoft/deberta-v3-large" 
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, model_max_length=128)  # model_max_length=512
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3, label2id=label2id, id2label=id2label).to(device)  # num_labels=3


#### NLI 3-class
from datasets import concatenate_datasets

## MNLI
dataset_train_mnli = load_dataset('multi_nli', split="train")  # split='train'
#dataset_train_mnli = dataset_train_mnli.remove_columns_(['promptID', 'pairID', 'premise_binary_parse', 'premise_parse', 'hypothesis_binary_parse', 'hypothesis_parse', 'genre'])
dataset_test_mnli_m = load_dataset('multi_nli', split="validation_matched")  # split='train'
#dataset_test_mnli_m = dataset_test_mnli_m.remove_columns_(['promptID', 'pairID', 'premise_binary_parse', 'premise_parse', 'hypothesis_binary_parse', 'hypothesis_parse', 'genre'])
dataset_test_mnli_mm = load_dataset('multi_nli', split="validation_mismatched")  # split='train'
#dataset_test_mnli_mm = dataset_test_mnli_mm.remove_columns_(['promptID', 'pairID', 'premise_binary_parse', 'premise_parse', 'hypothesis_binary_parse', 'hypothesis_parse', 'genre'])

print(dataset_train_mnli['premise'][:4])
print(dataset_train_mnli['hypothesis'][:4])
print(dataset_train_mnli['label'][:4])

#### tokenization
# Dynamic padding HF course: https://huggingface.co/course/chapter3/2?fw=pt
dynamic_padding = True

## tokanization & padding to same max length. is better for TPU, because they prefer all batches to be same length.
if dynamic_padding == False:
    def tokenize_func(examples):
        return tokenizer(examples["premise"], examples["hypothesis"], truncation=True, max_length=256)  # max_length=512,  padding=True
## dynamic padding
elif  dynamic_padding == True:
    def tokenize_func(examples):
        return tokenizer(examples["premise"], examples["hypothesis"], truncation=True)  # max_length=512,  padding=True

### 3 class tokenization
# training on: 
encoded_dataset_train = dataset_train_mnli.map(tokenize_func, batched=True)
#print(len(encoded_dataset_train))
# testing on: 
encoded_dataset_test = dataset_test_mnli_m.map(tokenize_func, batched=True)
#print(len(encoded_dataset_test))


if  dynamic_padding == True:
    data_collator = DataCollatorWithPadding(tokenizer)

# specific dataset testing
#encoded_dataset_test_anli = dataset_test_anli.map(tokenize_func, batched=True)
#encoded_dataset_test_anli_r3 = dataset_test_anli_r3.map(tokenize_func, batched=True)
#encoded_dataset_dev_fever = dataset_dev_fever.map(tokenize_func, batched=True)
#encoded_dataset_test_mnli_m = dataset_test_mnli_m.map(tokenize_func, batched=True)
#encoded_dataset_test_mnli_mm = dataset_test_mnli_mm.map(tokenize_func, batched=True)

## inspect length of training data - can speed up significantly via max_length
n_tokens = [len(encoding) for encoding in encoded_dataset_train["input_ids"]]
print(max(n_tokens))

# e.g. mnli has 69 texts > 256 tokens, 1169 texts > 128 tokens => sequence length of 256 tokens is fine for tests on mnli
df_n_tokens = pd.DataFrame(data={"n_tokens": n_tokens})
df_n_tokens[df_n_tokens.n_tokens > 256]  # for mnli-fever-anli: >300 removed around 307 texts, >256 removes 881 texts compared to 512, > 200 removes 5209
df_n_tokens.n_tokens.plot.hist(bins=20);


from datasets import list_metrics
print(list_metrics())
metric = load_metric('accuracy')  # 'glue', "mnli"


from transformers import TrainingArguments, Trainer

training_directory = "nli-few-shot/mnli-v2xl/"

# https://huggingface.co/transformers/main_classes/trainer.html#transformers.TrainingArguments
train_args = TrainingArguments(
    output_dir=f'./results/{training_directory}',
    overwrite_output_dir=True,
    save_steps=10_000,
    save_total_limit=2,
    learning_rate=3e-6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    #warmup_steps=0,  # 1000,
    warmup_ratio=0.06,  #0.1, 0.06
    weight_decay=0.1,  #0.1,
    fp16=True,
    fp16_full_eval=True,
    seed=42,
    prediction_loss_only=True,
)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(references=labels,predictions=predictions)

trainer = Trainer( 
    model=model,
    tokenizer=tokenizer,
    args=train_args,
    data_collator=data_collator,
    train_dataset=encoded_dataset_train,  #.shard(index=1, num_shards=100),  # https://huggingface.co/docs/datasets/processing.html#sharding-the-dataset-shard
    eval_dataset=encoded_dataset_test,  # encoded_dataset["validation_matched"],
    compute_metrics=compute_metrics
)

## cleaning memory in case of memory overload
import torch
import gc
#del(model)
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
gc.collect()
#torch.cuda.memory_summary(device=None, abbreviated=True)

trainer.train()

Sign up or log in to comment