| from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer |
| from datasets import load_dataset |
| import torch |
|
|
| |
| def load_train_data(): |
| |
| train_dataset = load_dataset('csv', data_files={"train": "datasets/Canstralian/ShellCommands.csv"}) |
| return train_dataset |
|
|
| |
| def load_model_and_tokenizer(model_name): |
| model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2) |
| tokenizer = AutoTokenizer.from_pretrained(model_name) |
| return model, tokenizer |
|
|
| |
| def preprocess_function(examples, tokenizer): |
| return tokenizer(examples['text'], padding=True, truncation=True) |
|
|
| |
| def fine_tune(model_name="WhiteRabbitNeo/WhiteRabbitNeo-13B-v1"): |
| train_data = load_train_data() |
| model, tokenizer = load_model_and_tokenizer(model_name) |
|
|
| |
| train_data = train_data.map(lambda x: preprocess_function(x, tokenizer), batched=True) |
| train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"]) |
|
|
| |
| training_args = TrainingArguments( |
| output_dir='./results', |
| evaluation_strategy="epoch", |
| learning_rate=2e-5, |
| per_device_train_batch_size=16, |
| num_train_epochs=3, |
| weight_decay=0.01, |
| ) |
|
|
| trainer = Trainer( |
| model=model, |
| args=training_args, |
| train_dataset=train_data['train'], |
| ) |
|
|
| trainer.train() |
|
|
| |
| fine_tune() |
|
|