Spaces:
Runtime error
Runtime error
| from transformers import AutoTokenizer | |
| import json | |
| # Load tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained("gpt2") | |
| # Convert text dataset to tokenized dataset | |
| data = [] | |
| with open("flirty_dataset.txt", "r") as f: | |
| dialogue = f.read().split("---\n") | |
| for pair in dialogue: | |
| if pair.strip(): | |
| lines = pair.strip().split("\n") | |
| user, bot = lines[0].split(": ", 1), lines[1].split(": ", 1) | |
| input_text = f"{user[0]}: {user[1]}" | |
| output_text = f"{bot[0]}: {bot[1]}" | |
| data.append({"input_ids": tokenizer(input_text, truncation=True)["input_ids"], | |
| "labels": tokenizer(output_text, truncation=True)["input_ids"]}) | |
| # Save tokenized data | |
| with open("flirty_dataset_tokenized.json", "w") as f: | |
| json.dump(data, f) | |
| print("Preprocessing complete! Tokenized dataset saved as flirty_dataset_tokenized.json") | |