import os import json from tqdm import tqdm from itertools import islice from datasets import load_dataset # Kazakh Cyrillic character to the Kazakh Latin character mapping from 2021 onwards cyrillic_to_latin = { "А": "A", "а": "a", "Ә": "Ä", "ә": "ä", "Б": "B", "б": "b", "Д": "D", "д": "d", "Е": "E", "е": "e", "Ф": "F", "ф": "f", "Г": "G", "г": "g", "Ғ": "Ğ", "ғ": "ğ", "Х": "H", "х": "h", # also Һ, see below "Һ": "H", "һ": "h", "И": "I", "и": "i", # used for [и], [й] "І": "I", "і": "ı", # distinct from И in sound, both map to 'I/i' "Ж": "J", "ж": "j", "К": "K", "к": "k", "Қ": "Q", "қ": "q", "Л": "L", "л": "l", "М": "M", "м": "m", "Н": "N", "н": "n", "Ң": "Ñ", "ң": "ñ", "О": "O", "о": "o", "Ө": "Ö", "ө": "ö", "П": "P", "п": "p", "Р": "R", "р": "r", "С": "S", "с": "s", "Ш": "Ş", "ш": "ş", "Т": "T", "т": "t", "У": "U", "у": "u", # basic 'u' sound, distinct from Ұ "Ұ": "Ū", "ұ": "ū", # back rounded, used frequently "Ү": "Ü", "ү": "ü", # front rounded "В": "V", "в": "v", "Ы": "Y", "ы": "y", "Й": "I", "й": "i", # same treatment as И "Ц": "Ts", "ц": "ts", # for Russian borrowings "Ч": "Ch", "ч": "ch", "Щ": "Ş", "щ": "ş", # typically simplified to 'ş' "Э": "E", "э": "e", "Ю": "Iu", "ю": "iu", # borrowed words only "Я": "Ia", "я": "ia", "Ъ": "", "ъ": "", "Ь": "", "ь": "", "З": "Z", "з": "z", # Additional (not in table but used in borrowings) "Ё": "Io", "ё": "io", } def convert_to_latin(text: str) -> str: """ Simple function to apply the Cyrillic -> Latin mapping for Kazakh characters. """ return ''.join(cyrillic_to_latin.get(char, char) for char in text) # Process all files in "extracted" dir # Output file output_path = "src/data/kazakh_latin_corpus.jsonl" # First step: process the Wikipedia dump print("Processing the Wikipedia dump of Kazakh articles...") with open(output_path, 'w', encoding = "utf-8") as out_file: # Iterate over all folders for root, _, files in os.walk("src/data/extracted"): for fname in tqdm(files, desc = "Files in Wikipedia dump"): with open(os.path.join(root, fname), 'r', encoding = "utf-8") as f: for line in f: try: data = json.loads(line) cyr_text = data["text"].strip() lat_text = convert_to_latin(cyr_text).strip() if cyr_text and lat_text: obj = { "transliteration": { "src": cyr_text, "tgt": lat_text } } out_file.write(json.dumps(obj, ensure_ascii = False) + "\n") except Exception as e: tqdm.write(f"Skipping due to: {e}") continue print("Done") # Second step: process the "CC100-Kazakh" dataset print("Loading 'CC100-Kazakh' dataset...") with open(output_path, 'a', encoding = "utf-8") as out_file: with open("src/data/kk.txt", 'r', encoding = "utf-8") as f: for line in tqdm(islice(f, 2_200_000), total = 2_200_000, desc = "Lines in CC100-Kazakh"): try: cyr_text = line.strip() lat_text = convert_to_latin(cyr_text).strip() if cyr_text and lat_text: obj = { "transliteration": { "src": cyr_text, "tgt": lat_text } } out_file.write(json.dumps(obj, ensure_ascii = False) + "\n") except Exception as e: tqdm.write(f"Skipping due to: {e}") continue # Third step: process the raw, Kazakh-centred part of the "KazParC" dataset print("Loading 'KazParC' dataset...") kazparc = load_dataset("issai/kazparc", "kazparc_raw", split = "train") with open(output_path, 'a', encoding = "utf-8") as out_file: for entry in tqdm(kazparc, desc = "Entries in KazParC"): try: if "kk" in entry and isinstance(entry["kk"], str): cyr_text = entry["kk"].strip() lat_text = convert_to_latin(cyr_text).strip() if cyr_text and lat_text: obj = { "transliteration": { "src": cyr_text, "tgt": lat_text } } out_file.write(json.dumps(obj, ensure_ascii = False) + "\n") except Exception as e: tqdm.write(f"Skipping due to: {e}") continue