import os
import json
from tqdm import tqdm
from itertools import islice
from datasets import load_dataset


# Kazakh Cyrillic character to the Kazakh Latin character mapping from 2021 onwards
cyrillic_to_latin = {
    "А": "A", "а": "a",
    "Ә": "Ä", "ә": "ä",
    "Б": "B", "б": "b",
    "Д": "D", "д": "d",
    "Е": "E", "е": "e",
    "Ф": "F", "ф": "f",
    "Г": "G", "г": "g",
    "Ғ": "Ğ", "ғ": "ğ",
    "Х": "H", "х": "h",      # also Һ, see below
    "Һ": "H", "һ": "h",

    "И": "I", "и": "i",      # used for [и], [й]
    "І": "I", "і": "ı",      # distinct from И in sound, both map to 'I/i'
    "Ж": "J", "ж": "j",

    "К": "K", "к": "k",
    "Қ": "Q", "қ": "q",
    "Л": "L", "л": "l",
    "М": "M", "м": "m",
    "Н": "N", "н": "n",
    "Ң": "Ñ", "ң": "ñ",

    "О": "O", "о": "o",
    "Ө": "Ö", "ө": "ö",

    "П": "P", "п": "p",
    "Р": "R", "р": "r",
    "С": "S", "с": "s",
    "Ш": "Ş", "ш": "ş",
    "Т": "T", "т": "t",

    "У": "U", "у": "u",      # basic 'u' sound, distinct from Ұ
    "Ұ": "Ū", "ұ": "ū",      # back rounded, used frequently
    "Ү": "Ü", "ү": "ü",      # front rounded

    "В": "V", "в": "v",
    "Ы": "Y", "ы": "y",
    "Й": "I", "й": "i",      # same treatment as И
    "Ц": "Ts", "ц": "ts",    # for Russian borrowings
    "Ч": "Ch", "ч": "ch",
    "Щ": "Ş", "щ": "ş",      # typically simplified to 'ş'

    "Э": "E", "э": "e",
    "Ю": "Iu", "ю": "iu",    # borrowed words only
    "Я": "Ia", "я": "ia",

    "Ъ": "", "ъ": "",
    "Ь": "", "ь": "",

    "З": "Z", "з": "z",

    # Additional (not in table but used in borrowings)
    "Ё": "Io", "ё": "io",
}


def convert_to_latin(text: str) -> str:
    """
    Simple function to apply the Cyrillic -> Latin mapping for Kazakh characters.
    """
    return ''.join(cyrillic_to_latin.get(char, char) for char in text)


# Process all files in "extracted" dir
# Output file
output_path = "src/data/kazakh_latin_corpus.jsonl"

# First step: process the Wikipedia dump
print("Processing the Wikipedia dump of Kazakh articles...")

with open(output_path, 'w', encoding = "utf-8") as out_file:
    # Iterate over all folders
    for root, _, files in os.walk("src/data/extracted"):
        for fname in tqdm(files, desc = "Files in Wikipedia dump"):
            with open(os.path.join(root, fname), 'r', encoding = "utf-8") as f:
                for line in f:
                    try:
                        data = json.loads(line)
                        cyr_text = data["text"].strip()
                        lat_text = convert_to_latin(cyr_text).strip()

                        if cyr_text and lat_text:
                            obj = {
                                "transliteration": {
                                    "src": cyr_text,
                                    "tgt": lat_text
                                }
                            }

                            out_file.write(json.dumps(obj, ensure_ascii = False) + "\n")

                    except Exception as e:
                        tqdm.write(f"Skipping due to: {e}")

                        continue

    print("Done")

    # Second step: process the "CC100-Kazakh" dataset
    print("Loading 'CC100-Kazakh' dataset...")

    with open(output_path, 'a', encoding = "utf-8") as out_file:
        with open("src/data/kk.txt", 'r', encoding = "utf-8") as f:
            for line in tqdm(islice(f, 2_200_000), total = 2_200_000, desc = "Lines in CC100-Kazakh"):
                try:
                    cyr_text = line.strip()
                    lat_text = convert_to_latin(cyr_text).strip()

                    if cyr_text and lat_text:
                        obj = {
                            "transliteration": {
                                "src": cyr_text,
                                "tgt": lat_text
                            }
                        }

                        out_file.write(json.dumps(obj, ensure_ascii = False) + "\n")

                except Exception as e:
                    tqdm.write(f"Skipping due to: {e}")

                    continue

    # Third step: process the raw, Kazakh-centred part of the "KazParC" dataset
    print("Loading 'KazParC' dataset...")

    kazparc = load_dataset("issai/kazparc", "kazparc_raw", split = "train")

    with open(output_path, 'a', encoding = "utf-8") as out_file:
        for entry in tqdm(kazparc, desc = "Entries in KazParC"):
            try:
                if "kk" in entry and isinstance(entry["kk"], str):
                    cyr_text = entry["kk"].strip()
                    lat_text = convert_to_latin(cyr_text).strip()

                    if cyr_text and lat_text:
                        obj = {
                            "transliteration": {
                                "src": cyr_text,
                                "tgt": lat_text
                            }
                        }

                        out_file.write(json.dumps(obj, ensure_ascii = False) + "\n")

            except Exception as e:
                tqdm.write(f"Skipping due to: {e}")

                continue