import gradio as gr
import os
from PIL import Image
import tempfile
from gradio_client import Client, handle_file
import torch
from transformers import VitsModel, AutoTokenizer, pipeline
import scipy.io.wavfile as wavfile
import traceback
import random
import time

# =========================
# Параметры
# =========================
TALKING_HEAD_SPACE = "Skywork/skyreels-a1-talking-head"

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# =========================
# Загрузка моделей
# =========================
try:
    # TTS модель (казахский)
    tts_model = VitsModel.from_pretrained("facebook/mms-tts-kaz").to(device)
    tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-kaz")

    # Перевод ru -> kk
    translator = pipeline(
        "translation",
        model="facebook/nllb-200-distilled-600M",
        device=0 if device == "cuda" else -1
    )

    # Модель для генерации вопросов
    qa_model = pipeline(
        "text2text-generation",
        model="google/flan-t5-small",
        device=0 if device == "cuda" else -1
    )

    print("✅ Все модели успешно загружены!")

except Exception as e:
    raise RuntimeError(f"❌ Ошибка загрузки моделей: {str(e)}")


# =========================
# Вспомогательные функции
# =========================
def generate_quiz(text: str):
    """
    Генерирует один вопрос и два варианта ответа (correct, wrong) на русском языке.
    Парсит текст вместо строгого JSON.
    """
    prompt = (
        "Сгенерируй один учебный вопрос по этому тексту и два варианта ответа (правильный и неправильный). "
        "Верни в формате: Вопрос: ... Ответ1: ... Ответ2: ... Текст: " + text
    )
    try:
        out = qa_model(prompt, max_length=256)[0]["generated_text"]
        # Пробуем извлечь вопрос и два ответа через простые маркеры
        import re
        match = re.search(
            r"Вопрос\s*[:\-]\s*(.*?)\s*Ответ1\s*[:\-]\s*(.*?)\s*Ответ2\s*[:\-]\s*(.*)", out, re.DOTALL | re.IGNORECASE
        )
        if not match:
            # fallback: разделяем по строкам
            lines = [l.strip() for l in out.splitlines() if l.strip()]
            if len(lines) >= 3:
                question, correct, wrong = lines[:3]
            else:
                # Если всё равно не получилось — берём первые 3 фразы
                parts = out.split(".")
                question = parts[0] if len(parts) > 0 else "Вопрос"
                correct = parts[1] if len(parts) > 1 else "Вариант 1"
                wrong = parts[2] if len(parts) > 2 else "Вариант 2"
        else:
            question, correct, wrong = match.groups()

        question = question.strip()
        correct = correct.strip()
        wrong = wrong.strip()
        options = [correct, wrong]
        random.shuffle(options)
        return question, options, correct
    except Exception as e:
        raise ValueError(f"Ошибка генерации вопроса:\n{str(e)}\nМодель вернула: {out}")


def synthesize_audio(text_ru: str):
    """Переводит русскую строку на казахский, синтезирует аудио и возвращает путь к файлу .wav"""
    translation = translator(text_ru, src_lang="rus_Cyrl", tgt_lang="kaz_Cyrl")
    text_kk = translation[0]["translation_text"]

    inputs = tts_tokenizer(text_kk, return_tensors="pt").to(device)
    with torch.no_grad():
        output = tts_model(**inputs)

    waveform = output.waveform.squeeze().cpu().numpy()
    audio = (waveform * 32767).astype('int16')
    sampling_rate = getattr(tts_model.config, 'sampling_rate', 22050)

    tmpf = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
    wavfile.write(tmpf.name, sampling_rate, audio)
    tmpf.close()
    return tmpf.name


def make_talking_head(image_path: str, audio_path: str, max_retries=3):
    """Вызывает SkyReels/Talking Head space и возвращает путь или URL видео."""
    for attempt in range(max_retries):
        try:
            client = Client(TALKING_HEAD_SPACE)
            result = client.predict(
                image_path=handle_file(image_path),
                audio_path=handle_file(audio_path),
                guidance_scale=3.0,
                steps=10,
                api_name="/process_image_audio"
            )

            # Отладочный вывод
            print(f"Result type: {type(result)}")
            print(f"Result content: {result}")

            # Обработка различных форматов результата
            if isinstance(result, tuple):
                # Если результат - кортеж, берем первый элемент
                video_path = result[0]
                if isinstance(video_path, dict) and "video" in video_path:
                    return video_path["video"]
                elif isinstance(video_path, str):
                    return video_path
                else:
                    # Если первый элемент не подходит, пробуем найти путь к видео в кортеже
                    for item in result:
                        if isinstance(item, str) and (item.endswith('.mp4') or item.endswith('.webm') or os.path.exists(str(item))):
                            return item
                    raise ValueError(f"Не удалось найти видео в результате: {result}")
            elif isinstance(result, dict) and "video" in result:
                return result["video"]
            elif isinstance(result, str):
                return result
            else:
                raise ValueError(f"Unexpected talking head result: {type(result)}, value: {result}")

        except Exception as e:
            if attempt < max_retries - 1:
                print(f"Попытка {attempt + 1} не удалась: {e}. Повторяю через 2 секунды...")
                time.sleep(2)
            else:
                raise Exception(f"Ошибка после {max_retries} попыток: {str(e)}")


# =========================
# Основные обработчики для Gradio
# =========================
def start_lesson(image: Image.Image, text: str, state):
    """Шаг 1: лектор читает текст лекции."""
    if image is None or not text.strip() or len(text) > 500:
        return None, "Пожалуйста, загрузите фото и введите текст лекции (до 500 символов)", gr.update(visible=False), state

    try:
        # Сохраняем изображение
        tmpimg = tempfile.NamedTemporaryFile(suffix='.png', delete=False)
        if image.mode != 'RGB':
            image = image.convert('RGB')
        image.save(tmpimg.name)
        tmpimg.close()
        image_path = tmpimg.name

        # Генерируем вопрос заранее (но не озвучиваем)
        question, options, correct = generate_quiz(text)

        # Лектор читает текст лекции
        audio_path = synthesize_audio(text)
        video_path = make_talking_head(image_path, audio_path)

        # Сохраняем состояние
        state_data = {
            'image_path': image_path,
            'correct': correct,
            'options': options,
            'question': question,
            'text': text
        }

        # Удаляем временный аудио файл
        try: 
            os.remove(audio_path)
        except: 
            pass

        return video_path, "✅ Лекция прочитана. Нажмите 'Задать вопрос' для проверки знаний.", gr.update(visible=True), state_data

    except Exception as e:
        traceback.print_exc()
        return None, f"❌ Ошибка: {e}", gr.update(visible=False), state


def ask_question(state):
    """Шаг 2: лектор задает вопрос с вариантами ответа."""
    if not state:
        return None, "❌ Ошибка: сначала запустите урок", gr.update(visible=False), gr.update(visible=False)
    
    try:
        image_path = state.get('image_path')
        question = state.get('question')
        options = state.get('options', [])

        # Формируем текст вопроса с вариантами
        quiz_text = f"{question}. Первый вариант: {options[0]}. Второй вариант: {options[1]}"
        
        # Генерируем аудио и видео с вопросом
        audio_path = synthesize_audio(quiz_text)
        video_path = make_talking_head(image_path, audio_path)

        # Удаляем временный аудио файл
        try: 
            os.remove(audio_path)
        except: 
            pass

        return (
            video_path, 
            f"**Вопрос:** {question}", 
            gr.update(value=options[0], visible=True), 
            gr.update(value=options[1], visible=True)
        )

    except Exception as e:
        traceback.print_exc()
        return None, f"❌ Ошибка: {e}", gr.update(visible=False), gr.update(visible=False)


def answer_selected(selected_option: str, state):
    """Шаг 3: пользователь выбирает вариант — генерируем реакцию лектора."""
    if not state:
        return None, "❌ Ошибка: отсутствует состояние урока"
    
    try:
        correct = state.get('correct')
        image_path = state.get('image_path')

        if selected_option == correct:
            reaction_ru = "Правильно! Молодец!"
            display_message = "✅ Дұрыс! Жарайсың!"
        else:
            reaction_ru = f"Неправильно. Правильный ответ: {correct}"
            display_message = f"❌ Қате. Дұрыс жауап: {correct}"

        audio_path = synthesize_audio(reaction_ru)
        reaction_video = make_talking_head(image_path, audio_path)

        try: 
            os.remove(audio_path)
        except: 
            pass

        return reaction_video, display_message

    except Exception as e:
        traceback.print_exc()
        return None, f"❌ Ошибка: {e}"


# =========================
# Gradio UI
# =========================
title = "🎓 Интерактивный бейне-лектор"
description = (
    "**Как работает:**\n"
    "1. Загрузите фото лектора и введите текст лекции (русский, до 500 символов)\n"
    "2. Нажмите 'Запустить урок' — лектор прочитает текст\n"
    "3. Нажмите 'Задать вопрос' — лектор задаст вопрос с двумя вариантами ответа\n"
    "4. Выберите правильный ответ — лектор отреагирует на қазақша"
)

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown(f"# {title}\n{description}")

    with gr.Row():
        with gr.Column(scale=1):
            inp_image = gr.Image(type='pil', label='📸 Фото лектора')
            inp_text = gr.Textbox(
                lines=5, 
                label='📝 Текст лекции (рус.)', 
                placeholder='Введите текст лекции...',
                info="Максимум 500 символов"
            )
            btn_start = gr.Button("🚀 Запустить урок", variant="primary")
            btn_question = gr.Button("❓ Задать вопрос", visible=False, variant="secondary")

        with gr.Column(scale=1):
            out_video = gr.Video(label='🎬 Видео лектора')
            out_status = gr.Markdown("ℹ️ Загрузите фото и текст, затем нажмите 'Запустить урок'")
            
            with gr.Row():
                btn_opt1 = gr.Button("Вариант 1", visible=False, size="lg")
                btn_opt2 = gr.Button("Вариант 2", visible=False, size="lg")
            
            out_reaction_video = gr.Video(label='🎥 Реакция лектора', visible=False)
            out_result = gr.Markdown("")

    lesson_state = gr.State({})

    # Шаг 1: Запуск урока (чтение текста)
    btn_start.click(
        fn=start_lesson,
        inputs=[inp_image, inp_text, lesson_state],
        outputs=[out_video, out_status, btn_question, lesson_state]
    )

    # Шаг 2: Задать вопрос
    btn_question.click(
        fn=ask_question,
        inputs=[lesson_state],
        outputs=[out_video, out_status, btn_opt1, btn_opt2]
    )

    # Шаг 3: Обработка ответов
    def handle_answer_1(state):
        option = state.get('options', [''])[0] if state else ''
        video, msg = answer_selected(option, state)
        return video, msg, gr.update(visible=True)
    
    def handle_answer_2(state):
        option = state.get('options', [''])[1] if state and len(state.get('options', [])) > 1 else ''
        video, msg = answer_selected(option, state)
        return video, msg, gr.update(visible=True)
    
    btn_opt1.click(
        fn=handle_answer_1,
        inputs=[lesson_state],
        outputs=[out_reaction_video, out_result, out_reaction_video]
    )
    
    btn_opt2.click(
        fn=handle_answer_2,
        inputs=[lesson_state],
        outputs=[out_reaction_video, out_result, out_reaction_video]
    )

if __name__ == '__main__':
    demo.launch(server_name="0.0.0.0", server_port=7860)