Spaces:

AlserFurma
/

LipSyncAI

Running

App Files Files Community

AlserFurma commited on 21 days ago

Commit

4adabcc

verified ·

1 Parent(s): c06b1cb

Update app.py

Browse files

Files changed (1) hide show

app.py +74 -372

app.py CHANGED Viewed

@@ -2,391 +2,93 @@ import gradio as gr
 import os
 from PIL import Image
 import tempfile
-from gradio_client import Client, handle_file
 import torch
 from transformers import VitsModel, AutoTokenizer
 import scipy.io.wavfile as wavfile
 import traceback
-import base64
-import random
-# Принудительно CPU и минимальное использование памяти
-os.environ['CUDA_VISIBLE_DEVICES'] = ''
-torch.set_num_threads(2) # Ограничение потоков CPU
-device = "cpu"
-print(f"Using device: {device} (optimized mode)")
-# Глобальные переменные
-tts_model = None
-tts_tokenizer = None
-TALKING_HEAD_SPACE = "Skywork/skyreels-a1-talking-head"
-def load_tts_model():
-    """Загрузка только TTS модели"""
-    global tts_model, tts_tokenizer
-    if tts_model is None:
-        print("Загрузка TTS модели (казахский)...")
-        tts_model = VitsModel.from_pretrained(
-            "facebook/mms-tts-kaz",
-            torch_dtype=torch.float32,
-            low_cpu_mem_usage=True
-        )
-        tts_model.eval() # Режим инференса
-        tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-kaz")
-        print("✓ TTS модель загружена")
     return True
-def simple_translate_to_kazakh(russian_text):
-    """
-    Упрощенная транслитерация/перевод без тяжелых моделей
-    Для реального использования нужна легкая модель или API
-    """
-    # Простая замена для базовых слов (демо)
-    translations = {
-        'привет': 'сәлем',
-        'здравствуйте': 'сәлеметсіздер ме',
-        'спасибо': 'рахмет',
-        'пожалуйста': 'өтінемін',
-        'да': 'иә',
-        'нет': 'жоқ',
-        'сегодня': 'бүгін',
-        'завтра': 'ертең',
-        'математика': 'математика',
-        'физика': 'физика',
-        'урок': 'сабақ',
-        'лекция': 'дәріс',
-        'студент': 'студент',
-        'учитель': 'мұғалім',
-        'школа': 'мектеп',
-        'университет': 'университет',
-        'знание': 'білім',
-        'книга': 'кітап',
-        'вопрос': 'сұрақ',
-        'ответ': 'жауап'
     }
-    text_lower = russian_text.lower()
-    result = russian_text
-    for ru, kk in translations.items():
-        result = result.replace(ru, kk)
-        result = result.replace(ru.capitalize(), kk.capitalize())
-    return result
-def inference(image: Image.Image, text: str):
-    error_msg = ""
-    video_path = None
-    audio_path = None
-    img_path = None
     try:
-        # Загрузка TTS
-        if not load_tts_model():
-            raise RuntimeError("Не удалось загрузить TTS модель")
-        # Валидация
-        if image is None:
-            raise ValueError("Загрузите изображение лектора!")
-        if not text or not text.strip():
-            raise ValueError("Введите текст лекции!")
-        if len(text) > 500:
-            raise ValueError("Текст слишком длинный! Максимум 500 символов.")
-        print(f"Входной текст: '{text[:50]}...'")
-        # Простой перевод на казахский
-        translated_text = simple_translate_to_kazakh(text)
-        print(f"Переведенный текст: '{translated_text[:50]}...'")
-        # Генерация аудио с оптимизацией памяти
-        print("Генерация аудио...")
         with torch.no_grad():
-            inputs = tts_tokenizer(translated_text, return_tensors="pt", truncation=True, max_length=512)
-            # Освобождение памяти перед генерацией
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-            output = tts_model(**inputs)
-            waveform = output.waveform.squeeze().cpu().numpy()
-            # Очистка
-            del inputs, output
-        if waveform.size == 0:
-            raise ValueError("TTS сгенерировал пустое аудио!")
-        # Сохранение аудио
-        audio = (waveform * 32767).astype("int16")
-        sampling_rate = tts_model.config.sampling_rate
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as audio_file:
-            wavfile.write(audio_file.name, sampling_rate, audio)
-            audio_path = audio_file.name
-        print(f"✓ Аудио: {audio_path} ({len(waveform)/sampling_rate:.1f} сек)")
-        # Оптимизация изображения
-        print("Обработка изображения...")
-        if image.mode != 'RGB':
-            image = image.convert('RGB')
-        # Уменьшаем размер если слишком большое (экономия памяти)
-        max_size = 1024
-        if max(image.size) > max_size:
-            ratio = max_size / max(image.size)
-            new_size = tuple(int(dim * ratio) for dim in image.size)
-            image = image.resize(new_size, Image.Resampling.LANCZOS)
-            print(f"Изображение уменьшено до {new_size}")
-        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as img_file:
-            image.save(img_file.name, format='PNG', optimize=True)
-            img_path = img_file.name
-        print(f"✓ Изображение: {img_path}")
-        # Вызов Talking Head API
-        print(f"Подключение к {TALKING_HEAD_SPACE}...")
-        client = Client(TALKING_HEAD_SPACE, verbose=False)
         result = client.predict(
             image_path=handle_file(img_path),
             audio_path=handle_file(audio_path),
-            guidance_scale=2.5, # Снижено для скорости
-            steps=8, # Меньше шагов = быстрее
             api_name="/process_image_audio"
         )
-        # Обработка результата
-        if isinstance(result, tuple) and len(result) > 0:
-            video_data = result[0]
-            if isinstance(video_data, dict):
-                video_path = video_data.get('video') or video_data.get('path')
-            elif isinstance(video_data, str):
-                video_path = video_data
-            else:
-                video_path = str(video_data)
-        elif isinstance(result, str):
-            video_path = result
-        else:
-            raise ValueError("Неизвестный формат результата от API")
-        if not video_path or not os.path.exists(video_path):
-            raise ValueError("Видео не сгенерировано!")
-        print(f"✓ Видео: {video_path}")
-        error_msg = "✅ Бейне сәтті жасалды!"
-    except Exception as e:
-        error_msg = f"❌ Қате: {str(e)}"
-        print(f"ОШИБКА: {error_msg}")
-        traceback.print_exc()
-    finally:
-        # Очистка временных файлов
-        for path in [audio_path, img_path]:
-            if path and os.path.exists(path):
-                try:
-                    os.remove(path)
-                except:
-                    pass
-    return video_path, error_msg
-def generate_interactive_lesson(text, video_path):
-    """Упрощенная версия без тяжелых моделей QA"""
-    try:
-        if not video_path or not os.path.exists(video_path):
-            return "<p style='color: red;'>❌ Алдымен бейнені жасаңыз!</p>"
-        # Простая генерация вопросов без ML моделей
-        sentences = text.split('.')[:3] # Первые 3 предложения
-        questions = []
-        for i, sent in enumerate(sentences):
-            sent = sent.strip()
-            if len(sent) < 10:
-                continue
-            # Простые шаблоны вопросов
-            words = sent.split()
-            if len(words) < 3:
-                continue
-            # Генерируем вопрос на основе шаблона
-            question_templates = [
-                f"Не сказано о {words[0].lower()}?",
-                f"Что упоминается в тексте о {words[1].lower() if len(words) > 1 else 'теме'}?",
-                f"Какая информация дана о {words[2].lower() if len(words) > 2 else 'содержании'}?"
-            ]
-            question = random.choice(question_templates)
-            # Правильный ответ - часть предложения
-            correct = ' '.join(words[:min(5, len(words))])
-            # Неправильные ответы
-            wrong_options = [
-                "Бұл туралы айтылмаған",
-                "Мәтінде жоқ",
-                "Дұрыс емес жауап"
-            ]
-            wrong = random.choice(wrong_options)
-            questions.append({
-                "question": question,
-                "correct": correct,
-                "wrong": wrong
-            })
-        if not questions:
-            # Создаем хотя бы один вопрос
-            questions.append({
-                "question": "Дәрістің негізгі тақырыбы не?",
-                "correct": text.split('.')[0][:50] if text else "Білім",
-                "wrong": "Спорт туралы"
-            })
-        # Base64 видео (оптимизировано)
-        print("Кодирование видео в base64...")
-        with open(video_path, 'rb') as f:
-            video_data = f.read()
-            # Проверка размера
-            if len(video_data) > 50 * 1024 * 1024: # 50MB
-                return "<p style='color: orange;'>⚠️ Видео слишком большое для встраивания. Скачайте его отдельно.</p>"
-            video_base64 = base64.b64encode(video_data).decode('utf-8')
-        # Минимальный HTML
-        html = f"""<!DOCTYPE html>
-<html>
-<head>
-    <meta charset="UTF-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>Интерактивті сабақ</title>
-    <style>
-        * {{ margin: 0; padding: 0; box-sizing: border-box; }}
-        body {{ font-family: Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 15px; background: #f5f5f5; }}
-        h1 {{ color: #333; text-align: center; margin: 20px 0; font-size: 24px; }}
-        video {{ width: 100%; max-width: 600px; display: block; margin: 20px auto; border-radius: 8px; box-shadow: 0 2px 8px rgba(0,0,0,0.1); }}
-        .text {{ background: white; padding: 15px; margin: 20px 0; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }}
-        .q {{ background: white; padding: 15px; margin: 15px 0; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }}
-        button {{ background: #4CAF50; color: white; padding: 10px 20px; border: none; border-radius: 5px; cursor: pointer; margin-top: 10px; }}
-        button:hover {{ background: #45a049; }}
-        .fb {{ margin-top: 10px; padding: 8px; border-radius: 5px; font-weight: bold; }}
-        label {{ cursor: pointer; }}
-    </style>
-</head>
-<body>
-    <h1>📚 Интерактивті сабақ</h1>
-    <video controls><source src="data:video/mp4;base64,{video_base64}" type="video/mp4"></video>
-    <div class="text"><strong>Дәріс мәтіні:</strong> {text[:500]}</div>
-    <h2 style="text-align:center; margin: 20px 0;">Тесттер:</h2>
-"""
-        for i, q in enumerate(questions):
-            ca = q['correct'].replace("'", "\\'").replace('"', '&quot;')
-            html += f"""
-    <div class="q">
-        <p><strong>Сұрақ {i+1}:</strong> {q['question']}</p>
-        <div style="margin: 10px 0;">
-            <input type="radio" name="q{i}" value="c" id="c{i}">
-            <label for="c{i}">{q['correct']}</label><br>
-            <input type="radio" name="q{i}" value="w" id="w{i}">
-            <label for="w{i}">{q['wrong']}</label>
-        </div>
-        <button onclick="check({i},'{ca}')">Тексеру</button>
-        <div class="fb" id="fb{i}"></div>
-    </div>
-"""
-        html += """
-    <script>
-    function check(i, c) {
-        var s = document.querySelector('input[name="q'+i+'"]:checked');
-        var f = document.getElementById('fb'+i);
-        if(!s) { f.innerHTML='⚠️ Жауап таңдаңыз!'; f.style.background='#fff3cd'; f.style.color='#856404'; return; }
-        if(s.value==='c') { f.innerHTML='✅ Дұрыс!'; f.style.background='#d4edda'; f.style.color='#155724'; }
-        else { f.innerHTML='❌ Қате. Дұрыс: '+c; f.style.background='#f8d7da'; f.style.color='#721c24'; }
-    }
-    </script>
-</body>
-</html>"""
-        escaped = html.replace('\\', '\\\\').replace('`', '\\`').replace('${', '\\${')
-        return f"""
-<div style="text-align:center; padding: 20px; background: white; border-radius: 8px;">
-    <h3 style="color: #2c3e50;">✅ Интерактивті сабақ дайын!</h3>
-    <button onclick="var w=window.open('','_blank');w.document.write(`{escaped}`);w.document.close();"
-            style="background: #27ae60; color: white; padding: 15px 30px; font-size: 16px; border: none;
-            border-radius: 8px; cursor: pointer; margin-top: 15px; box-shadow: 0 4px 6px rgba(0,0,0,0.1);">
-        📖 Интерактивті сабақты ашу
-    </button>
-</div>
-"""
     except Exception as e:
         traceback.print_exc()
-        return f"<p style='color: red;'>❌ Қате: {str(e)}</p>"
-# Интерфейс
-with gr.Blocks(theme=gr.themes.Soft(), title="Бейне Оқытушы", css="""
-    .gradio-container {max-width: 1200px !important;}
-    footer {display: none !important;}
-""") as iface:
-    gr.Markdown("""
-    # 🎓 Бейне Оқытушы (CPU Оптимизацияланған)
-    **Қалай пайдалану:**
-    1. 📸 Суретіңізді жүктеңіз (бет анық көрінетін)
-    2. 📝 Дәріс мәтінін орыс тілінде енгізіңіз (500 таңбаға дейін)
-    3. 🎬 "Бейнені жасау" батырмасын басыңыз
-    4. 📚 Дайын болғаннан кейін "Интерактивті сабақ" жасай аласыз
-    ⚡ **Ескерту:** CPU режимінде жұмыс істейді, генерация 1-3 минут алуы мүмкін.
-    """)
     with gr.Row():
-        with gr.Column(scale=1):
-            image_input = gr.Image(type="pil", label="📸 Дәріскер суреті")
-            text_input = gr.Textbox(
-                lines=6,
-                placeholder="Мысалы: Сәлеметсіздер ме! Бүгін біз математика туралы сөйлесеміз...",
-                label="📝 Дәріс мәтіні (орыс тілінде)"
-            )
-            generate_btn = gr.Button("🎬 Бейнені жасау", variant="primary", size="lg")
-        with gr.Column(scale=1):
-            video_output = gr.Video(label="🎬 Дайын бейне")
-            status = gr.Textbox(label="ℹ️ Мәртебе", interactive=False)
-    interactive_btn = gr.Button("📚 Интерактивті сабақ жасау", visible=False, variant="secondary")
-    lesson_output = gr.HTML(value="", label="Интерактивті сабақ", visible=False)
-    def show_lesson_btn(video, status_msg):
-        return gr.update(visible=bool(video and "✅" in status_msg))
-    generate_btn.click(
-        inference,
-        inputs=[image_input, text_input],
-        outputs=[video_output, status]
-    ).then(
-        show_lesson_btn,
-        inputs=[video_output, status],
-        outputs=interactive_btn
-    )
-    interactive_btn.click(
-        generate_interactive_lesson,
-        inputs=[text_input, video_output],
-        outputs=lesson_output
-    ).then(
-        lambda: gr.update(visible=True),
-        outputs=lesson_output
-    )
-if __name__ == "__main__":
-    try:
-        iface.launch(
-            server_name="0.0.0.0",
-            server_port=7860
-        )
-    except Exception as e:
-        print(f"Launch error: {str(e)}")

 import os
 from PIL import Image
 import tempfile
 import torch
 from transformers import VitsModel, AutoTokenizer
 import scipy.io.wavfile as wavfile
+from gradio_client import Client, handle_file
 import traceback
+# Только CPU
+os.environ["CUDA_VISIBLE_DEVICES"] = ""
+torch.set_num_threads(4)
+TALKING_HEAD = "Skywork/skyreels-a1-talking-head"
+model = None
+tokenizer = None
+def load_tts():
+    global model, tokenizer
+    if model is None:
+        print("Загружаем TTS (каз)…")
+        model = VitsModel.from_pretrained("facebook/mms-tts-kaz")
+        tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-kaz")
+        print("TTS готова")
     return True
+def ru_to_kz_simple(text: str) -> str:
+    rep = {
+        "привет": "сәлем", "здравствуйте": "сәлеметсіз бе", "спасибо": "рахмет",
+        "да": "иә", "нет": "жоқ", "сегодня": "бүгін", "завтра": "ертең",
+        "урок": "сабақ", "лекция": "дәріс", "учитель": "мұғалім", "школа": "мектеп"
     }
+    for ru, kz in rep.items():
+        text = text.replace(ru, kz).replace(ru.capitalize(), kz.capitalize())
+    return text
+def create_video(image: Image.Image, text: str):
+    if not image or not text.strip():
+        return None, "Загрузите фото и введите текст!"
+    load_tts()
+    text_kz = ru_to_kz_simple(text.strip())
     try:
+        # TTS
+        inputs = tokenizer(text_kz, return_tensors="pt")
         with torch.no_grad():
+            waveform = model(**inputs).waveform.squeeze().cpu().numpy()
+        rate = model.config.sampling_rate
+        audio_path = "/tmp/audio.wav"
+        wavfile.write(audio_path, rate, (waveform * 32767).astype("int16"))
+        # Изображение
+        if image.mode != "RGB":
+            image = image.convert("RGB")
+        img_path = "/tmp/img.png"
+        image.save(img_path)
+        # Talking head
+        client = Client(TALKING_HEAD)
         result = client.predict(
             image_path=handle_file(img_path),
             audio_path=handle_file(audio_path),
+            guidance_scale=2.0,
+            steps=8,
             api_name="/process_image_audio"
         )
+        video_path = result[0] if isinstance(result, (list, tuple)) else result
+        return video_path, "Бейне дайын!"
     except Exception as e:
         traceback.print_exc()
+        return None, f"Қате: {e}"
+# === Интерфейс ===
+with gr.Blocks(title="Бейне-лектор қазақша") as app:
+    gr.Markdown("# Бейне-лектор қазақша\nФото + текст → говорящий видео-лектор")
     with gr.Row():
+        with gr.Column():
+            img_in = gr.Image(label="Фото лектора", type="pil")
+            txt_in = gr.Textbox(label="Текст лекции (русский)", lines=6, placeholder="Привет! Сегодня мы изучаем математику…")
+            btn = gr.Button("Сделать видео", variant="primary")
+        with gr.Column():
+            video_out = gr.Video(label="Готовое видео")
+            status = gr.Textbox(label="Статус", interactive=False)
+    btn.click(create_video, [img_in, txt_in], [video_out, status])
+app.launch(server_name="0.0.0.0", server_port=7860)