"""
Usage:
wget https://github.com/thewh1teagle/phonikud-chatterbox/releases/download/asset-files-v1/female1.wav -O example1.wav

# Run with default HF model
uv run src/infer.py

# Or run with local checkpoint
uv run src/infer.py --model ./whisper-heb-ipa/checkpoint-600

# Or with whisper small
uv run src/infer.py --model openai/whisper-small

# Or with thewh1teagle/whisper-heb-ipa
uv run src/infer.py --model thewh1teagle/whisper-heb-ipa
"""


import torch
from transformers import pipeline
import gradio as gr
import argparse
from pydub import AudioSegment
from pydub.effects import normalize
import tempfile
import os

def main():
    parser = argparse.ArgumentParser(description="Whisper Transcription Demo")
    parser.add_argument(
        "--model", 
        type=str, 
        default="openai/whisper-small",
        help="Model name or path for Whisper (default: openai/whisper-small)"
    )
    args = parser.parse_args()
    
    MODEL_NAME = args.model
    BATCH_SIZE = 8

    device = 0 if torch.cuda.is_available() else "cpu"

    pipe = pipeline(
        task="automatic-speech-recognition",
        model=MODEL_NAME,
        chunk_length_s=30,
        device=device,
    )

    def normalize_audio(file_path):
        """Normalize audio using pydub to improve transcription quality."""
        try:
            # Load audio file
            audio = AudioSegment.from_file(file_path)
            
            # Normalize the audio (adjusts volume to optimal level)
            normalized_audio = normalize(audio)
            
            # Create a temporary file for the normalized audio
            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
                normalized_audio.export(temp_file.name, format="wav")
                return temp_file.name
        except Exception as e:
            print(f"Warning: Audio normalization failed: {e}")
            # Return original file if normalization fails
            return file_path

    def transcribe(file, task):
        # Normalize the audio before transcription
        normalized_file = normalize_audio(file)
        
        try:
            outputs = pipe(normalized_file, batch_size=BATCH_SIZE, generate_kwargs={"task": task})
            text = outputs["text"]
            return text
        finally:
            # Clean up temporary normalized file if it was created
            if normalized_file != file and os.path.exists(normalized_file):
                try:
                    os.unlink(normalized_file)
                except Exception as e:
                    print(f"Warning: Could not delete temporary file {normalized_file}: {e}")

    demo = gr.Blocks(
        css="""
        .large-textbox textarea {
            font-size: 20px !important;
            line-height: 1.6 !important;
        }
        """
    )

    mic_transcribe = gr.Interface(
        fn=transcribe,
        inputs=[
            gr.Audio(sources=["microphone", "upload"], type="filepath"),
            gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
        ],
        outputs=gr.Textbox(
            label="Transcription", 
            lines=6, 
            max_lines=15, 
            min_width=400,
            show_copy_button=True,
            placeholder="Transcribed text will appear here...",
            elem_classes=["large-textbox"]
        ),
        theme="huggingface",
        title="Whisper Demo: Transcribe Audio",
        description=(
            "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
            f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
            " of arbitrary length."
        ),
        allow_flagging="never",
    )

    file_transcribe = gr.Interface(
        fn=transcribe,
        inputs=[
            gr.Audio(sources=["upload"], label="Audio file", type="filepath"),
            gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
        ],
        outputs=gr.Textbox(
            label="Transcription", 
            lines=6, 
            max_lines=15, 
            min_width=400,
            show_copy_button=True,
            placeholder="Transcribed text will appear here...",
            elem_classes=["large-textbox"]
        ),
        theme="huggingface",
        title="Whisper Demo: Transcribe Audio",
        description=(
            "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
            f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
            " of arbitrary length."
        ),
        examples=[
            ["./example1.wav", "transcribe"],
        ],
        cache_examples=True,
        allow_flagging="never",
    )

    with demo:
        gr.TabbedInterface([file_transcribe, mic_transcribe], ["Transcribe Audio File", "Transcribe Microphone"])

    demo.launch(server_name="0.0.0.0", server_port=7860)


if __name__ == "__main__":
    main()