whisper-heb-ipa / app.py
thewh1teagle
latest
24a8315
"""
Usage:
wget https://github.com/thewh1teagle/phonikud-chatterbox/releases/download/asset-files-v1/female1.wav -O example1.wav
# Run with default HF model
uv run src/infer.py
# Or run with local checkpoint
uv run src/infer.py --model ./whisper-heb-ipa/checkpoint-600
# Or with whisper small
uv run src/infer.py --model openai/whisper-small
# Or with thewh1teagle/whisper-heb-ipa
uv run src/infer.py --model thewh1teagle/whisper-heb-ipa
"""
import torch
from transformers import pipeline
import gradio as gr
import argparse
from pydub import AudioSegment
from pydub.effects import normalize
import tempfile
import os
def main():
parser = argparse.ArgumentParser(description="Whisper Transcription Demo")
parser.add_argument(
"--model",
type=str,
default="openai/whisper-small",
help="Model name or path for Whisper (default: openai/whisper-small)"
)
args = parser.parse_args()
MODEL_NAME = args.model
BATCH_SIZE = 8
device = 0 if torch.cuda.is_available() else "cpu"
pipe = pipeline(
task="automatic-speech-recognition",
model=MODEL_NAME,
chunk_length_s=30,
device=device,
)
def normalize_audio(file_path):
"""Normalize audio using pydub to improve transcription quality."""
try:
# Load audio file
audio = AudioSegment.from_file(file_path)
# Normalize the audio (adjusts volume to optimal level)
normalized_audio = normalize(audio)
# Create a temporary file for the normalized audio
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
normalized_audio.export(temp_file.name, format="wav")
return temp_file.name
except Exception as e:
print(f"Warning: Audio normalization failed: {e}")
# Return original file if normalization fails
return file_path
def transcribe(file, task):
# Normalize the audio before transcription
normalized_file = normalize_audio(file)
try:
outputs = pipe(normalized_file, batch_size=BATCH_SIZE, generate_kwargs={"task": task})
text = outputs["text"]
return text
finally:
# Clean up temporary normalized file if it was created
if normalized_file != file and os.path.exists(normalized_file):
try:
os.unlink(normalized_file)
except Exception as e:
print(f"Warning: Could not delete temporary file {normalized_file}: {e}")
demo = gr.Blocks(
css="""
.large-textbox textarea {
font-size: 20px !important;
line-height: 1.6 !important;
}
"""
)
mic_transcribe = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(sources=["microphone", "upload"], type="filepath"),
gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
],
outputs=gr.Textbox(
label="Transcription",
lines=6,
max_lines=15,
min_width=400,
show_copy_button=True,
placeholder="Transcribed text will appear here...",
elem_classes=["large-textbox"]
),
theme="huggingface",
title="Whisper Demo: Transcribe Audio",
description=(
"Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and πŸ€— Transformers to transcribe audio files"
" of arbitrary length."
),
allow_flagging="never",
)
file_transcribe = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(sources=["upload"], label="Audio file", type="filepath"),
gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
],
outputs=gr.Textbox(
label="Transcription",
lines=6,
max_lines=15,
min_width=400,
show_copy_button=True,
placeholder="Transcribed text will appear here...",
elem_classes=["large-textbox"]
),
theme="huggingface",
title="Whisper Demo: Transcribe Audio",
description=(
"Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and πŸ€— Transformers to transcribe audio files"
" of arbitrary length."
),
examples=[
["./example1.wav", "transcribe"],
],
cache_examples=True,
allow_flagging="never",
)
with demo:
gr.TabbedInterface([file_transcribe, mic_transcribe], ["Transcribe Audio File", "Transcribe Microphone"])
demo.launch(server_name="0.0.0.0", server_port=7860)
if __name__ == "__main__":
main()