Spaces:

thewh1teagle
/

whisper-heb-ipa

Running

whisper-heb-ipa / app.py

thewh1teagle

latest

24a8315 3 months ago

5.09 kB


	"""
	Usage:
	wget https://github.com/thewh1teagle/phonikud-chatterbox/releases/download/asset-files-v1/female1.wav -O example1.wav

	# Run with default HF model
	uv run src/infer.py

	# Or run with local checkpoint
	uv run src/infer.py --model ./whisper-heb-ipa/checkpoint-600

	# Or with whisper small
	uv run src/infer.py --model openai/whisper-small

	# Or with thewh1teagle/whisper-heb-ipa
	uv run src/infer.py --model thewh1teagle/whisper-heb-ipa
	"""


	import torch
	from transformers import pipeline
	import gradio as gr
	import argparse
	from pydub import AudioSegment
	from pydub.effects import normalize
	import tempfile
	import os

	def main():
	parser = argparse.ArgumentParser(description="Whisper Transcription Demo")
	parser.add_argument(
	"--model",
	type=str,
	default="openai/whisper-small",
	help="Model name or path for Whisper (default: openai/whisper-small)"
	)
	args = parser.parse_args()

	MODEL_NAME = args.model
	BATCH_SIZE = 8

	device = 0 if torch.cuda.is_available() else "cpu"

	pipe = pipeline(
	task="automatic-speech-recognition",
	model=MODEL_NAME,
	chunk_length_s=30,
	device=device,
	)

	def normalize_audio(file_path):
	"""Normalize audio using pydub to improve transcription quality."""
	try:
	# Load audio file
	audio = AudioSegment.from_file(file_path)

	# Normalize the audio (adjusts volume to optimal level)
	normalized_audio = normalize(audio)

	# Create a temporary file for the normalized audio
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
	normalized_audio.export(temp_file.name, format="wav")
	return temp_file.name
	except Exception as e:
	print(f"Warning: Audio normalization failed: {e}")
	# Return original file if normalization fails
	return file_path

	def transcribe(file, task):
	# Normalize the audio before transcription
	normalized_file = normalize_audio(file)

	try:
	outputs = pipe(normalized_file, batch_size=BATCH_SIZE, generate_kwargs={"task": task})
	text = outputs["text"]
	return text
	finally:
	# Clean up temporary normalized file if it was created
	if normalized_file != file and os.path.exists(normalized_file):
	try:
	os.unlink(normalized_file)
	except Exception as e:
	print(f"Warning: Could not delete temporary file {normalized_file}: {e}")

	demo = gr.Blocks(
	css="""
	.large-textbox textarea {
	font-size: 20px !important;
	line-height: 1.6 !important;
	}
	"""
	)

	mic_transcribe = gr.Interface(
	fn=transcribe,
	inputs=[
	gr.Audio(sources=["microphone", "upload"], type="filepath"),
	gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
	],
	outputs=gr.Textbox(
	label="Transcription",
	lines=6,
	max_lines=15,
	min_width=400,
	show_copy_button=True,
	placeholder="Transcribed text will appear here...",
	elem_classes=["large-textbox"]
	),
	theme="huggingface",
	title="Whisper Demo: Transcribe Audio",
	description=(
	"Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
	f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
	" of arbitrary length."
	),
	allow_flagging="never",
	)

	file_transcribe = gr.Interface(
	fn=transcribe,
	inputs=[
	gr.Audio(sources=["upload"], label="Audio file", type="filepath"),
	gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
	],
	outputs=gr.Textbox(
	label="Transcription",
	lines=6,
	max_lines=15,
	min_width=400,
	show_copy_button=True,
	placeholder="Transcribed text will appear here...",
	elem_classes=["large-textbox"]
	),
	theme="huggingface",
	title="Whisper Demo: Transcribe Audio",
	description=(
	"Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
	f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
	" of arbitrary length."
	),
	examples=[
	["./example1.wav", "transcribe"],
	],
	cache_examples=True,
	allow_flagging="never",
	)

	with demo:
	gr.TabbedInterface([file_transcribe, mic_transcribe], ["Transcribe Audio File", "Transcribe Microphone"])

	demo.launch(server_name="0.0.0.0", server_port=7860)


	if __name__ == "__main__":
	main()