Spaces:

Successmove
/

XTTS

Runtime error

App Files Files Community

XTTS / app.py

Successmove

Create app.py

08a0d1e verified 4 months ago

raw

history blame contribute delete

4.32 kB

	import gradio as gr
	import torch
	import tempfile
	import os
	from TTS.api import TTS

	# Initialize the XTTS model
	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Using device: {device}")

	# Initialize XTTS model
	tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)

	# Get list of supported languages
	supported_languages = [
	"en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl",
	"cs", "ar", "zh-cn", "ja", "hu", "ko"
	]

	def generate_speech(
	text,
	language,
	speaker_wav=None,
	voice_preset=None,
	speed=1.0,
	temperature=0.7
	):
	"""
	Generate speech from text using XTTS model
	"""
	# Create a temporary file for output
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
	output_path = tmp_file.name

	try:
	# If speaker wav is provided, use it for voice cloning
	if speaker_wav is not None:
	tts.tts_to_file(
	text=text,
	file_path=output_path,
	speaker_wav=speaker_wav,
	language=language,
	speed=speed,
	temperature=temperature
	)
	else:
	# Use default voice if no speaker wav is provided
	tts.tts_to_file(
	text=text,
	file_path=output_path,
	language=language,
	speed=speed,
	temperature=temperature
	)

	return output_path
	except Exception as e:
	# Clean up temporary file if error occurs
	if os.path.exists(output_path):
	os.unlink(output_path)
	raise gr.Error(f"Error generating speech: {str(e)}")

	# Create Gradio interface
	with gr.Blocks(title="XTTS Text-to-Speech") as demo:
	gr.Markdown("# XTTS Text-to-Speech Generator")
	gr.Markdown("Generate speech from text with voice cloning capabilities using XTTS v2")

	with gr.Row():
	with gr.Column():
	text_input = gr.Textbox(
	label="Input Text",
	placeholder="Enter text to convert to speech...",
	lines=3
	)

	language_input = gr.Dropdown(
	label="Language",
	choices=[(lang, lang) for lang in supported_languages],
	value="en",
	info="Select the language for synthesis"
	)

	speaker_wav_input = gr.Audio(
	label="Reference Voice (Optional)",
	type="filepath",
	info="Upload a 3-10 second audio sample for voice cloning"
	)

	with gr.Accordion("Advanced Settings", open=False):
	speed_input = gr.Slider(
	label="Speed",
	minimum=0.5,
	maximum=2.0,
	value=1.0,
	step=0.1,
	info="Speech speed (0.5 = slow, 2.0 = fast)"
	)

	temperature_input = gr.Slider(
	label="Temperature",
	minimum=0.1,
	maximum=1.0,
	value=0.7,
	step=0.1,
	info="Voice variability (lower = more deterministic)"
	)

	generate_btn = gr.Button("Generate Speech", variant="primary")

	with gr.Column():
	audio_output = gr.Audio(
	label="Generated Speech",
	type="filepath"
	)

	gr.Examples(
	examples=[
	["Hello, world! This is a sample text to speech generation.", "en"],
	["Bonjour, comment allez-vous aujourd'hui?", "fr"],
	["Hola, ¿cómo estás?", "es"],
	],
	inputs=[text_input, language_input],
	outputs=audio_output,
	fn=generate_speech,
	cache_examples=True
	)

	generate_btn.click(
	fn=generate_speech,
	inputs=[
	text_input,
	language_input,
	speaker_wav_input,
	speed_input,
	temperature_input
	],
	outputs=audio_output
	)

	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=7860)