XTTS / app.py
Successmove's picture
Create app.py
08a0d1e verified
import gradio as gr
import torch
import tempfile
import os
from TTS.api import TTS
# Initialize the XTTS model
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
# Initialize XTTS model
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
# Get list of supported languages
supported_languages = [
"en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl",
"cs", "ar", "zh-cn", "ja", "hu", "ko"
]
def generate_speech(
text,
language,
speaker_wav=None,
voice_preset=None,
speed=1.0,
temperature=0.7
):
"""
Generate speech from text using XTTS model
"""
# Create a temporary file for output
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
output_path = tmp_file.name
try:
# If speaker wav is provided, use it for voice cloning
if speaker_wav is not None:
tts.tts_to_file(
text=text,
file_path=output_path,
speaker_wav=speaker_wav,
language=language,
speed=speed,
temperature=temperature
)
else:
# Use default voice if no speaker wav is provided
tts.tts_to_file(
text=text,
file_path=output_path,
language=language,
speed=speed,
temperature=temperature
)
return output_path
except Exception as e:
# Clean up temporary file if error occurs
if os.path.exists(output_path):
os.unlink(output_path)
raise gr.Error(f"Error generating speech: {str(e)}")
# Create Gradio interface
with gr.Blocks(title="XTTS Text-to-Speech") as demo:
gr.Markdown("# XTTS Text-to-Speech Generator")
gr.Markdown("Generate speech from text with voice cloning capabilities using XTTS v2")
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Input Text",
placeholder="Enter text to convert to speech...",
lines=3
)
language_input = gr.Dropdown(
label="Language",
choices=[(lang, lang) for lang in supported_languages],
value="en",
info="Select the language for synthesis"
)
speaker_wav_input = gr.Audio(
label="Reference Voice (Optional)",
type="filepath",
info="Upload a 3-10 second audio sample for voice cloning"
)
with gr.Accordion("Advanced Settings", open=False):
speed_input = gr.Slider(
label="Speed",
minimum=0.5,
maximum=2.0,
value=1.0,
step=0.1,
info="Speech speed (0.5 = slow, 2.0 = fast)"
)
temperature_input = gr.Slider(
label="Temperature",
minimum=0.1,
maximum=1.0,
value=0.7,
step=0.1,
info="Voice variability (lower = more deterministic)"
)
generate_btn = gr.Button("Generate Speech", variant="primary")
with gr.Column():
audio_output = gr.Audio(
label="Generated Speech",
type="filepath"
)
gr.Examples(
examples=[
["Hello, world! This is a sample text to speech generation.", "en"],
["Bonjour, comment allez-vous aujourd'hui?", "fr"],
["Hola, ¿cómo estás?", "es"],
],
inputs=[text_input, language_input],
outputs=audio_output,
fn=generate_speech,
cache_examples=True
)
generate_btn.click(
fn=generate_speech,
inputs=[
text_input,
language_input,
speaker_wav_input,
speed_input,
temperature_input
],
outputs=audio_output
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)