Ringg-TTS-v1.0 / app.py
utkarshshukla2912's picture
added counter and name
673efce
raw
history blame
11.5 kB
import gradio as gr
import requests
import json
import os
from pathlib import Path
import uuid
# gr.NO_RELOAD = False
# API Base URL
BASE_URL = os.environ.get("BASE_URL", "")
# Counter persistence file
COUNTER_FILE = Path("generation_counter.json")
# Example texts
EXAMPLE_TEXT_ENGLISH = "Welcome to Ringg TTS! This is a text to speech system that can convert your text into natural-sounding audio. Try it out with your own content!"
EXAMPLE_TEXT_HINDI = "नमस्ते! मैं रिंग टीटीएस हूँ। मैं आपके टेक्स्ट को प्राकृतिक आवाज़ में बदल सकता हूँ। कृपया अपना टेक्स्ट यहाँ लिखें और सुनें।"
EXAMPLE_TEXT_MIXED = "Hello दोस्तों! Welcome to Ringg TTS. यह एक बहुत ही शानदार text to speech system है जो Hindi और English दोनों languages को support करता है।"
def load_counter():
"""Load generation counter from file"""
try:
if COUNTER_FILE.exists():
with open(COUNTER_FILE, "r") as f:
data = json.load(f)
return data.get("count", 0)
except Exception as e:
print(f"Error loading counter: {e}")
return 0
def save_counter(count):
"""Save generation counter to file"""
try:
with open(COUNTER_FILE, "w") as f:
json.dump({"count": count}, f)
except Exception as e:
print(f"Error saving counter: {e}")
def get_voices():
"""Fetch available voices from API"""
try:
response = requests.get(f"{BASE_URL}/voices", timeout=10)
if response.status_code == 200:
voices_data = response.json().get("voices", {})
# Create a list of tuples (display_name, voice_id)
voices = []
for voice_id, voice_info in voices_data.items():
name = voice_info.get("name", "Unknown")
gender = voice_info.get("gender", "N/A")
display_name = f"{name} ({gender})"
voices.append((display_name, voice_id))
return sorted(voices, key=lambda x: x[0])
return []
except Exception as e:
print(f"Error fetching voices: {e}")
return []
def synthesize_speech(text, voice_id):
"""Synthesize speech from text"""
if not text or not text.strip():
return None, "⚠️ Please enter some text", "", "", "", "", "", ""
if not voice_id:
return None, "⚠️ Please select a voice", "", "", "", "", "", ""
# Print input text length
text_length = len(text)
print(f"Input text length: {text_length} characters")
try:
payload = {"text": text, "voice_id": voice_id}
response = requests.post(
f"{BASE_URL}/synthesize",
headers={"Content-Type": "application/json"},
json=payload,
timeout=30,
)
if response.status_code == 200:
result = response.json()
if result.get("success"):
audio_url = result.get("audio_url", "")
metrics = result.get("metrics", {})
# Format metrics
total_time = f"{metrics.get('t', 0):.3f}s"
rtf = f"{metrics.get('rtf', 0):.4f}"
wav_duration = f"{metrics.get('wav_seconds', 0):.2f}s"
vocoder_time = f"{metrics.get('t_vocoder', 0):.3f}s"
no_vocoder_time = f"{metrics.get('t_no_vocoder', 0):.3f}s"
rtf_no_vocoder = f"{metrics.get('rtf_no_vocoder', 0):.4f}"
status_msg = "✅ Audio generated successfully!"
return (
audio_url,
status_msg,
total_time,
rtf,
wav_duration,
vocoder_time,
no_vocoder_time,
rtf_no_vocoder,
)
else:
error_msg = result.get("message", "Unknown error")
return None, f"❌ Synthesis failed: {error_msg}", "", "", "", "", "", ""
else:
return (
None,
f"❌ API returned status code: {response.status_code}",
"",
"",
"",
"",
"",
"",
)
except Exception as e:
return None, f"❌ Error: {str(e)}", "", "", "", "", "", ""
# Load initial counter value
initial_counter = load_counter()
# Create Gradio interface
with gr.Blocks(
theme=gr.themes.Base(
font=[gr.themes.GoogleFont("Source Sans Pro"), "Arial", "sans-serif"]
),
css=".gradio-container {max-width: none !important;}",
) as demo:
# Title with Health Status
with gr.Row():
with gr.Column(scale=4):
audio_image = gr.HTML(
value="""
<div style="display: flex; align-items: center; gap: 10px;">
<img style="width: 50px; height: 50px; background-color: white; border-radius: 10%;" src="https://storage.googleapis.com/desivocal-prod/desi-vocal/ringg.svg" alt="Logo">
<h1 style="margin: 0;">Ringg Squirrel TTS v1.0 🐿️</h1>
</div>
"""
)
with gr.Column(scale=1):
generation_counter = gr.Markdown(
f"**Generations:** {initial_counter}", elem_id="counter"
)
# Best Practices Section
gr.Markdown("""
### 📝 Best Practices for Best Results
- **Supported Languages:** Hindi and English only
- **Check spelling carefully:** Misspelled words may be mispronounced
- **Punctuation matters:** Use proper punctuation for natural pauses and intonation
- **Technical terms:** Extremely rare or specialized technical terms might be mispronounced
- **Numbers & dates:** Write numbers as words for better pronunciation (e.g., "twenty-five" instead of "25")
""")
# Text Input
text_input = gr.Textbox(
label="Text (max 500 characters)",
placeholder="Type or paste your text here (max 500 characters)...",
lines=6,
max_lines=10,
max_length=500,
)
# Character count display
char_count = gr.Markdown("**Character count:** 0 / 500")
with gr.Row():
with gr.Column(scale=1):
# Voice Selection
voices = get_voices()
voice_choices = {display: vid for display, vid in voices}
voice_dropdown = gr.Dropdown(
choices=list(voice_choices.keys()),
label="Choose a voice style",
info=f"{len(voices)} voices available",
value=list(voice_choices.keys())[0] if voices else None,
)
with gr.Column(scale=1):
audio_output = gr.Audio(label="Listen to your audio", type="filepath")
metrics_header = gr.Markdown("### 📊 Generation Metrics", visible=False)
metrics_output = gr.Code(
label="Metrics", language="json", interactive=False, visible=False
)
generate_btn = gr.Button("🎬 Generate Speech", variant="primary", size="lg")
gr.Markdown("#### 🎯 Try these examples:")
with gr.Row():
example_btn1 = gr.Button("English Example", size="sm")
example_btn2 = gr.Button("Hindi Example", size="sm")
example_btn3 = gr.Button("Mixed Example", size="sm")
# Footer
gr.Markdown("---")
gr.Markdown("# 🙏 Acknowledgements")
# gr.Markdown("- Based on [ZipVoice](https://github.com/k2-fsa/ZipVoice)")
gr.Markdown(
"- Special thanks to [@jeremylee12](https://huggingface.co/jeremylee12) for his contributions"
)
# State variable for generation counter
gen_count_state = gr.State(value=initial_counter)
# Event Handlers
def update_char_count(text):
"""Update character count as user types"""
count = len(text) if text else 0
return f"**Character count:** {count} / 500"
def load_example_text(example_text):
"""Load example text and update character count"""
count = len(example_text)
return example_text, f"**Character count:** {count} / 500"
def clear_text():
"""Clear text input"""
return "", "**Character count:** 0 / 500"
def on_generate(text, voice_display, gen_count):
voice_id = voice_choices.get(voice_display)
audio_url, _status, t_time, rtf, wav_dur, voc_time, no_voc_time, rtf_no_voc = (
synthesize_speech(text, voice_id)
)
# Download audio if URL is available
audio_file = None
if audio_url:
try:
audio_response = requests.get(audio_url, timeout=30)
if audio_response.status_code == 200:
# Save to temporary file
audio_file = f"/tmp/ringg_{str(uuid.uuid4())}.wav"
with open(audio_file, "wb") as f:
f.write(audio_response.content)
# Increment counter only on successful generation
gen_count += 1
# Save counter to file for persistence
save_counter(gen_count)
except Exception as e:
_status = f"⚠️ Audio generated but download failed: {str(e)}"
# Format metrics as JSON string (only if available)
has_metrics = any([t_time, rtf, wav_dur, voc_time, no_voc_time, rtf_no_voc])
metrics_json = ""
if has_metrics:
metrics_json = json.dumps(
{
"total_time": t_time,
"rtf": rtf,
"audio_duration": wav_dur,
"vocoder_time": voc_time,
"no_vocoder_time": no_voc_time,
"rtf_no_vocoder": rtf_no_voc,
},
indent=2,
)
return (
audio_file,
gr.update(visible=has_metrics),
gr.update(value=metrics_json, visible=has_metrics),
gen_count,
f"**Generations:** {gen_count}",
)
# Update character count on text input change
text_input.change(fn=update_char_count, inputs=[text_input], outputs=[char_count])
# Example button clicks
example_btn1.click(
fn=lambda: load_example_text(EXAMPLE_TEXT_ENGLISH),
inputs=None,
outputs=[text_input, char_count],
)
example_btn2.click(
fn=lambda: load_example_text(EXAMPLE_TEXT_HINDI),
inputs=None,
outputs=[text_input, char_count],
)
example_btn3.click(
fn=lambda: load_example_text(EXAMPLE_TEXT_MIXED),
inputs=None,
outputs=[text_input, char_count],
)
generate_btn.click(
fn=on_generate,
inputs=[text_input, voice_dropdown, gen_count_state],
outputs=[
audio_output,
# status_output,
metrics_header,
metrics_output,
gen_count_state,
generation_counter,
],
)
# Refresh health status on load
demo.load()
if __name__ == "__main__":
demo.queue(max_size=5)
demo.launch(share=False, server_name="0.0.0.0", server_port=7860, debug=True)