Spaces:

RinggAI
/

Ringg-TTS-v1.0

Running

App Files Files Community

utkarshshukla2912 commited on 20 days ago

Commit

4806882

1 Parent(s): 9baf492

remove base inference

Browse files

Files changed (3) hide show

app.py +63 -126
generation_counter.json +1 -1
vertex_client.py +10 -122

app.py CHANGED Viewed

@@ -258,31 +258,17 @@ with gr.Blocks(
             show_label=False,
         )
-    # Side-by-side comparison of Base and Distill models
-    gr.Markdown("### 🎧 Audio Results Comparison")
-    with gr.Row():
-        with gr.Column(scale=1):
-            # gr.Markdown("#### Base Model")
-            audio_output_base = gr.Audio(label="Base Model Audio", type="filepath")
-            status_base = gr.Markdown("", visible=True)
-            metrics_header_base = gr.Markdown("**📊 Metrics**", visible=False)
-            metrics_output_base = gr.Code(
-                label="Base Metrics", language="json", interactive=False, visible=False
-            )
-        with gr.Column(scale=1):
-            # gr.Markdown("#### Distill Model")
-            audio_output_distill = gr.Audio(
-                label="Distill Model Audio", type="filepath"
-            )
-            status_distill = gr.Markdown("", visible=True)
-            metrics_header_distill = gr.Markdown("**📊 Metrics**", visible=False)
-            metrics_output_distill = gr.Code(
-                label="Distill Metrics",
-                language="json",
-                interactive=False,
-                visible=False,
-            )
     generate_btn = gr.Button("🎬 Generate Speech", variant="primary", size="lg")
@@ -315,15 +301,11 @@ with gr.Blocks(
         return "", "Character count: 0 / 300"
     def on_generate(text, voice_display):
-        """Generate speech using both base and distill models in parallel."""
         # Validate inputs
         if not text or not text.strip():
             error_msg = "⚠️ Please enter some text"
             yield (
-                None,
-                error_msg,
-                gr.update(visible=False),
-                gr.update(visible=False),
                 None,
                 error_msg,
                 gr.update(visible=False),
@@ -336,10 +318,6 @@ with gr.Blocks(
         if not voice_id:
             error_msg = "⚠️ Please select a voice"
             yield (
-                None,
-                error_msg,
-                gr.update(visible=False),
-                gr.update(visible=False),
                 None,
                 error_msg,
                 gr.update(visible=False),
@@ -348,101 +326,64 @@ with gr.Blocks(
             )
             return
-        # Initialize state for both models
-        results = {
-            "base": {"audio": None, "status": "⏳ Loading...", "metrics": None},
-            "distill": {"audio": None, "status": "⏳ Loading...", "metrics": None},
-        }
         # Show loading state initially
         yield (
             None,
-            results["base"]["status"],
-            gr.update(visible=False),
-            gr.update(visible=False),
-            None,
-            results["distill"]["status"],
             gr.update(visible=False),
             gr.update(visible=False),
             f"**🌍 Generations:** {load_counter()}",
         )
-        # Use parallel synthesis
         vertex_client = get_vertex_client()
-        counter_incremented = False
-        for (
-            model_type,
-            success,
-            audio_bytes,
-            metrics,
-        ) in vertex_client.synthesize_parallel(text, voice_id):
-            if success and audio_bytes:
-                # Save audio file in system temp directory
-                temp_dir = tempfile.gettempdir()
-                audio_file = os.path.join(
-                    temp_dir, f"ringg_{model_type}_{str(uuid.uuid4())}.wav"
                 )
-                with open(audio_file, "wb") as f:
-                    f.write(audio_bytes)
-                # Increment counter only once (for the first successful result)
-                if not counter_incremented:
-                    new_count = increment_counter()
-                    counter_incremented = True
-                else:
-                    new_count = load_counter()
-                # Format metrics
-                metrics_json = ""
-                has_metrics = False
-                if metrics:
-                    has_metrics = True
-                    metrics_json = json.dumps(
-                        {
-                            "total_time": f"{metrics.get('t', 0):.3f}s",
-                            "rtf": f"{metrics.get('rtf', 0):.4f}",
-                            "audio_duration": f"{metrics.get('wav_seconds', 0):.2f}s",
-                            "vocoder_time": f"{metrics.get('t_vocoder', 0):.3f}s",
-                            "no_vocoder_time": f"{metrics.get('t_no_vocoder', 0):.3f}s",
-                            "rtf_no_vocoder": f"{metrics.get('rtf_no_vocoder', 0):.4f}",
-                        },
-                        indent=2,
-                    )
-                # Update the corresponding model result
-                results[model_type] = {
-                    "audio": audio_file,
-                    "status": "",
-                    "metrics": metrics_json,
-                    "has_metrics": has_metrics,
-                }
-            else:
-                # Update failed model
-                results[model_type] = {
-                    "audio": None,
-                    "status": "❌ Failed to generate",
-                    "metrics": "",
-                    "has_metrics": False,
-                }
-            # Yield updated state for both models
             yield (
-                results["base"]["audio"],
-                results["base"]["status"],
-                gr.update(visible=results["base"].get("has_metrics", False)),
-                gr.update(
-                    value=results["base"]["metrics"],
-                    visible=results["base"].get("has_metrics", False),
-                ),
-                results["distill"]["audio"],
-                results["distill"]["status"],
-                gr.update(visible=results["distill"].get("has_metrics", False)),
-                gr.update(
-                    value=results["distill"]["metrics"],
-                    visible=results["distill"].get("has_metrics", False),
-                ),
-                f"**🌍 Generations:** {new_count if counter_incremented else load_counter()}",
             )
     def refresh_counter_on_load():
@@ -475,14 +416,10 @@ with gr.Blocks(
         fn=on_generate,
         inputs=[text_input, voice_dropdown],
         outputs=[
-            audio_output_base,
-            status_base,
-            metrics_header_base,
-            metrics_output_base,
-            audio_output_distill,
-            status_distill,
-            metrics_header_distill,
-            metrics_output_distill,
             generation_counter,
         ],
         concurrency_limit=2,

             show_label=False,
         )
+    # Audio output section
+    gr.Markdown("### 🎧 Audio Result")
+    audio_output = gr.Audio(label="Generated Audio", type="filepath")
+    status = gr.Markdown("", visible=True)
+    metrics_header = gr.Markdown("**📊 Metrics**", visible=False)
+    metrics_output = gr.Code(
+        label="Performance Metrics",
+        language="json",
+        interactive=False,
+        visible=False,
+    )
     generate_btn = gr.Button("🎬 Generate Speech", variant="primary", size="lg")
         return "", "Character count: 0 / 300"
     def on_generate(text, voice_display):
+        """Generate speech using the distill model."""
         # Validate inputs
         if not text or not text.strip():
             error_msg = "⚠️ Please enter some text"
             yield (
                 None,
                 error_msg,
                 gr.update(visible=False),
         if not voice_id:
             error_msg = "⚠️ Please select a voice"
             yield (
                 None,
                 error_msg,
                 gr.update(visible=False),
             )
             return
         # Show loading state initially
         yield (
             None,
+            "⏳ Loading...",
             gr.update(visible=False),
             gr.update(visible=False),
             f"**🌍 Generations:** {load_counter()}",
         )
+        # Synthesize speech
         vertex_client = get_vertex_client()
+        success, audio_bytes, metrics = vertex_client.synthesize(text, voice_id)
+        if success and audio_bytes:
+            # Save audio file in system temp directory
+            temp_dir = tempfile.gettempdir()
+            audio_file = os.path.join(
+                temp_dir, f"ringg_{str(uuid.uuid4())}.wav"
+            )
+            with open(audio_file, "wb") as f:
+                f.write(audio_bytes)
+            # Increment counter
+            new_count = increment_counter()
+            # Format metrics
+            metrics_json = ""
+            has_metrics = False
+            if metrics:
+                has_metrics = True
+                metrics_json = json.dumps(
+                    {
+                        "total_time": f"{metrics.get('t', 0):.3f}s",
+                        "rtf": f"{metrics.get('rtf', 0):.4f}",
+                        "audio_duration": f"{metrics.get('wav_seconds', 0):.2f}s",
+                        "vocoder_time": f"{metrics.get('t_vocoder', 0):.3f}s",
+                        "no_vocoder_time": f"{metrics.get('t_no_vocoder', 0):.3f}s",
+                        "rtf_no_vocoder": f"{metrics.get('rtf_no_vocoder', 0):.4f}",
+                    },
+                    indent=2,
                 )
+            # Yield success result
+            yield (
+                audio_file,
+                "",
+                gr.update(visible=has_metrics),
+                gr.update(value=metrics_json, visible=has_metrics),
+                f"**🌍 Generations:** {new_count}",
+            )
+        else:
+            # Yield failure result
             yield (
+                None,
+                "❌ Failed to generate",
+                gr.update(visible=False),
+                gr.update(visible=False),
+                f"**🌍 Generations:** {load_counter()}",
             )
     def refresh_counter_on_load():
         fn=on_generate,
         inputs=[text_input, voice_dropdown],
         outputs=[
+            audio_output,
+            status,
+            metrics_header,
+            metrics_output,
             generation_counter,
         ],
         concurrency_limit=2,

generation_counter.json CHANGED Viewed

	@@ -1 +1 @@
1	- {"count": 10, "last_updated": ~~1762780862~~.~~430711~~}


1	+ {"count": 11, "last_updated": 1763749917.869355}

vertex_client.py CHANGED Viewed

@@ -5,8 +5,7 @@ import os
 import json
 import logging
 import requests
-from typing import Optional, Dict, Any, Tuple, Generator
-from concurrent.futures import ThreadPoolExecutor, as_completed
 from google.cloud import aiplatform
 from google.oauth2 import service_account
 from dotenv import load_dotenv
@@ -25,7 +24,6 @@ class VertexAIClient:
     def __init__(self):
         """Initialize the Vertex AI client."""
         self.endpoint = None
-        self.endpoint_distill = None
         self.credentials = None
         self.initialized = False
@@ -59,7 +57,7 @@ class VertexAIClient:
     def initialize(self) -> bool:
         """
-        Initialize Vertex AI and find the zipvoice and zipvoice_base_distill endpoints.
         Returns:
             True if initialization successful, False otherwise
@@ -82,24 +80,18 @@ class VertexAIClient:
             )
             logger.info("Vertex AI initialized for project desivocalprod01")
-            # Find both endpoints
             for endpoint in aiplatform.Endpoint.list():
-                if endpoint.display_name == "zipvoice":
                     self.endpoint = endpoint
-                    logger.info(f"Found zipvoice endpoint: {endpoint.resource_name}")
-                elif endpoint.display_name == "zipvoice_base_distill":
-                    self.endpoint_distill = endpoint
                     logger.info(f"Found zipvoice_base_distill endpoint: {endpoint.resource_name}")
-            # Check if at least the base endpoint is found
             if not self.endpoint:
-                logger.error("zipvoice endpoint not found in Vertex AI")
                 return False
-            # Warn if distill endpoint is not found but continue
-            if not self.endpoint_distill:
-                logger.warning("zipvoice_base_distill endpoint not found - distill model will not be available")
             self.initialized = True
             return True
@@ -139,65 +131,6 @@ class VertexAIClient:
             return False, None
     def synthesize(self, text: str, voice_id: str, timeout: int = 60) -> Tuple[bool, Optional[bytes], Optional[Dict[str, Any]]]:
-        """
-        Synthesize speech from text using Vertex AI endpoint.
-        Args:
-            text: Text to synthesize
-            voice_id: Voice ID to use
-            timeout: Request timeout in seconds
-        Returns:
-            Tuple of (success, audio_bytes, metrics)
-        """
-        if not self.initialized:
-            if not self.initialize():
-                return False, None, None
-        try:
-            logger.info(f"Synthesizing text (length: {len(text)}) with voice {voice_id}")
-            response = self.endpoint.raw_predict(
-                body=json.dumps({
-                    "text": text,
-                    "voice_id": voice_id,
-                }),
-                headers={"Content-Type": "application/json"},
-            )
-            # Parse JSON response
-            result = json.loads(response.text) if hasattr(response, 'text') else response
-            logger.info(f"Vertex AI response: {result}")
-            # Check if synthesis was successful
-            if result.get("success"):
-                audio_url = result.get("audio_url")
-                metrics = result.get("metrics")
-                if not audio_url:
-                    logger.error("No audio_url in successful response")
-                    return False, None, None
-                # Download audio from URL
-                logger.info(f"Downloading audio from: {audio_url}")
-                audio_response = requests.get(audio_url, timeout=timeout)
-                if audio_response.status_code == 200:
-                    audio_data = audio_response.content
-                    logger.info(f"Successfully downloaded audio ({len(audio_data)} bytes)")
-                    return True, audio_data, metrics
-                else:
-                    logger.error(f"Failed to download audio: HTTP {audio_response.status_code}")
-                    return False, None, None
-            else:
-                error_msg = result.get("message", "Unknown error")
-                logger.error(f"Synthesis failed: {error_msg}")
-                return False, None, None
-        except Exception as e:
-            logger.error(f"Failed to synthesize speech with Vertex AI: {e}")
-            return False, None, None
-    def synthesize_distill(self, text: str, voice_id: str, timeout: int = 60) -> Tuple[bool, Optional[bytes], Optional[Dict[str, Any]]]:
         """
         Synthesize speech from text using Vertex AI distill endpoint.
@@ -213,13 +146,9 @@ class VertexAIClient:
             if not self.initialize():
                 return False, None, None
-        if not self.endpoint_distill:
-            logger.error("Distill endpoint not available")
-            return False, None, None
         try:
             logger.info(f"Synthesizing text (length: {len(text)}) with voice {voice_id} using distill model")
-            response = self.endpoint_distill.raw_predict(
                 body=json.dumps({
                     "text": text,
                     "voice_id": voice_id,
@@ -230,7 +159,7 @@ class VertexAIClient:
             # Parse JSON response
             result = json.loads(response.text) if hasattr(response, 'text') else response
-            logger.info(f"Vertex AI distill response: {result}")
             # Check if synthesis was successful
             if result.get("success"):
@@ -258,50 +187,9 @@ class VertexAIClient:
                 return False, None, None
         except Exception as e:
-            logger.error(f"Failed to synthesize speech with Vertex AI distill: {e}")
             return False, None, None
-    def synthesize_parallel(self, text: str, voice_id: str, timeout: int = 60) -> Generator[Tuple[str, bool, Optional[bytes], Optional[Dict[str, Any]]], None, None]:
-        """
-        Synthesize speech from text using both base and distill endpoints in parallel.
-        Yields results as they arrive (doesn't wait for both to complete).
-        Args:
-            text: Text to synthesize
-            voice_id: Voice ID to use
-            timeout: Request timeout in seconds
-        Yields:
-            Tuple of (model_type, success, audio_bytes, metrics)
-            model_type is either "base" or "distill"
-        """
-        if not self.initialized:
-            if not self.initialize():
-                logger.error("Failed to initialize client for parallel synthesis")
-                return
-        # Create executor for parallel execution
-        with ThreadPoolExecutor(max_workers=2) as executor:
-            # Submit both tasks
-            futures = {}
-            # Always submit base model
-            futures[executor.submit(self.synthesize, text, voice_id, timeout)] = "base"
-            # Submit distill model if available
-            if self.endpoint_distill:
-                futures[executor.submit(self.synthesize_distill, text, voice_id, timeout)] = "distill"
-            # Yield results as they complete
-            for future in as_completed(futures):
-                model_type = futures[future]
-                try:
-                    success, audio_bytes, metrics = future.result()
-                    yield model_type, success, audio_bytes, metrics
-                except Exception as e:
-                    logger.error(f"Error in parallel synthesis for {model_type}: {e}")
-                    yield model_type, False, None, None
 # Global instance

 import json
 import logging
 import requests
+from typing import Optional, Dict, Any, Tuple
 from google.cloud import aiplatform
 from google.oauth2 import service_account
 from dotenv import load_dotenv
     def __init__(self):
         """Initialize the Vertex AI client."""
         self.endpoint = None
         self.credentials = None
         self.initialized = False
     def initialize(self) -> bool:
         """
+        Initialize Vertex AI and find the zipvoice_base_distill endpoint.
         Returns:
             True if initialization successful, False otherwise
             )
             logger.info("Vertex AI initialized for project desivocalprod01")
+            # Find distill endpoint
             for endpoint in aiplatform.Endpoint.list():
+                if endpoint.display_name == "zipvoice_base_distill":
                     self.endpoint = endpoint
                     logger.info(f"Found zipvoice_base_distill endpoint: {endpoint.resource_name}")
+                    break
+            # Check if endpoint is found
             if not self.endpoint:
+                logger.error("zipvoice_base_distill endpoint not found in Vertex AI")
                 return False
             self.initialized = True
             return True
             return False, None
     def synthesize(self, text: str, voice_id: str, timeout: int = 60) -> Tuple[bool, Optional[bytes], Optional[Dict[str, Any]]]:
         """
         Synthesize speech from text using Vertex AI distill endpoint.
             if not self.initialize():
                 return False, None, None
         try:
             logger.info(f"Synthesizing text (length: {len(text)}) with voice {voice_id} using distill model")
+            response = self.endpoint.raw_predict(
                 body=json.dumps({
                     "text": text,
                     "voice_id": voice_id,
             # Parse JSON response
             result = json.loads(response.text) if hasattr(response, 'text') else response
+            logger.info(f"Vertex AI response: {result}")
             # Check if synthesis was successful
             if result.get("success"):
                 return False, None, None
         except Exception as e:
+            logger.error(f"Failed to synthesize speech with Vertex AI: {e}")
             return False, None, None
 # Global instance