Spaces:
Running
Running
| """ | |
| Pip's Voice - Text-to-speech with emotional tone matching. | |
| Uses ElevenLabs for high-quality, expressive speech. | |
| """ | |
| import asyncio | |
| from typing import Optional, AsyncGenerator | |
| from dataclasses import dataclass | |
| from services.elevenlabs_client import ElevenLabsClient | |
| class VoiceResponse: | |
| """Audio response from Pip.""" | |
| audio_bytes: bytes | |
| tone_used: str | |
| model_used: str | |
| error: Optional[str] = None | |
| class PipVoice: | |
| """ | |
| Pip's voice synthesis with emotional tone matching. | |
| """ | |
| def __init__(self): | |
| self.client = ElevenLabsClient() | |
| # Emotion to tone mapping with fallbacks | |
| self._emotion_tone_map = { | |
| # Positive emotions | |
| "happy": "warm", | |
| "joy": "excited", | |
| "excited": "excited", | |
| "proud": "warm", | |
| "grateful": "warm", | |
| "love": "warm", | |
| "hopeful": "warm", | |
| # Negative emotions | |
| "sad": "gentle", | |
| "melancholy": "gentle", | |
| "grief": "gentle", | |
| "lonely": "gentle", | |
| "disappointed": "gentle", | |
| # Anxious emotions | |
| "anxious": "calm", | |
| "worried": "calm", | |
| "nervous": "calm", | |
| "overwhelmed": "calm", | |
| "stressed": "calm", | |
| # Other emotions | |
| "angry": "calm", | |
| "frustrated": "calm", | |
| "confused": "warm", | |
| "curious": "mysterious", | |
| "peaceful": "calm", | |
| "tired": "calm", | |
| "neutral": "warm", | |
| } | |
| # Action to tone mapping | |
| self._action_tone_map = { | |
| "reflect": "warm", | |
| "celebrate": "excited", | |
| "comfort": "gentle", | |
| "calm": "calm", | |
| "energize": "warm", | |
| "curiosity": "mysterious", | |
| "intervene": "gentle", | |
| } | |
| def get_tone_for_context( | |
| self, | |
| emotions: list[str], | |
| action: str, | |
| intensity: int = 5 | |
| ) -> str: | |
| """ | |
| Determine the best voice tone based on emotional context. | |
| """ | |
| # Action takes priority for tone | |
| action_tone = self._action_tone_map.get(action, "warm") | |
| if not emotions: | |
| return action_tone | |
| primary_emotion = emotions[0].lower() | |
| emotion_tone = self._emotion_tone_map.get(primary_emotion, "warm") | |
| # For high intensity, lean towards action tone | |
| # For low intensity, lean towards emotion tone | |
| if intensity >= 7: | |
| return action_tone | |
| return emotion_tone | |
| async def speak( | |
| self, | |
| text: str, | |
| emotions: list[str] = None, | |
| action: str = "reflect", | |
| intensity: int = 5, | |
| use_fast_model: bool = True | |
| ) -> VoiceResponse: | |
| """ | |
| Generate speech for text with appropriate emotional tone. | |
| Args: | |
| text: What Pip should say | |
| emotions: Detected emotions for tone matching | |
| action: Pip's current action | |
| intensity: Emotional intensity (1-10) | |
| use_fast_model: Use Flash model for speed | |
| Returns: | |
| VoiceResponse with audio bytes | |
| """ | |
| tone = self.get_tone_for_context(emotions or [], action, intensity) | |
| audio_bytes = await self.client.speak( | |
| text=text, | |
| tone=tone, | |
| use_fast_model=use_fast_model | |
| ) | |
| if audio_bytes: | |
| return VoiceResponse( | |
| audio_bytes=audio_bytes, | |
| tone_used=tone, | |
| model_used="flash" if use_fast_model else "expressive" | |
| ) | |
| return VoiceResponse( | |
| audio_bytes=b"", | |
| tone_used=tone, | |
| model_used="none", | |
| error="Failed to generate speech" | |
| ) | |
| async def speak_stream( | |
| self, | |
| text: str, | |
| emotions: list[str] = None, | |
| action: str = "reflect", | |
| intensity: int = 5 | |
| ) -> AsyncGenerator[bytes, None]: | |
| """ | |
| Stream speech generation for lower latency. | |
| """ | |
| tone = self.get_tone_for_context(emotions or [], action, intensity) | |
| async for chunk in self.client.speak_stream(text, tone): | |
| yield chunk | |
| async def speak_acknowledgment(self, ack_text: str) -> VoiceResponse: | |
| """ | |
| Quick speech for acknowledgments (uses fastest model + tone). | |
| """ | |
| return await self.speak( | |
| text=ack_text, | |
| action="reflect", | |
| use_fast_model=True | |
| ) | |
| async def speak_intervention(self, text: str) -> VoiceResponse: | |
| """ | |
| Speech for intervention scenarios - gentle and calming. | |
| """ | |
| return await self.speak( | |
| text=text, | |
| action="intervene", | |
| use_fast_model=False # Use expressive model for nuance | |
| ) | |
| async def get_voices(self) -> list[dict]: | |
| """Get available voices for potential customization.""" | |
| return await self.client.get_available_voices() | |
| class PipEars: | |
| """ | |
| Pip's hearing - Speech-to-text for voice input. | |
| Uses OpenAI Whisper. | |
| """ | |
| def __init__(self): | |
| from services.openai_client import OpenAIClient | |
| self.client = OpenAIClient() | |
| async def listen(self, audio_file_path: str) -> str: | |
| """ | |
| Transcribe audio file to text. | |
| Args: | |
| audio_file_path: Path to audio file | |
| Returns: | |
| Transcribed text | |
| """ | |
| return await self.client.transcribe_audio(audio_file_path) | |
| async def listen_bytes(self, audio_bytes: bytes, filename: str = "audio.wav") -> str: | |
| """ | |
| Transcribe audio bytes to text. | |
| Args: | |
| audio_bytes: Raw audio bytes | |
| filename: Filename hint for format detection | |
| Returns: | |
| Transcribed text | |
| """ | |
| return await self.client.transcribe_audio_bytes(audio_bytes, filename) | |