""" Pip's Voice - Text-to-speech with emotional tone matching. Uses ElevenLabs for high-quality, expressive speech. """ import asyncio from typing import Optional, AsyncGenerator from dataclasses import dataclass from services.elevenlabs_client import ElevenLabsClient @dataclass class VoiceResponse: """Audio response from Pip.""" audio_bytes: bytes tone_used: str model_used: str error: Optional[str] = None class PipVoice: """ Pip's voice synthesis with emotional tone matching. """ def __init__(self): self.client = ElevenLabsClient() # Emotion to tone mapping with fallbacks self._emotion_tone_map = { # Positive emotions "happy": "warm", "joy": "excited", "excited": "excited", "proud": "warm", "grateful": "warm", "love": "warm", "hopeful": "warm", # Negative emotions "sad": "gentle", "melancholy": "gentle", "grief": "gentle", "lonely": "gentle", "disappointed": "gentle", # Anxious emotions "anxious": "calm", "worried": "calm", "nervous": "calm", "overwhelmed": "calm", "stressed": "calm", # Other emotions "angry": "calm", "frustrated": "calm", "confused": "warm", "curious": "mysterious", "peaceful": "calm", "tired": "calm", "neutral": "warm", } # Action to tone mapping self._action_tone_map = { "reflect": "warm", "celebrate": "excited", "comfort": "gentle", "calm": "calm", "energize": "warm", "curiosity": "mysterious", "intervene": "gentle", } def get_tone_for_context( self, emotions: list[str], action: str, intensity: int = 5 ) -> str: """ Determine the best voice tone based on emotional context. """ # Action takes priority for tone action_tone = self._action_tone_map.get(action, "warm") if not emotions: return action_tone primary_emotion = emotions[0].lower() emotion_tone = self._emotion_tone_map.get(primary_emotion, "warm") # For high intensity, lean towards action tone # For low intensity, lean towards emotion tone if intensity >= 7: return action_tone return emotion_tone async def speak( self, text: str, emotions: list[str] = None, action: str = "reflect", intensity: int = 5, use_fast_model: bool = True ) -> VoiceResponse: """ Generate speech for text with appropriate emotional tone. Args: text: What Pip should say emotions: Detected emotions for tone matching action: Pip's current action intensity: Emotional intensity (1-10) use_fast_model: Use Flash model for speed Returns: VoiceResponse with audio bytes """ tone = self.get_tone_for_context(emotions or [], action, intensity) audio_bytes = await self.client.speak( text=text, tone=tone, use_fast_model=use_fast_model ) if audio_bytes: return VoiceResponse( audio_bytes=audio_bytes, tone_used=tone, model_used="flash" if use_fast_model else "expressive" ) return VoiceResponse( audio_bytes=b"", tone_used=tone, model_used="none", error="Failed to generate speech" ) async def speak_stream( self, text: str, emotions: list[str] = None, action: str = "reflect", intensity: int = 5 ) -> AsyncGenerator[bytes, None]: """ Stream speech generation for lower latency. """ tone = self.get_tone_for_context(emotions or [], action, intensity) async for chunk in self.client.speak_stream(text, tone): yield chunk async def speak_acknowledgment(self, ack_text: str) -> VoiceResponse: """ Quick speech for acknowledgments (uses fastest model + tone). """ return await self.speak( text=ack_text, action="reflect", use_fast_model=True ) async def speak_intervention(self, text: str) -> VoiceResponse: """ Speech for intervention scenarios - gentle and calming. """ return await self.speak( text=text, action="intervene", use_fast_model=False # Use expressive model for nuance ) async def get_voices(self) -> list[dict]: """Get available voices for potential customization.""" return await self.client.get_available_voices() class PipEars: """ Pip's hearing - Speech-to-text for voice input. Uses OpenAI Whisper. """ def __init__(self): from services.openai_client import OpenAIClient self.client = OpenAIClient() async def listen(self, audio_file_path: str) -> str: """ Transcribe audio file to text. Args: audio_file_path: Path to audio file Returns: Transcribed text """ return await self.client.transcribe_audio(audio_file_path) async def listen_bytes(self, audio_bytes: bytes, filename: str = "audio.wav") -> str: """ Transcribe audio bytes to text. Args: audio_bytes: Raw audio bytes filename: Filename hint for format detection Returns: Transcribed text """ return await self.client.transcribe_audio_bytes(audio_bytes, filename)