pipV1 / pip_voice.py
Itsjustamit's picture
files for v1
cd35cc5 verified
"""
Pip's Voice - Text-to-speech with emotional tone matching.
Uses ElevenLabs for high-quality, expressive speech.
"""
import asyncio
from typing import Optional, AsyncGenerator
from dataclasses import dataclass
from services.elevenlabs_client import ElevenLabsClient
@dataclass
class VoiceResponse:
"""Audio response from Pip."""
audio_bytes: bytes
tone_used: str
model_used: str
error: Optional[str] = None
class PipVoice:
"""
Pip's voice synthesis with emotional tone matching.
"""
def __init__(self):
self.client = ElevenLabsClient()
# Emotion to tone mapping with fallbacks
self._emotion_tone_map = {
# Positive emotions
"happy": "warm",
"joy": "excited",
"excited": "excited",
"proud": "warm",
"grateful": "warm",
"love": "warm",
"hopeful": "warm",
# Negative emotions
"sad": "gentle",
"melancholy": "gentle",
"grief": "gentle",
"lonely": "gentle",
"disappointed": "gentle",
# Anxious emotions
"anxious": "calm",
"worried": "calm",
"nervous": "calm",
"overwhelmed": "calm",
"stressed": "calm",
# Other emotions
"angry": "calm",
"frustrated": "calm",
"confused": "warm",
"curious": "mysterious",
"peaceful": "calm",
"tired": "calm",
"neutral": "warm",
}
# Action to tone mapping
self._action_tone_map = {
"reflect": "warm",
"celebrate": "excited",
"comfort": "gentle",
"calm": "calm",
"energize": "warm",
"curiosity": "mysterious",
"intervene": "gentle",
}
def get_tone_for_context(
self,
emotions: list[str],
action: str,
intensity: int = 5
) -> str:
"""
Determine the best voice tone based on emotional context.
"""
# Action takes priority for tone
action_tone = self._action_tone_map.get(action, "warm")
if not emotions:
return action_tone
primary_emotion = emotions[0].lower()
emotion_tone = self._emotion_tone_map.get(primary_emotion, "warm")
# For high intensity, lean towards action tone
# For low intensity, lean towards emotion tone
if intensity >= 7:
return action_tone
return emotion_tone
async def speak(
self,
text: str,
emotions: list[str] = None,
action: str = "reflect",
intensity: int = 5,
use_fast_model: bool = True
) -> VoiceResponse:
"""
Generate speech for text with appropriate emotional tone.
Args:
text: What Pip should say
emotions: Detected emotions for tone matching
action: Pip's current action
intensity: Emotional intensity (1-10)
use_fast_model: Use Flash model for speed
Returns:
VoiceResponse with audio bytes
"""
tone = self.get_tone_for_context(emotions or [], action, intensity)
audio_bytes = await self.client.speak(
text=text,
tone=tone,
use_fast_model=use_fast_model
)
if audio_bytes:
return VoiceResponse(
audio_bytes=audio_bytes,
tone_used=tone,
model_used="flash" if use_fast_model else "expressive"
)
return VoiceResponse(
audio_bytes=b"",
tone_used=tone,
model_used="none",
error="Failed to generate speech"
)
async def speak_stream(
self,
text: str,
emotions: list[str] = None,
action: str = "reflect",
intensity: int = 5
) -> AsyncGenerator[bytes, None]:
"""
Stream speech generation for lower latency.
"""
tone = self.get_tone_for_context(emotions or [], action, intensity)
async for chunk in self.client.speak_stream(text, tone):
yield chunk
async def speak_acknowledgment(self, ack_text: str) -> VoiceResponse:
"""
Quick speech for acknowledgments (uses fastest model + tone).
"""
return await self.speak(
text=ack_text,
action="reflect",
use_fast_model=True
)
async def speak_intervention(self, text: str) -> VoiceResponse:
"""
Speech for intervention scenarios - gentle and calming.
"""
return await self.speak(
text=text,
action="intervene",
use_fast_model=False # Use expressive model for nuance
)
async def get_voices(self) -> list[dict]:
"""Get available voices for potential customization."""
return await self.client.get_available_voices()
class PipEars:
"""
Pip's hearing - Speech-to-text for voice input.
Uses OpenAI Whisper.
"""
def __init__(self):
from services.openai_client import OpenAIClient
self.client = OpenAIClient()
async def listen(self, audio_file_path: str) -> str:
"""
Transcribe audio file to text.
Args:
audio_file_path: Path to audio file
Returns:
Transcribed text
"""
return await self.client.transcribe_audio(audio_file_path)
async def listen_bytes(self, audio_bytes: bytes, filename: str = "audio.wav") -> str:
"""
Transcribe audio bytes to text.
Args:
audio_bytes: Raw audio bytes
filename: Filename hint for format detection
Returns:
Transcribed text
"""
return await self.client.transcribe_audio_bytes(audio_bytes, filename)