Spaces:

MCP-1st-Birthday
/

pipV1

Running

App Files Files Community

pipV1 / pip_voice.py

Itsjustamit

files for v1

cd35cc5 verified 11 days ago

raw

history blame contribute delete

6.18 kB

	"""
	Pip's Voice - Text-to-speech with emotional tone matching.
	Uses ElevenLabs for high-quality, expressive speech.
	"""

	import asyncio
	from typing import Optional, AsyncGenerator
	from dataclasses import dataclass

	from services.elevenlabs_client import ElevenLabsClient


	@dataclass
	class VoiceResponse:
	"""Audio response from Pip."""
	audio_bytes: bytes
	tone_used: str
	model_used: str
	error: Optional[str] = None


	class PipVoice:
	"""
	Pip's voice synthesis with emotional tone matching.
	"""

	def __init__(self):
	self.client = ElevenLabsClient()

	# Emotion to tone mapping with fallbacks
	self._emotion_tone_map = {
	# Positive emotions
	"happy": "warm",
	"joy": "excited",
	"excited": "excited",
	"proud": "warm",
	"grateful": "warm",
	"love": "warm",
	"hopeful": "warm",

	# Negative emotions
	"sad": "gentle",
	"melancholy": "gentle",
	"grief": "gentle",
	"lonely": "gentle",
	"disappointed": "gentle",

	# Anxious emotions
	"anxious": "calm",
	"worried": "calm",
	"nervous": "calm",
	"overwhelmed": "calm",
	"stressed": "calm",

	# Other emotions
	"angry": "calm",
	"frustrated": "calm",
	"confused": "warm",
	"curious": "mysterious",
	"peaceful": "calm",
	"tired": "calm",
	"neutral": "warm",
	}

	# Action to tone mapping
	self._action_tone_map = {
	"reflect": "warm",
	"celebrate": "excited",
	"comfort": "gentle",
	"calm": "calm",
	"energize": "warm",
	"curiosity": "mysterious",
	"intervene": "gentle",
	}

	def get_tone_for_context(
	self,
	emotions: list[str],
	action: str,
	intensity: int = 5
	) -> str:
	"""
	Determine the best voice tone based on emotional context.
	"""
	# Action takes priority for tone
	action_tone = self._action_tone_map.get(action, "warm")

	if not emotions:
	return action_tone

	primary_emotion = emotions[0].lower()
	emotion_tone = self._emotion_tone_map.get(primary_emotion, "warm")

	# For high intensity, lean towards action tone
	# For low intensity, lean towards emotion tone
	if intensity >= 7:
	return action_tone

	return emotion_tone

	async def speak(
	self,
	text: str,
	emotions: list[str] = None,
	action: str = "reflect",
	intensity: int = 5,
	use_fast_model: bool = True
	) -> VoiceResponse:
	"""
	Generate speech for text with appropriate emotional tone.

	Args:
	text: What Pip should say
	emotions: Detected emotions for tone matching
	action: Pip's current action
	intensity: Emotional intensity (1-10)
	use_fast_model: Use Flash model for speed

	Returns:
	VoiceResponse with audio bytes
	"""
	tone = self.get_tone_for_context(emotions or [], action, intensity)

	audio_bytes = await self.client.speak(
	text=text,
	tone=tone,
	use_fast_model=use_fast_model
	)

	if audio_bytes:
	return VoiceResponse(
	audio_bytes=audio_bytes,
	tone_used=tone,
	model_used="flash" if use_fast_model else "expressive"
	)

	return VoiceResponse(
	audio_bytes=b"",
	tone_used=tone,
	model_used="none",
	error="Failed to generate speech"
	)

	async def speak_stream(
	self,
	text: str,
	emotions: list[str] = None,
	action: str = "reflect",
	intensity: int = 5
	) -> AsyncGenerator[bytes, None]:
	"""
	Stream speech generation for lower latency.
	"""
	tone = self.get_tone_for_context(emotions or [], action, intensity)

	async for chunk in self.client.speak_stream(text, tone):
	yield chunk

	async def speak_acknowledgment(self, ack_text: str) -> VoiceResponse:
	"""
	Quick speech for acknowledgments (uses fastest model + tone).
	"""
	return await self.speak(
	text=ack_text,
	action="reflect",
	use_fast_model=True
	)

	async def speak_intervention(self, text: str) -> VoiceResponse:
	"""
	Speech for intervention scenarios - gentle and calming.
	"""
	return await self.speak(
	text=text,
	action="intervene",
	use_fast_model=False # Use expressive model for nuance
	)

	async def get_voices(self) -> list[dict]:
	"""Get available voices for potential customization."""
	return await self.client.get_available_voices()


	class PipEars:
	"""
	Pip's hearing - Speech-to-text for voice input.
	Uses OpenAI Whisper.
	"""

	def __init__(self):
	from services.openai_client import OpenAIClient
	self.client = OpenAIClient()

	async def listen(self, audio_file_path: str) -> str:
	"""
	Transcribe audio file to text.

	Args:
	audio_file_path: Path to audio file

	Returns:
	Transcribed text
	"""
	return await self.client.transcribe_audio(audio_file_path)

	async def listen_bytes(self, audio_bytes: bytes, filename: str = "audio.wav") -> str:
	"""
	Transcribe audio bytes to text.

	Args:
	audio_bytes: Raw audio bytes
	filename: Filename hint for format detection

	Returns:
	Transcribed text
	"""
	return await self.client.transcribe_audio_bytes(audio_bytes, filename)