Spaces:
Sleeping
Sleeping
Ko-TTS-Arena Contributors
commited on
Commit
·
b0bdfc9
1
Parent(s):
407795c
fix: Add ffmpeg to Docker, disable Gemini TTS (requires OAuth2), keep REST API code for future
Browse files- Dockerfile +1 -0
- models.py +9 -9
- requirements.txt +1 -2
- tts.py +31 -28
Dockerfile
CHANGED
|
@@ -6,6 +6,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
| 6 |
wget \
|
| 7 |
curl \
|
| 8 |
git \
|
|
|
|
| 9 |
&& rm -rf /var/lib/apt/lists/*
|
| 10 |
|
| 11 |
# Create non-root user
|
|
|
|
| 6 |
wget \
|
| 7 |
curl \
|
| 8 |
git \
|
| 9 |
+
ffmpeg \
|
| 10 |
&& rm -rf /var/lib/apt/lists/*
|
| 11 |
|
| 12 |
# Create non-root user
|
models.py
CHANGED
|
@@ -649,15 +649,15 @@ def insert_initial_models():
|
|
| 649 |
is_active=has_typecast,
|
| 650 |
model_url="https://typecast.ai/",
|
| 651 |
),
|
| 652 |
-
# Gemini TTS (Google Cloud - 다국어 지원) -
|
| 653 |
-
Model(
|
| 654 |
-
|
| 655 |
-
|
| 656 |
-
|
| 657 |
-
|
| 658 |
-
|
| 659 |
-
|
| 660 |
-
),
|
| 661 |
]
|
| 662 |
|
| 663 |
for model in tts_models:
|
|
|
|
| 649 |
is_active=has_typecast,
|
| 650 |
model_url="https://typecast.ai/",
|
| 651 |
),
|
| 652 |
+
# Gemini TTS (Google Cloud - 다국어 지원) - OAuth2 인증 필요, 현재 비활성화
|
| 653 |
+
# Model(
|
| 654 |
+
# id="gemini-tts-aoede",
|
| 655 |
+
# name="Gemini TTS (Aoede)",
|
| 656 |
+
# model_type=ModelType.TTS,
|
| 657 |
+
# is_open=False,
|
| 658 |
+
# is_active=has_gemini_tts,
|
| 659 |
+
# model_url="https://cloud.google.com/text-to-speech/docs/gemini-tts",
|
| 660 |
+
# ),
|
| 661 |
]
|
| 662 |
|
| 663 |
for model in tts_models:
|
requirements.txt
CHANGED
|
@@ -14,5 +14,4 @@ huggingface-hub
|
|
| 14 |
scipy
|
| 15 |
numpy
|
| 16 |
pydub
|
| 17 |
-
typecast-python
|
| 18 |
-
google-cloud-texttospeech
|
|
|
|
| 14 |
scipy
|
| 15 |
numpy
|
| 16 |
pydub
|
| 17 |
+
typecast-python
|
|
|
tts.py
CHANGED
|
@@ -448,47 +448,50 @@ def predict_typecast_tts(text: str, voice_id: str = "tc_612ed01c7eb720fddd3ddedf
|
|
| 448 |
|
| 449 |
|
| 450 |
def predict_gemini_tts(text: str, voice: str = "Aoede", model: str = "gemini-2.5-flash-tts") -> str:
|
| 451 |
-
"""Gemini TTS API 호출 (
|
| 452 |
api_key = GEMINI_TTS_API_KEY
|
| 453 |
if not api_key:
|
| 454 |
raise ValueError("GEMINI_TTS_API_KEY 환경 변수가 설정되지 않았습니다.")
|
| 455 |
|
| 456 |
try:
|
| 457 |
-
|
| 458 |
-
|
| 459 |
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 466 |
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
|
|
|
| 471 |
)
|
|
|
|
| 472 |
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
text=text,
|
| 477 |
-
prompt='친절하고 자연스러운 톤으로 말해주세요'
|
| 478 |
-
),
|
| 479 |
-
voice=voice_params,
|
| 480 |
-
audio_config=texttospeech.AudioConfig(
|
| 481 |
-
audio_encoding=texttospeech.AudioEncoding.LINEAR16,
|
| 482 |
-
sample_rate_hertz=24000
|
| 483 |
-
),
|
| 484 |
-
)
|
| 485 |
|
|
|
|
| 486 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
| 487 |
-
f.write(
|
| 488 |
return f.name
|
| 489 |
|
| 490 |
-
except
|
| 491 |
-
raise ValueError("
|
| 492 |
except Exception as e:
|
| 493 |
raise ValueError(f"Gemini TTS API 오류: {str(e)}")
|
| 494 |
|
|
|
|
| 448 |
|
| 449 |
|
| 450 |
def predict_gemini_tts(text: str, voice: str = "Aoede", model: str = "gemini-2.5-flash-tts") -> str:
|
| 451 |
+
"""Gemini TTS API 호출 (REST API 방식)"""
|
| 452 |
api_key = GEMINI_TTS_API_KEY
|
| 453 |
if not api_key:
|
| 454 |
raise ValueError("GEMINI_TTS_API_KEY 환경 변수가 설정되지 않았습니다.")
|
| 455 |
|
| 456 |
try:
|
| 457 |
+
# REST API 엔드포인트
|
| 458 |
+
url = f"https://texttospeech.googleapis.com/v1beta1/text:synthesize?key={api_key}"
|
| 459 |
|
| 460 |
+
payload = {
|
| 461 |
+
"input": {
|
| 462 |
+
"text": text,
|
| 463 |
+
"prompt": "친절하고 자연스러운 톤으로 말해주세요"
|
| 464 |
+
},
|
| 465 |
+
"voice": {
|
| 466 |
+
"languageCode": "ko-kr",
|
| 467 |
+
"name": voice,
|
| 468 |
+
"modelName": model
|
| 469 |
+
},
|
| 470 |
+
"audioConfig": {
|
| 471 |
+
"audioEncoding": "LINEAR16",
|
| 472 |
+
"sampleRateHertz": 24000
|
| 473 |
+
}
|
| 474 |
+
}
|
| 475 |
|
| 476 |
+
response = requests.post(
|
| 477 |
+
url,
|
| 478 |
+
headers={"Content-Type": "application/json"},
|
| 479 |
+
json=payload,
|
| 480 |
+
timeout=60
|
| 481 |
)
|
| 482 |
+
response.raise_for_status()
|
| 483 |
|
| 484 |
+
audio_content = response.json().get("audioContent")
|
| 485 |
+
if not audio_content:
|
| 486 |
+
raise ValueError("Gemini TTS API가 오디오를 반환하지 않았습니다.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 487 |
|
| 488 |
+
audio_bytes = base64.b64decode(audio_content)
|
| 489 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
| 490 |
+
f.write(audio_bytes)
|
| 491 |
return f.name
|
| 492 |
|
| 493 |
+
except requests.exceptions.RequestException as e:
|
| 494 |
+
raise ValueError(f"Gemini TTS API 요청 오류: {str(e)}")
|
| 495 |
except Exception as e:
|
| 496 |
raise ValueError(f"Gemini TTS API 오류: {str(e)}")
|
| 497 |
|