Update melotts module
Browse files
ax_speech_translate_demo_qwen_api.py
CHANGED
|
@@ -48,7 +48,7 @@ def intersperse(lst, item):
|
|
| 48 |
result[1::2] = lst
|
| 49 |
return result
|
| 50 |
|
| 51 |
-
|
| 52 |
def get_text_for_tts_infer(text, language_str, symbol_to_id=None):
|
| 53 |
norm_text, phone, tone, word2ph = clean_text(text, language_str)
|
| 54 |
phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str, symbol_to_id)
|
|
@@ -64,6 +64,75 @@ def get_text_for_tts_infer(text, language_str, symbol_to_id=None):
|
|
| 64 |
word2ph[0] += 1
|
| 65 |
|
| 66 |
return phone, tone, language, norm_text, word2ph
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
|
| 69 |
def audio_numpy_concat(segment_data_list, sr, speed=1.):
|
|
|
|
| 48 |
result[1::2] = lst
|
| 49 |
return result
|
| 50 |
|
| 51 |
+
"""
|
| 52 |
def get_text_for_tts_infer(text, language_str, symbol_to_id=None):
|
| 53 |
norm_text, phone, tone, word2ph = clean_text(text, language_str)
|
| 54 |
phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str, symbol_to_id)
|
|
|
|
| 64 |
word2ph[0] += 1
|
| 65 |
|
| 66 |
return phone, tone, language, norm_text, word2ph
|
| 67 |
+
"""
|
| 68 |
+
|
| 69 |
+
# 处理字符无法不识别
|
| 70 |
+
def get_text_for_tts_infer(text, language_str, symbol_to_id=None):
|
| 71 |
+
"""修复版音素处理:确保所有数组长度一致"""
|
| 72 |
+
try:
|
| 73 |
+
norm_text, phone, tone, word2ph = clean_text(text, language_str)
|
| 74 |
+
|
| 75 |
+
# 特殊音素直接映射为空字符串
|
| 76 |
+
phone_mapping = {
|
| 77 |
+
'ɛ': '', 'æ': '', 'ʌ': '', 'ʊ': '', 'ɔ': '', 'ɪ': '', 'ɝ': '', 'ɚ': '', 'ɑ': '',
|
| 78 |
+
'ʒ': '', 'θ': '', 'ð': '', 'ŋ': '', 'ʃ': '', 'ʧ': '', 'ʤ': '', 'ː': '', 'ˈ': '',
|
| 79 |
+
'ˌ': '', 'ʰ': '', 'ʲ': '', 'ʷ': '', 'ʔ': '', 'ɾ': '', 'ɹ': '', 'ɫ': '', 'ɡ': '',
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
# 同步处理 phone 和 tone,确保它们长度一致
|
| 83 |
+
processed_phone = []
|
| 84 |
+
processed_tone = []
|
| 85 |
+
removed_symbols = set()
|
| 86 |
+
|
| 87 |
+
for p, t in zip(phone, tone):
|
| 88 |
+
if p in phone_mapping:
|
| 89 |
+
# 特殊音素直接删除,同时删除对应的 tone
|
| 90 |
+
removed_symbols.add(p)
|
| 91 |
+
elif p in symbol_to_id:
|
| 92 |
+
# 正常音素保留,同时保留对应的 tone
|
| 93 |
+
processed_phone.append(p)
|
| 94 |
+
processed_tone.append(t)
|
| 95 |
+
else:
|
| 96 |
+
# 其他未知音素也删除
|
| 97 |
+
removed_symbols.add(p)
|
| 98 |
+
|
| 99 |
+
# 记录被删除的音素
|
| 100 |
+
if removed_symbols:
|
| 101 |
+
print(f"[音素过滤] 删除了 {len(removed_symbols)} 个特殊音素: {sorted(removed_symbols)}")
|
| 102 |
+
print(f"[音素过滤] 处理后音素序列长度: {len(processed_phone)}")
|
| 103 |
+
print(f"[音素过滤] 处理后音调序列长度: {len(processed_tone)}")
|
| 104 |
+
|
| 105 |
+
# 如果没有有效音素,使用默认音素,
|
| 106 |
+
if not processed_phone:
|
| 107 |
+
print("[警告] 没有有效音素,使用默认中文音素")
|
| 108 |
+
processed_phone = ['ni', 'hao']
|
| 109 |
+
processed_tone = ['1', '3']
|
| 110 |
+
word2ph = [1, 1]
|
| 111 |
+
|
| 112 |
+
# 确保 word2ph 的长度与处理后的音素序列匹配
|
| 113 |
+
if len(processed_phone) != len(phone):
|
| 114 |
+
print(f"[警告] 音素序列长度变化: {len(phone)} -> {len(processed_phone)}")
|
| 115 |
+
# 简单处理:重新计算 word2ph
|
| 116 |
+
word2ph = [1] * len(processed_phone)
|
| 117 |
+
|
| 118 |
+
phone, tone, language = cleaned_text_to_sequence(processed_phone, processed_tone, language_str, symbol_to_id)
|
| 119 |
+
|
| 120 |
+
phone = intersperse(phone, 0)
|
| 121 |
+
tone = intersperse(tone, 0)
|
| 122 |
+
language = intersperse(language, 0)
|
| 123 |
+
|
| 124 |
+
phone = np.array(phone, dtype=np.int32)
|
| 125 |
+
tone = np.array(tone, dtype=np.int32)
|
| 126 |
+
language = np.array(language, dtype=np.int32)
|
| 127 |
+
word2ph = np.array(word2ph, dtype=np.int32) * 2
|
| 128 |
+
word2ph[0] += 1
|
| 129 |
+
return phone, tone, language, norm_text, word2ph
|
| 130 |
+
|
| 131 |
+
except Exception as e:
|
| 132 |
+
print(f"[错误] 文本处理失败: {e}")
|
| 133 |
+
import traceback
|
| 134 |
+
traceback.print_exc()
|
| 135 |
+
raise e
|
| 136 |
|
| 137 |
|
| 138 |
def audio_numpy_concat(segment_data_list, sr, speed=1.):
|