Spaces:
Runtime error
Runtime error
| import os | |
| import spaces | |
| import gradio as gr | |
| import librosa | |
| import numpy as np | |
| import torch | |
| from transformers import SpeechT5ForTextToSpeech, SpeechT5Processor, SpeechT5HifiGan | |
| checkpoint = "techiaith/microsoft_speecht5_finetuned_bu_tts_cy_en" | |
| processor = SpeechT5Processor.from_pretrained(checkpoint) | |
| model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint) | |
| vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") | |
| speaker_embeddings = { | |
| "GGP": "spkemb/speaker0.npy", | |
| "BGP": "spkemb/speaker1.npy", | |
| "BDP": "spkemb/speaker2.npy", | |
| } | |
| def predict(text, speaker, mic_audio=None): | |
| if len(text.strip()) == 0: | |
| return (16000, np.zeros(0).astype(np.int16)) | |
| speaker_embedding = np.load(speaker_embeddings[speaker[:3]]) | |
| speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0) | |
| inputs = processor(text=text, return_tensors="pt") | |
| speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder) | |
| speech = (speech.numpy() * 32767).astype(np.int16) | |
| return (16000, speech) | |
| title = "Techiaith Finetune Microsoft/SpeechT5: Speech Synthesis" | |
| description = """ | |
| Lleisiau TTS microsoft_speech_T5_finetune_bu_tts_cy_en | |
| """ | |
| examples = [ | |
| ["Rhyfeddod neu ffenomenon optegol a meteorolegol yw enfys, pan fydd sbectrwm o olau yn ymddangos yn yr awyr pan fo'r haul yn disgleirio ar ddiferion o leithder yn atmosffer y ddaear.", "GGP (gwryw-gogledd-pro)"], | |
| ["Rhyfeddod neu ffenomenon optegol a meteorolegol yw enfys, pan fydd sbectrwm o olau yn ymddangos yn yr awyr pan fo'r haul yn disgleirio ar ddiferion o leithder yn atmosffer y ddaear.", "BGP (benyw-gogledd-pro)"], | |
| ["Rhyfeddod neu ffenomenon optegol a meteorolegol yw enfys, pan fydd sbectrwm o olau yn ymddangos yn yr awyr pan fo'r haul yn disgleirio ar ddiferion o leithder yn atmosffer y ddaear.", "BDP (benyw-de-pro)"], | |
| ] | |
| gr.Interface( | |
| fn=predict, | |
| inputs=[ | |
| gr.Text(label="Input Text"), | |
| gr.Radio(label="Speaker", choices=[ | |
| "GGP (gwryw-gogledd-pro)", | |
| "BGP (benyw-gogledd-pro)", | |
| "BDP (benyw-de-pro)", | |
| ], | |
| value="GGP (gwryw-gogledd-pro)"), | |
| gr.Audio(label="Record Speech", source="microphone", type="numpy"), | |
| ], | |
| outputs=[ | |
| gr.Audio(label="Generated Speech", type="numpy"), | |
| ], | |
| title=title, | |
| description=description, | |
| examples=examples, | |
| ).launch() | |