Spaces:
Build error
Build error
| from transformers import pipeline | |
| import gradio as gr | |
| import torch | |
| import numpy as np | |
| import librosa | |
| import matplotlib.pyplot as plt | |
| import noisereduce | |
| model_id = "fydhfzh/hubert-classifier-aug-fold-3" | |
| pipe = pipeline("audio-classification", model=model_id) | |
| def get_binary_values(): | |
| binary_values = [] | |
| for i in range(1, 29): | |
| binary_rep = format(i, '05b') | |
| for i in range(1, 4): | |
| binary_harakat = format(i, '02b') | |
| binary_values.append(binary_rep + binary_harakat) | |
| return binary_values | |
| binary_values = get_binary_values() | |
| arabic_letters = [ | |
| "ุฃู", "ุฅู", "ุฃู", | |
| "ุจู", "ุจู", "ุจู", | |
| "ุชู", "ุชู", "ุชู", | |
| "ุซู", "ุซู", "ุซู", | |
| "ุฌู", "ุฌู", "ุฌู", | |
| "ุญู", "ุญู", "ุญู", | |
| "ุฎู", "ุฎู", "ุฎู", | |
| "ุฏู", "ุฏู", "ุฏู", | |
| "ุฐู", "ุฐู", "ุฐู", | |
| "ุฑู", "ุฑู", "ุฑู", | |
| "ุฒู", "ุฒู", "ุฒู", | |
| "ุณู", "ุณู", "ุณู", | |
| "ุดู", "ุดู", "ุดู", | |
| "ุตู", "ุตู", "ุตู", | |
| "ุถู", "ุถู", "ุถู", | |
| "ุทู", "ุทู", "ุทู", | |
| "ุธู", "ุธู", "ุธู", | |
| "ุนู", "ุนู", "ุนู", | |
| "ุบู", "ุบู", "ุบู", | |
| "ูู", "ูู", "ูู", | |
| "ูู", "ูู", "ูู", | |
| "ูู", "ูู", "ูู", | |
| "ูู", "ูู", "ูู", | |
| "ู ู", "ู ู", "ู ู", | |
| "ูู", "ูู", "ูู", | |
| "ูู", "ูู", "ูู", | |
| "ูู", "ูู", "ูู", | |
| "ูู", "ูู", "ูู" | |
| ] | |
| arabic_representation = dict(zip(binary_values, arabic_letters)) | |
| arabic_representation | |
| def split_input(raw_input): | |
| mse = librosa.feature.rms(y=raw_input, frame_length=2048, hop_length=512) ** 2 | |
| mse_db = librosa.core.power_to_db(mse.squeeze(), ref=np.min, top_db=None) | |
| mse_db = mse_db[mse_db != 0] | |
| percentile_param = 10 | |
| extra_db_param = 0 | |
| threshold = np.percentile(mse_db, percentile_param) + extra_db_param | |
| print(threshold) | |
| intervals = librosa.effects.split(y=raw_input, top_db=threshold) # top_db = 60 - threshold | |
| splitted_input = [] | |
| for i, (start, end) in enumerate(intervals): | |
| # Add overlapping frames both for trail and lead to ensure good split result | |
| overlap = 2000 | |
| start = start - overlap if start - overlap >= 0 else 0 | |
| end = end + overlap if end + overlap <= len(raw_input) else len(raw_input) | |
| split_audio = raw_input[start:end] | |
| if len(split_audio) < 16000: | |
| side_len = (16000 - len(split_audio))/2 | |
| pad_width = (int(side_len), int(side_len)) | |
| split_audio = np.pad(split_audio, pad_width=pad_width, mode='constant', constant_values=(0, 0)) | |
| else: | |
| split_audio = split_audio[0:16000] | |
| splitted_input.append(split_audio) | |
| return splitted_input | |
| def process_audio(filepath): | |
| audio, sr = librosa.load(filepath, sr=16000) | |
| audio = noisereduce.reduce_noise(audio, sr) | |
| audio = librosa.util.normalize(audio) | |
| audios = split_input(audio) | |
| return audios | |
| def classify_utterances(filepath): | |
| audios = process_audio(filepath) | |
| output = [pipe(audio)[0] for audio in audios] | |
| predictions = [arabic_representation[x['label']] for x in output] | |
| return ' '.join(predictions) | |
| demo = gr.Blocks() | |
| mic_classification = gr.Interface( | |
| fn=classify_utterances, | |
| inputs=gr.Audio(sources='microphone', type='filepath'), | |
| outputs=gr.Textbox() | |
| ) | |
| file_classification = gr.Interface( | |
| fn=classify_utterances, | |
| inputs=gr.Audio(sources='upload', type='filepath'), | |
| outputs=gr.Textbox() | |
| ) | |
| with demo: | |
| gr.TabbedInterface( | |
| [mic_classification, file_classification], | |
| ['Classify Microphone', 'Classify Audio File'] | |
| ) | |
| demo.launch() |