Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,20 +5,18 @@ import numpy as np
|
|
| 5 |
import torch
|
| 6 |
import spaces
|
| 7 |
|
| 8 |
-
#
|
| 9 |
print("Loading model...")
|
| 10 |
model_id = "badrex/mms-300m-arabic-dialect-identifier"
|
| 11 |
classifier = pipeline("audio-classification", model=model_id, device='cuda')
|
| 12 |
print("Model loaded successfully")
|
| 13 |
print("Model moved to GPU successfully")
|
| 14 |
|
| 15 |
-
|
| 16 |
@spaces.GPU
|
| 17 |
def predict(audio_segment, sr=16000):
|
| 18 |
return classifier({"sampling_rate": sr, "raw": audio_segment})
|
| 19 |
|
| 20 |
-
|
| 21 |
-
# Define dialect mapping
|
| 22 |
dialect_mapping = {
|
| 23 |
"MSA": "Modern Standard Arabic (MSA) - العربية الفصحى الحديثة",
|
| 24 |
"Egyptian": "Egyptian Arabic - اللهجة المصرية العامية",
|
|
@@ -31,16 +29,12 @@ def predict_dialect(audio):
|
|
| 31 |
if audio is None:
|
| 32 |
return {"Error": 1.0}
|
| 33 |
|
| 34 |
-
# The audio input from Gradio is a tuple of (sample_rate, audio_array)
|
| 35 |
sr, audio_array = audio
|
| 36 |
|
| 37 |
-
# Process the audio input
|
| 38 |
if len(audio_array.shape) > 1:
|
| 39 |
-
audio_array = audio_array.mean(axis=1)
|
| 40 |
|
| 41 |
-
# Convert audio to float32 if it's not already (fix for Chrome recording issue)
|
| 42 |
if audio_array.dtype != np.float32:
|
| 43 |
-
# Normalize to [-1, 1] range as expected by the model
|
| 44 |
if audio_array.dtype == np.int16:
|
| 45 |
audio_array = audio_array.astype(np.float32) / 32768.0
|
| 46 |
else:
|
|
@@ -48,10 +42,8 @@ def predict_dialect(audio):
|
|
| 48 |
|
| 49 |
print(f"Processing audio: sample rate={sr}, shape={audio_array.shape}")
|
| 50 |
|
| 51 |
-
# Classify the dialect
|
| 52 |
predictions = predict(sr=sr, audio_segment=audio_array)
|
| 53 |
|
| 54 |
-
# Format results for display
|
| 55 |
results = {}
|
| 56 |
for pred in predictions:
|
| 57 |
dialect_name = dialect_mapping.get(pred['label'], pred['label'])
|
|
@@ -59,110 +51,36 @@ def predict_dialect(audio):
|
|
| 59 |
|
| 60 |
return results
|
| 61 |
|
| 62 |
-
#
|
| 63 |
examples = []
|
| 64 |
examples_dir = "examples"
|
| 65 |
if os.path.exists(examples_dir):
|
| 66 |
for filename in os.listdir(examples_dir):
|
| 67 |
if filename.endswith((".wav", ".mp3", ".ogg")):
|
| 68 |
examples.append([os.path.join(examples_dir, filename)])
|
| 69 |
-
|
| 70 |
print(f"Found {len(examples)} example files")
|
| 71 |
else:
|
| 72 |
print("Examples directory not found")
|
| 73 |
|
|
|
|
|
|
|
|
|
|
| 74 |
|
|
|
|
|
|
|
| 75 |
|
| 76 |
-
|
| 77 |
-
custom_css = """
|
| 78 |
-
<style>
|
| 79 |
-
.centered-content {
|
| 80 |
-
text-align: center;
|
| 81 |
-
max-width: 800px;
|
| 82 |
-
margin: 0 auto;
|
| 83 |
-
padding: 20px;
|
| 84 |
-
}
|
| 85 |
-
|
| 86 |
-
.logo-image {
|
| 87 |
-
width: 200px;
|
| 88 |
-
height: auto;
|
| 89 |
-
margin: 20px auto;
|
| 90 |
-
display: block;
|
| 91 |
-
}
|
| 92 |
-
|
| 93 |
-
.description-text {
|
| 94 |
-
font-size: 16px;
|
| 95 |
-
line-height: 1.6;
|
| 96 |
-
margin-bottom: 20px;
|
| 97 |
-
}
|
| 98 |
-
|
| 99 |
-
.dialect-list {
|
| 100 |
-
font-size: 15px;
|
| 101 |
-
line-height: 1.8;
|
| 102 |
-
text-align: left;
|
| 103 |
-
max-width: 600px;
|
| 104 |
-
margin: 0 auto;
|
| 105 |
-
}
|
| 106 |
-
|
| 107 |
-
.highlight-text {
|
| 108 |
-
font-size: 16px;
|
| 109 |
-
color: #2563eb;
|
| 110 |
-
margin: 20px 0;
|
| 111 |
-
}
|
| 112 |
-
|
| 113 |
-
.footer-text {
|
| 114 |
-
font-size: 13px;
|
| 115 |
-
color: #6b7280;
|
| 116 |
-
margin-top: 20px;
|
| 117 |
-
}
|
| 118 |
-
</style>
|
| 119 |
-
"""
|
| 120 |
-
|
| 121 |
-
"""
|
| 122 |
-
<p style="font-size: 15px; line-height: 1.8;">
|
| 123 |
-
<strong>The following Arabic language varieties are supported:</strong>
|
| 124 |
-
<br><br>
|
| 125 |
-
✦ <strong>Modern Standard Arabic (MSA)</strong> - The formal language of media and education
|
| 126 |
-
<br>
|
| 127 |
-
✦ <strong>Egyptian Arabic</strong> - The dialect of Cairo, Alexandria, and popular Arabic cinema
|
| 128 |
-
<br>
|
| 129 |
-
✦ <strong>peninsular Arabic</strong> - Spoken across Saudi Arabia, UAE, Kuwait, Qatar, Bahrain, and Oman
|
| 130 |
-
<br>
|
| 131 |
-
✦ <strong>Levantine Arabic</strong> - The dialect of Syria, Lebanon, Jordan, and Palestine
|
| 132 |
-
<br>
|
| 133 |
-
✦ <strong>Maghrebi Arabic</strong> - The distinctive varieties of Morocco, Algeria, Tunisia, and Libya
|
| 134 |
-
</p>
|
| 135 |
-
<br>
|
| 136 |
"""
|
| 137 |
|
| 138 |
-
# Create the Gradio interface
|
| 139 |
demo = gr.Interface(
|
| 140 |
fn=predict_dialect,
|
| 141 |
inputs=gr.Audio(),
|
| 142 |
outputs=gr.Label(num_top_classes=5, label="Predicted Dialect"),
|
| 143 |
title="Tamyïz 🍉 Arabic Dialect Identification in Speech",
|
| 144 |
-
description=
|
| 145 |
-
<div class="centered-content">
|
| 146 |
-
<div>
|
| 147 |
-
<p>
|
| 148 |
-
By <a href="https://badrex.github.io/" style="color: #2563eb;">Badr Alabsi</a> with ❤��🤍💚
|
| 149 |
-
</p>
|
| 150 |
-
<br>
|
| 151 |
-
<p style="font-size: 15px; line-height: 1.8;">
|
| 152 |
-
This is a demo for the accurate and robust Transformer-based <a href="https://huggingface.co/badrex/mms-300m-arabic-dialect-identifier" style="color: #FF5349;">model</a> for Spoken Arabic Dialect Identification (ADI).
|
| 153 |
-
From just a short audio clip (5-10 seconds), the model can identify Modern Standard Arabic (<strong>MSA</strong>) as well as four major regional Arabic varieties: <strong>Egyptian</strong> Arabic, <strong>Peninsular</strong> Arabic (Gulf, Yemeni, and Iraqi), <strong>Levantine</strong> Arabic, and <strong>Maghrebi</strong> Arabic.
|
| 154 |
-
<br>
|
| 155 |
-
<p style="font-size: 15px; line-height: 1.8;">
|
| 156 |
-
Simply <strong>upload an audio file</strong> 📤 or <strong>record yourself speaking</strong> 🎙️⏺️ to try out the model!
|
| 157 |
-
</p>
|
| 158 |
-
</div>
|
| 159 |
-
</div>
|
| 160 |
-
""",
|
| 161 |
examples=examples if examples else None,
|
| 162 |
-
cache_examples=False,
|
| 163 |
-
#theme=gr.themes.Default(font=[gr.themes.GoogleFont("Amiri"), "Arial", "serif"]),
|
| 164 |
flagging_mode=None
|
| 165 |
)
|
| 166 |
|
| 167 |
-
# Launch the app
|
| 168 |
demo.launch(share=True)
|
|
|
|
| 5 |
import torch
|
| 6 |
import spaces
|
| 7 |
|
| 8 |
+
# load the model
|
| 9 |
print("Loading model...")
|
| 10 |
model_id = "badrex/mms-300m-arabic-dialect-identifier"
|
| 11 |
classifier = pipeline("audio-classification", model=model_id, device='cuda')
|
| 12 |
print("Model loaded successfully")
|
| 13 |
print("Model moved to GPU successfully")
|
| 14 |
|
|
|
|
| 15 |
@spaces.GPU
|
| 16 |
def predict(audio_segment, sr=16000):
|
| 17 |
return classifier({"sampling_rate": sr, "raw": audio_segment})
|
| 18 |
|
| 19 |
+
# define dialect mapping
|
|
|
|
| 20 |
dialect_mapping = {
|
| 21 |
"MSA": "Modern Standard Arabic (MSA) - العربية الفصحى الحديثة",
|
| 22 |
"Egyptian": "Egyptian Arabic - اللهجة المصرية العامية",
|
|
|
|
| 29 |
if audio is None:
|
| 30 |
return {"Error": 1.0}
|
| 31 |
|
|
|
|
| 32 |
sr, audio_array = audio
|
| 33 |
|
|
|
|
| 34 |
if len(audio_array.shape) > 1:
|
| 35 |
+
audio_array = audio_array.mean(axis=1)
|
| 36 |
|
|
|
|
| 37 |
if audio_array.dtype != np.float32:
|
|
|
|
| 38 |
if audio_array.dtype == np.int16:
|
| 39 |
audio_array = audio_array.astype(np.float32) / 32768.0
|
| 40 |
else:
|
|
|
|
| 42 |
|
| 43 |
print(f"Processing audio: sample rate={sr}, shape={audio_array.shape}")
|
| 44 |
|
|
|
|
| 45 |
predictions = predict(sr=sr, audio_segment=audio_array)
|
| 46 |
|
|
|
|
| 47 |
results = {}
|
| 48 |
for pred in predictions:
|
| 49 |
dialect_name = dialect_mapping.get(pred['label'], pred['label'])
|
|
|
|
| 51 |
|
| 52 |
return results
|
| 53 |
|
| 54 |
+
# prepare examples
|
| 55 |
examples = []
|
| 56 |
examples_dir = "examples"
|
| 57 |
if os.path.exists(examples_dir):
|
| 58 |
for filename in os.listdir(examples_dir):
|
| 59 |
if filename.endswith((".wav", ".mp3", ".ogg")):
|
| 60 |
examples.append([os.path.join(examples_dir, filename)])
|
|
|
|
| 61 |
print(f"Found {len(examples)} example files")
|
| 62 |
else:
|
| 63 |
print("Examples directory not found")
|
| 64 |
|
| 65 |
+
# clean description without problematic HTML
|
| 66 |
+
description = """
|
| 67 |
+
By <a href="https://badrex.github.io/">Badr Alabsi</a> with ❤️🤍💚
|
| 68 |
|
| 69 |
+
This is a demo for the accurate and robust Transformer-based <a href="https://huggingface.co/badrex/mms-300m-arabic-dialect-identifier">model</a> for Spoken Arabic Dialect Identification (ADI).
|
| 70 |
+
From just a short audio clip (5-10 seconds), the model can identify Modern Standard Arabic (MSA) as well as four major regional Arabic varieties: Egyptian Arabic, Peninsular Arabic (Gulf, Yemeni, and Iraqi), Levantine Arabic, and Maghrebi Arabic.
|
| 71 |
|
| 72 |
+
Simply **upload an audio file** 📤 or **record yourself speaking** 🎙️⏺️ to try out the model!
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
"""
|
| 74 |
|
|
|
|
| 75 |
demo = gr.Interface(
|
| 76 |
fn=predict_dialect,
|
| 77 |
inputs=gr.Audio(),
|
| 78 |
outputs=gr.Label(num_top_classes=5, label="Predicted Dialect"),
|
| 79 |
title="Tamyïz 🍉 Arabic Dialect Identification in Speech",
|
| 80 |
+
description=description,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
examples=examples if examples else None,
|
| 82 |
+
cache_examples=False,
|
|
|
|
| 83 |
flagging_mode=None
|
| 84 |
)
|
| 85 |
|
|
|
|
| 86 |
demo.launch(share=True)
|