Spaces:
Runtime error
Runtime error
Ceyda Cinarel
commited on
Commit
Β·
8af4bd8
1
Parent(s):
3eecce7
first commit
Browse files- README.md +18 -2
- common_voice.py +164 -0
- requirements.txt +6 -0
README.md
CHANGED
|
@@ -1,2 +1,18 @@
|
|
| 1 |
-
#
|
| 2 |
-
Common Voice Dataset
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Common Voice Dataset Explorer
|
| 2 |
+
[Common Voice Dataset](https://commonvoice.mozilla.org/en/datasets) is by Mozilla
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
Made during [huggingface finetuning week](https://discuss.huggingface.co/t/open-to-the-community-xlsr-wav2vec2-fine-tuning-week-for-low-resource-languages/4467)
|
| 6 |
+
|
| 7 |
+
# Usage
|
| 8 |
+
`pip install -r requirements.txt`
|
| 9 |
+
`streamlit run common_voice.py`
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
# Details
|
| 13 |
+
- Made using streamlit
|
| 14 |
+
- Using https://github.com/PablocFonseca/streamlit-aggrid for interactivity, because you can't click plots yet.
|
| 15 |
+
|
| 16 |
+
I tried to put this together as quickly as I can, so it is not perfect.
|
| 17 |
+
Open a PR or issue~
|
| 18 |
+
|
common_voice.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
# os.environ['TRANSFORMERS_CACHE'] = '/mnt/hf_cache'
|
| 3 |
+
from datasets import load_dataset
|
| 4 |
+
|
| 5 |
+
import streamlit as st
|
| 6 |
+
import pandas as pd
|
| 7 |
+
import numpy as np
|
| 8 |
+
|
| 9 |
+
from st_aggrid import GridOptionsBuilder, AgGrid, GridUpdateMode, DataReturnMode, JsCode
|
| 10 |
+
import plotly.express as px
|
| 11 |
+
|
| 12 |
+
# pd.options.plotting.backend = "plotly"
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
# TODO show average sentence length
|
| 16 |
+
# TODO show audio stats
|
| 17 |
+
# TODO speed better caching
|
| 18 |
+
|
| 19 |
+
# hide_menu_style = """
|
| 20 |
+
# <style>
|
| 21 |
+
# #MainMenu {visibility: hidden;}
|
| 22 |
+
# </style>
|
| 23 |
+
# """
|
| 24 |
+
# st.markdown(hide_menu_style, unsafe_allow_html=True)
|
| 25 |
+
|
| 26 |
+
@st.cache(suppress_st_warning=True)
|
| 27 |
+
def cache_graph(dat,y,x,color=None):
|
| 28 |
+
#I feel like this doesn't work correctly
|
| 29 |
+
return px.bar(dat,y=y,x=x,color=color)
|
| 30 |
+
|
| 31 |
+
@st.cache(suppress_st_warning=True)
|
| 32 |
+
def cache_dataset(language,split=None):
|
| 33 |
+
dat=load_dataset("common_voice",language,split=split)
|
| 34 |
+
if split:
|
| 35 |
+
return pd.DataFrame(dat)
|
| 36 |
+
else:
|
| 37 |
+
return dat
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
language_codes=['ab', 'ar', 'as', 'br', 'ca', 'cnh', 'cs', 'cv', 'cy', 'de', 'dv', 'el', 'en', 'eo', 'es', 'et', 'eu', 'fa', 'fi', 'fr', 'fy-NL', 'ga-IE', 'hi', 'hsb', 'hu', 'ia', 'id', 'it', 'ja', 'ka', 'kab', 'ky', 'lg', 'lt', 'lv', 'mn', 'mt', 'nl', 'or', 'pa-IN', 'pl', 'pt', 'rm-sursilv', 'rm-vallader', 'ro', 'ru', 'rw', 'sah', 'sl', 'sv-SE', 'ta', 'th', 'tr', 'tt', 'uk', 'vi', 'vot', 'zh-CN', 'zh-HK', 'zh-TW']
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
return_mode_value = DataReturnMode.AS_INPUT
|
| 44 |
+
update_mode_value = GridUpdateMode.SELECTION_CHANGED
|
| 45 |
+
|
| 46 |
+
def configure_grid_stat(df):
|
| 47 |
+
|
| 48 |
+
gb = GridOptionsBuilder.from_dataframe(df)
|
| 49 |
+
gb.configure_default_column(editable=False)
|
| 50 |
+
# gb.configure_side_bar()
|
| 51 |
+
|
| 52 |
+
gb.configure_selection("multiple", use_checkbox=False,rowMultiSelectWithClick=True, suppressRowDeselection=False)
|
| 53 |
+
|
| 54 |
+
# gb.configure_grid_options(domLayout='autoHeight')
|
| 55 |
+
gb.configure_grid_options(domLayout='normal')
|
| 56 |
+
gridOptions = gb.build()
|
| 57 |
+
return gridOptions
|
| 58 |
+
|
| 59 |
+
def configure_grid_detail(df):
|
| 60 |
+
|
| 61 |
+
gb = GridOptionsBuilder.from_dataframe(df)
|
| 62 |
+
|
| 63 |
+
# gb.configure_default_column(groupable=False, value=True, enableRowGroup=False, editable=False)
|
| 64 |
+
# gb.configure_side_bar()
|
| 65 |
+
gb.configure_default_column(editable=False)
|
| 66 |
+
gb.configure_column("sentence",initialPinned="left")
|
| 67 |
+
# gb.configure_column("client_id",hide=True)
|
| 68 |
+
gb.configure_selection("single", use_checkbox=False)
|
| 69 |
+
gb.configure_pagination(paginationAutoPageSize=True)
|
| 70 |
+
gb.configure_grid_options(domLayout='normal')
|
| 71 |
+
|
| 72 |
+
gridOptions = gb.build()
|
| 73 |
+
return gridOptions
|
| 74 |
+
|
| 75 |
+
st.sidebar.markdown("# Common Voice Explorer")
|
| 76 |
+
st.sidebar.markdown('[Common Voice](https://commonvoice.mozilla.org/en/datasets) dataset by Mozilla')
|
| 77 |
+
|
| 78 |
+
language=st.sidebar.selectbox("Language code:",language_codes)
|
| 79 |
+
placeholder = st.sidebar.empty()
|
| 80 |
+
placeholder.markdown('Loading for the first time may take a while...downloading dataset :hourglass_flowing_sand:')
|
| 81 |
+
dat = cache_dataset(language, split=None)
|
| 82 |
+
split=placeholder.multiselect("Split:",list(dat.keys()),default="train")
|
| 83 |
+
if len(split)>1:
|
| 84 |
+
split="+".join(split)
|
| 85 |
+
elif split:
|
| 86 |
+
split=split[0]
|
| 87 |
+
|
| 88 |
+
split_stat=pd.DataFrame(dat.num_rows.items(), columns=['split', 'num_rows'])
|
| 89 |
+
fig = cache_graph(split_stat,y='split',x='num_rows')
|
| 90 |
+
st.sidebar.plotly_chart(fig, use_container_width=True,config=dict(displayModeBar=False))
|
| 91 |
+
st.sidebar.markdown("Dataset Explorer by [Ceyda Cinarel](https://github.com/cceyda/common-voice-explorer)")
|
| 92 |
+
|
| 93 |
+
chart_data = cache_dataset(language, split=split)
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
cols=["accent", "age", "down_votes", "gender", "locale", "segment", "up_votes"]
|
| 97 |
+
cols_other=["sentence","path","client_id"]
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
st.markdown("# Stats")
|
| 101 |
+
st.markdown("## Distribution")
|
| 102 |
+
# st.markdown("x axis:first selection color:second selection ")
|
| 103 |
+
|
| 104 |
+
attributes=st.multiselect("Colums:",cols)
|
| 105 |
+
|
| 106 |
+
if attributes:
|
| 107 |
+
|
| 108 |
+
# chart_data = chart_data.replace(r'^\s+$', "UNK", regex=True)
|
| 109 |
+
stats=chart_data.groupby(attributes).size().reset_index(name='counts')
|
| 110 |
+
|
| 111 |
+
col1, col2 = st.beta_columns(2)
|
| 112 |
+
|
| 113 |
+
if len(attributes)>1:
|
| 114 |
+
color=attributes[1]
|
| 115 |
+
else:
|
| 116 |
+
color=None
|
| 117 |
+
fig = cache_graph(stats, x=attributes[0], y='counts',color=color)
|
| 118 |
+
|
| 119 |
+
col1.plotly_chart(fig, use_container_width=True)
|
| 120 |
+
|
| 121 |
+
gridOptions=configure_grid_stat(stats)
|
| 122 |
+
with col2:
|
| 123 |
+
selection=AgGrid(stats,
|
| 124 |
+
data_return_mode=return_mode_value,
|
| 125 |
+
update_mode=update_mode_value,
|
| 126 |
+
fit_columns_on_grid_load=True,
|
| 127 |
+
# allow_unsafe_jscode=True,
|
| 128 |
+
gridOptions=gridOptions
|
| 129 |
+
)
|
| 130 |
+
st.write(":point_up: Click on the table to see details")
|
| 131 |
+
|
| 132 |
+
condition=False
|
| 133 |
+
if selection['selected_rows']:
|
| 134 |
+
for r in selection['selected_rows']:
|
| 135 |
+
|
| 136 |
+
del r["counts"]
|
| 137 |
+
# st.write(r)
|
| 138 |
+
sub_cond=True
|
| 139 |
+
for a in r.keys():
|
| 140 |
+
sub_cond&=(chart_data[a]==r[a])
|
| 141 |
+
|
| 142 |
+
condition|=sub_cond
|
| 143 |
+
|
| 144 |
+
detail_frame=chart_data[condition]
|
| 145 |
+
gridOptions=configure_grid_detail(detail_frame)
|
| 146 |
+
|
| 147 |
+
detail_selection=AgGrid(detail_frame,
|
| 148 |
+
data_return_mode=return_mode_value,
|
| 149 |
+
update_mode=update_mode_value,
|
| 150 |
+
# allow_unsafe_jscode=True,
|
| 151 |
+
gridOptions=gridOptions
|
| 152 |
+
)
|
| 153 |
+
if detail_selection['selected_rows']:
|
| 154 |
+
|
| 155 |
+
example=detail_selection['selected_rows'][0]
|
| 156 |
+
st.audio(example["path"])
|
| 157 |
+
st.write(example["sentence"])
|
| 158 |
+
else:
|
| 159 |
+
st.write(":point_up: Click on the table to listen")
|
| 160 |
+
|
| 161 |
+
else:
|
| 162 |
+
st.write(":point_up: Select a column or two")
|
| 163 |
+
|
| 164 |
+
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
datasets
|
| 2 |
+
pandas
|
| 3 |
+
numpy
|
| 4 |
+
streamlit
|
| 5 |
+
streamlit-aggrid
|
| 6 |
+
plotly
|