Spaces:
Running
Running
David Dale
commited on
Commit
·
baeda9f
1
Parent(s):
b9fc216
add language names and best systems in the leaderboard; code linting
Browse files- app.py +20 -12
- leaderboard.py +91 -37
app.py
CHANGED
|
@@ -4,13 +4,15 @@
|
|
| 4 |
# This source code is licensed under the license found in the
|
| 5 |
# LICENSE file in the root directory of this source tree.
|
| 6 |
|
| 7 |
-
import gradio as gr
|
| 8 |
-
import os
|
| 9 |
import csv
|
| 10 |
-
import
|
| 11 |
from collections import defaultdict
|
| 12 |
|
|
|
|
|
|
|
|
|
|
| 13 |
from leaderboard import leaderboard_tab
|
|
|
|
| 14 |
# from data_samples import data_browse_tab
|
| 15 |
|
| 16 |
DLA = """
|
|
@@ -68,9 +70,11 @@ Meta may make changes to this Agreement at any time with notice to You and the o
|
|
| 68 |
No failure to exercise and no delay in exercising any right, remedy or power hereunder will operate as a waiver thereof, nor will any single or partial exercise of any right, remedy or power hereunder will operate as a waiver thereof, or the exercise of any other right, remedy or power provided herein or by law or in equity.
|
| 69 |
"""
|
| 70 |
|
|
|
|
| 71 |
def guidelines_tab():
|
| 72 |
with gr.Tab("Translation guidelines"):
|
| 73 |
-
gr.Markdown(
|
|
|
|
| 74 |
# Contributor Guidelines
|
| 75 |
|
| 76 |
# 0\\. Objective
|
|
@@ -195,16 +199,19 @@ Considering that some languages frequently resort to code-mixing, it is allowed
|
|
| 195 |
It may be the case that some words or expressions of the source language have more than one translation (e.g., in English, both "Bombay" and "Mumbai" refer to the same place). When making your choice, please ensure that:
|
| 196 |
* The translation is culturally informed; i.e., please refrain from using a negatively connotated or dispreferred translation;
|
| 197 |
* Recurring items are consistently translated throughout the dataset; i.e., please do not alternate between translation options if not necessary.
|
| 198 |
-
"""
|
|
|
|
| 199 |
|
| 200 |
|
| 201 |
def dla_tab():
|
| 202 |
with gr.Tab("Dataset License"):
|
| 203 |
gr.Markdown(DLA)
|
| 204 |
|
|
|
|
| 205 |
def intro_tab():
|
| 206 |
with gr.Tab("Intro"):
|
| 207 |
-
gr.Markdown(
|
|
|
|
| 208 |
## Let’s make machine translation available for any written language!
|
| 209 |
|
| 210 |
Please take part in shaping the future - your help will be greatly appreciated.
|
|
@@ -237,7 +244,8 @@ If you want to contribute dataset translations for a new language or validate ex
|
|
| 237 |
* \\[Omnilingual MT Team et al., 2025\\] Omnilingual MT Team, BOUQuET 💐 : dataset, Benchmark and Open initiative for Universal Quality Evaluation in Translation, ArXiv, 2025
|
| 238 |
|
| 239 |
|
| 240 |
-
"""
|
|
|
|
| 241 |
|
| 242 |
|
| 243 |
with gr.Blocks(
|
|
@@ -248,16 +256,16 @@ with gr.Blocks(
|
|
| 248 |
font-size: 0.8em;
|
| 249 |
}
|
| 250 |
""",
|
| 251 |
-
theme=gr.themes.Glass(
|
| 252 |
-
font=[gr.themes.GoogleFont("Roboto"), "Arial", "sans-serif"]
|
| 253 |
-
),
|
| 254 |
) as demo:
|
| 255 |
with gr.Blocks(
|
| 256 |
elem_id="root",
|
| 257 |
):
|
| 258 |
-
gr.Markdown(
|
|
|
|
| 259 |
# Welcome to BOUQuET 💐 , Benchmark and Open-initiative for Universal Quality Evaluation in Translation.
|
| 260 |
-
"""
|
|
|
|
| 261 |
intro_tab()
|
| 262 |
leaderboard_tab()
|
| 263 |
# data_browse_tab()
|
|
|
|
| 4 |
# This source code is licensed under the license found in the
|
| 5 |
# LICENSE file in the root directory of this source tree.
|
| 6 |
|
|
|
|
|
|
|
| 7 |
import csv
|
| 8 |
+
import os
|
| 9 |
from collections import defaultdict
|
| 10 |
|
| 11 |
+
import gradio as gr
|
| 12 |
+
import pandas as pd
|
| 13 |
+
|
| 14 |
from leaderboard import leaderboard_tab
|
| 15 |
+
|
| 16 |
# from data_samples import data_browse_tab
|
| 17 |
|
| 18 |
DLA = """
|
|
|
|
| 70 |
No failure to exercise and no delay in exercising any right, remedy or power hereunder will operate as a waiver thereof, nor will any single or partial exercise of any right, remedy or power hereunder will operate as a waiver thereof, or the exercise of any other right, remedy or power provided herein or by law or in equity.
|
| 71 |
"""
|
| 72 |
|
| 73 |
+
|
| 74 |
def guidelines_tab():
|
| 75 |
with gr.Tab("Translation guidelines"):
|
| 76 |
+
gr.Markdown(
|
| 77 |
+
"""
|
| 78 |
# Contributor Guidelines
|
| 79 |
|
| 80 |
# 0\\. Objective
|
|
|
|
| 199 |
It may be the case that some words or expressions of the source language have more than one translation (e.g., in English, both "Bombay" and "Mumbai" refer to the same place). When making your choice, please ensure that:
|
| 200 |
* The translation is culturally informed; i.e., please refrain from using a negatively connotated or dispreferred translation;
|
| 201 |
* Recurring items are consistently translated throughout the dataset; i.e., please do not alternate between translation options if not necessary.
|
| 202 |
+
"""
|
| 203 |
+
)
|
| 204 |
|
| 205 |
|
| 206 |
def dla_tab():
|
| 207 |
with gr.Tab("Dataset License"):
|
| 208 |
gr.Markdown(DLA)
|
| 209 |
|
| 210 |
+
|
| 211 |
def intro_tab():
|
| 212 |
with gr.Tab("Intro"):
|
| 213 |
+
gr.Markdown(
|
| 214 |
+
"""
|
| 215 |
## Let’s make machine translation available for any written language!
|
| 216 |
|
| 217 |
Please take part in shaping the future - your help will be greatly appreciated.
|
|
|
|
| 244 |
* \\[Omnilingual MT Team et al., 2025\\] Omnilingual MT Team, BOUQuET 💐 : dataset, Benchmark and Open initiative for Universal Quality Evaluation in Translation, ArXiv, 2025
|
| 245 |
|
| 246 |
|
| 247 |
+
"""
|
| 248 |
+
)
|
| 249 |
|
| 250 |
|
| 251 |
with gr.Blocks(
|
|
|
|
| 256 |
font-size: 0.8em;
|
| 257 |
}
|
| 258 |
""",
|
| 259 |
+
theme=gr.themes.Glass(font=[gr.themes.GoogleFont("Roboto"), "Arial", "sans-serif"]),
|
|
|
|
|
|
|
| 260 |
) as demo:
|
| 261 |
with gr.Blocks(
|
| 262 |
elem_id="root",
|
| 263 |
):
|
| 264 |
+
gr.Markdown(
|
| 265 |
+
"""
|
| 266 |
# Welcome to BOUQuET 💐 , Benchmark and Open-initiative for Universal Quality Evaluation in Translation.
|
| 267 |
+
"""
|
| 268 |
+
)
|
| 269 |
intro_tab()
|
| 270 |
leaderboard_tab()
|
| 271 |
# data_browse_tab()
|
leaderboard.py
CHANGED
|
@@ -1,12 +1,12 @@
|
|
| 1 |
-
|
| 2 |
-
import pandas as pd
|
| 3 |
-
import gradio as gr
|
| 4 |
import csv
|
| 5 |
from collections import defaultdict
|
| 6 |
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
def strip_colname(x):
|
| 9 |
-
if x.startswith(
|
| 10 |
return x[6:]
|
| 11 |
return x
|
| 12 |
|
|
@@ -46,8 +46,8 @@ Descriptions of the implementation of the systems will come out later.
|
|
| 46 |
def leaderboard_tab():
|
| 47 |
stats = pd.read_csv("data/benchmark_stats.tsv", sep="\t", quoting=csv.QUOTE_NONE)
|
| 48 |
stats.columns = [strip_colname(c) for c in stats.columns]
|
| 49 |
-
|
| 50 |
-
metrics = [
|
| 51 |
systems = sorted(set(stats["system"]))
|
| 52 |
levels = ["sentence_level", "paragraph_level"]
|
| 53 |
ALL = "ALL"
|
|
@@ -65,8 +65,29 @@ def leaderboard_tab():
|
|
| 65 |
lang_tgt2src[tgt_lang].add(src_lang)
|
| 66 |
langs_src.add(src_lang)
|
| 67 |
langs_tgt.add(tgt_lang)
|
| 68 |
-
|
| 69 |
langs_df = pd.read_csv("data/language_metadata.tsv", sep="\t")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
with gr.Tab("Leaderboard"):
|
| 72 |
gr.Markdown("# BOUQuET translation leaderboard")
|
|
@@ -75,8 +96,12 @@ def leaderboard_tab():
|
|
| 75 |
gr.Markdown("## Systems ranking")
|
| 76 |
# Inputs
|
| 77 |
gr_level = gr.Dropdown(levels, value="sentence_level", label="Level")
|
| 78 |
-
gr_src_lang = gr.Dropdown(
|
| 79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
|
| 81 |
# Interactivity
|
| 82 |
inputs = [gr_level, gr_src_lang, gr_tgt_lang]
|
|
@@ -87,39 +112,52 @@ def leaderboard_tab():
|
|
| 87 |
filtered = filtered[filtered["src_lang"].eq(src_lang)]
|
| 88 |
if tgt_lang != ALL:
|
| 89 |
filtered = filtered[filtered["tgt_lang"].eq(tgt_lang)]
|
| 90 |
-
means =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
means.columns = [strip_colname(c) for c in means.columns]
|
| 92 |
styler = means.style.background_gradient().format(precision=4)
|
| 93 |
return styler
|
| 94 |
|
| 95 |
df_all = get_lb(*[inp.value for inp in inputs])
|
| 96 |
gr_df = gr.Dataframe(df_all)
|
| 97 |
-
|
| 98 |
for inp in inputs:
|
| 99 |
inp.change(fn=get_lb, inputs=inputs, outputs=gr_df)
|
| 100 |
-
|
| 101 |
# Interdependecy of the controls
|
| 102 |
def src2tgt(src_lang, tgt_lang):
|
| 103 |
if src_lang == ALL:
|
| 104 |
-
choices = [ALL] + sorted(langs_tgt)
|
| 105 |
else:
|
| 106 |
-
choices = [ALL] + sorted(lang_src2tgt[src_lang])
|
| 107 |
-
|
| 108 |
return gr.update(choices=choices, value=tgt_lang)
|
| 109 |
-
|
| 110 |
def tgt2src(src_lang, tgt_lang):
|
| 111 |
if tgt_lang == ALL:
|
| 112 |
-
choices = [ALL] + sorted(langs_src)
|
| 113 |
else:
|
| 114 |
-
choices = [ALL] + sorted(lang_tgt2src[tgt_lang])
|
| 115 |
return gr.update(choices=choices, value=src_lang)
|
| 116 |
-
|
| 117 |
-
gr_src_lang.input(
|
| 118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
|
| 120 |
gr.Markdown("## Languages difficulty")
|
| 121 |
-
gr_system = gr.Dropdown(
|
| 122 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
gr_metric = gr.Dropdown(metrics, label="Quality metric", value="metricx_both")
|
| 124 |
gr_level2 = gr.Dropdown(levels, value="sentence_level", label="Level")
|
| 125 |
bar_controls = [gr_system, gr_direction, gr_metric, gr_level2]
|
|
@@ -127,42 +165,58 @@ def leaderboard_tab():
|
|
| 127 |
def get_hist(system, direction, metric, level):
|
| 128 |
# decide on the data to process
|
| 129 |
if direction == EN2XX:
|
| 130 |
-
direction_filter = stats[
|
| 131 |
lang_col = "tgt_lang"
|
| 132 |
else:
|
| 133 |
-
direction_filter = stats[
|
| 134 |
lang_col = "src_lang"
|
| 135 |
if system in (MEAN, BEST):
|
| 136 |
system_filter = stats["system"].astype(bool)
|
| 137 |
else:
|
| 138 |
-
system_filter = stats[
|
| 139 |
subset = stats[system_filter & direction_filter & stats["level"].eq(level)]
|
| 140 |
|
| 141 |
# Compute the means and update the plot
|
| 142 |
grouped = subset.groupby(lang_col)[metric]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
if system == BEST:
|
| 144 |
-
|
| 145 |
-
means = grouped.min()
|
| 146 |
-
else:
|
| 147 |
-
means = grouped.max()
|
| 148 |
else:
|
| 149 |
means = grouped.mean()
|
| 150 |
-
|
| 151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
)
|
| 153 |
-
|
|
|
|
|
|
|
| 154 |
return gr.update(
|
| 155 |
-
value=
|
|
|
|
|
|
|
|
|
|
| 156 |
height=500,
|
| 157 |
-
sort="y",
|
|
|
|
| 158 |
)
|
| 159 |
-
|
| 160 |
default_bar = get_hist(*[x.value for x in bar_controls])
|
| 161 |
gr_barplot = gr.BarPlot(**default_bar)
|
| 162 |
|
| 163 |
for inp in bar_controls:
|
| 164 |
inp.change(fn=get_hist, inputs=bar_controls, outputs=gr_barplot)
|
| 165 |
-
|
| 166 |
gr.Markdown(METRICS_EXPLANATION)
|
| 167 |
gr.Markdown(SYSTEMS_EXPLANATION)
|
| 168 |
gr.Markdown(LANGS_EXPLANATION)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import csv
|
| 2 |
from collections import defaultdict
|
| 3 |
|
| 4 |
+
import gradio as gr
|
| 5 |
+
import pandas as pd
|
| 6 |
+
|
| 7 |
|
| 8 |
def strip_colname(x):
|
| 9 |
+
if x.startswith("score_"):
|
| 10 |
return x[6:]
|
| 11 |
return x
|
| 12 |
|
|
|
|
| 46 |
def leaderboard_tab():
|
| 47 |
stats = pd.read_csv("data/benchmark_stats.tsv", sep="\t", quoting=csv.QUOTE_NONE)
|
| 48 |
stats.columns = [strip_colname(c) for c in stats.columns]
|
| 49 |
+
|
| 50 |
+
metrics = ["metricx_both", "xcomet_both", "CHRFpp", "glotlid_ref"]
|
| 51 |
systems = sorted(set(stats["system"]))
|
| 52 |
levels = ["sentence_level", "paragraph_level"]
|
| 53 |
ALL = "ALL"
|
|
|
|
| 65 |
lang_tgt2src[tgt_lang].add(src_lang)
|
| 66 |
langs_src.add(src_lang)
|
| 67 |
langs_tgt.add(tgt_lang)
|
| 68 |
+
|
| 69 |
langs_df = pd.read_csv("data/language_metadata.tsv", sep="\t")
|
| 70 |
+
lang2name = {}
|
| 71 |
+
for i, row in langs_df.iterrows():
|
| 72 |
+
code = row["ISO 639-3"] + "_" + row["ISO 15924"]
|
| 73 |
+
if isinstance(row["Glottocode"], str) and len(row["Glottocode"]) > 0:
|
| 74 |
+
code = code + "_" + row["Glottocode"]
|
| 75 |
+
lang2name[code] = row["Language"]
|
| 76 |
+
|
| 77 |
+
if isinstance(row["Secondary ISO 639-3"], str) and len(
|
| 78 |
+
row["Secondary ISO 639-3"]
|
| 79 |
+
):
|
| 80 |
+
code = row["Secondary ISO 639-3"] + code[3:]
|
| 81 |
+
lang2name[code] = row["Language"]
|
| 82 |
+
for lang in langs_src.union(langs_tgt):
|
| 83 |
+
if lang not in lang2name:
|
| 84 |
+
print(f"Name not found for {lang}")
|
| 85 |
+
|
| 86 |
+
def named_langs(langs_list):
|
| 87 |
+
return [
|
| 88 |
+
(f"{lang} — {lang2name[lang]}", lang) if lang in lang2name else lang
|
| 89 |
+
for lang in langs_list
|
| 90 |
+
]
|
| 91 |
|
| 92 |
with gr.Tab("Leaderboard"):
|
| 93 |
gr.Markdown("# BOUQuET translation leaderboard")
|
|
|
|
| 96 |
gr.Markdown("## Systems ranking")
|
| 97 |
# Inputs
|
| 98 |
gr_level = gr.Dropdown(levels, value="sentence_level", label="Level")
|
| 99 |
+
gr_src_lang = gr.Dropdown(
|
| 100 |
+
[ALL] + named_langs(sorted(langs_src)), value=ALL, label="Source lang"
|
| 101 |
+
)
|
| 102 |
+
gr_tgt_lang = gr.Dropdown(
|
| 103 |
+
[ALL] + named_langs(sorted(langs_tgt)), value=ALL, label="Target lang"
|
| 104 |
+
)
|
| 105 |
|
| 106 |
# Interactivity
|
| 107 |
inputs = [gr_level, gr_src_lang, gr_tgt_lang]
|
|
|
|
| 112 |
filtered = filtered[filtered["src_lang"].eq(src_lang)]
|
| 113 |
if tgt_lang != ALL:
|
| 114 |
filtered = filtered[filtered["tgt_lang"].eq(tgt_lang)]
|
| 115 |
+
means = (
|
| 116 |
+
filtered.groupby(["system"])[metrics]
|
| 117 |
+
.mean()
|
| 118 |
+
.reset_index()
|
| 119 |
+
.sort_values("metricx_both")
|
| 120 |
+
)
|
| 121 |
means.columns = [strip_colname(c) for c in means.columns]
|
| 122 |
styler = means.style.background_gradient().format(precision=4)
|
| 123 |
return styler
|
| 124 |
|
| 125 |
df_all = get_lb(*[inp.value for inp in inputs])
|
| 126 |
gr_df = gr.Dataframe(df_all)
|
| 127 |
+
|
| 128 |
for inp in inputs:
|
| 129 |
inp.change(fn=get_lb, inputs=inputs, outputs=gr_df)
|
| 130 |
+
|
| 131 |
# Interdependecy of the controls
|
| 132 |
def src2tgt(src_lang, tgt_lang):
|
| 133 |
if src_lang == ALL:
|
| 134 |
+
choices = [ALL] + named_langs(sorted(langs_tgt))
|
| 135 |
else:
|
| 136 |
+
choices = [ALL] + named_langs(sorted(lang_src2tgt[src_lang]))
|
| 137 |
+
|
| 138 |
return gr.update(choices=choices, value=tgt_lang)
|
| 139 |
+
|
| 140 |
def tgt2src(src_lang, tgt_lang):
|
| 141 |
if tgt_lang == ALL:
|
| 142 |
+
choices = [ALL] + named_langs(sorted(langs_src))
|
| 143 |
else:
|
| 144 |
+
choices = [ALL] + named_langs(sorted(lang_tgt2src[tgt_lang]))
|
| 145 |
return gr.update(choices=choices, value=src_lang)
|
| 146 |
+
|
| 147 |
+
gr_src_lang.input(
|
| 148 |
+
fn=src2tgt, inputs=[gr_src_lang, gr_tgt_lang], outputs=gr_tgt_lang
|
| 149 |
+
)
|
| 150 |
+
gr_tgt_lang.input(
|
| 151 |
+
fn=tgt2src, inputs=[gr_src_lang, gr_tgt_lang], outputs=gr_src_lang
|
| 152 |
+
)
|
| 153 |
|
| 154 |
gr.Markdown("## Languages difficulty")
|
| 155 |
+
gr_system = gr.Dropdown(
|
| 156 |
+
[MEAN, BEST] + systems, value=MEAN, label="Translation system"
|
| 157 |
+
)
|
| 158 |
+
gr_direction = gr.Dropdown(
|
| 159 |
+
[XX2EN, EN2XX], value=XX2EN, label="Translation direction"
|
| 160 |
+
)
|
| 161 |
gr_metric = gr.Dropdown(metrics, label="Quality metric", value="metricx_both")
|
| 162 |
gr_level2 = gr.Dropdown(levels, value="sentence_level", label="Level")
|
| 163 |
bar_controls = [gr_system, gr_direction, gr_metric, gr_level2]
|
|
|
|
| 165 |
def get_hist(system, direction, metric, level):
|
| 166 |
# decide on the data to process
|
| 167 |
if direction == EN2XX:
|
| 168 |
+
direction_filter = stats["src_lang"].eq("eng_Latn")
|
| 169 |
lang_col = "tgt_lang"
|
| 170 |
else:
|
| 171 |
+
direction_filter = stats["tgt_lang"].eq("eng_Latn")
|
| 172 |
lang_col = "src_lang"
|
| 173 |
if system in (MEAN, BEST):
|
| 174 |
system_filter = stats["system"].astype(bool)
|
| 175 |
else:
|
| 176 |
+
system_filter = stats["system"].eq(system)
|
| 177 |
subset = stats[system_filter & direction_filter & stats["level"].eq(level)]
|
| 178 |
|
| 179 |
# Compute the means and update the plot
|
| 180 |
grouped = subset.groupby(lang_col)[metric]
|
| 181 |
+
if metric == "metricx_both":
|
| 182 |
+
bests = grouped.min()
|
| 183 |
+
best_sys = grouped.idxmin()
|
| 184 |
+
else:
|
| 185 |
+
bests = grouped.max()
|
| 186 |
+
best_sys = grouped.idxmax()
|
| 187 |
if system == BEST:
|
| 188 |
+
means = bests
|
|
|
|
|
|
|
|
|
|
| 189 |
else:
|
| 190 |
means = grouped.mean()
|
| 191 |
+
report = (
|
| 192 |
+
pd.DataFrame(
|
| 193 |
+
{
|
| 194 |
+
metric: means,
|
| 195 |
+
"best_system": subset.loc[best_sys]["system"].values,
|
| 196 |
+
}
|
| 197 |
+
)
|
| 198 |
+
.sort_values(metric, ascending=(metric == "metricx_both"))
|
| 199 |
+
.reset_index()
|
| 200 |
)
|
| 201 |
+
report["lang_name"] = [lang2name.get(lang, "") for lang in report[lang_col]]
|
| 202 |
+
tooltip_columns = ["lang_name", "best_system"]
|
| 203 |
+
|
| 204 |
return gr.update(
|
| 205 |
+
value=report,
|
| 206 |
+
x=lang_col,
|
| 207 |
+
y=metric,
|
| 208 |
+
x_label_angle=-90,
|
| 209 |
height=500,
|
| 210 |
+
sort="y",
|
| 211 |
+
tooltip=tooltip_columns,
|
| 212 |
)
|
| 213 |
+
|
| 214 |
default_bar = get_hist(*[x.value for x in bar_controls])
|
| 215 |
gr_barplot = gr.BarPlot(**default_bar)
|
| 216 |
|
| 217 |
for inp in bar_controls:
|
| 218 |
inp.change(fn=get_hist, inputs=bar_controls, outputs=gr_barplot)
|
| 219 |
+
|
| 220 |
gr.Markdown(METRICS_EXPLANATION)
|
| 221 |
gr.Markdown(SYSTEMS_EXPLANATION)
|
| 222 |
gr.Markdown(LANGS_EXPLANATION)
|