David Dale commited on
Commit
baeda9f
·
1 Parent(s): b9fc216

add language names and best systems in the leaderboard; code linting

Browse files
Files changed (2) hide show
  1. app.py +20 -12
  2. leaderboard.py +91 -37
app.py CHANGED
@@ -4,13 +4,15 @@
4
  # This source code is licensed under the license found in the
5
  # LICENSE file in the root directory of this source tree.
6
 
7
- import gradio as gr
8
- import os
9
  import csv
10
- import pandas as pd
11
  from collections import defaultdict
12
 
 
 
 
13
  from leaderboard import leaderboard_tab
 
14
  # from data_samples import data_browse_tab
15
 
16
  DLA = """
@@ -68,9 +70,11 @@ Meta may make changes to this Agreement at any time with notice to You and the o
68
  No failure to exercise and no delay in exercising any right, remedy or power hereunder will operate as a waiver thereof, nor will any single or partial exercise of any right, remedy or power hereunder will operate as a waiver thereof, or the exercise of any other right, remedy or power provided herein or by law or in equity.
69
  """
70
 
 
71
  def guidelines_tab():
72
  with gr.Tab("Translation guidelines"):
73
- gr.Markdown("""
 
74
  # Contributor Guidelines
75
 
76
  # 0\\. Objective
@@ -195,16 +199,19 @@ Considering that some languages frequently resort to code-mixing, it is allowed
195
  It may be the case that some words or expressions of the source language have more than one translation (e.g., in English, both "Bombay" and "Mumbai" refer to the same place). When making your choice, please ensure that:
196
  * The translation is culturally informed; i.e., please refrain from using a negatively connotated or dispreferred translation;
197
  * Recurring items are consistently translated throughout the dataset; i.e., please do not alternate between translation options if not necessary.
198
- """)
 
199
 
200
 
201
  def dla_tab():
202
  with gr.Tab("Dataset License"):
203
  gr.Markdown(DLA)
204
 
 
205
  def intro_tab():
206
  with gr.Tab("Intro"):
207
- gr.Markdown("""
 
208
  ## Let’s make machine translation available for any written language!
209
 
210
  Please take part in shaping the future - your help will be greatly appreciated.
@@ -237,7 +244,8 @@ If you want to contribute dataset translations for a new language or validate ex
237
  * \\[Omnilingual MT Team et al., 2025\\] Omnilingual MT Team, BOUQuET 💐 : dataset, Benchmark and Open initiative for Universal Quality Evaluation in Translation, ArXiv, 2025
238
 
239
 
240
- """)
 
241
 
242
 
243
  with gr.Blocks(
@@ -248,16 +256,16 @@ with gr.Blocks(
248
  font-size: 0.8em;
249
  }
250
  """,
251
- theme=gr.themes.Glass(
252
- font=[gr.themes.GoogleFont("Roboto"), "Arial", "sans-serif"]
253
- ),
254
  ) as demo:
255
  with gr.Blocks(
256
  elem_id="root",
257
  ):
258
- gr.Markdown("""
 
259
  # Welcome to BOUQuET 💐 , Benchmark and Open-initiative for Universal Quality Evaluation in Translation.
260
- """)
 
261
  intro_tab()
262
  leaderboard_tab()
263
  # data_browse_tab()
 
4
  # This source code is licensed under the license found in the
5
  # LICENSE file in the root directory of this source tree.
6
 
 
 
7
  import csv
8
+ import os
9
  from collections import defaultdict
10
 
11
+ import gradio as gr
12
+ import pandas as pd
13
+
14
  from leaderboard import leaderboard_tab
15
+
16
  # from data_samples import data_browse_tab
17
 
18
  DLA = """
 
70
  No failure to exercise and no delay in exercising any right, remedy or power hereunder will operate as a waiver thereof, nor will any single or partial exercise of any right, remedy or power hereunder will operate as a waiver thereof, or the exercise of any other right, remedy or power provided herein or by law or in equity.
71
  """
72
 
73
+
74
  def guidelines_tab():
75
  with gr.Tab("Translation guidelines"):
76
+ gr.Markdown(
77
+ """
78
  # Contributor Guidelines
79
 
80
  # 0\\. Objective
 
199
  It may be the case that some words or expressions of the source language have more than one translation (e.g., in English, both "Bombay" and "Mumbai" refer to the same place). When making your choice, please ensure that:
200
  * The translation is culturally informed; i.e., please refrain from using a negatively connotated or dispreferred translation;
201
  * Recurring items are consistently translated throughout the dataset; i.e., please do not alternate between translation options if not necessary.
202
+ """
203
+ )
204
 
205
 
206
  def dla_tab():
207
  with gr.Tab("Dataset License"):
208
  gr.Markdown(DLA)
209
 
210
+
211
  def intro_tab():
212
  with gr.Tab("Intro"):
213
+ gr.Markdown(
214
+ """
215
  ## Let’s make machine translation available for any written language!
216
 
217
  Please take part in shaping the future - your help will be greatly appreciated.
 
244
  * \\[Omnilingual MT Team et al., 2025\\] Omnilingual MT Team, BOUQuET 💐 : dataset, Benchmark and Open initiative for Universal Quality Evaluation in Translation, ArXiv, 2025
245
 
246
 
247
+ """
248
+ )
249
 
250
 
251
  with gr.Blocks(
 
256
  font-size: 0.8em;
257
  }
258
  """,
259
+ theme=gr.themes.Glass(font=[gr.themes.GoogleFont("Roboto"), "Arial", "sans-serif"]),
 
 
260
  ) as demo:
261
  with gr.Blocks(
262
  elem_id="root",
263
  ):
264
+ gr.Markdown(
265
+ """
266
  # Welcome to BOUQuET 💐 , Benchmark and Open-initiative for Universal Quality Evaluation in Translation.
267
+ """
268
+ )
269
  intro_tab()
270
  leaderboard_tab()
271
  # data_browse_tab()
leaderboard.py CHANGED
@@ -1,12 +1,12 @@
1
-
2
- import pandas as pd
3
- import gradio as gr
4
  import csv
5
  from collections import defaultdict
6
 
 
 
 
7
 
8
  def strip_colname(x):
9
- if x.startswith('score_'):
10
  return x[6:]
11
  return x
12
 
@@ -46,8 +46,8 @@ Descriptions of the implementation of the systems will come out later.
46
  def leaderboard_tab():
47
  stats = pd.read_csv("data/benchmark_stats.tsv", sep="\t", quoting=csv.QUOTE_NONE)
48
  stats.columns = [strip_colname(c) for c in stats.columns]
49
-
50
- metrics = ['metricx_both', 'xcomet_both', 'CHRFpp', 'glotlid_ref']
51
  systems = sorted(set(stats["system"]))
52
  levels = ["sentence_level", "paragraph_level"]
53
  ALL = "ALL"
@@ -65,8 +65,29 @@ def leaderboard_tab():
65
  lang_tgt2src[tgt_lang].add(src_lang)
66
  langs_src.add(src_lang)
67
  langs_tgt.add(tgt_lang)
68
-
69
  langs_df = pd.read_csv("data/language_metadata.tsv", sep="\t")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
  with gr.Tab("Leaderboard"):
72
  gr.Markdown("# BOUQuET translation leaderboard")
@@ -75,8 +96,12 @@ def leaderboard_tab():
75
  gr.Markdown("## Systems ranking")
76
  # Inputs
77
  gr_level = gr.Dropdown(levels, value="sentence_level", label="Level")
78
- gr_src_lang = gr.Dropdown([ALL] + sorted(langs_src), value=ALL, label="Source lang")
79
- gr_tgt_lang = gr.Dropdown([ALL] + sorted(langs_tgt), value=ALL, label="Target lang")
 
 
 
 
80
 
81
  # Interactivity
82
  inputs = [gr_level, gr_src_lang, gr_tgt_lang]
@@ -87,39 +112,52 @@ def leaderboard_tab():
87
  filtered = filtered[filtered["src_lang"].eq(src_lang)]
88
  if tgt_lang != ALL:
89
  filtered = filtered[filtered["tgt_lang"].eq(tgt_lang)]
90
- means = filtered.groupby(['system'])[metrics].mean().reset_index().sort_values('metricx_both')
 
 
 
 
 
91
  means.columns = [strip_colname(c) for c in means.columns]
92
  styler = means.style.background_gradient().format(precision=4)
93
  return styler
94
 
95
  df_all = get_lb(*[inp.value for inp in inputs])
96
  gr_df = gr.Dataframe(df_all)
97
-
98
  for inp in inputs:
99
  inp.change(fn=get_lb, inputs=inputs, outputs=gr_df)
100
-
101
  # Interdependecy of the controls
102
  def src2tgt(src_lang, tgt_lang):
103
  if src_lang == ALL:
104
- choices = [ALL] + sorted(langs_tgt)
105
  else:
106
- choices = [ALL] + sorted(lang_src2tgt[src_lang])
107
-
108
  return gr.update(choices=choices, value=tgt_lang)
109
-
110
  def tgt2src(src_lang, tgt_lang):
111
  if tgt_lang == ALL:
112
- choices = [ALL] + sorted(langs_src)
113
  else:
114
- choices = [ALL] + sorted(lang_tgt2src[tgt_lang])
115
  return gr.update(choices=choices, value=src_lang)
116
-
117
- gr_src_lang.input(fn=src2tgt, inputs=[gr_src_lang, gr_tgt_lang], outputs=gr_tgt_lang)
118
- gr_tgt_lang.input(fn=tgt2src, inputs=[gr_src_lang, gr_tgt_lang], outputs=gr_src_lang)
 
 
 
 
119
 
120
  gr.Markdown("## Languages difficulty")
121
- gr_system = gr.Dropdown([MEAN, BEST] + systems, value=MEAN, label="Translation system")
122
- gr_direction = gr.Dropdown([XX2EN, EN2XX], value=XX2EN, label="Translation direction")
 
 
 
 
123
  gr_metric = gr.Dropdown(metrics, label="Quality metric", value="metricx_both")
124
  gr_level2 = gr.Dropdown(levels, value="sentence_level", label="Level")
125
  bar_controls = [gr_system, gr_direction, gr_metric, gr_level2]
@@ -127,42 +165,58 @@ def leaderboard_tab():
127
  def get_hist(system, direction, metric, level):
128
  # decide on the data to process
129
  if direction == EN2XX:
130
- direction_filter = stats['src_lang'].eq('eng_Latn')
131
  lang_col = "tgt_lang"
132
  else:
133
- direction_filter = stats['tgt_lang'].eq('eng_Latn')
134
  lang_col = "src_lang"
135
  if system in (MEAN, BEST):
136
  system_filter = stats["system"].astype(bool)
137
  else:
138
- system_filter = stats['system'].eq(system)
139
  subset = stats[system_filter & direction_filter & stats["level"].eq(level)]
140
 
141
  # Compute the means and update the plot
142
  grouped = subset.groupby(lang_col)[metric]
 
 
 
 
 
 
143
  if system == BEST:
144
- if metric == "metricx_both":
145
- means = grouped.min()
146
- else:
147
- means = grouped.max()
148
  else:
149
  means = grouped.mean()
150
- means = means.sort_values(
151
- ascending=(metric=="metricx_both")
 
 
 
 
 
 
 
152
  )
153
- means = means.to_frame().reset_index()
 
 
154
  return gr.update(
155
- value=means, x=lang_col, y=metric, x_label_angle=-90,
 
 
 
156
  height=500,
157
- sort="y",
 
158
  )
159
-
160
  default_bar = get_hist(*[x.value for x in bar_controls])
161
  gr_barplot = gr.BarPlot(**default_bar)
162
 
163
  for inp in bar_controls:
164
  inp.change(fn=get_hist, inputs=bar_controls, outputs=gr_barplot)
165
-
166
  gr.Markdown(METRICS_EXPLANATION)
167
  gr.Markdown(SYSTEMS_EXPLANATION)
168
  gr.Markdown(LANGS_EXPLANATION)
 
 
 
 
1
  import csv
2
  from collections import defaultdict
3
 
4
+ import gradio as gr
5
+ import pandas as pd
6
+
7
 
8
  def strip_colname(x):
9
+ if x.startswith("score_"):
10
  return x[6:]
11
  return x
12
 
 
46
  def leaderboard_tab():
47
  stats = pd.read_csv("data/benchmark_stats.tsv", sep="\t", quoting=csv.QUOTE_NONE)
48
  stats.columns = [strip_colname(c) for c in stats.columns]
49
+
50
+ metrics = ["metricx_both", "xcomet_both", "CHRFpp", "glotlid_ref"]
51
  systems = sorted(set(stats["system"]))
52
  levels = ["sentence_level", "paragraph_level"]
53
  ALL = "ALL"
 
65
  lang_tgt2src[tgt_lang].add(src_lang)
66
  langs_src.add(src_lang)
67
  langs_tgt.add(tgt_lang)
68
+
69
  langs_df = pd.read_csv("data/language_metadata.tsv", sep="\t")
70
+ lang2name = {}
71
+ for i, row in langs_df.iterrows():
72
+ code = row["ISO 639-3"] + "_" + row["ISO 15924"]
73
+ if isinstance(row["Glottocode"], str) and len(row["Glottocode"]) > 0:
74
+ code = code + "_" + row["Glottocode"]
75
+ lang2name[code] = row["Language"]
76
+
77
+ if isinstance(row["Secondary ISO 639-3"], str) and len(
78
+ row["Secondary ISO 639-3"]
79
+ ):
80
+ code = row["Secondary ISO 639-3"] + code[3:]
81
+ lang2name[code] = row["Language"]
82
+ for lang in langs_src.union(langs_tgt):
83
+ if lang not in lang2name:
84
+ print(f"Name not found for {lang}")
85
+
86
+ def named_langs(langs_list):
87
+ return [
88
+ (f"{lang} — {lang2name[lang]}", lang) if lang in lang2name else lang
89
+ for lang in langs_list
90
+ ]
91
 
92
  with gr.Tab("Leaderboard"):
93
  gr.Markdown("# BOUQuET translation leaderboard")
 
96
  gr.Markdown("## Systems ranking")
97
  # Inputs
98
  gr_level = gr.Dropdown(levels, value="sentence_level", label="Level")
99
+ gr_src_lang = gr.Dropdown(
100
+ [ALL] + named_langs(sorted(langs_src)), value=ALL, label="Source lang"
101
+ )
102
+ gr_tgt_lang = gr.Dropdown(
103
+ [ALL] + named_langs(sorted(langs_tgt)), value=ALL, label="Target lang"
104
+ )
105
 
106
  # Interactivity
107
  inputs = [gr_level, gr_src_lang, gr_tgt_lang]
 
112
  filtered = filtered[filtered["src_lang"].eq(src_lang)]
113
  if tgt_lang != ALL:
114
  filtered = filtered[filtered["tgt_lang"].eq(tgt_lang)]
115
+ means = (
116
+ filtered.groupby(["system"])[metrics]
117
+ .mean()
118
+ .reset_index()
119
+ .sort_values("metricx_both")
120
+ )
121
  means.columns = [strip_colname(c) for c in means.columns]
122
  styler = means.style.background_gradient().format(precision=4)
123
  return styler
124
 
125
  df_all = get_lb(*[inp.value for inp in inputs])
126
  gr_df = gr.Dataframe(df_all)
127
+
128
  for inp in inputs:
129
  inp.change(fn=get_lb, inputs=inputs, outputs=gr_df)
130
+
131
  # Interdependecy of the controls
132
  def src2tgt(src_lang, tgt_lang):
133
  if src_lang == ALL:
134
+ choices = [ALL] + named_langs(sorted(langs_tgt))
135
  else:
136
+ choices = [ALL] + named_langs(sorted(lang_src2tgt[src_lang]))
137
+
138
  return gr.update(choices=choices, value=tgt_lang)
139
+
140
  def tgt2src(src_lang, tgt_lang):
141
  if tgt_lang == ALL:
142
+ choices = [ALL] + named_langs(sorted(langs_src))
143
  else:
144
+ choices = [ALL] + named_langs(sorted(lang_tgt2src[tgt_lang]))
145
  return gr.update(choices=choices, value=src_lang)
146
+
147
+ gr_src_lang.input(
148
+ fn=src2tgt, inputs=[gr_src_lang, gr_tgt_lang], outputs=gr_tgt_lang
149
+ )
150
+ gr_tgt_lang.input(
151
+ fn=tgt2src, inputs=[gr_src_lang, gr_tgt_lang], outputs=gr_src_lang
152
+ )
153
 
154
  gr.Markdown("## Languages difficulty")
155
+ gr_system = gr.Dropdown(
156
+ [MEAN, BEST] + systems, value=MEAN, label="Translation system"
157
+ )
158
+ gr_direction = gr.Dropdown(
159
+ [XX2EN, EN2XX], value=XX2EN, label="Translation direction"
160
+ )
161
  gr_metric = gr.Dropdown(metrics, label="Quality metric", value="metricx_both")
162
  gr_level2 = gr.Dropdown(levels, value="sentence_level", label="Level")
163
  bar_controls = [gr_system, gr_direction, gr_metric, gr_level2]
 
165
  def get_hist(system, direction, metric, level):
166
  # decide on the data to process
167
  if direction == EN2XX:
168
+ direction_filter = stats["src_lang"].eq("eng_Latn")
169
  lang_col = "tgt_lang"
170
  else:
171
+ direction_filter = stats["tgt_lang"].eq("eng_Latn")
172
  lang_col = "src_lang"
173
  if system in (MEAN, BEST):
174
  system_filter = stats["system"].astype(bool)
175
  else:
176
+ system_filter = stats["system"].eq(system)
177
  subset = stats[system_filter & direction_filter & stats["level"].eq(level)]
178
 
179
  # Compute the means and update the plot
180
  grouped = subset.groupby(lang_col)[metric]
181
+ if metric == "metricx_both":
182
+ bests = grouped.min()
183
+ best_sys = grouped.idxmin()
184
+ else:
185
+ bests = grouped.max()
186
+ best_sys = grouped.idxmax()
187
  if system == BEST:
188
+ means = bests
 
 
 
189
  else:
190
  means = grouped.mean()
191
+ report = (
192
+ pd.DataFrame(
193
+ {
194
+ metric: means,
195
+ "best_system": subset.loc[best_sys]["system"].values,
196
+ }
197
+ )
198
+ .sort_values(metric, ascending=(metric == "metricx_both"))
199
+ .reset_index()
200
  )
201
+ report["lang_name"] = [lang2name.get(lang, "") for lang in report[lang_col]]
202
+ tooltip_columns = ["lang_name", "best_system"]
203
+
204
  return gr.update(
205
+ value=report,
206
+ x=lang_col,
207
+ y=metric,
208
+ x_label_angle=-90,
209
  height=500,
210
+ sort="y",
211
+ tooltip=tooltip_columns,
212
  )
213
+
214
  default_bar = get_hist(*[x.value for x in bar_controls])
215
  gr_barplot = gr.BarPlot(**default_bar)
216
 
217
  for inp in bar_controls:
218
  inp.change(fn=get_hist, inputs=bar_controls, outputs=gr_barplot)
219
+
220
  gr.Markdown(METRICS_EXPLANATION)
221
  gr.Markdown(SYSTEMS_EXPLANATION)
222
  gr.Markdown(LANGS_EXPLANATION)