Spaces:

facebook
/

bouquet

Running

App Files Files Community

David Dale commited on 26 days ago

Commit

baeda9f

1 Parent(s): b9fc216

add language names and best systems in the leaderboard; code linting

Browse files

Files changed (2) hide show

app.py +20 -12
leaderboard.py +91 -37

app.py CHANGED Viewed

@@ -4,13 +4,15 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
-import gradio as gr
-import os
 import csv
-import pandas as pd
 from collections import defaultdict
 from leaderboard import leaderboard_tab
 # from data_samples import data_browse_tab
 DLA = """
@@ -68,9 +70,11 @@ Meta may make changes to this Agreement at any time with notice to You and the o
 No failure to exercise and no delay in exercising any right, remedy or power hereunder will operate as a waiver thereof, nor will any single or partial exercise of any right, remedy or power hereunder will operate as a waiver thereof, or the exercise of any other right, remedy or power provided herein or by law or in equity.
 """
 def guidelines_tab():
     with gr.Tab("Translation guidelines"):
-        gr.Markdown("""
 # Contributor Guidelines
 # 0\\. Objective
@@ -195,16 +199,19 @@ Considering that some languages frequently resort to code-mixing, it is allowed
 It may be the case that some words or expressions of the source language have more than one translation (e.g., in English, both "Bombay" and "Mumbai" refer to the same place). When making your choice, please ensure that:
 * The translation is culturally informed; i.e., please refrain from using a negatively connotated or dispreferred translation;
 * Recurring items are consistently translated throughout the dataset; i.e., please do not alternate between translation options if not necessary.
-""")
 def dla_tab():
     with gr.Tab("Dataset License"):
         gr.Markdown(DLA)
 def intro_tab():
     with gr.Tab("Intro"):
-        gr.Markdown("""
 ## Let’s make machine translation available for any written language!
 Please take part in shaping the future - your help will be greatly appreciated.
@@ -237,7 +244,8 @@ If you want to contribute dataset translations for a new language or validate ex
 * \\[Omnilingual MT Team et al., 2025\\]  Omnilingual MT Team, BOUQuET 💐 : dataset, Benchmark and Open initiative for Universal Quality Evaluation in Translation, ArXiv, 2025
-        """)
 with gr.Blocks(
@@ -248,16 +256,16 @@ with gr.Blocks(
         font-size: 0.8em;
     }
     """,
-    theme=gr.themes.Glass(
-        font=[gr.themes.GoogleFont("Roboto"), "Arial", "sans-serif"]
-    ),
 ) as demo:
     with gr.Blocks(
         elem_id="root",
     ):
-        gr.Markdown("""
     # Welcome to BOUQuET 💐 , Benchmark and Open-initiative for Universal Quality Evaluation in Translation.
-    """)
         intro_tab()
         leaderboard_tab()
         # data_browse_tab()

 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 import csv
+import os
 from collections import defaultdict
+import gradio as gr
+import pandas as pd
 from leaderboard import leaderboard_tab
 # from data_samples import data_browse_tab
 DLA = """
 No failure to exercise and no delay in exercising any right, remedy or power hereunder will operate as a waiver thereof, nor will any single or partial exercise of any right, remedy or power hereunder will operate as a waiver thereof, or the exercise of any other right, remedy or power provided herein or by law or in equity.
 """
 def guidelines_tab():
     with gr.Tab("Translation guidelines"):
+        gr.Markdown(
+            """
 # Contributor Guidelines
 # 0\\. Objective
 It may be the case that some words or expressions of the source language have more than one translation (e.g., in English, both "Bombay" and "Mumbai" refer to the same place). When making your choice, please ensure that:
 * The translation is culturally informed; i.e., please refrain from using a negatively connotated or dispreferred translation;
 * Recurring items are consistently translated throughout the dataset; i.e., please do not alternate between translation options if not necessary.
+"""
+        )
 def dla_tab():
     with gr.Tab("Dataset License"):
         gr.Markdown(DLA)
 def intro_tab():
     with gr.Tab("Intro"):
+        gr.Markdown(
+            """
 ## Let’s make machine translation available for any written language!
 Please take part in shaping the future - your help will be greatly appreciated.
 * \\[Omnilingual MT Team et al., 2025\\]  Omnilingual MT Team, BOUQuET 💐 : dataset, Benchmark and Open initiative for Universal Quality Evaluation in Translation, ArXiv, 2025
+        """
+        )
 with gr.Blocks(
         font-size: 0.8em;
     }
     """,
+    theme=gr.themes.Glass(font=[gr.themes.GoogleFont("Roboto"), "Arial", "sans-serif"]),
 ) as demo:
     with gr.Blocks(
         elem_id="root",
     ):
+        gr.Markdown(
+            """
     # Welcome to BOUQuET 💐 , Benchmark and Open-initiative for Universal Quality Evaluation in Translation.
+    """
+        )
         intro_tab()
         leaderboard_tab()
         # data_browse_tab()

leaderboard.py CHANGED Viewed

@@ -1,12 +1,12 @@
-import pandas as pd
-import gradio as gr
 import csv
 from collections import defaultdict
 def strip_colname(x):
-    if x.startswith('score_'):
         return x[6:]
     return x
@@ -46,8 +46,8 @@ Descriptions of the implementation of the systems will come out later.
 def leaderboard_tab():
     stats = pd.read_csv("data/benchmark_stats.tsv", sep="\t", quoting=csv.QUOTE_NONE)
     stats.columns = [strip_colname(c) for c in stats.columns]
-    metrics = ['metricx_both', 'xcomet_both', 'CHRFpp', 'glotlid_ref']
     systems = sorted(set(stats["system"]))
     levels = ["sentence_level", "paragraph_level"]
     ALL = "ALL"
@@ -65,8 +65,29 @@ def leaderboard_tab():
         lang_tgt2src[tgt_lang].add(src_lang)
         langs_src.add(src_lang)
         langs_tgt.add(tgt_lang)
     langs_df = pd.read_csv("data/language_metadata.tsv", sep="\t")
     with gr.Tab("Leaderboard"):
         gr.Markdown("# BOUQuET translation leaderboard")
@@ -75,8 +96,12 @@ def leaderboard_tab():
         gr.Markdown("## Systems ranking")
         # Inputs
         gr_level = gr.Dropdown(levels, value="sentence_level", label="Level")
-        gr_src_lang = gr.Dropdown([ALL] + sorted(langs_src), value=ALL, label="Source lang")
-        gr_tgt_lang = gr.Dropdown([ALL] + sorted(langs_tgt), value=ALL, label="Target lang")
         # Interactivity
         inputs = [gr_level, gr_src_lang, gr_tgt_lang]
@@ -87,39 +112,52 @@ def leaderboard_tab():
                 filtered = filtered[filtered["src_lang"].eq(src_lang)]
             if tgt_lang != ALL:
                 filtered = filtered[filtered["tgt_lang"].eq(tgt_lang)]
-            means = filtered.groupby(['system'])[metrics].mean().reset_index().sort_values('metricx_both')
             means.columns = [strip_colname(c) for c in means.columns]
             styler = means.style.background_gradient().format(precision=4)
             return styler
         df_all = get_lb(*[inp.value for inp in inputs])
         gr_df = gr.Dataframe(df_all)
         for inp in inputs:
             inp.change(fn=get_lb, inputs=inputs, outputs=gr_df)
         # Interdependecy of the controls
         def src2tgt(src_lang, tgt_lang):
             if src_lang == ALL:
-                choices = [ALL] + sorted(langs_tgt)
             else:
-                choices = [ALL] + sorted(lang_src2tgt[src_lang])
             return gr.update(choices=choices, value=tgt_lang)
         def tgt2src(src_lang, tgt_lang):
             if tgt_lang == ALL:
-                choices = [ALL] + sorted(langs_src)
             else:
-                choices = [ALL] + sorted(lang_tgt2src[tgt_lang])
             return gr.update(choices=choices, value=src_lang)
-        gr_src_lang.input(fn=src2tgt, inputs=[gr_src_lang, gr_tgt_lang], outputs=gr_tgt_lang)
-        gr_tgt_lang.input(fn=tgt2src, inputs=[gr_src_lang, gr_tgt_lang], outputs=gr_src_lang)
         gr.Markdown("## Languages difficulty")
-        gr_system = gr.Dropdown([MEAN, BEST] + systems, value=MEAN, label="Translation system")
-        gr_direction = gr.Dropdown([XX2EN, EN2XX], value=XX2EN, label="Translation direction")
         gr_metric = gr.Dropdown(metrics, label="Quality metric", value="metricx_both")
         gr_level2 = gr.Dropdown(levels, value="sentence_level", label="Level")
         bar_controls = [gr_system, gr_direction, gr_metric, gr_level2]
@@ -127,42 +165,58 @@ def leaderboard_tab():
         def get_hist(system, direction, metric, level):
             # decide on the data to process
             if direction == EN2XX:
-                direction_filter = stats['src_lang'].eq('eng_Latn')
                 lang_col = "tgt_lang"
             else:
-                direction_filter = stats['tgt_lang'].eq('eng_Latn')
                 lang_col = "src_lang"
             if system in (MEAN, BEST):
                 system_filter = stats["system"].astype(bool)
             else:
-                system_filter = stats['system'].eq(system)
             subset = stats[system_filter & direction_filter & stats["level"].eq(level)]
             # Compute the means and update the plot
             grouped = subset.groupby(lang_col)[metric]
             if system == BEST:
-                if metric == "metricx_both":
-                    means = grouped.min()
-                else:
-                    means = grouped.max()
             else:
                 means = grouped.mean()
-            means = means.sort_values(
-                ascending=(metric=="metricx_both")
             )
-            means = means.to_frame().reset_index()
             return gr.update(
-                value=means, x=lang_col, y=metric, x_label_angle=-90,
                 height=500,
-                sort="y",
             )
         default_bar = get_hist(*[x.value for x in bar_controls])
         gr_barplot = gr.BarPlot(**default_bar)
         for inp in bar_controls:
             inp.change(fn=get_hist, inputs=bar_controls, outputs=gr_barplot)
         gr.Markdown(METRICS_EXPLANATION)
         gr.Markdown(SYSTEMS_EXPLANATION)
         gr.Markdown(LANGS_EXPLANATION)

 import csv
 from collections import defaultdict
+import gradio as gr
+import pandas as pd
 def strip_colname(x):
+    if x.startswith("score_"):
         return x[6:]
     return x
 def leaderboard_tab():
     stats = pd.read_csv("data/benchmark_stats.tsv", sep="\t", quoting=csv.QUOTE_NONE)
     stats.columns = [strip_colname(c) for c in stats.columns]
+    metrics = ["metricx_both", "xcomet_both", "CHRFpp", "glotlid_ref"]
     systems = sorted(set(stats["system"]))
     levels = ["sentence_level", "paragraph_level"]
     ALL = "ALL"
         lang_tgt2src[tgt_lang].add(src_lang)
         langs_src.add(src_lang)
         langs_tgt.add(tgt_lang)
     langs_df = pd.read_csv("data/language_metadata.tsv", sep="\t")
+    lang2name = {}
+    for i, row in langs_df.iterrows():
+        code = row["ISO 639-3"] + "_" + row["ISO 15924"]
+        if isinstance(row["Glottocode"], str) and len(row["Glottocode"]) > 0:
+            code = code + "_" + row["Glottocode"]
+        lang2name[code] = row["Language"]
+        if isinstance(row["Secondary ISO 639-3"], str) and len(
+            row["Secondary ISO 639-3"]
+        ):
+            code = row["Secondary ISO 639-3"] + code[3:]
+            lang2name[code] = row["Language"]
+    for lang in langs_src.union(langs_tgt):
+        if lang not in lang2name:
+            print(f"Name not found for {lang}")
+    def named_langs(langs_list):
+        return [
+            (f"{lang} — {lang2name[lang]}", lang) if lang in lang2name else lang
+            for lang in langs_list
+        ]
     with gr.Tab("Leaderboard"):
         gr.Markdown("# BOUQuET translation leaderboard")
         gr.Markdown("## Systems ranking")
         # Inputs
         gr_level = gr.Dropdown(levels, value="sentence_level", label="Level")
+        gr_src_lang = gr.Dropdown(
+            [ALL] + named_langs(sorted(langs_src)), value=ALL, label="Source lang"
+        )
+        gr_tgt_lang = gr.Dropdown(
+            [ALL] + named_langs(sorted(langs_tgt)), value=ALL, label="Target lang"
+        )
         # Interactivity
         inputs = [gr_level, gr_src_lang, gr_tgt_lang]
                 filtered = filtered[filtered["src_lang"].eq(src_lang)]
             if tgt_lang != ALL:
                 filtered = filtered[filtered["tgt_lang"].eq(tgt_lang)]
+            means = (
+                filtered.groupby(["system"])[metrics]
+                .mean()
+                .reset_index()
+                .sort_values("metricx_both")
+            )
             means.columns = [strip_colname(c) for c in means.columns]
             styler = means.style.background_gradient().format(precision=4)
             return styler
         df_all = get_lb(*[inp.value for inp in inputs])
         gr_df = gr.Dataframe(df_all)
         for inp in inputs:
             inp.change(fn=get_lb, inputs=inputs, outputs=gr_df)
         # Interdependecy of the controls
         def src2tgt(src_lang, tgt_lang):
             if src_lang == ALL:
+                choices = [ALL] + named_langs(sorted(langs_tgt))
             else:
+                choices = [ALL] + named_langs(sorted(lang_src2tgt[src_lang]))
             return gr.update(choices=choices, value=tgt_lang)
         def tgt2src(src_lang, tgt_lang):
             if tgt_lang == ALL:
+                choices = [ALL] + named_langs(sorted(langs_src))
             else:
+                choices = [ALL] + named_langs(sorted(lang_tgt2src[tgt_lang]))
             return gr.update(choices=choices, value=src_lang)
+        gr_src_lang.input(
+            fn=src2tgt, inputs=[gr_src_lang, gr_tgt_lang], outputs=gr_tgt_lang
+        )
+        gr_tgt_lang.input(
+            fn=tgt2src, inputs=[gr_src_lang, gr_tgt_lang], outputs=gr_src_lang
+        )
         gr.Markdown("## Languages difficulty")
+        gr_system = gr.Dropdown(
+            [MEAN, BEST] + systems, value=MEAN, label="Translation system"
+        )
+        gr_direction = gr.Dropdown(
+            [XX2EN, EN2XX], value=XX2EN, label="Translation direction"
+        )
         gr_metric = gr.Dropdown(metrics, label="Quality metric", value="metricx_both")
         gr_level2 = gr.Dropdown(levels, value="sentence_level", label="Level")
         bar_controls = [gr_system, gr_direction, gr_metric, gr_level2]
         def get_hist(system, direction, metric, level):
             # decide on the data to process
             if direction == EN2XX:
+                direction_filter = stats["src_lang"].eq("eng_Latn")
                 lang_col = "tgt_lang"
             else:
+                direction_filter = stats["tgt_lang"].eq("eng_Latn")
                 lang_col = "src_lang"
             if system in (MEAN, BEST):
                 system_filter = stats["system"].astype(bool)
             else:
+                system_filter = stats["system"].eq(system)
             subset = stats[system_filter & direction_filter & stats["level"].eq(level)]
             # Compute the means and update the plot
             grouped = subset.groupby(lang_col)[metric]
+            if metric == "metricx_both":
+                bests = grouped.min()
+                best_sys = grouped.idxmin()
+            else:
+                bests = grouped.max()
+                best_sys = grouped.idxmax()
             if system == BEST:
+                means = bests
             else:
                 means = grouped.mean()
+            report = (
+                pd.DataFrame(
+                    {
+                        metric: means,
+                        "best_system": subset.loc[best_sys]["system"].values,
+                    }
+                )
+                .sort_values(metric, ascending=(metric == "metricx_both"))
+                .reset_index()
             )
+            report["lang_name"] = [lang2name.get(lang, "") for lang in report[lang_col]]
+            tooltip_columns = ["lang_name", "best_system"]
             return gr.update(
+                value=report,
+                x=lang_col,
+                y=metric,
+                x_label_angle=-90,
                 height=500,
+                sort="y",
+                tooltip=tooltip_columns,
             )
         default_bar = get_hist(*[x.value for x in bar_controls])
         gr_barplot = gr.BarPlot(**default_bar)
         for inp in bar_controls:
             inp.change(fn=get_hist, inputs=bar_controls, outputs=gr_barplot)
         gr.Markdown(METRICS_EXPLANATION)
         gr.Markdown(SYSTEMS_EXPLANATION)
         gr.Markdown(LANGS_EXPLANATION)