Spaces:
Running
Running
File size: 6,312 Bytes
1bca40f 1d0b778 1bca40f 1d0b778 f7ac4d6 99bd427 1bca40f f7ac4d6 99bd427 f7ac4d6 1bca40f 99bd427 1bca40f 1d0b778 1bca40f f7ac4d6 99bd427 f7ac4d6 1d0b778 99bd427 f7ac4d6 99bd427 f7ac4d6 99bd427 f7ac4d6 99bd427 f7ac4d6 99bd427 1d0b778 f7ac4d6 1d0b778 f7ac4d6 99bd427 f7ac4d6 1bca40f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
import pandas as pd
import gradio as gr
import csv
from collections import defaultdict
def strip_colname(x):
if x.startswith('score_'):
return x[6:]
return x
INTRO = """The current leaderboard displays performance across all filtered directions based on the dev subset of BOUQuET.
A smarter leaderboard and the code for reproducing the evaluations will be published soon!
"""
LANGS_EXPLANATION = """## Languages
For the description of languages, please refer to https://huggingface.co/datasets/facebook/bouquet#languages.
"""
METRICS_EXPLANATION = """## Metrics
1. `metricx_both`: [google/metricx-24-hybrid-xl-v2p6](https://huggingface.co/google/metricx-24-hybrid-xl-v2p6) score based on both source and reference. **Attention: lower is better!**
2. `xcomet_both`: []() score based on both source and reference.
3. `CHRFpp`: ChrF++ score ([sacrebleu](https://github.com/mjpost/sacrebleu) implementation) based on reference.
4. `glotlid_ref`: probability, as predicted by the [GlotLID model](https://huggingface.co/cis-lmu/glotlid), that translation and reference are in the same language.
"""
SYSTEMS_EXPLANATION = """## Systems
Descriptions of the implementation of the systems will come out later.
"""
def leaderboard_tab():
stats = pd.read_csv("data/benchmark_stats.tsv", sep="\t", quoting=csv.QUOTE_NONE)
stats.columns = [strip_colname(c) for c in stats.columns]
metrics = ['metricx_both', 'xcomet_both', 'CHRFpp', 'glotlid_ref']
systems = sorted(set(stats["system"]))
levels = ["sentence_level", "paragraph_level"]
ALL = "ALL"
MEAN = "Average"
BEST = "Best"
XX2EN = "Everything-into-English"
EN2XX = "English-into-Everything"
lang_src2tgt = defaultdict(set)
lang_tgt2src = defaultdict(set)
langs_src = set()
langs_tgt = set()
for src_lang, tgt_lang in stats[["src_lang", "tgt_lang"]].drop_duplicates().values:
lang_src2tgt[src_lang].add(tgt_lang)
lang_tgt2src[tgt_lang].add(src_lang)
langs_src.add(src_lang)
langs_tgt.add(tgt_lang)
with gr.Tab("Leaderboard"):
gr.Markdown("# BOUQuET translation leaderboard")
gr.Markdown(INTRO)
gr.Markdown("## Systems ranking")
# Inputs
gr_level = gr.Dropdown(levels, value="sentence_level", label="Level")
gr_src_lang = gr.Dropdown([ALL] + sorted(langs_src), value=ALL, label="Source lang")
gr_tgt_lang = gr.Dropdown([ALL] + sorted(langs_tgt), value=ALL, label="Target lang")
# Interactivity
inputs = [gr_level, gr_src_lang, gr_tgt_lang]
def get_lb(level, src_lang, tgt_lang):
filtered = stats[stats["level"].eq(level)]
if src_lang != ALL:
filtered = filtered[filtered["src_lang"].eq(src_lang)]
if tgt_lang != ALL:
filtered = filtered[filtered["tgt_lang"].eq(tgt_lang)]
means = filtered.groupby(['system'])[metrics].mean().reset_index().sort_values('metricx_both')
means.columns = [strip_colname(c) for c in means.columns]
styler = means.style.background_gradient().format(precision=4)
return styler
df_all = get_lb(*[inp.value for inp in inputs])
gr_df = gr.Dataframe(df_all)
for inp in inputs:
inp.change(fn=get_lb, inputs=inputs, outputs=gr_df)
# Interdependecy of the controls
def src2tgt(src_lang, tgt_lang):
if src_lang == ALL:
choices = [ALL] + sorted(langs_tgt)
else:
choices = [ALL] + sorted(lang_src2tgt[src_lang])
return gr.update(choices=choices, value=tgt_lang)
def tgt2src(src_lang, tgt_lang):
if tgt_lang == ALL:
choices = [ALL] + sorted(langs_src)
else:
choices = [ALL] + sorted(lang_tgt2src[tgt_lang])
return gr.update(choices=choices, value=src_lang)
gr_src_lang.input(fn=src2tgt, inputs=[gr_src_lang, gr_tgt_lang], outputs=gr_tgt_lang)
gr_tgt_lang.input(fn=tgt2src, inputs=[gr_src_lang, gr_tgt_lang], outputs=gr_src_lang)
gr.Markdown("## Languages difficulty")
gr_system = gr.Dropdown([MEAN, BEST] + systems, value=MEAN, label="Translation system")
gr_direction = gr.Dropdown([XX2EN, EN2XX], value=XX2EN, label="Translation direction")
gr_metric = gr.Dropdown(metrics, label="Quality metric", value="metricx_both")
gr_level2 = gr.Dropdown(levels, value="sentence_level", label="Level")
bar_controls = [gr_system, gr_direction, gr_metric, gr_level2]
def get_hist(system, direction, metric, level):
# decide on the data to process
if direction == EN2XX:
direction_filter = stats['src_lang'].eq('eng_Latn')
lang_col = "tgt_lang"
else:
direction_filter = stats['tgt_lang'].eq('eng_Latn')
lang_col = "src_lang"
if system in (MEAN, BEST):
system_filter = stats["system"].astype(bool)
else:
system_filter = stats['system'].eq(system)
subset = stats[system_filter & direction_filter & stats["level"].eq(level)]
# Compute the means and update the plot
grouped = subset.groupby(lang_col)[metric]
if system == BEST:
if metric == "metricx_both":
means = grouped.min()
else:
means = grouped.max()
else:
means = grouped.mean()
means = means.sort_values(
ascending=(metric=="metricx_both")
)
means = means.to_frame().reset_index()
return gr.update(
value=means, x=lang_col, y=metric, x_label_angle=-90,
height=500,
sort="y",
)
default_bar = get_hist(*[x.value for x in bar_controls])
gr_barplot = gr.BarPlot(**default_bar)
for inp in bar_controls:
inp.change(fn=get_hist, inputs=bar_controls, outputs=gr_barplot)
gr.Markdown(LANGS_EXPLANATION)
gr.Markdown(METRICS_EXPLANATION)
gr.Markdown(SYSTEMS_EXPLANATION)
|