""" Gradio dashboard to explore Lighteval tasks. Loads tasks from the lighteval Registry and displays them in a searchable, filterable interface. """ import re from collections import Counter from dataclasses import dataclass, field import gradio as gr from lighteval.tasks.registry import Registry registry = Registry(custom_tasks=None, load_multilingual=True) modules_data = registry.get_tasks_dump() @dataclass class TaskDoc: module: str abstract: str languages: list[str] tags: list[str] paper: str | None dataset: str | None name: str | None = None task_names: list[str] = field(default_factory=list) starred: bool = False def _module_to_github_path(module: str) -> str: """Convert module path to GitHub source URL path.""" if module.startswith("lighteval."): mod_path_parts = module[len("lighteval."):].split(".") return "src/lighteval/" + "/".join(mod_path_parts) + ".py" return "src/lighteval/" + module.replace(".", "/") + ".py" def index_tasks() -> tuple[list[TaskDoc], list[str], list[str]]: """Load tasks from registry and build index.""" docs: list[TaskDoc] = [] language_counts: Counter = Counter() tag_set: set = set() for entry in modules_data: docstring = entry.get("docstring", {}) module = entry.get("module", "") # Extract fields from docstring abstract = docstring.get("abstract", "").strip() langs = [lang.lower() for lang in docstring.get("languages", [])] tgs = [t.lower() for t in docstring.get("tags", [])] paper = docstring.get("paper", "").strip() or None name = docstring.get("name", "").strip() or None starred = docstring.get("starred", False) # Convert dataset array to comma-separated string dataset_list = docstring.get("dataset", []) dataset = ", ".join(dataset_list) if dataset_list else None # Extract task names from tasks array tasks_list = entry.get("tasks", []) task_names = [task.get("name", "") for task in tasks_list if task.get("name")] # Update counters for lang in langs: language_counts[lang] += 1 for t in tgs: tag_set.add(t) docs.append(TaskDoc( module=module, abstract=abstract, languages=langs, tags=tgs, paper=paper, dataset=dataset, name=name, task_names=task_names, starred=starred )) languages_sorted = [ lang for lang, _ in sorted(language_counts.items(), key=lambda kv: (-kv[1], kv[0])) ] tags_sorted = sorted(tag_set) return docs, languages_sorted, tags_sorted ALL_TASKS, ALL_LANGS, ALL_TAGS = index_tasks() TOP_LANGS = ALL_LANGS[:8] def normalize_name_for_matching(name: str) -> str: """Normalize name for comparison: lowercase, remove underscores/spaces/colons.""" return re.sub(r"[_\s:]+", "", name.lower()) def filter_tasks(languages: list[str], tags: list[str], search: str) -> list[TaskDoc]: """Filter tasks by languages, tags, and search query.""" selected_langs = [lang.lower() for lang in (languages or [])] selected_tags = [t.lower() for t in (tags or [])] search_lc = (search or "").strip().lower() out: list[TaskDoc] = [] for td in ALL_TASKS: if selected_langs and not any(lang in td.languages for lang in selected_langs): continue if selected_tags and not any(t in td.tags for t in selected_tags): continue if search_lc: hay = " ".join([td.module, td.abstract, ", ".join(td.tags), (td.dataset or "")]).lower() if search_lc not in hay: continue out.append(td) out.sort(key=lambda td: (not td.starred, (td.name or td.module).lower())) return out def truncate_text(text: str, max_length: int = 250) -> str: """Truncate text to max_length, breaking at word boundary if possible.""" if len(text) <= max_length: return text truncated = text[:max_length] last_space = truncated.rfind(" ") if last_space > max_length * 0.7: truncated = truncated[:last_space] return truncated + "..." def group_task_names_by_prefix(task_names: list[str]) -> list[str]: """Group task names by prefix (part before colon). If multiple tasks share the same prefix, only show the prefix once. Tasks without a colon are shown as-is. Preserves original order as much as possible. """ prefix_groups: dict[str, list[str]] = {} prefix_first_pos: dict[str, int] = {} # Track first occurrence position result: list[tuple[int, str]] = [] # (position, name) tuples for pos, task_name in enumerate(task_names): if ":" in task_name: prefix = task_name.split(":")[0] if prefix not in prefix_groups: prefix_groups[prefix] = [] prefix_first_pos[prefix] = pos prefix_groups[prefix].append(task_name) else: # Standalone task - add directly at its position result.append((pos, task_name)) # Process prefix groups for prefix, tasks in prefix_groups.items(): pos = prefix_first_pos[prefix] if len(tasks) > 1: # Multiple tasks share this prefix - show only the prefix result.append((pos, prefix)) else: # Only one task with this prefix - show the full task name result.append((pos, tasks[0])) # Sort by position to preserve original order result.sort(key=lambda x: x[0]) return [name for _, name in result] def render_cards(tasks: list[TaskDoc]) -> str: """Render task cards as HTML.""" items: list[str] = [] for t in tasks: # Get display name module_parts = t.module.split(".") base_no_ext = module_parts[-1] if module_parts else "" fallback_name = module_parts[-2] if base_no_ext == "main" and len(module_parts) >= 2 else base_no_ext task_name = (t.name or fallback_name).replace("_", " ").title() # Build source link mod_path = _module_to_github_path(t.module) source_html = f'source' paper_html = f'paper' if t.paper else "" tags_html = " ".join([f'{tag}' for tag in t.tags]) if t.tags else "" langs_html = " ".join([f'{lang}' for lang in t.languages]) if t.languages else "" chips_tags_html = f'
{tags_html}
' if tags_html else "" chips_langs_html = f'
{langs_html}
' if langs_html else "" abstract_text = t.abstract or "-" abstract_text = truncate_text(abstract_text) abstract_html = abstract_text.replace("\n", "
") sep_html = ' | ' if paper_html else "" links_html = f"{source_html}{sep_html}{paper_html}" dataset_links = [] if t.dataset: datasets = [d.strip() for d in t.dataset.split(",") if d.strip()] for ds in datasets[:6]: dataset_links.append(f'{ds}') if len(datasets) > 6: dataset_links.append(f'+{len(datasets) - 6} more') dataset_html = " ".join(dataset_links) if dataset_links else "" star_icon = "⭐ " if t.starred else "" # Display evaluation task names (max 3 visible, with dropdown for more) # Group task names by prefix to collapse shared prefixes task_names_html = "" if t.task_names: grouped_names = group_task_names_by_prefix(t.task_names) visible_names = grouped_names[:3] remaining_names = grouped_names[3:] visible_html = " ".join([f'{name}' for name in visible_names]) if remaining_names: remaining_html = " ".join([f'{name}' for name in remaining_names]) task_names_html = f'''
Run using lighteval:
{visible_html}
Show {len(remaining_names)} more
{remaining_html}
''' else: task_names_html = f'
Run using lighteval:
{visible_html}
' items.append( f"""
{star_icon}{task_name} {dataset_html}
{chips_tags_html} {chips_langs_html}
{abstract_html}
{task_names_html}
""" ) return "
" + "\n".join(items) + "
" def on_filter(languages: list[str], tags: list[str], search: str): tasks = filter_tasks(languages, tags, search) count = len(tasks) total = len(ALL_TASKS) counter_text = f"**Showing {count} of {total} tasks**" if count != total else f"**{total} tasks**" return counter_text, render_cards(tasks) def on_toggle_language_choices(show_all: bool, selected_langs: list[str], tags: list[str], search: str): choices = ALL_LANGS if show_all else TOP_LANGS kept = [lang for lang in (selected_langs or []) if lang in choices] tasks = filter_tasks(kept, tags, search) count = len(tasks) total = len(ALL_TASKS) counter_text = f"**Showing {count} of {total} tasks**" if count != total else f"**{total} tasks**" return gr.update(choices=choices, value=kept), counter_text, render_cards(tasks) def on_toggle_tags_visibility(show: bool, selected_tags: list[str], languages: list[str], search: str): """Toggle tag filter visibility while preserving selections.""" tags_value: list[str] = selected_tags or [] tasks = filter_tasks(languages, tags_value, search) count = len(tasks) total = len(ALL_TASKS) counter_text = f"**Showing {count} of {total} tasks**" if count != total else f"**{total} tasks**" return gr.update(visible=show, value=tags_value), counter_text, render_cards(tasks) # Custom CSS for the app custom_css = """ /* layout */ .cards-grid { display: grid; grid-template-columns: repeat(auto-fill, minmax(280px, 1fr)); gap: 20px; margin-top: 10px; } /* card base */ .card { border-radius: 16px; padding: 18px; transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1); outline: none; position: relative; overflow: hidden; border: 2px solid transparent; } .card::before { content: ''; position: absolute; top: 0; left: -100%; width: 100%; height: 100%; background: linear-gradient( 90deg, transparent, rgba(255, 255, 255, 0.1), transparent ); transition: left 0.5s; } .card:hover::before { left: 100%; } .card:hover, .card:focus { transform: translateY(-6px) scale(1.02); box-shadow: 0 20px 40px rgba(0, 0, 0, 0.12), 0 8px 16px rgba(0, 0, 0, 0.08); } .title { display: flex; align-items: center; gap: 8px; flex-wrap: wrap; position: relative; z-index: 1; } .title-text { font-weight: 700; font-size: 17px; font-family: ui-sans-serif, system-ui, -apple-system, "Segoe UI", Roboto, "Helvetica Neue", Arial; letter-spacing: -0.01em; } .dataset-inline { font-size: 12px; display: flex; flex-wrap: wrap; gap: 6px; align-items: center; margin-left: 8px; } .chips { margin: 8px 0 6px 0; display: flex; gap: 4px; flex-wrap: wrap; } .chips-tags { margin: 8px 0 4px 0; } .chips-langs { margin: 4px 0 6px 0; } .chip { display: inline-block; padding: 4px 10px; border-radius: 12px; font-size: 11px; font-weight: 500; background: linear-gradient(135deg, #e6f2ff 0%, #d6e9ff 100%); color: #1e3a8a; transition: all 0.2s ease; border: 1px solid rgba(30, 58, 138, 0.1); } .chip:hover { transform: translateY(-1px); box-shadow: 0 2px 8px rgba(30, 58, 138, 0.2); } .chip-lang { background: linear-gradient(135deg, #e8f5e9 0%, #d4edda 100%); color: #166534; border-color: rgba(22, 101, 52, 0.1); } .chip-lang:hover { box-shadow: 0 2px 8px rgba(22, 101, 52, 0.2); } .abstract { color: #475569; font-size: 13.5px; line-height: 1.6; margin-top: 8px; min-height: 48px; } .task-names { margin-top: 10px; padding-top: 8px; border-top: 1px solid rgba(148, 163, 184, 0.15); } .task-names-label { font-size: 11px; font-weight: 600; color: #64748b; margin-bottom: 6px; text-transform: uppercase; letter-spacing: 0.5px; } .task-names-list { display: flex; flex-wrap: wrap; gap: 6px; } .task-names-remaining { margin-top: 8px; padding-top: 8px; border-top: 1px solid rgba(148, 163, 184, 0.15); } .task-names-details { margin-top: 8px; } .task-names-summary { font-size: 11px; font-weight: 600; color: #64748b; cursor: pointer; user-select: none; padding: 4px 8px; border-radius: 4px; display: inline-block; transition: all 0.2s ease; background: rgba(148, 163, 184, 0.1); } .task-names-summary:hover { background: rgba(148, 163, 184, 0.2); color: #475569; } .task-names-summary::-webkit-details-marker { display: none; } .task-names-details[open] .task-names-summary { margin-bottom: 8px; } .task-name { display: inline-block; padding: 3px 8px; border-radius: 6px; font-size: 11px; font-weight: 500; background: linear-gradient(135deg, #fef3c7 0%, #fde68a 100%); color: #92400e; border: 1px solid rgba(146, 64, 14, 0.2); font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace; transition: all 0.2s ease; } .task-name:hover { transform: translateY(-1px); box-shadow: 0 2px 6px rgba(146, 64, 14, 0.2); background: linear-gradient(135deg, #fde68a 0%, #fcd34d 100%); } .links { margin-top: 12px; font-size: 12px; padding-top: 8px; border-top: 1px solid rgba(148, 163, 184, 0.2); } .links a { text-decoration: none; font-weight: 600; transition: all 0.2s ease; position: relative; } .links a::after { content: ''; position: absolute; width: 0; height: 2px; bottom: -2px; left: 0; background: currentColor; transition: width 0.3s ease; } .links a:hover::after { width: 100%; } .links a:hover { transform: translateX(2px); } .sep { color: #94a3b8; margin: 0 8px; } .dataset { display: inline-block; font-size: 12px; color: #0ea5e9; background: linear-gradient(135deg, #ecfeff 0%, #e0f7fa 100%); padding: 4px 10px; border-radius: 8px; text-decoration: none; transition: all 0.2s ease; border: 1px solid rgba(14, 165, 233, 0.2); font-weight: 500; white-space: nowrap; } .dataset:hover { transform: translateY(-1px); box-shadow: 0 4px 12px rgba(14, 165, 233, 0.3); background: linear-gradient(135deg, #e0f7fa 0%, #d1f2eb 100%); } .dataset-more { display: inline-block; font-size: 12px; color: #64748b; background: linear-gradient(135deg, #f1f5f9 0%, #e2e8f0 100%); padding: 4px 10px; border-radius: 8px; font-weight: 500; white-space: nowrap; } /* Light mode */ :root { --bg-start: #f8fafc; --bg-end: #f1f5f9; --card-bg: #ffffff; --card-border: rgba(226, 232, 240, 0.8); --title-color: #1e3a8a; --text-color: #0f172a; --muted: #475569; --link: #2563eb; } /* Dark mode overrides */ @media (prefers-color-scheme: dark) { :root { --bg-start: #0b1220; --bg-end: #0f172a; --card-bg: #071022; --card-border: rgba(15, 42, 68, 0.8); --title-color: #93c5fd; --text-color: #e6eef8; --muted: #cbd5e1; --link: #6ea8ff; } .dataset-more { color: #94a3b8; background: linear-gradient(135deg, rgba(148, 163, 184, 0.15) 0%, rgba(148, 163, 184, 0.1) 100%); } .chips-tags .chip { background: linear-gradient(135deg, rgba(29, 78, 216, 0.35) 0%, rgba(29, 78, 216, 0.25) 100%); color: #e6eef8; border: 1px solid rgba(148, 163, 184, 0.15); } .chips-langs .chip { background: linear-gradient(135deg, rgba(22, 101, 52, 0.35) 0%, rgba(22, 101, 52, 0.25) 100%); color: #e6eef8; border: 1px solid rgba(148, 163, 184, 0.15); } .links { border-top-color: rgba(148, 163, 184, 0.3); } .task-names { border-top-color: rgba(148, 163, 184, 0.25); } .task-names-label { color: #94a3b8; } .task-name { background: linear-gradient(135deg, rgba(146, 64, 14, 0.3) 0%, rgba(146, 64, 14, 0.2) 100%); color: #fbbf24; border-color: rgba(146, 64, 14, 0.3); } .task-name:hover { background: linear-gradient(135deg, rgba(146, 64, 14, 0.4) 0%, rgba(146, 64, 14, 0.3) 100%); box-shadow: 0 2px 6px rgba(251, 191, 36, 0.3); } .task-names-summary { background: rgba(148, 163, 184, 0.15); color: #94a3b8; } .task-names-summary:hover { background: rgba(148, 163, 184, 0.25); color: #cbd5e1; } .task-names-remaining { border-top-color: rgba(148, 163, 184, 0.25); } } /* apply */ body { background: linear-gradient(135deg, var(--bg-start) 0%, var(--bg-end) 100%); background-attachment: fixed; color: var(--text-color); min-height: 100vh; } .card { background: var(--card-bg); border: 2px solid var(--card-border); color: var(--text-color); backdrop-filter: blur(10px); } .title-text { color: var(--title-color); } .abstract { color: var(--muted); } .links a { color: var(--link); } /* small screens adjustments */ @media (max-width: 520px) { .cards-grid { gap: 12px; grid-template-columns: 1fr; } .title-text { font-size: 16px; } .card { padding: 14px; } } """ with gr.Blocks(title="Lighteval Tasks Explorer", css=custom_css) as demo: with gr.Row(): with gr.Column(): gr.Markdown( """

Lighteval Tasks Explorer

Browse tasks by language, tags and search the task descriptions.

""" ) task_counter = gr.Markdown(f"**{len(ALL_TASKS)} tasks**") with gr.Row(equal_height=False): with gr.Column(scale=2): gr.Markdown("⭐⭐⭐ Recommended benchmarks are marked with a star icon.") search_tb = gr.Textbox(label="Search", placeholder="Search in module path, tags, abstract…", value="") with gr.Group(): gr.Markdown("**Languages**") show_all_langs = gr.Checkbox(label="Show all languages", value=False) lang_dd = gr.CheckboxGroup(choices=TOP_LANGS, value=[]) with gr.Group(): gr.Markdown("**Benchmark type**") show_tags_filters = gr.Checkbox(label="Show tag checkboxes", value=False) tag_dd = gr.CheckboxGroup(choices=ALL_TAGS, value=[], visible=False) gr.Markdown("Tip: use the filters and search together. Results update live.") gr.Image( value="measuring_model_size.png", label="", show_label=False, container=False, show_download_button=False ) with gr.Column(scale=5): cards = gr.HTML() cards.value = "
Loading tasks…
" show_all_langs.change(on_toggle_language_choices, inputs=[show_all_langs, lang_dd, tag_dd, search_tb], outputs=[lang_dd, task_counter, cards]) show_tags_filters.change(on_toggle_tags_visibility, inputs=[show_tags_filters, tag_dd, lang_dd, search_tb], outputs=[tag_dd, task_counter, cards]) search_tb.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards]) lang_dd.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards]) tag_dd.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards]) initial_tasks = filter_tasks([], [], "") cards.value = render_cards(initial_tasks) if __name__ == "__main__": demo.launch()