every-leaderboards / index.html
Linker1907's picture
Remove authentication requirement - dataset is public
40e90a6
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Official Benchmarks Leaderboard 2026 — 12 Hugging Face Benchmarks</title>
<meta name="description" content="Unified leaderboard for 12 official Hugging Face benchmarks. Compare AI models across GSM8K, MMLU-Pro, GPQA, HLE, and more.">
<meta name="keywords" content="AI benchmark, HuggingFace benchmarks, GSM8K, MMLU-Pro, GPQA, HLE, SWE-bench, leaderboard, AI evaluation">
<meta name="author" content="Benchmarks Team">
<meta name="robots" content="index, follow">
<link href="https://fonts.googleapis.com/css2?family=Source+Sans+Pro:ital,wght@0,200;0,300;0,400;0,600;0,700;1,200;1,300;1,400;1,600;1,700&family=IBM+Plex+Mono:wght@400;600;700&display=swap" rel="stylesheet">
<script src="https://cdnjs.cloudflare.com/ajax/libs/Chart.js/4.4.1/chart.umd.min.js"></script>
<script src="https://unpkg.com/es-module-shims@1.7.0/dist/es-module-shims.js"></script>
<script type="importmap">
{
"imports": {
"@huggingface/hub": "https://cdn.jsdelivr.net/npm/@huggingface/hub@0.21.0/+esm"
}
}
</script>
<style>
*{margin:0;padding:0;box-sizing:border-box;}
:root{
--bg:#f9fafb;--bg2:#f3f4f6;--surface:#ffffff;--surface-alt:#f9fafb;
--border:#e5e7eb;--border-hover:#d1d5db;
--shadow-sm:0 1px 3px rgba(15,23,42,.04),0 1px 2px rgba(15,23,42,.06);
--shadow:0 4px 16px rgba(15,23,42,.06),0 1px 3px rgba(15,23,42,.08);
--shadow-lg:0 12px 40px rgba(15,23,42,.08),0 4px 12px rgba(15,23,42,.06);
--text:#111827;--text-sec:#6b7280;--text-muted:#9ca3af;
--ac:#6366f1;--ac2:#4f46e5;--ac-bg:rgba(99,102,241,.06);
--teal:#0d9488;--amber:#d97706;--green:#16a34a;--rose:#e11d48;--purple:#7c3aed;
--radius:16px;--radius-sm:10px;--radius-xs:6px;
--font:'Source Sans Pro',sans-serif;--font-mono:'IBM Plex Mono',monospace;
--tr:0.22s cubic-bezier(0.4,0,0.2,1);
}
html{scroll-behavior:smooth;}
body{font-family:var(--font);background:var(--bg);color:var(--text);min-height:100vh;-webkit-font-smoothing:antialiased;font-size:13px;}
::-webkit-scrollbar{width:5px;height:4px;}
::-webkit-scrollbar-track{background:transparent;}
::-webkit-scrollbar-thumb{background:rgba(99,102,241,.2);border-radius:10px;}
::-webkit-scrollbar-thumb:hover{background:rgba(99,102,241,.4);}
::selection{background:rgba(99,102,241,.12);}
body::before{content:"";position:fixed;inset:0;z-index:0;pointer-events:none;
background:radial-gradient(ellipse 70% 45% at 15% 8%,rgba(99,102,241,.05),transparent 55%),
radial-gradient(ellipse 55% 35% at 85% 92%,rgba(13,148,136,.04),transparent 50%);}
.wrap{position:relative;z-index:1;max-width:1600px;margin:0 auto;padding:22px 12px 70px;}
/* HEADER */
header{text-align:center;margin-bottom:20px;animation:fadeIn .6s ease-out;}
@keyframes fadeIn{from{opacity:0;transform:translateY(-10px)}to{opacity:1;transform:translateY(0)}}
.badge-row{display:flex;align-items:center;justify-content:center;gap:8px;margin-bottom:10px;flex-wrap:wrap;}
.badge{display:inline-flex;align-items:center;gap:6px;background:var(--surface);border:1px solid var(--border);border-radius:100px;padding:4px 14px;font-family:var(--font-mono);font-size:9px;font-weight:600;letter-spacing:2px;text-transform:uppercase;color:var(--ac);box-shadow:var(--shadow-sm);}
.pulse{width:5px;height:5px;border-radius:50%;background:var(--ac);animation:p 2s infinite;}
@keyframes p{0%,100%{opacity:1;transform:scale(1)}50%{opacity:.4;transform:scale(.8)}}
h1{font-size:clamp(20px,3vw,36px);font-weight:800;line-height:1.1;letter-spacing:-1.5px;margin-bottom:8px;
background:linear-gradient(135deg,#1e1b4b 15%,#6366f1 50%,#0d9488 85%);background-size:200%;
-webkit-background-clip:text;-webkit-text-fill-color:transparent;animation:shimmer 6s ease-in-out infinite;}
@keyframes shimmer{0%,100%{background-position:0%}50%{background-position:100%}}
.sub{color:var(--text-muted);font-size:11px;line-height:1.8;max-width:800px;margin:0 auto 12px;}
.sub b{color:var(--text-sec);font-weight:600;-webkit-text-fill-color:var(--text-sec);}
/* STATS */
.stats{display:flex;flex-wrap:wrap;gap:7px;justify-content:center;margin-bottom:16px;}
.st{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius-sm);padding:10px 16px;text-align:center;min-width:90px;box-shadow:var(--shadow-sm);transition:var(--tr);}
.st:hover{box-shadow:var(--shadow);border-color:var(--border-hover);}
.stn{font-family:var(--font-mono);font-size:18px;font-weight:700;color:var(--ac);}
.stl{font-size:9px;color:var(--text-muted);margin-top:2px;text-transform:uppercase;letter-spacing:.5px;}
/* TOOLBAR */
.toolbar{display:flex;flex-wrap:wrap;gap:8px;margin-bottom:12px;align-items:center;background:var(--surface);padding:12px;border-radius:var(--radius-sm);border:1px solid var(--border);box-shadow:var(--shadow-sm);}
.search-wrap{position:relative;flex:1;min-width:200px;max-width:300px;}
.search-wrap input{width:100%;padding:8px 10px 8px 32px;border:1px solid var(--border);border-radius:20px;background:var(--surface-alt);font-family:var(--font-mono);font-size:11px;color:var(--text);outline:none;transition:var(--tr);}
.search-wrap input:focus{border-color:var(--ac);box-shadow:0 0 0 2px rgba(99,102,241,.1);background:var(--surface);}
.search-wrap::before{content:"🔍";position:absolute;left:10px;top:50%;transform:translateY(-50%);font-size:14px;pointer-events:none;}
.flbl{font-size:9px;font-family:var(--font-mono);color:var(--text-muted);text-transform:uppercase;letter-spacing:1px;font-weight:600;}
.fb{background:var(--surface-alt);border:1px solid var(--border);color:var(--text-sec);padding:6px 14px;border-radius:20px;font-size:10px;font-weight:600;cursor:pointer;transition:var(--tr);box-shadow:var(--shadow-sm);font-family:var(--font);}
.fb:hover{background:var(--ac-bg);border-color:rgba(99,102,241,.3);color:var(--ac);}
.fb.on{background:linear-gradient(135deg,#6366f1,#4f46e5);border-color:transparent;color:#fff;box-shadow:0 3px 12px rgba(99,102,241,.25);}
/* SIZE FILTER */
.size-filter-wrap{display:flex;flex-direction:column;gap:4px;padding:8px 16px;border-left:1px solid var(--border);border-right:1px solid var(--border);min-width:220px;}
.size-filter-label{font-size:9px;font-family:var(--font-mono);color:var(--text-muted);text-transform:uppercase;letter-spacing:0.5px;font-weight:600;}
.range-slider-container{position:relative;height:32px;display:flex;align-items:center;}
.range-slider-track{position:absolute;width:100%;height:4px;background:var(--surface-alt);border-radius:2px;border:1px solid var(--border);}
.range-slider-fill{position:absolute;height:4px;background:linear-gradient(90deg,#6366f1,#4f46e5);border-radius:2px;transition:all 0.1s ease;}
.range-slider-input{position:absolute;width:100%;height:32px;-webkit-appearance:none;appearance:none;background:transparent;pointer-events:none;margin:0;}
.range-slider-input::-webkit-slider-thumb{-webkit-appearance:none;appearance:none;width:12px;height:12px;border-radius:50%;background:white;border:2px solid #6366f1;cursor:pointer;pointer-events:all;box-shadow:0 2px 4px rgba(0,0,0,0.1);transition:all 0.2s ease;}
.range-slider-input::-webkit-slider-thumb:hover{transform:scale(1.2);box-shadow:0 2px 8px rgba(99,102,241,0.3);}
.range-slider-input::-webkit-slider-thumb:active{transform:scale(1.1);box-shadow:0 2px 12px rgba(99,102,241,0.4);}
.range-slider-input::-moz-range-thumb{width:12px;height:12px;border-radius:50%;background:white;border:2px solid #6366f1;cursor:pointer;pointer-events:all;box-shadow:0 2px 4px rgba(0,0,0,0.1);transition:all 0.2s ease;}
.range-slider-input::-moz-range-thumb:hover{transform:scale(1.2);box-shadow:0 2px 8px rgba(99,102,241,0.3);}
.range-slider-input::-moz-range-thumb:active{transform:scale(1.1);box-shadow:0 2px 12px rgba(99,102,241,0.4);}
.range-values{display:flex;justify-content:center;align-items:center;gap:4px;font-size:10px;font-family:var(--font-mono);color:var(--text-sec);font-weight:600;}
.range-values span:first-child,.range-values span:last-child{color:var(--ac);font-weight:700;}
/* BENCHMARK FILTER BAR */
.benchmark-filter-bar{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius-sm);padding:12px;margin-bottom:12px;box-shadow:var(--shadow-sm);}
.benchmark-filter-wrapper{display:flex;flex-wrap:nowrap;gap:10px;align-items:center;justify-content:space-between;}
.benchmark-category{display:flex;align-items:center;gap:6px;padding:6px 12px 6px 8px;background:var(--bg2);border-radius:12px;border:1px solid var(--border);flex:1;min-width:0;}
.category-icon{font-size:14px;line-height:1;flex-shrink:0;}
.category-label{font-size:9px;font-family:var(--font-mono);color:var(--text-muted);text-transform:uppercase;letter-spacing:0.5px;font-weight:600;white-space:nowrap;flex-shrink:0;}
.benchmark-pills{display:flex;gap:4px;flex-wrap:wrap;flex:1;min-width:0;}
.benchmark-pill{background:var(--surface);border:1px solid var(--border);color:var(--text-sec);padding:4px 10px;border-radius:16px;font-size:9px;font-weight:600;cursor:pointer;transition:all 0.2s cubic-bezier(0.4,0,0.2,1);font-family:var(--font);white-space:nowrap;box-shadow:var(--shadow-sm);opacity:0.6;}
.benchmark-pill:hover{border-color:var(--border-hover);transform:translateY(-1px);box-shadow:0 2px 8px rgba(15,23,42,.08);}
.benchmark-pill.active{background:var(--ac-bg);border-color:rgba(99,102,241,.3);color:var(--ac);opacity:1;box-shadow:0 2px 8px rgba(99,102,241,.12);}
.benchmark-pill.active:hover{background:rgba(99,102,241,.12);border-color:var(--ac);}
.filter-actions{display:flex;gap:6px;flex-shrink:0;padding-left:12px;border-left:2px solid var(--border);}
.filter-action-btn{background:var(--surface-alt);border:1px solid var(--border);color:var(--text-sec);padding:4px 10px;border-radius:16px;font-size:9px;font-weight:600;cursor:pointer;transition:var(--tr);font-family:var(--font);white-space:nowrap;box-shadow:var(--shadow-sm);}
.filter-action-btn:hover{background:var(--ac-bg);border-color:rgba(99,102,241,.3);color:var(--ac);}
/* TABLE */
.tw{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);overflow-x:auto;box-shadow:var(--shadow);margin-bottom:20px;}
table{width:100%;border-collapse:collapse;font-size:11px;}
thead{background:var(--surface-alt);position:sticky;top:0;z-index:100;box-shadow:0 2px 4px rgba(0,0,0,0.1);}
thead tr{border-bottom:2px solid var(--border);}
th{padding:12px 8px;text-align:center;font-size:11px;font-family:var(--font-mono);text-transform:uppercase;letter-spacing:.5px;color:var(--text-muted);white-space:nowrap;cursor:pointer;user-select:none;vertical-align:bottom;line-height:1.6;font-weight:700;transition:var(--tr);}
th.c-model{text-align:left;padding-left:14px;min-width:180px;position:sticky;left:0;background:var(--surface-alt);z-index:101;}
th:hover{color:var(--ac);background:rgba(99,102,241,.08);transform:translateY(-1px);}
th.sorted{color:var(--ac);font-weight:800;}
.sa{opacity:.6;font-size:7px;margin-left:3px;}
th a{color:inherit;text-decoration:none;}
th a:hover{color:var(--ac);text-decoration:underline;}
tbody tr{border-bottom:1px solid var(--border);transition:background var(--tr);}
tbody tr:last-child{border-bottom:none;}
tbody tr:hover{background:rgba(99,102,241,.025);}
tbody tr.hl{background:transparent;}
tbody tr.hl:hover{background:var(--hover);}
tbody tr.hidden{display:none;}
td{padding:10px 6px;text-align:center;vertical-align:middle;}
td.c-model{text-align:left;padding-left:14px;position:sticky;left:0;background:var(--surface);z-index:9;border-right:1px solid var(--border);}
tbody tr:hover td.c-model{background:rgba(99,102,241,.025);}
tbody tr.hl td.c-model{background:var(--surface);}
/* MODEL CELL */
.mc{display:flex;flex-direction:column;gap:2px;}
.mn{font-weight:700;font-size:12px;color:var(--text);display:flex;align-items:center;gap:5px;flex-wrap:wrap;}
.mn a{color:var(--text);text-decoration:none;transition:var(--tr);position:relative;}
.mn a:hover{color:var(--ac);text-decoration:none;}
.mn a::after{content:'';position:absolute;bottom:-2px;left:0;width:0;height:1px;background:var(--ac);transition:width 0.3s ease;}
.mn a:hover::after{width:100%;}
.ms{display:flex;gap:4px;align-items:center;margin-top:2px;}
.mp{font-size:8px;color:var(--text-muted);font-family:var(--font-mono);}
.badge-type{font-size:7px;padding:2px 6px;border-radius:4px;font-family:var(--font-mono);font-weight:700;text-transform:uppercase;}
.badge-open{background:rgba(22,163,74,.1);color:#16a34a;border:1px solid rgba(22,163,74,.2);}
.badge-closed{background:rgba(100,116,139,.1);color:#64748b;border:1px solid rgba(100,116,139,.2);}
/* PROVIDER LOGO */
.provider-logo{width:20px;height:20px;border-radius:50%;object-fit:cover;border:1px solid var(--border);box-shadow:var(--shadow-sm);transition:var(--tr);}
.provider-logo:hover{transform:scale(1.1);box-shadow:var(--shadow);}
.provider-logo-fallback{width:20px;height:20px;border-radius:50%;background:var(--ac-bg);border:1px solid var(--border);display:inline-flex;align-items:center;justify-content:center;font-size:9px;font-weight:700;color:var(--ac);font-family:var(--font-mono);}
.provider-logo-inline{width:16px;height:16px;border-radius:50%;object-fit:cover;border:1px solid var(--border);box-shadow:var(--shadow-sm);margin-right:6px;vertical-align:middle;display:inline-block;}
.provider-logo-fallback-inline{width:16px;height:16px;border-radius:50%;background:var(--ac-bg);border:1px solid var(--border);display:inline-flex;align-items:center;justify-content:center;font-size:8px;font-weight:700;color:var(--ac);font-family:var(--font-mono);margin-right:6px;vertical-align:middle;}
/* SCORE CELL */
.sc{display:flex;flex-direction:column;align-items:center;gap:2px;}
.sn{font-family:var(--font-mono);font-size:11px;font-weight:700;}
.sb{width:40px;height:3px;background:var(--border);border-radius:2px;overflow:hidden;margin-top:2px;}
.sf{height:100%;border-radius:2px;transition:width .8s cubic-bezier(0.4,0,0.2,1);}
.na{color:var(--text-muted);font-size:9px;font-family:var(--font-mono);}
.conf-badge{font-size:6px;padding:1px 4px;border-radius:3px;font-family:var(--font-mono);font-weight:700;margin-top:2px;}
.conf-official{background:rgba(22,163,74,.1);color:#16a34a;border:1px solid rgba(22,163,74,.2);}
.conf-verified{background:rgba(59,130,246,.1);color:#3b82f6;border:1px solid rgba(59,130,246,.2);}
.conf-community{background:rgba(217,119,6,.1);color:#d97706;border:1px solid rgba(217,119,6,.2);}
/* COLOR GRADES */
.grade-s{color:#6366f1;font-weight:700;}
.grade-a{color:#0d9488;font-weight:700;}
.grade-b{color:#d97706;font-weight:700;}
.grade-c{color:#e11d48;font-weight:600;}
.bar-s{background:linear-gradient(90deg,#6366f1,#818cf8);}
.bar-a{background:linear-gradient(90deg,#0d9488,#14b8a6);}
.bar-b{background:linear-gradient(90deg,#d97706,#f59e0b);}
.bar-c{background:linear-gradient(90deg,#e11d48,#f43f5e);}
/* BENCHMARK COLORS */
.bm-math{color:#d97706;}
.bm-knowledge{color:#6366f1;}
.bm-coding{color:#0d9488;}
.bm-vision{color:#16a34a;}
.bm-embedding{color:#7c3aed;}
.bm-language{color:#e11d48;}
.bm-agent{color:#0d9488;}
/* DARK MODE */
body.dark{--bg:#0f172a;--bg2:#1e293b;--surface:#1e293b;--surface-alt:#334155;
--border:#334155;--border-hover:#475569;--text:#f1f5f9;--text-sec:#cbd5e1;--text-muted:#94a3b8;
--shadow-sm:0 1px 3px rgba(0,0,0,.3);--shadow:0 4px 16px rgba(0,0,0,.3);--shadow-lg:0 12px 40px rgba(0,0,0,.4);
--ac:#818cf8;--ac2:#6366f1;--ac-bg:rgba(129,140,248,.1);}
body.dark::before{background:radial-gradient(ellipse 70% 45% at 15% 8%,rgba(129,140,248,.08),transparent 55%),radial-gradient(ellipse 55% 35% at 85% 92%,rgba(13,148,136,.06),transparent 50%);}
body.dark th.c-model,body.dark td.c-model{background:var(--surface)!important;}
body.dark thead{background:var(--surface-alt);}
body.dark tbody tr:hover td.c-model{background:var(--surface-alt)!important;}
/* MOBILE */
@media(max-width:768px){
.wrap{padding:12px 8px 50px;}
h1{font-size:20px!important;}
.toolbar{flex-direction:column;gap:6px;}
.search-wrap{max-width:100%;min-width:100%;}
.benchmark-filter-wrapper{flex-wrap:wrap;gap:8px;}
.benchmark-category{flex:1 1 100%;margin-bottom:4px;}
.filter-actions{flex:1 1 100%;justify-content:center;margin-top:4px;border-left:none;border-top:2px solid var(--border);padding-left:0;padding-top:8px;}
table{font-size:9px;}
th,td{padding:6px 3px;}
th.c-model,td.c-model{min-width:130px!important;}
.mn{font-size:10px!important;}
}
</style>
</head>
<body>
<div class="wrap">
<header>
<div class="badge-row">
<button id="darkBtn" onclick="toggleDark()" style="background:linear-gradient(135deg,#1e293b,#334155);border:1px solid #475569;border-radius:20px;padding:4px 14px;font-size:10px;font-family:var(--font-mono);color:#e2e8f0;cursor:pointer;font-weight:700;transition:all .2s;box-shadow:0 2px 6px rgba(0,0,0,.2)">🌙 Dark</button>
</div>
<h1>Community Benchmarks Leaderboard</h1>
<p class="sub">
<b>Unified leaderboard for the official Hugging Face benchmarks.</b> A place to find and compare results comming from the commmunity, model card, paper.
</p>
<div class="stats">
<div class="st"><div class="stn" id="statModels">6</div><div class="stl">Models</div></div>
<div class="st"><div class="stn">12</div><div class="stl">Benchmarks</div></div>
<div class="st"><div class="stn" id="statScores">0</div><div class="stl">Total Scores</div></div>
</div>
</header>
<div class="toolbar">
<div class="search-wrap">
<input type="text" id="searchBox" placeholder="Search models..." oninput="filterModels()">
</div>
<div class="size-filter-wrap">
<div class="size-filter-label">Model Size</div>
<div class="range-slider-container">
<div class="range-slider-track">
<div class="range-slider-fill" id="sliderFill"></div>
</div>
<input type="range" id="minSize" min="0" max="1100" value="0" step="1" class="range-slider-input">
<input type="range" id="maxSize" min="0" max="1100" value="1100" step="1" class="range-slider-input">
</div>
<div class="range-values">
<span id="minSizeLabel">0B</span>
<span></span>
<span id="maxSizeLabel">1100B+</span>
</div>
</div>
<div style="flex: 1"></div>
<button id="oauthSignin" class="fb" style="display: none; background: linear-gradient(135deg, #6366f1, #4f46e5); color: white; border: none;">
🔐 Sign in with HF
</button>
<div id="oauthUser" style="display: none; font-size: 10px; color: var(--text-sec); font-family: var(--font-mono); display: flex; align-items: center; gap: 8px;">
<img id="oauthAvatar" src="" style="width: 24px; height: 24px; border-radius: 50%; border: 1px solid var(--border);">
<span id="oauthUsername"></span>
<button id="oauthSignout" class="fb" style="padding: 4px 10px;">Sign out</button>
</div>
</div>
<div class="benchmark-filter-bar">
<div class="benchmark-filter-wrapper">
<div class="benchmark-category">
<span class="category-icon">📐</span>
<span class="category-label">Math</span>
<div class="benchmark-pills">
<button class="benchmark-pill active" data-benchmark="gsm8k" onclick="toggleBenchmark('gsm8k')">GSM8K</button>
<button class="benchmark-pill active" data-benchmark="aime2026" onclick="toggleBenchmark('aime2026')">AIME 2026</button>
<button class="benchmark-pill active" data-benchmark="hmmt2026" onclick="toggleBenchmark('hmmt2026')">HMMT</button>
</div>
</div>
<div class="benchmark-category">
<span class="category-icon">🧠</span>
<span class="category-label">Knowledge</span>
<div class="benchmark-pills">
<button class="benchmark-pill active" data-benchmark="mmluPro" onclick="toggleBenchmark('mmluPro')">MMLU-Pro</button>
<button class="benchmark-pill active" data-benchmark="gpqa" onclick="toggleBenchmark('gpqa')">GPQA◆</button>
<button class="benchmark-pill active" data-benchmark="hle" onclick="toggleBenchmark('hle')">HLE</button>
</div>
</div>
<div class="benchmark-category">
<span class="category-icon">💻</span>
<span class="category-label">Coding</span>
<div class="benchmark-pills">
<button class="benchmark-pill active" data-benchmark="sweVerified" onclick="toggleBenchmark('sweVerified')">SWE-V</button>
<button class="benchmark-pill active" data-benchmark="swePro" onclick="toggleBenchmark('swePro')">SWE-Pro</button>
</div>
</div>
<div class="benchmark-category">
<span class="category-icon">🤖</span>
<span class="category-label">Agent</span>
<div class="benchmark-pills">
<button class="benchmark-pill active" data-benchmark="terminalBench" onclick="toggleBenchmark('terminalBench')">TB 2.0</button>
</div>
</div>
<div class="benchmark-category">
<span class="category-icon">💬</span>
<span class="category-label">Language</span>
<div class="benchmark-pills">
<button class="benchmark-pill active" data-benchmark="evasionBench" onclick="toggleBenchmark('evasionBench')">EvasionB</button>
</div>
</div>
<div class="benchmark-category">
<span class="category-icon">👁️</span>
<span class="category-label">Vision</span>
<div class="benchmark-pills">
<button class="benchmark-pill" data-benchmark="olmOcr" onclick="toggleBenchmark('olmOcr')">olmOCR</button>
</div>
</div>
<div class="benchmark-category">
<span class="category-icon">🔍</span>
<span class="category-label">Embedding</span>
<div class="benchmark-pills">
<button class="benchmark-pill" data-benchmark="arguana" onclick="toggleBenchmark('arguana')">ArguAna</button>
</div>
</div>
<div class="filter-actions">
<button class="filter-action-btn" onclick="selectAllBenchmarks()">Select All</button>
<button class="filter-action-btn" onclick="deselectAllBenchmarks()">Clear All</button>
</div>
</div>
</div>
<div class="tw">
<table id="leaderboardTable">
<thead>
<tr>
<th class="c-model" onclick="sortTable(0)">Model<span class="sa"></span></th>
<th onclick="sortTable(1)" class="bm-math" title="Grade School Math 8K - Click to sort">GSM8K <a href="https://huggingface.co/datasets/openai/gsm8k" target="_blank" onclick="event.stopPropagation()" style="font-size:9px;opacity:0.5;text-decoration:none;">ℹ️</a><span class="sa"></span></th>
<th onclick="sortTable(2)" class="bm-knowledge" title="Massive Multi-task Language Understanding Pro - Click to sort">MMLU-Pro <a href="https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro" target="_blank" onclick="event.stopPropagation()" style="font-size:9px;opacity:0.5;text-decoration:none;">ℹ️</a><span class="sa"></span></th>
<th onclick="sortTable(3)" class="bm-knowledge" title="PhD-level expert questions - Click to sort">GPQA◆ <a href="https://huggingface.co/datasets/Idavidrein/gpqa" target="_blank" onclick="event.stopPropagation()" style="font-size:9px;opacity:0.5;text-decoration:none;">ℹ️</a><span class="sa"></span></th>
<th onclick="sortTable(4)" class="bm-knowledge" title="Humanity's Last Exam - Click to sort">HLE <a href="https://lastexam.ai" target="_blank" onclick="event.stopPropagation()" style="font-size:9px;opacity:0.5;text-decoration:none;">ℹ️</a><span class="sa"></span></th>
<th onclick="sortTable(5)" class="bm-vision" title="OCR Evaluation Benchmark - Click to sort">olmOCR <a href="https://huggingface.co/datasets/allenai/olmOCR-bench" target="_blank" onclick="event.stopPropagation()" style="font-size:9px;opacity:0.5;text-decoration:none;">ℹ️</a><span class="sa"></span></th>
<th onclick="sortTable(6)" class="bm-coding" title="SWE-bench Verified - Click to sort">SWE-V <a href="https://www.swebench.com" target="_blank" onclick="event.stopPropagation()" style="font-size:9px;opacity:0.5;text-decoration:none;">ℹ️</a><span class="sa"></span></th>
<th onclick="sortTable(7)" class="bm-embedding" title="MTEB Text Retrieval - Click to sort">ArguAna <a href="https://huggingface.co/datasets/mteb/arguana" target="_blank" onclick="event.stopPropagation()" style="font-size:9px;opacity:0.5;text-decoration:none;">ℹ️</a><span class="sa"></span></th>
<th onclick="sortTable(8)" class="bm-coding" title="SWE-bench Pro - Click to sort">SWE-Pro <a href="https://scale.com/leaderboard/swe_bench_pro_public" target="_blank" onclick="event.stopPropagation()" style="font-size:9px;opacity:0.5;text-decoration:none;">ℹ️</a><span class="sa"></span></th>
<th onclick="sortTable(9)" class="bm-math" title="AIME 2026 - Click to sort">AIME'26 <a href="https://matharena.ai/?comp=aime--aime_2026" target="_blank" onclick="event.stopPropagation()" style="font-size:9px;opacity:0.5;text-decoration:none;">ℹ️</a><span class="sa"></span></th>
<th onclick="sortTable(10)" class="bm-agent" title="Terminal-Bench 2.0 - Click to sort">TB 2.0 <a href="https://www.tbench.ai/leaderboard/terminal-bench/2.0" target="_blank" onclick="event.stopPropagation()" style="font-size:9px;opacity:0.5;text-decoration:none;">ℹ️</a><span class="sa"></span></th>
<th onclick="sortTable(11)" class="bm-language" title="EvasionBench - Click to sort">EvasionB <a href="https://huggingface.co/datasets/FutureMa/EvasionBench" target="_blank" onclick="event.stopPropagation()" style="font-size:9px;opacity:0.5;text-decoration:none;">ℹ️</a><span class="sa"></span></th>
<th onclick="sortTable(12)" class="bm-math" title="HMMT February 2026 - Click to sort">HMMT Feb'26 <a href="https://matharena.ai/?comp=hmmt--hmmt_feb_2026" target="_blank" onclick="event.stopPropagation()" style="font-size:9px;opacity:0.5;text-decoration:none;">ℹ️</a><span class="sa"></span></th>
</tr>
</thead>
<tbody id="tableBody">
</tbody>
</table>
</div>
</div>
<script>
// Load leaderboard data from HuggingFace parquet dataset
let LEADERBOARD_DATA = null;
const DATASET_REPO = 'OpenEvals/leaderboard-data';
const PARQUET_URL = `https://huggingface.co/api/datasets/${DATASET_REPO}/parquet/default/train/0.parquet`;
// Transform flat parquet row to nested JSON structure expected by the UI
function transformParquetToModel(row) {
const model = {
id: row.model_id,
name: row.model_name,
provider: row.provider,
type: row.model_type || 'open',
metadata: {
license: row.license || 'Unknown',
parametersInBillions: row.parameters_billions,
contextWindow: row.context_window || 0,
modality: row.modality || 'text',
architecture: row.architecture || 'Transformer'
},
benchmarks: {},
aggregateScore: row.aggregate_score,
coverageCount: row.coverage_count,
coveragePercent: row.coverage_percent
};
// Extract benchmark scores from flat columns
const benchmarkKeys = ['gsm8k', 'mmluPro', 'gpqa', 'hle', 'olmOcr', 'sweVerified',
'swePro', 'aime2026', 'terminalBench', 'evasionBench', 'hmmt2026'];
for (const key of benchmarkKeys) {
const scoreKey = `${key}_score`;
if (row[scoreKey] !== null && row[scoreKey] !== undefined) {
model.benchmarks[key] = {
score: row[scoreKey],
confidence: 'official',
source: 'API',
date: new Date().toISOString().split('T')[0]
};
}
}
return model;
}
// Fetch data from public parquet dataset
async function loadLeaderboardData() {
try {
// Show loading state
const tableBody = document.querySelector('#leaderboardTable tbody');
if (tableBody) {
tableBody.innerHTML = '<tr><td colspan="13" style="text-align:center;padding:40px;color:var(--text-muted);">Loading leaderboard data...</td></tr>';
}
console.log('Fetching parquet data from:', PARQUET_URL);
// Fetch parquet file (no auth needed for public dataset)
const response = await fetch(PARQUET_URL);
if (!response.ok) {
throw new Error(`Failed to load data: ${response.status} ${response.statusText}`);
}
// Get parquet as array buffer
const arrayBuffer = await response.arrayBuffer();
// Use parquet-wasm to read the data
const parquetWasm = await import('https://cdn.jsdelivr.net/npm/parquet-wasm@0.6.0/+esm');
await parquetWasm.default();
// Read parquet file
const parquetFile = parquetWasm.readParquet(new Uint8Array(arrayBuffer));
const table = parquetFile.intoIPCStream();
// Parse Apache Arrow IPC to JSON
const arrowTable = await import('https://cdn.jsdelivr.net/npm/apache-arrow@14.0.0/+esm').then(m => m.tableFromIPC(table));
const rows = arrowTable.toArray().map(row => row.toJSON());
console.log(`Loaded ${rows.length} models from parquet`);
// Transform flat parquet rows to nested JSON structure
const models = rows.map(transformParquetToModel);
// Create leaderboard data structure
LEADERBOARD_DATA = {
metadata: {
version: '1.0.0',
lastUpdated: new Date().toISOString(),
title: 'Official Benchmarks Leaderboard 2026',
description: 'Unified leaderboard for 11 official Hugging Face benchmarks',
totalModels: models.length,
totalBenchmarks: 11
},
benchmarks: {
// Benchmark definitions (kept for compatibility)
gsm8k: { id: 'gsm8k', name: 'GSM8K', shortName: 'GSM8K', category: 'math' },
mmluPro: { id: 'mmluPro', name: 'MMLU-Pro', shortName: 'MMLU-Pro', category: 'knowledge' },
gpqa: { id: 'gpqa', name: 'GPQA Diamond', shortName: 'GPQA', category: 'knowledge' },
hle: { id: 'hle', name: 'HLE', shortName: 'HLE', category: 'knowledge' },
olmOcr: { id: 'olmOcr', name: 'olmOCR', shortName: 'olmOCR', category: 'vision' },
sweVerified: { id: 'sweVerified', name: 'SWE-bench Verified', shortName: 'SWE-V', category: 'coding' },
swePro: { id: 'swePro', name: 'SWE-bench Pro', shortName: 'SWE-Pro', category: 'coding' },
aime2026: { id: 'aime2026', name: 'AIME 2026', shortName: 'AIME', category: 'math' },
terminalBench: { id: 'terminalBench', name: 'Terminal-Bench 2.0', shortName: 'TB 2.0', category: 'agent' },
evasionBench: { id: 'evasionBench', name: 'EvasionBench', shortName: 'EvasionB', category: 'language' },
hmmt2026: { id: 'hmmt2026', name: 'HMMT Feb 2026', shortName: 'HMMT', category: 'math' }
},
models: models
};
console.log('Leaderboard data loaded successfully');
// Initialize the page after data is loaded
init();
} catch (error) {
console.error('Error loading leaderboard data:', error);
const tableBody = document.querySelector('#leaderboardTable tbody');
if (tableBody) {
tableBody.innerHTML = `
<tr>
<td colspan="13" style="text-align:center;padding:40px;">
<div style="color:var(--rose);font-weight:600;margin-bottom:10px;">⚠️ Failed to load leaderboard data</div>
<div style="color:var(--text-muted);font-size:11px;">${error.message}</div>
<div style="margin-top:15px;">
<button onclick="location.reload()" style="background:var(--ac);color:white;border:none;padding:8px 16px;border-radius:8px;cursor:pointer;font-weight:600;">Retry</button>
</div>
</td>
</tr>
`;
}
}
}
// Placeholder - will be loaded from dataset
const BENCHMARK_CATEGORIES = {
'gsm8k': 'math',
'aime2026': 'math',
'hmmt2026': 'math',
'mmluPro': 'knowledge',
'gpqa': 'knowledge',
'hle': 'knowledge',
'sweVerified': 'coding',
'swePro': 'coding',
'olmOcr': 'vision',
'arguana': 'embedding',
'terminalBench': 'agent',
'evasionBench': 'language'
};
const CATEGORY_GRADIENTS = {
'math': {
textColor: '#7c3aed',
gradient: 'linear-gradient(90deg, #e9d5ff, #a855f7, #7c3aed)'
},
'knowledge': {
textColor: '#2563eb',
gradient: 'linear-gradient(90deg, #dbeafe, #60a5fa, #2563eb)'
},
'coding': {
textColor: '#059669',
gradient: 'linear-gradient(90deg, #d1fae5, #34d399, #059669)'
},
'agent': {
textColor: '#0d9488',
gradient: 'linear-gradient(90deg, #ccfbf1, #5eead4, #0d9488)'
},
'language': {
textColor: '#ea580c',
gradient: 'linear-gradient(90deg, #fed7aa, #fb923c, #ea580c)'
},
'vision': {
textColor: '#db2777',
gradient: 'linear-gradient(90deg, #fce7f3, #f472b6, #db2777)'
},
'embedding': {
textColor: '#d97706',
gradient: 'linear-gradient(90deg, #fef3c7, #fbbf24, #d97706)'
},
'aggregate': {
textColor: '#6366f1',
gradient: 'linear-gradient(90deg, #e0e7ff, #818cf8, #6366f1)'
}
};
// Benchmark filter functions
let selectedBenchmarks = ['mmluPro', 'gpqa', 'hle', 'sweVerified', 'swePro', 'aime2026', 'terminalBench', 'evasionBench', 'hmmt2026'];
function toggleBenchmark(benchmarkKey) {
const pill = document.querySelector(`.benchmark-pill[data-benchmark="${benchmarkKey}"]`);
if (selectedBenchmarks.includes(benchmarkKey)) {
// Deselect
selectedBenchmarks = selectedBenchmarks.filter(b => b !== benchmarkKey);
pill.classList.remove('active');
} else {
// Select
selectedBenchmarks.push(benchmarkKey);
pill.classList.add('active');
}
// Update table
filterByBenchmarks(selectedBenchmarks);
}
function selectAllBenchmarks() {
// Get all benchmarks from BENCHMARK_CATEGORIES
selectedBenchmarks = Object.keys(BENCHMARK_CATEGORIES);
// Update all pills to active
document.querySelectorAll('.benchmark-pill').forEach(pill => {
pill.classList.add('active');
});
// Update table
filterByBenchmarks(selectedBenchmarks);
}
function deselectAllBenchmarks() {
selectedBenchmarks = [];
// Update all pills to inactive
document.querySelectorAll('.benchmark-pill').forEach(pill => {
pill.classList.remove('active');
});
// Update table
filterByBenchmarks(selectedBenchmarks);
}
function initBenchmarkFilter() {
// Set initial active states based on selectedBenchmarks array
selectedBenchmarks.forEach(benchKey => {
const pill = document.querySelector(`.benchmark-pill[data-benchmark="${benchKey}"]`);
if (pill) pill.classList.add('active');
});
// Initial table population
filterByBenchmarks(selectedBenchmarks);
}
// ===== SIZE FILTER FUNCTIONS =====
function getModelSize(model) {
// First try to get from metadata.parametersInBillions
if (model.metadata && model.metadata.parametersInBillions !== null && model.metadata.parametersInBillions !== undefined) {
return model.metadata.parametersInBillions;
}
// Return null if not available (model will always be shown)
return null;
}
function initSizeFilter() {
const minSlider = document.getElementById('minSize');
const maxSlider = document.getElementById('maxSize');
const minLabel = document.getElementById('minSizeLabel');
const maxLabel = document.getElementById('maxSizeLabel');
const sliderFill = document.getElementById('sliderFill');
function updateSlider() {
let minVal = parseInt(minSlider.value);
let maxVal = parseInt(maxSlider.value);
// Prevent thumbs from crossing
if (minVal > maxVal - 1) {
minVal = maxVal - 1;
minSlider.value = minVal;
}
if (maxVal < minVal + 1) {
maxVal = minVal + 1;
maxSlider.value = maxVal;
}
// Update labels
minLabel.textContent = minVal + 'B';
maxLabel.textContent = maxVal === 1100 ? '1100B+' : maxVal + 'B';
// Update fill bar
const minPercent = (minVal / 1100) * 100;
const maxPercent = (maxVal / 1100) * 100;
sliderFill.style.left = minPercent + '%';
sliderFill.style.width = (maxPercent - minPercent) + '%';
// Apply all filters
applyAllFilters();
}
// Real-time filtering on input (drag)
minSlider.addEventListener('input', updateSlider);
maxSlider.addEventListener('input', updateSlider);
// Initial update
updateSlider();
}
function applyAllFilters() {
const searchTerm = document.getElementById('searchBox').value.toLowerCase();
const minSize = parseInt(document.getElementById('minSize').value);
const maxSize = parseInt(document.getElementById('maxSize').value);
// Get all rows
const rows = document.querySelectorAll('#tableBody tr');
rows.forEach(row => {
// Use dataset.name which is already set
const modelName = row.dataset.name || '';
const modelSize = parseFloat(row.dataset.size);
// Search filter
const matchesSearch = !searchTerm || modelName.includes(searchTerm);
// Size filter (always show if no size data)
let matchesSize = true;
if (!isNaN(modelSize) && modelSize > 0) {
matchesSize = modelSize >= minSize && modelSize <= maxSize;
}
// Show row if matches all filters (AND logic)
if (matchesSearch && matchesSize) {
row.style.display = '';
} else {
row.style.display = 'none';
}
});
}
function filterByBenchmarks(selectedBenchmarks) {
if (selectedBenchmarks.length === 0) {
// Show empty state or all hidden
document.getElementById('tableBody').innerHTML = '<tr><td colspan="16" style="text-align:center;padding:40px;color:var(--text-muted);">No benchmarks selected. Please select at least one benchmark.</td></tr>';
return;
}
// Get categories for selected benchmarks
const categoriesToShow = [...new Set(selectedBenchmarks.map(bench => BENCHMARK_CATEGORIES[bench]))];
// Populate table with selected benchmarks
populateTableWithBenchmarks(selectedBenchmarks);
// Show/hide columns based on selected benchmarks
const headers = document.querySelectorAll('thead th');
const benchmarkMapping = {
'gsm8k': 1,
'mmluPro': 2,
'gpqa': 3,
'hle': 4,
'olmOcr': 5,
'sweVerified': 6,
'arguana': 7,
'swePro': 8,
'aime2026': 9,
'terminalBench': 10,
'evasionBench': 11,
'hmmt2026': 12
};
// First show all benchmark columns
Object.values(benchmarkMapping).forEach(colIndex => {
const th = headers[colIndex];
if (th) th.style.display = '';
document.querySelectorAll(`tbody td:nth-child(${colIndex + 1})`).forEach(td => {
td.style.display = '';
});
});
// Then hide unselected ones
Object.entries(benchmarkMapping).forEach(([benchKey, colIndex]) => {
if (!selectedBenchmarks.includes(benchKey)) {
const th = headers[colIndex];
if (th) th.style.display = 'none';
document.querySelectorAll(`tbody td:nth-child(${colIndex + 1})`).forEach(td => {
td.style.display = 'none';
});
}
});
}
function initStickyHeader() {
const table = document.getElementById('leaderboardTable');
const thead = table.querySelector('thead');
const tableWrapper = document.querySelector('.tw');
// Clone the header for sticky version
const stickyHeader = thead.cloneNode(true);
stickyHeader.id = 'stickyHeader';
stickyHeader.style.cssText = 'position: fixed; top: 0; z-index: 1000; visibility: hidden; background: var(--surface-alt); box-shadow: 0 2px 8px rgba(0,0,0,0.15); display: table; table-layout: fixed;';
// Match the width and position of header
const updateHeaderPosition = () => {
const tableRect = tableWrapper.getBoundingClientRect();
const originalCells = thead.querySelectorAll('th');
const stickyCells = stickyHeader.querySelectorAll('th');
// Position sticky header to match table wrapper
stickyHeader.style.left = tableRect.left + 'px';
stickyHeader.style.width = tableRect.width + 'px';
// Match column widths
originalCells.forEach((cell, i) => {
if (stickyCells[i]) {
stickyCells[i].style.width = cell.offsetWidth + 'px';
}
});
};
document.body.appendChild(stickyHeader);
// Show/hide sticky header based on scroll
const handleScroll = () => {
const tableRect = table.getBoundingClientRect();
const theadRect = thead.getBoundingClientRect();
if (theadRect.top < 0 && tableRect.bottom > 50) {
stickyHeader.style.visibility = 'visible';
updateHeaderPosition();
} else {
stickyHeader.style.visibility = 'hidden';
}
};
window.addEventListener('scroll', handleScroll);
window.addEventListener('resize', updateHeaderPosition);
}
function init() {
// Initialize benchmark filter (defaults to 9 benchmarks selected, excluding gsm8k, vision, and embedding)
initBenchmarkFilter();
// Initialize size filter
initSizeFilter();
// Initialize sticky header
initStickyHeader();
// Load dark mode preference
if (localStorage.getItem('dark') === 'true') {
document.body.classList.add('dark');
document.getElementById('darkBtn').textContent = '☀️ Light';
}
// Sort by Terminal Bench 2.0 (column 10) by default
setTimeout(() => {
sortTable(10);
}, 100);
}
function populateTableWithDefaultHidden() {
// Populate with all categories except vision and embedding
const categoriesToShow = ['math', 'knowledge', 'coding', 'agent', 'language'];
populateTableWithCategories(categoriesToShow);
hideColumnsByCategory(['vision', 'embedding']);
}
function populateTableWithCategories(categories) {
const tbody = document.getElementById('tableBody');
// Filter models to only show those with scores in visible categories
// Keep original aggregate scores for fair ranking
const processedModels = LEADERBOARD_DATA.models.map(model => {
let visibleScoresCount = 0;
// Count how many visible benchmarks this model has scores for
Object.keys(model.benchmarks).forEach(benchKey => {
const benchCategory = BENCHMARK_CATEGORIES[benchKey];
if (categories.includes(benchCategory) && model.benchmarks[benchKey]) {
visibleScoresCount++;
}
});
return {
...model,
visibleCoverageCount: visibleScoresCount,
hasVisibleScores: visibleScoresCount > 0
};
});
// Filter out models with no scores in visible benchmarks
// Sort by ORIGINAL aggregate score (not recalculated)
const modelsToShow = processedModels
.filter(model => model.hasVisibleScores)
.sort((a, b) => b.aggregateScore - a.aggregateScore);
tbody.innerHTML = '';
// Calculate total visible benchmarks
const totalVisibleBenchmarks = Object.keys(BENCHMARK_CATEGORIES).filter(benchKey => {
const benchCategory = BENCHMARK_CATEGORIES[benchKey];
return categories.includes(benchCategory);
}).length;
modelsToShow.forEach((model, idx) => {
const row = document.createElement('tr');
if (model.type === 'open' && model.aggregateScore > 80) {
row.classList.add('hl');
}
row.dataset.type = model.type;
row.dataset.name = model.name.toLowerCase();
row.dataset.provider = model.provider.toLowerCase();
row.dataset.size = getModelSize(model) || '';
const typeBadge = model.type === 'open'
? '<span class="badge-type badge-open">Open</span>'
: '<span class="badge-type badge-closed">Closed</span>';
const aggGrade = getGrade(model.aggregateScore);
row.innerHTML = `
<td class="c-model">
<div class="mc">
<div class="mn">
${model.providerLogoUrl
? `<img src="${model.providerLogoUrl}" alt="${model.provider}" class="provider-logo-inline" title="${model.provider}" onerror="this.style.display='none';">`
: `<span class="provider-logo-fallback-inline" title="${model.provider}">${model.provider.substring(0,2).toUpperCase()}</span>`
}
<a href="https://huggingface.co/${model.name}" target="_blank" rel="noopener noreferrer">${model.name}</a>
</div>
<div class="ms">
<span class="mp">${model.provider}</span>
<span class="mp">${model.metadata.parameters || 'Unknown'}</span>
</div>
</div>
</td>
${renderScore(model.benchmarks.gsm8k, 'gsm8k')}
${renderScore(model.benchmarks.mmluPro, 'mmluPro')}
${renderScore(model.benchmarks.gpqa, 'gpqa')}
${renderScore(model.benchmarks.hle, 'hle')}
${renderScore(model.benchmarks.olmOcr, 'olmOcr')}
${renderScore(model.benchmarks.sweVerified, 'sweVerified')}
${renderScore(model.benchmarks.arguana, 'arguana')}
${renderScore(model.benchmarks.swePro, 'swePro')}
${renderScore(model.benchmarks.aime2026, 'aime2026')}
${renderScore(model.benchmarks.terminalBench, 'terminalBench')}
${renderScore(model.benchmarks.evasionBench, 'evasionBench')}
${renderScore(model.benchmarks.hmmt2026, 'hmmt2026')}
`;
tbody.appendChild(row);
});
// Update stats based on visible categories
updateStatsForCategories(categories);
}
function updateStatsForCategories(categories) {
const modelsWithScores = LEADERBOARD_DATA.models.filter(model => {
return Object.keys(model.benchmarks).some(benchKey => {
const benchCategory = BENCHMARK_CATEGORIES[benchKey];
return categories.includes(benchCategory) && model.benchmarks[benchKey];
});
});
const totalModels = modelsWithScores.length;
const totalScores = modelsWithScores.reduce((sum, model) => {
const visibleScores = Object.keys(model.benchmarks).filter(benchKey => {
const benchCategory = BENCHMARK_CATEGORIES[benchKey];
return categories.includes(benchCategory) && model.benchmarks[benchKey];
}).length;
return sum + visibleScores;
}, 0);
document.getElementById('statModels').textContent = totalModels;
document.getElementById('statScores').textContent = totalScores;
}
function populateTableWithBenchmarks(selectedBenchmarks) {
const tbody = document.getElementById('tableBody');
// Filter models to only show those with scores in selected benchmarks
const processedModels = LEADERBOARD_DATA.models.map(model => {
let visibleScoresCount = 0;
// Count how many selected benchmarks this model has scores for
selectedBenchmarks.forEach(benchKey => {
if (model.benchmarks[benchKey]) {
visibleScoresCount++;
}
});
return {
...model,
visibleCoverageCount: visibleScoresCount,
hasVisibleScores: visibleScoresCount > 0
};
});
// Filter out models with no scores in selected benchmarks
// Sort by ORIGINAL aggregate score
const modelsToShow = processedModels
.filter(model => model.hasVisibleScores)
.sort((a, b) => b.aggregateScore - a.aggregateScore);
tbody.innerHTML = '';
const totalVisibleBenchmarks = selectedBenchmarks.length;
modelsToShow.forEach((model, idx) => {
const row = document.createElement('tr');
if (model.type === 'open' && model.aggregateScore > 80) {
row.classList.add('hl');
}
row.dataset.type = model.type;
row.dataset.name = model.name.toLowerCase();
row.dataset.provider = model.provider.toLowerCase();
row.dataset.size = getModelSize(model) || '';
const typeBadge = model.type === 'open'
? '<span class="badge-type badge-open">Open</span>'
: '<span class="badge-type badge-closed">Closed</span>';
const aggGrade = getGrade(model.aggregateScore);
row.innerHTML = `
<td class="c-model">
<div class="mc">
<div class="mn">
${model.providerLogoUrl
? `<img src="${model.providerLogoUrl}" alt="${model.provider}" class="provider-logo-inline" title="${model.provider}" onerror="this.style.display='none';">`
: `<span class="provider-logo-fallback-inline" title="${model.provider}">${model.provider.substring(0,2).toUpperCase()}</span>`
}
<a href="https://huggingface.co/${model.name}" target="_blank" rel="noopener noreferrer">${model.name}</a>
</div>
<div class="ms">
<span class="mp">${model.provider}</span>
<span class="mp">${model.metadata.parameters || 'Unknown'}</span>
</div>
</div>
</td>
${renderScore(model.benchmarks.gsm8k, 'gsm8k')}
${renderScore(model.benchmarks.mmluPro, 'mmluPro')}
${renderScore(model.benchmarks.gpqa, 'gpqa')}
${renderScore(model.benchmarks.hle, 'hle')}
${renderScore(model.benchmarks.olmOcr, 'olmOcr')}
${renderScore(model.benchmarks.sweVerified, 'sweVerified')}
${renderScore(model.benchmarks.arguana, 'arguana')}
${renderScore(model.benchmarks.swePro, 'swePro')}
${renderScore(model.benchmarks.aime2026, 'aime2026')}
${renderScore(model.benchmarks.terminalBench, 'terminalBench')}
${renderScore(model.benchmarks.evasionBench, 'evasionBench')}
${renderScore(model.benchmarks.hmmt2026, 'hmmt2026')}
`;
tbody.appendChild(row);
});
// Update stats
updateStatsForBenchmarks(selectedBenchmarks);
}
function updateStatsForBenchmarks(selectedBenchmarks) {
const modelsWithScores = LEADERBOARD_DATA.models.filter(model => {
return selectedBenchmarks.some(benchKey => model.benchmarks[benchKey]);
});
const totalModels = modelsWithScores.length;
const totalScores = modelsWithScores.reduce((sum, model) => {
const visibleScores = selectedBenchmarks.filter(benchKey => model.benchmarks[benchKey]).length;
return sum + visibleScores;
}, 0);
document.getElementById('statModels').textContent = totalModels;
document.getElementById('statScores').textContent = totalScores;
}
function hideColumnsByCategory(categories) {
const headers = document.querySelectorAll('thead th');
headers.forEach((th, index) => {
if (index < 4) return; // Skip Model, Provider, Aggregate, Coverage
const classList = th.className.split(' ');
const bmClass = classList.find(cls => cls.startsWith('bm-'));
if (!bmClass) return;
const benchCategory = bmClass.replace('bm-', '');
if (categories.includes(benchCategory)) {
th.style.display = 'none';
const cellSelector = `tbody td:nth-child(${index + 1})`;
document.querySelectorAll(cellSelector).forEach(td => {
td.style.display = 'none';
});
}
});
}
function toggleDark() {
document.body.classList.toggle('dark');
const isDark = document.body.classList.contains('dark');
localStorage.setItem('dark', isDark);
document.getElementById('darkBtn').textContent = isDark ? '☀️ Light' : '🌙 Dark';
}
function getGrade(score, benchmarkKey = null) {
// Determine category
let category = 'aggregate';
if (benchmarkKey && BENCHMARK_CATEGORIES[benchmarkKey]) {
category = BENCHMARK_CATEGORIES[benchmarkKey];
}
// Get gradient for this category
const gradientData = CATEGORY_GRADIENTS[category] || CATEGORY_GRADIENTS['aggregate'];
return {
textColor: gradientData.textColor,
gradient: gradientData.gradient
};
}
function getConfidenceBadge(confidence) {
const badges = {
'official': '<span class="conf-badge conf-official">✓✓ Official</span>',
'verified': '<span class="conf-badge conf-verified">✓ Verified</span>',
'community': '<span class="conf-badge conf-community">~ Community</span>'
};
return badges[confidence] || '';
}
function renderScore(benchmarkData, benchmarkKey) {
if (!benchmarkData) {
return '<td><div class="sc"><span class="na">—</span></div></td>';
}
const score = benchmarkData.score;
const grade = getGrade(score, benchmarkKey);
return `
<td>
<div class="sc" title="${benchmarkData.source || ''} (${benchmarkData.date || 'unknown date'})">
<div class="sn" style="color: ${grade.textColor}; font-weight: 700;">${score.toFixed(1)}</div>
</div>
</td>
`;
}
function filterType(type) {
currentFilter = type;
// Update button states
document.querySelectorAll('.toolbar .fb').forEach(btn => {
btn.classList.remove('on');
});
event.target.classList.add('on');
// Filter rows
const rows = document.querySelectorAll('#tableBody tr');
rows.forEach(row => {
if (type === 'all') {
row.classList.remove('hidden');
} else {
if (row.dataset.type === type) {
row.classList.remove('hidden');
} else {
row.classList.add('hidden');
}
}
});
}
function filterModels() {
// Delegate to unified filter function
applyAllFilters();
}
function filterCategory(category) {
currentCategory = category;
// Update button states
document.querySelectorAll('.toolbar button[onclick^="filterCategory"]').forEach(btn => {
btn.classList.remove('on');
});
event.target.classList.add('on');
// Determine which categories to show
let categoriesToShow;
if (category === 'all') {
categoriesToShow = Object.values(BENCHMARK_CATEGORIES).filter((v, i, a) => a.indexOf(v) === i);
} else {
categoriesToShow = [category];
}
// Repopulate table with filtered benchmarks and recalculated scores
populateTableWithCategories(categoriesToShow);
// Show all columns first
const headers = document.querySelectorAll('thead th');
headers.forEach((th, index) => {
if (index < 4) return;
th.style.display = '';
const cellSelector = `tbody td:nth-child(${index + 1})`;
document.querySelectorAll(cellSelector).forEach(td => {
td.style.display = '';
});
});
// Hide columns not in the selected category
headers.forEach((th, index) => {
if (index < 4) return;
const classList = th.className.split(' ');
const bmClass = classList.find(cls => cls.startsWith('bm-'));
if (!bmClass) return;
const benchCategory = bmClass.replace('bm-', '');
if (!categoriesToShow.includes(benchCategory)) {
th.style.display = 'none';
const cellSelector = `tbody td:nth-child(${index + 1})`;
document.querySelectorAll(cellSelector).forEach(td => {
td.style.display = 'none';
});
}
});
}
let currentSortColumn = null;
let currentSortDirection = 'desc';
function sortTable(colIndex) {
const table = document.getElementById('leaderboardTable');
const tbody = table.querySelector('tbody');
const rows = Array.from(tbody.querySelectorAll('tr'));
// Toggle sort direction if clicking same column
if (currentSortColumn === colIndex) {
currentSortDirection = currentSortDirection === 'desc' ? 'asc' : 'desc';
} else {
currentSortColumn = colIndex;
currentSortDirection = 'desc';
}
// Sort rows
rows.sort((a, b) => {
let aVal, bVal;
if (colIndex === 0) {
// Sort by model name
aVal = a.dataset.name || '';
bVal = b.dataset.name || '';
return currentSortDirection === 'asc'
? aVal.localeCompare(bVal)
: bVal.localeCompare(aVal);
} else {
// Sort by benchmark score
const aCell = a.cells[colIndex];
const bCell = b.cells[colIndex];
// Extract score from cell (look for the score number)
const aScore = aCell ? parseFloat(aCell.textContent.trim()) : -1;
const bScore = bCell ? parseFloat(bCell.textContent.trim()) : -1;
// Handle missing scores (—) - put them at the end
if (isNaN(aScore) && isNaN(bScore)) return 0;
if (isNaN(aScore)) return 1;
if (isNaN(bScore)) return -1;
return currentSortDirection === 'desc'
? bScore - aScore
: aScore - bScore;
}
});
// Re-append rows in sorted order
rows.forEach(row => tbody.appendChild(row));
// Update sort indicators
updateSortIndicators(colIndex);
}
function updateSortIndicators(colIndex) {
const headers = document.querySelectorAll('thead th');
headers.forEach((th, index) => {
const sortArrow = th.querySelector('.sa');
if (sortArrow) {
if (index === colIndex) {
sortArrow.textContent = currentSortDirection === 'desc' ? '↓' : '↑';
th.style.color = 'var(--ac)';
} else {
sortArrow.textContent = '↕';
th.style.color = '';
}
}
});
}
// Initialize on page load - load data first, then init
window.addEventListener('DOMContentLoaded', () => {
// Load leaderboard data (no authentication required - dataset is public)
loadLeaderboardData();
});
</script>
<script type="module">
import { oauthLoginUrl, oauthHandleRedirectIfPresent } from "@huggingface/hub";
console.log("Initializing OAuth...");
let oauthResult = localStorage.getItem("oauth");
if (oauthResult) {
try {
oauthResult = JSON.parse(oauthResult);
} catch {
oauthResult = null;
}
}
oauthResult ||= await oauthHandleRedirectIfPresent();
if (oauthResult) {
// User is logged in
console.log("OAuth success:", oauthResult);
localStorage.setItem("oauth", JSON.stringify(oauthResult));
// Show user info
document.getElementById("oauthUser").style.display = "flex";
document.getElementById("oauthAvatar").src = oauthResult.userInfo?.avatarUrl || "";
document.getElementById("oauthUsername").textContent = oauthResult.userInfo?.name || "User";
// Setup signout
document.getElementById("oauthSignout").onclick = async function() {
localStorage.removeItem("oauth");
window.location.href = window.location.href.replace(/\?.*$/, '');
window.location.reload();
};
// Store token globally for API calls (optional - dataset is public)
window.HF_TOKEN = oauthResult.accessToken;
console.log("User logged in:", oauthResult.userInfo?.name);
} else {
// User is not logged in
document.getElementById("oauthSignin").style.display = "inline-block";
document.getElementById("oauthSignin").onclick = async function() {
const scopes = window.huggingface?.variables?.OAUTH_SCOPES || "openid profile email read-repos gated-repos";
window.location.href = (await oauthLoginUrl({scopes: scopes})) + "&prompt=consent";
};
}
</script>
</body>
</html>