Spaces:
Running
Running
| <html lang="en"> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
| <title>Official Benchmarks Leaderboard 2026 — 12 Hugging Face Benchmarks</title> | |
| <meta name="description" content="Unified leaderboard for 12 official Hugging Face benchmarks. Compare AI models across GSM8K, MMLU-Pro, GPQA, HLE, and more."> | |
| <meta name="keywords" content="AI benchmark, HuggingFace benchmarks, GSM8K, MMLU-Pro, GPQA, HLE, SWE-bench, leaderboard, AI evaluation"> | |
| <meta name="author" content="Benchmarks Team"> | |
| <meta name="robots" content="index, follow"> | |
| <link href="https://fonts.googleapis.com/css2?family=Source+Sans+Pro:ital,wght@0,200;0,300;0,400;0,600;0,700;1,200;1,300;1,400;1,600;1,700&family=IBM+Plex+Mono:wght@400;600;700&display=swap" rel="stylesheet"> | |
| <script src="https://cdnjs.cloudflare.com/ajax/libs/Chart.js/4.4.1/chart.umd.min.js"></script> | |
| <script src="https://unpkg.com/es-module-shims@1.7.0/dist/es-module-shims.js"></script> | |
| <script type="importmap"> | |
| { | |
| "imports": { | |
| "@huggingface/hub": "https://cdn.jsdelivr.net/npm/@huggingface/hub@0.21.0/+esm" | |
| } | |
| } | |
| </script> | |
| <style> | |
| *{margin:0;padding:0;box-sizing:border-box;} | |
| :root{ | |
| --bg:#f9fafb;--bg2:#f3f4f6;--surface:#ffffff;--surface-alt:#f9fafb; | |
| --border:#e5e7eb;--border-hover:#d1d5db; | |
| --shadow-sm:0 1px 3px rgba(15,23,42,.04),0 1px 2px rgba(15,23,42,.06); | |
| --shadow:0 4px 16px rgba(15,23,42,.06),0 1px 3px rgba(15,23,42,.08); | |
| --shadow-lg:0 12px 40px rgba(15,23,42,.08),0 4px 12px rgba(15,23,42,.06); | |
| --text:#111827;--text-sec:#6b7280;--text-muted:#9ca3af; | |
| --ac:#6366f1;--ac2:#4f46e5;--ac-bg:rgba(99,102,241,.06); | |
| --teal:#0d9488;--amber:#d97706;--green:#16a34a;--rose:#e11d48;--purple:#7c3aed; | |
| --radius:16px;--radius-sm:10px;--radius-xs:6px; | |
| --font:'Source Sans Pro',sans-serif;--font-mono:'IBM Plex Mono',monospace; | |
| --tr:0.22s cubic-bezier(0.4,0,0.2,1); | |
| } | |
| html{scroll-behavior:smooth;} | |
| body{font-family:var(--font);background:var(--bg);color:var(--text);min-height:100vh;-webkit-font-smoothing:antialiased;font-size:13px;} | |
| ::-webkit-scrollbar{width:5px;height:4px;} | |
| ::-webkit-scrollbar-track{background:transparent;} | |
| ::-webkit-scrollbar-thumb{background:rgba(99,102,241,.2);border-radius:10px;} | |
| ::-webkit-scrollbar-thumb:hover{background:rgba(99,102,241,.4);} | |
| ::selection{background:rgba(99,102,241,.12);} | |
| body::before{content:"";position:fixed;inset:0;z-index:0;pointer-events:none; | |
| background:radial-gradient(ellipse 70% 45% at 15% 8%,rgba(99,102,241,.05),transparent 55%), | |
| radial-gradient(ellipse 55% 35% at 85% 92%,rgba(13,148,136,.04),transparent 50%);} | |
| .wrap{position:relative;z-index:1;max-width:1600px;margin:0 auto;padding:22px 12px 70px;} | |
| /* HEADER */ | |
| header{text-align:center;margin-bottom:20px;animation:fadeIn .6s ease-out;} | |
| @keyframes fadeIn{from{opacity:0;transform:translateY(-10px)}to{opacity:1;transform:translateY(0)}} | |
| .badge-row{display:flex;align-items:center;justify-content:center;gap:8px;margin-bottom:10px;flex-wrap:wrap;} | |
| .badge{display:inline-flex;align-items:center;gap:6px;background:var(--surface);border:1px solid var(--border);border-radius:100px;padding:4px 14px;font-family:var(--font-mono);font-size:9px;font-weight:600;letter-spacing:2px;text-transform:uppercase;color:var(--ac);box-shadow:var(--shadow-sm);} | |
| .pulse{width:5px;height:5px;border-radius:50%;background:var(--ac);animation:p 2s infinite;} | |
| @keyframes p{0%,100%{opacity:1;transform:scale(1)}50%{opacity:.4;transform:scale(.8)}} | |
| h1{font-size:clamp(20px,3vw,36px);font-weight:800;line-height:1.1;letter-spacing:-1.5px;margin-bottom:8px; | |
| background:linear-gradient(135deg,#1e1b4b 15%,#6366f1 50%,#0d9488 85%);background-size:200%; | |
| -webkit-background-clip:text;-webkit-text-fill-color:transparent;animation:shimmer 6s ease-in-out infinite;} | |
| @keyframes shimmer{0%,100%{background-position:0%}50%{background-position:100%}} | |
| .sub{color:var(--text-muted);font-size:11px;line-height:1.8;max-width:800px;margin:0 auto 12px;} | |
| .sub b{color:var(--text-sec);font-weight:600;-webkit-text-fill-color:var(--text-sec);} | |
| /* STATS */ | |
| .stats{display:flex;flex-wrap:wrap;gap:7px;justify-content:center;margin-bottom:16px;} | |
| .st{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius-sm);padding:10px 16px;text-align:center;min-width:90px;box-shadow:var(--shadow-sm);transition:var(--tr);} | |
| .st:hover{box-shadow:var(--shadow);border-color:var(--border-hover);} | |
| .stn{font-family:var(--font-mono);font-size:18px;font-weight:700;color:var(--ac);} | |
| .stl{font-size:9px;color:var(--text-muted);margin-top:2px;text-transform:uppercase;letter-spacing:.5px;} | |
| /* TOOLBAR */ | |
| .toolbar{display:flex;flex-wrap:wrap;gap:8px;margin-bottom:12px;align-items:center;background:var(--surface);padding:12px;border-radius:var(--radius-sm);border:1px solid var(--border);box-shadow:var(--shadow-sm);} | |
| .search-wrap{position:relative;flex:1;min-width:200px;max-width:300px;} | |
| .search-wrap input{width:100%;padding:8px 10px 8px 32px;border:1px solid var(--border);border-radius:20px;background:var(--surface-alt);font-family:var(--font-mono);font-size:11px;color:var(--text);outline:none;transition:var(--tr);} | |
| .search-wrap input:focus{border-color:var(--ac);box-shadow:0 0 0 2px rgba(99,102,241,.1);background:var(--surface);} | |
| .search-wrap::before{content:"🔍";position:absolute;left:10px;top:50%;transform:translateY(-50%);font-size:14px;pointer-events:none;} | |
| .flbl{font-size:9px;font-family:var(--font-mono);color:var(--text-muted);text-transform:uppercase;letter-spacing:1px;font-weight:600;} | |
| .fb{background:var(--surface-alt);border:1px solid var(--border);color:var(--text-sec);padding:6px 14px;border-radius:20px;font-size:10px;font-weight:600;cursor:pointer;transition:var(--tr);box-shadow:var(--shadow-sm);font-family:var(--font);} | |
| .fb:hover{background:var(--ac-bg);border-color:rgba(99,102,241,.3);color:var(--ac);} | |
| .fb.on{background:linear-gradient(135deg,#6366f1,#4f46e5);border-color:transparent;color:#fff;box-shadow:0 3px 12px rgba(99,102,241,.25);} | |
| /* SIZE FILTER */ | |
| .size-filter-wrap{display:flex;flex-direction:column;gap:4px;padding:8px 16px;border-left:1px solid var(--border);border-right:1px solid var(--border);min-width:220px;} | |
| .size-filter-label{font-size:9px;font-family:var(--font-mono);color:var(--text-muted);text-transform:uppercase;letter-spacing:0.5px;font-weight:600;} | |
| .range-slider-container{position:relative;height:32px;display:flex;align-items:center;} | |
| .range-slider-track{position:absolute;width:100%;height:4px;background:var(--surface-alt);border-radius:2px;border:1px solid var(--border);} | |
| .range-slider-fill{position:absolute;height:4px;background:linear-gradient(90deg,#6366f1,#4f46e5);border-radius:2px;transition:all 0.1s ease;} | |
| .range-slider-input{position:absolute;width:100%;height:32px;-webkit-appearance:none;appearance:none;background:transparent;pointer-events:none;margin:0;} | |
| .range-slider-input::-webkit-slider-thumb{-webkit-appearance:none;appearance:none;width:12px;height:12px;border-radius:50%;background:white;border:2px solid #6366f1;cursor:pointer;pointer-events:all;box-shadow:0 2px 4px rgba(0,0,0,0.1);transition:all 0.2s ease;} | |
| .range-slider-input::-webkit-slider-thumb:hover{transform:scale(1.2);box-shadow:0 2px 8px rgba(99,102,241,0.3);} | |
| .range-slider-input::-webkit-slider-thumb:active{transform:scale(1.1);box-shadow:0 2px 12px rgba(99,102,241,0.4);} | |
| .range-slider-input::-moz-range-thumb{width:12px;height:12px;border-radius:50%;background:white;border:2px solid #6366f1;cursor:pointer;pointer-events:all;box-shadow:0 2px 4px rgba(0,0,0,0.1);transition:all 0.2s ease;} | |
| .range-slider-input::-moz-range-thumb:hover{transform:scale(1.2);box-shadow:0 2px 8px rgba(99,102,241,0.3);} | |
| .range-slider-input::-moz-range-thumb:active{transform:scale(1.1);box-shadow:0 2px 12px rgba(99,102,241,0.4);} | |
| .range-values{display:flex;justify-content:center;align-items:center;gap:4px;font-size:10px;font-family:var(--font-mono);color:var(--text-sec);font-weight:600;} | |
| .range-values span:first-child,.range-values span:last-child{color:var(--ac);font-weight:700;} | |
| /* BENCHMARK FILTER BAR */ | |
| .benchmark-filter-bar{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius-sm);padding:12px;margin-bottom:12px;box-shadow:var(--shadow-sm);} | |
| .benchmark-filter-wrapper{display:flex;flex-wrap:nowrap;gap:10px;align-items:center;justify-content:space-between;} | |
| .benchmark-category{display:flex;align-items:center;gap:6px;padding:6px 12px 6px 8px;background:var(--bg2);border-radius:12px;border:1px solid var(--border);flex:1;min-width:0;} | |
| .category-icon{font-size:14px;line-height:1;flex-shrink:0;} | |
| .category-label{font-size:9px;font-family:var(--font-mono);color:var(--text-muted);text-transform:uppercase;letter-spacing:0.5px;font-weight:600;white-space:nowrap;flex-shrink:0;} | |
| .benchmark-pills{display:flex;gap:4px;flex-wrap:wrap;flex:1;min-width:0;} | |
| .benchmark-pill{background:var(--surface);border:1px solid var(--border);color:var(--text-sec);padding:4px 10px;border-radius:16px;font-size:9px;font-weight:600;cursor:pointer;transition:all 0.2s cubic-bezier(0.4,0,0.2,1);font-family:var(--font);white-space:nowrap;box-shadow:var(--shadow-sm);opacity:0.6;} | |
| .benchmark-pill:hover{border-color:var(--border-hover);transform:translateY(-1px);box-shadow:0 2px 8px rgba(15,23,42,.08);} | |
| .benchmark-pill.active{background:var(--ac-bg);border-color:rgba(99,102,241,.3);color:var(--ac);opacity:1;box-shadow:0 2px 8px rgba(99,102,241,.12);} | |
| .benchmark-pill.active:hover{background:rgba(99,102,241,.12);border-color:var(--ac);} | |
| .filter-actions{display:flex;gap:6px;flex-shrink:0;padding-left:12px;border-left:2px solid var(--border);} | |
| .filter-action-btn{background:var(--surface-alt);border:1px solid var(--border);color:var(--text-sec);padding:4px 10px;border-radius:16px;font-size:9px;font-weight:600;cursor:pointer;transition:var(--tr);font-family:var(--font);white-space:nowrap;box-shadow:var(--shadow-sm);} | |
| .filter-action-btn:hover{background:var(--ac-bg);border-color:rgba(99,102,241,.3);color:var(--ac);} | |
| /* TABLE */ | |
| .tw{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);overflow-x:auto;box-shadow:var(--shadow);margin-bottom:20px;} | |
| table{width:100%;border-collapse:collapse;font-size:11px;} | |
| thead{background:var(--surface-alt);position:sticky;top:0;z-index:100;box-shadow:0 2px 4px rgba(0,0,0,0.1);} | |
| thead tr{border-bottom:2px solid var(--border);} | |
| th{padding:12px 8px;text-align:center;font-size:11px;font-family:var(--font-mono);text-transform:uppercase;letter-spacing:.5px;color:var(--text-muted);white-space:nowrap;cursor:pointer;user-select:none;vertical-align:bottom;line-height:1.6;font-weight:700;transition:var(--tr);} | |
| th.c-model{text-align:left;padding-left:14px;min-width:180px;position:sticky;left:0;background:var(--surface-alt);z-index:101;} | |
| th:hover{color:var(--ac);background:rgba(99,102,241,.08);transform:translateY(-1px);} | |
| th.sorted{color:var(--ac);font-weight:800;} | |
| .sa{opacity:.6;font-size:7px;margin-left:3px;} | |
| th a{color:inherit;text-decoration:none;} | |
| th a:hover{color:var(--ac);text-decoration:underline;} | |
| tbody tr{border-bottom:1px solid var(--border);transition:background var(--tr);} | |
| tbody tr:last-child{border-bottom:none;} | |
| tbody tr:hover{background:rgba(99,102,241,.025);} | |
| tbody tr.hl{background:transparent;} | |
| tbody tr.hl:hover{background:var(--hover);} | |
| tbody tr.hidden{display:none;} | |
| td{padding:10px 6px;text-align:center;vertical-align:middle;} | |
| td.c-model{text-align:left;padding-left:14px;position:sticky;left:0;background:var(--surface);z-index:9;border-right:1px solid var(--border);} | |
| tbody tr:hover td.c-model{background:rgba(99,102,241,.025);} | |
| tbody tr.hl td.c-model{background:var(--surface);} | |
| /* MODEL CELL */ | |
| .mc{display:flex;flex-direction:column;gap:2px;} | |
| .mn{font-weight:700;font-size:12px;color:var(--text);display:flex;align-items:center;gap:5px;flex-wrap:wrap;} | |
| .mn a{color:var(--text);text-decoration:none;transition:var(--tr);position:relative;} | |
| .mn a:hover{color:var(--ac);text-decoration:none;} | |
| .mn a::after{content:'';position:absolute;bottom:-2px;left:0;width:0;height:1px;background:var(--ac);transition:width 0.3s ease;} | |
| .mn a:hover::after{width:100%;} | |
| .ms{display:flex;gap:4px;align-items:center;margin-top:2px;} | |
| .mp{font-size:8px;color:var(--text-muted);font-family:var(--font-mono);} | |
| .badge-type{font-size:7px;padding:2px 6px;border-radius:4px;font-family:var(--font-mono);font-weight:700;text-transform:uppercase;} | |
| .badge-open{background:rgba(22,163,74,.1);color:#16a34a;border:1px solid rgba(22,163,74,.2);} | |
| .badge-closed{background:rgba(100,116,139,.1);color:#64748b;border:1px solid rgba(100,116,139,.2);} | |
| /* PROVIDER LOGO */ | |
| .provider-logo{width:20px;height:20px;border-radius:50%;object-fit:cover;border:1px solid var(--border);box-shadow:var(--shadow-sm);transition:var(--tr);} | |
| .provider-logo:hover{transform:scale(1.1);box-shadow:var(--shadow);} | |
| .provider-logo-fallback{width:20px;height:20px;border-radius:50%;background:var(--ac-bg);border:1px solid var(--border);display:inline-flex;align-items:center;justify-content:center;font-size:9px;font-weight:700;color:var(--ac);font-family:var(--font-mono);} | |
| .provider-logo-inline{width:16px;height:16px;border-radius:50%;object-fit:cover;border:1px solid var(--border);box-shadow:var(--shadow-sm);margin-right:6px;vertical-align:middle;display:inline-block;} | |
| .provider-logo-fallback-inline{width:16px;height:16px;border-radius:50%;background:var(--ac-bg);border:1px solid var(--border);display:inline-flex;align-items:center;justify-content:center;font-size:8px;font-weight:700;color:var(--ac);font-family:var(--font-mono);margin-right:6px;vertical-align:middle;} | |
| /* SCORE CELL */ | |
| .sc{display:flex;flex-direction:column;align-items:center;gap:2px;} | |
| .sn{font-family:var(--font-mono);font-size:11px;font-weight:700;} | |
| .sb{width:40px;height:3px;background:var(--border);border-radius:2px;overflow:hidden;margin-top:2px;} | |
| .sf{height:100%;border-radius:2px;transition:width .8s cubic-bezier(0.4,0,0.2,1);} | |
| .na{color:var(--text-muted);font-size:9px;font-family:var(--font-mono);} | |
| .conf-badge{font-size:6px;padding:1px 4px;border-radius:3px;font-family:var(--font-mono);font-weight:700;margin-top:2px;} | |
| .conf-official{background:rgba(22,163,74,.1);color:#16a34a;border:1px solid rgba(22,163,74,.2);} | |
| .conf-verified{background:rgba(59,130,246,.1);color:#3b82f6;border:1px solid rgba(59,130,246,.2);} | |
| .conf-community{background:rgba(217,119,6,.1);color:#d97706;border:1px solid rgba(217,119,6,.2);} | |
| /* COLOR GRADES */ | |
| .grade-s{color:#6366f1;font-weight:700;} | |
| .grade-a{color:#0d9488;font-weight:700;} | |
| .grade-b{color:#d97706;font-weight:700;} | |
| .grade-c{color:#e11d48;font-weight:600;} | |
| .bar-s{background:linear-gradient(90deg,#6366f1,#818cf8);} | |
| .bar-a{background:linear-gradient(90deg,#0d9488,#14b8a6);} | |
| .bar-b{background:linear-gradient(90deg,#d97706,#f59e0b);} | |
| .bar-c{background:linear-gradient(90deg,#e11d48,#f43f5e);} | |
| /* BENCHMARK COLORS */ | |
| .bm-math{color:#d97706;} | |
| .bm-knowledge{color:#6366f1;} | |
| .bm-coding{color:#0d9488;} | |
| .bm-vision{color:#16a34a;} | |
| .bm-embedding{color:#7c3aed;} | |
| .bm-language{color:#e11d48;} | |
| .bm-agent{color:#0d9488;} | |
| /* DARK MODE */ | |
| body.dark{--bg:#0f172a;--bg2:#1e293b;--surface:#1e293b;--surface-alt:#334155; | |
| --border:#334155;--border-hover:#475569;--text:#f1f5f9;--text-sec:#cbd5e1;--text-muted:#94a3b8; | |
| --shadow-sm:0 1px 3px rgba(0,0,0,.3);--shadow:0 4px 16px rgba(0,0,0,.3);--shadow-lg:0 12px 40px rgba(0,0,0,.4); | |
| --ac:#818cf8;--ac2:#6366f1;--ac-bg:rgba(129,140,248,.1);} | |
| body.dark::before{background:radial-gradient(ellipse 70% 45% at 15% 8%,rgba(129,140,248,.08),transparent 55%),radial-gradient(ellipse 55% 35% at 85% 92%,rgba(13,148,136,.06),transparent 50%);} | |
| body.dark th.c-model,body.dark td.c-model{background:var(--surface)!important;} | |
| body.dark thead{background:var(--surface-alt);} | |
| body.dark tbody tr:hover td.c-model{background:var(--surface-alt)!important;} | |
| /* MOBILE */ | |
| @media(max-width:768px){ | |
| .wrap{padding:12px 8px 50px;} | |
| h1{font-size:20px!important;} | |
| .toolbar{flex-direction:column;gap:6px;} | |
| .search-wrap{max-width:100%;min-width:100%;} | |
| .benchmark-filter-wrapper{flex-wrap:wrap;gap:8px;} | |
| .benchmark-category{flex:1 1 100%;margin-bottom:4px;} | |
| .filter-actions{flex:1 1 100%;justify-content:center;margin-top:4px;border-left:none;border-top:2px solid var(--border);padding-left:0;padding-top:8px;} | |
| table{font-size:9px;} | |
| th,td{padding:6px 3px;} | |
| th.c-model,td.c-model{min-width:130px!important;} | |
| .mn{font-size:10px!important;} | |
| } | |
| </style> | |
| </head> | |
| <body> | |
| <div class="wrap"> | |
| <header> | |
| <div class="badge-row"> | |
| <button id="darkBtn" onclick="toggleDark()" style="background:linear-gradient(135deg,#1e293b,#334155);border:1px solid #475569;border-radius:20px;padding:4px 14px;font-size:10px;font-family:var(--font-mono);color:#e2e8f0;cursor:pointer;font-weight:700;transition:all .2s;box-shadow:0 2px 6px rgba(0,0,0,.2)">🌙 Dark</button> | |
| </div> | |
| <h1>Community Benchmarks Leaderboard</h1> | |
| <p class="sub"> | |
| <b>Unified leaderboard for the official Hugging Face benchmarks.</b> A place to find and compare results comming from the commmunity, model card, paper. | |
| </p> | |
| <div class="stats"> | |
| <div class="st"><div class="stn" id="statModels">6</div><div class="stl">Models</div></div> | |
| <div class="st"><div class="stn">12</div><div class="stl">Benchmarks</div></div> | |
| <div class="st"><div class="stn" id="statScores">0</div><div class="stl">Total Scores</div></div> | |
| </div> | |
| </header> | |
| <div class="toolbar"> | |
| <div class="search-wrap"> | |
| <input type="text" id="searchBox" placeholder="Search models..." oninput="filterModels()"> | |
| </div> | |
| <div class="size-filter-wrap"> | |
| <div class="size-filter-label">Model Size</div> | |
| <div class="range-slider-container"> | |
| <div class="range-slider-track"> | |
| <div class="range-slider-fill" id="sliderFill"></div> | |
| </div> | |
| <input type="range" id="minSize" min="0" max="1100" value="0" step="1" class="range-slider-input"> | |
| <input type="range" id="maxSize" min="0" max="1100" value="1100" step="1" class="range-slider-input"> | |
| </div> | |
| <div class="range-values"> | |
| <span id="minSizeLabel">0B</span> | |
| <span> – </span> | |
| <span id="maxSizeLabel">1100B+</span> | |
| </div> | |
| </div> | |
| <div style="flex: 1"></div> | |
| <button id="oauthSignin" class="fb" style="display: none; background: linear-gradient(135deg, #6366f1, #4f46e5); color: white; border: none;"> | |
| 🔐 Sign in with HF | |
| </button> | |
| <div id="oauthUser" style="display: none; font-size: 10px; color: var(--text-sec); font-family: var(--font-mono); display: flex; align-items: center; gap: 8px;"> | |
| <img id="oauthAvatar" src="" style="width: 24px; height: 24px; border-radius: 50%; border: 1px solid var(--border);"> | |
| <span id="oauthUsername"></span> | |
| <button id="oauthSignout" class="fb" style="padding: 4px 10px;">Sign out</button> | |
| </div> | |
| </div> | |
| <div class="benchmark-filter-bar"> | |
| <div class="benchmark-filter-wrapper"> | |
| <div class="benchmark-category"> | |
| <span class="category-icon">📐</span> | |
| <span class="category-label">Math</span> | |
| <div class="benchmark-pills"> | |
| <button class="benchmark-pill active" data-benchmark="gsm8k" onclick="toggleBenchmark('gsm8k')">GSM8K</button> | |
| <button class="benchmark-pill active" data-benchmark="aime2026" onclick="toggleBenchmark('aime2026')">AIME 2026</button> | |
| <button class="benchmark-pill active" data-benchmark="hmmt2026" onclick="toggleBenchmark('hmmt2026')">HMMT</button> | |
| </div> | |
| </div> | |
| <div class="benchmark-category"> | |
| <span class="category-icon">🧠</span> | |
| <span class="category-label">Knowledge</span> | |
| <div class="benchmark-pills"> | |
| <button class="benchmark-pill active" data-benchmark="mmluPro" onclick="toggleBenchmark('mmluPro')">MMLU-Pro</button> | |
| <button class="benchmark-pill active" data-benchmark="gpqa" onclick="toggleBenchmark('gpqa')">GPQA◆</button> | |
| <button class="benchmark-pill active" data-benchmark="hle" onclick="toggleBenchmark('hle')">HLE</button> | |
| </div> | |
| </div> | |
| <div class="benchmark-category"> | |
| <span class="category-icon">💻</span> | |
| <span class="category-label">Coding</span> | |
| <div class="benchmark-pills"> | |
| <button class="benchmark-pill active" data-benchmark="sweVerified" onclick="toggleBenchmark('sweVerified')">SWE-V</button> | |
| <button class="benchmark-pill active" data-benchmark="swePro" onclick="toggleBenchmark('swePro')">SWE-Pro</button> | |
| </div> | |
| </div> | |
| <div class="benchmark-category"> | |
| <span class="category-icon">🤖</span> | |
| <span class="category-label">Agent</span> | |
| <div class="benchmark-pills"> | |
| <button class="benchmark-pill active" data-benchmark="terminalBench" onclick="toggleBenchmark('terminalBench')">TB 2.0</button> | |
| </div> | |
| </div> | |
| <div class="benchmark-category"> | |
| <span class="category-icon">💬</span> | |
| <span class="category-label">Language</span> | |
| <div class="benchmark-pills"> | |
| <button class="benchmark-pill active" data-benchmark="evasionBench" onclick="toggleBenchmark('evasionBench')">EvasionB</button> | |
| </div> | |
| </div> | |
| <div class="benchmark-category"> | |
| <span class="category-icon">👁️</span> | |
| <span class="category-label">Vision</span> | |
| <div class="benchmark-pills"> | |
| <button class="benchmark-pill" data-benchmark="olmOcr" onclick="toggleBenchmark('olmOcr')">olmOCR</button> | |
| </div> | |
| </div> | |
| <div class="benchmark-category"> | |
| <span class="category-icon">🔍</span> | |
| <span class="category-label">Embedding</span> | |
| <div class="benchmark-pills"> | |
| <button class="benchmark-pill" data-benchmark="arguana" onclick="toggleBenchmark('arguana')">ArguAna</button> | |
| </div> | |
| </div> | |
| <div class="filter-actions"> | |
| <button class="filter-action-btn" onclick="selectAllBenchmarks()">Select All</button> | |
| <button class="filter-action-btn" onclick="deselectAllBenchmarks()">Clear All</button> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="tw"> | |
| <table id="leaderboardTable"> | |
| <thead> | |
| <tr> | |
| <th class="c-model" onclick="sortTable(0)">Model<span class="sa">↕</span></th> | |
| <th onclick="sortTable(1)" class="bm-math" title="Grade School Math 8K - Click to sort">GSM8K <a href="https://huggingface.co/datasets/openai/gsm8k" target="_blank" onclick="event.stopPropagation()" style="font-size:9px;opacity:0.5;text-decoration:none;">ℹ️</a><span class="sa">↕</span></th> | |
| <th onclick="sortTable(2)" class="bm-knowledge" title="Massive Multi-task Language Understanding Pro - Click to sort">MMLU-Pro <a href="https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro" target="_blank" onclick="event.stopPropagation()" style="font-size:9px;opacity:0.5;text-decoration:none;">ℹ️</a><span class="sa">↕</span></th> | |
| <th onclick="sortTable(3)" class="bm-knowledge" title="PhD-level expert questions - Click to sort">GPQA◆ <a href="https://huggingface.co/datasets/Idavidrein/gpqa" target="_blank" onclick="event.stopPropagation()" style="font-size:9px;opacity:0.5;text-decoration:none;">ℹ️</a><span class="sa">↕</span></th> | |
| <th onclick="sortTable(4)" class="bm-knowledge" title="Humanity's Last Exam - Click to sort">HLE <a href="https://lastexam.ai" target="_blank" onclick="event.stopPropagation()" style="font-size:9px;opacity:0.5;text-decoration:none;">ℹ️</a><span class="sa">↕</span></th> | |
| <th onclick="sortTable(5)" class="bm-vision" title="OCR Evaluation Benchmark - Click to sort">olmOCR <a href="https://huggingface.co/datasets/allenai/olmOCR-bench" target="_blank" onclick="event.stopPropagation()" style="font-size:9px;opacity:0.5;text-decoration:none;">ℹ️</a><span class="sa">↕</span></th> | |
| <th onclick="sortTable(6)" class="bm-coding" title="SWE-bench Verified - Click to sort">SWE-V <a href="https://www.swebench.com" target="_blank" onclick="event.stopPropagation()" style="font-size:9px;opacity:0.5;text-decoration:none;">ℹ️</a><span class="sa">↕</span></th> | |
| <th onclick="sortTable(7)" class="bm-embedding" title="MTEB Text Retrieval - Click to sort">ArguAna <a href="https://huggingface.co/datasets/mteb/arguana" target="_blank" onclick="event.stopPropagation()" style="font-size:9px;opacity:0.5;text-decoration:none;">ℹ️</a><span class="sa">↕</span></th> | |
| <th onclick="sortTable(8)" class="bm-coding" title="SWE-bench Pro - Click to sort">SWE-Pro <a href="https://scale.com/leaderboard/swe_bench_pro_public" target="_blank" onclick="event.stopPropagation()" style="font-size:9px;opacity:0.5;text-decoration:none;">ℹ️</a><span class="sa">↕</span></th> | |
| <th onclick="sortTable(9)" class="bm-math" title="AIME 2026 - Click to sort">AIME'26 <a href="https://matharena.ai/?comp=aime--aime_2026" target="_blank" onclick="event.stopPropagation()" style="font-size:9px;opacity:0.5;text-decoration:none;">ℹ️</a><span class="sa">↕</span></th> | |
| <th onclick="sortTable(10)" class="bm-agent" title="Terminal-Bench 2.0 - Click to sort">TB 2.0 <a href="https://www.tbench.ai/leaderboard/terminal-bench/2.0" target="_blank" onclick="event.stopPropagation()" style="font-size:9px;opacity:0.5;text-decoration:none;">ℹ️</a><span class="sa">↕</span></th> | |
| <th onclick="sortTable(11)" class="bm-language" title="EvasionBench - Click to sort">EvasionB <a href="https://huggingface.co/datasets/FutureMa/EvasionBench" target="_blank" onclick="event.stopPropagation()" style="font-size:9px;opacity:0.5;text-decoration:none;">ℹ️</a><span class="sa">↕</span></th> | |
| <th onclick="sortTable(12)" class="bm-math" title="HMMT February 2026 - Click to sort">HMMT Feb'26 <a href="https://matharena.ai/?comp=hmmt--hmmt_feb_2026" target="_blank" onclick="event.stopPropagation()" style="font-size:9px;opacity:0.5;text-decoration:none;">ℹ️</a><span class="sa">↕</span></th> | |
| </tr> | |
| </thead> | |
| <tbody id="tableBody"> | |
| </tbody> | |
| </table> | |
| </div> | |
| </div> | |
| <script> | |
| // Load leaderboard data from HuggingFace parquet dataset | |
| let LEADERBOARD_DATA = null; | |
| const DATASET_REPO = 'OpenEvals/leaderboard-data'; | |
| const PARQUET_URL = `https://huggingface.co/api/datasets/${DATASET_REPO}/parquet/default/train/0.parquet`; | |
| // Transform flat parquet row to nested JSON structure expected by the UI | |
| function transformParquetToModel(row) { | |
| const model = { | |
| id: row.model_id, | |
| name: row.model_name, | |
| provider: row.provider, | |
| type: row.model_type || 'open', | |
| metadata: { | |
| license: row.license || 'Unknown', | |
| parametersInBillions: row.parameters_billions, | |
| contextWindow: row.context_window || 0, | |
| modality: row.modality || 'text', | |
| architecture: row.architecture || 'Transformer' | |
| }, | |
| benchmarks: {}, | |
| aggregateScore: row.aggregate_score, | |
| coverageCount: row.coverage_count, | |
| coveragePercent: row.coverage_percent | |
| }; | |
| // Extract benchmark scores from flat columns | |
| const benchmarkKeys = ['gsm8k', 'mmluPro', 'gpqa', 'hle', 'olmOcr', 'sweVerified', | |
| 'swePro', 'aime2026', 'terminalBench', 'evasionBench', 'hmmt2026']; | |
| for (const key of benchmarkKeys) { | |
| const scoreKey = `${key}_score`; | |
| if (row[scoreKey] !== null && row[scoreKey] !== undefined) { | |
| model.benchmarks[key] = { | |
| score: row[scoreKey], | |
| confidence: 'official', | |
| source: 'API', | |
| date: new Date().toISOString().split('T')[0] | |
| }; | |
| } | |
| } | |
| return model; | |
| } | |
| // Fetch data from public parquet dataset | |
| async function loadLeaderboardData() { | |
| try { | |
| // Show loading state | |
| const tableBody = document.querySelector('#leaderboardTable tbody'); | |
| if (tableBody) { | |
| tableBody.innerHTML = '<tr><td colspan="13" style="text-align:center;padding:40px;color:var(--text-muted);">Loading leaderboard data...</td></tr>'; | |
| } | |
| console.log('Fetching parquet data from:', PARQUET_URL); | |
| // Fetch parquet file (no auth needed for public dataset) | |
| const response = await fetch(PARQUET_URL); | |
| if (!response.ok) { | |
| throw new Error(`Failed to load data: ${response.status} ${response.statusText}`); | |
| } | |
| // Get parquet as array buffer | |
| const arrayBuffer = await response.arrayBuffer(); | |
| // Use parquet-wasm to read the data | |
| const parquetWasm = await import('https://cdn.jsdelivr.net/npm/parquet-wasm@0.6.0/+esm'); | |
| await parquetWasm.default(); | |
| // Read parquet file | |
| const parquetFile = parquetWasm.readParquet(new Uint8Array(arrayBuffer)); | |
| const table = parquetFile.intoIPCStream(); | |
| // Parse Apache Arrow IPC to JSON | |
| const arrowTable = await import('https://cdn.jsdelivr.net/npm/apache-arrow@14.0.0/+esm').then(m => m.tableFromIPC(table)); | |
| const rows = arrowTable.toArray().map(row => row.toJSON()); | |
| console.log(`Loaded ${rows.length} models from parquet`); | |
| // Transform flat parquet rows to nested JSON structure | |
| const models = rows.map(transformParquetToModel); | |
| // Create leaderboard data structure | |
| LEADERBOARD_DATA = { | |
| metadata: { | |
| version: '1.0.0', | |
| lastUpdated: new Date().toISOString(), | |
| title: 'Official Benchmarks Leaderboard 2026', | |
| description: 'Unified leaderboard for 11 official Hugging Face benchmarks', | |
| totalModels: models.length, | |
| totalBenchmarks: 11 | |
| }, | |
| benchmarks: { | |
| // Benchmark definitions (kept for compatibility) | |
| gsm8k: { id: 'gsm8k', name: 'GSM8K', shortName: 'GSM8K', category: 'math' }, | |
| mmluPro: { id: 'mmluPro', name: 'MMLU-Pro', shortName: 'MMLU-Pro', category: 'knowledge' }, | |
| gpqa: { id: 'gpqa', name: 'GPQA Diamond', shortName: 'GPQA', category: 'knowledge' }, | |
| hle: { id: 'hle', name: 'HLE', shortName: 'HLE', category: 'knowledge' }, | |
| olmOcr: { id: 'olmOcr', name: 'olmOCR', shortName: 'olmOCR', category: 'vision' }, | |
| sweVerified: { id: 'sweVerified', name: 'SWE-bench Verified', shortName: 'SWE-V', category: 'coding' }, | |
| swePro: { id: 'swePro', name: 'SWE-bench Pro', shortName: 'SWE-Pro', category: 'coding' }, | |
| aime2026: { id: 'aime2026', name: 'AIME 2026', shortName: 'AIME', category: 'math' }, | |
| terminalBench: { id: 'terminalBench', name: 'Terminal-Bench 2.0', shortName: 'TB 2.0', category: 'agent' }, | |
| evasionBench: { id: 'evasionBench', name: 'EvasionBench', shortName: 'EvasionB', category: 'language' }, | |
| hmmt2026: { id: 'hmmt2026', name: 'HMMT Feb 2026', shortName: 'HMMT', category: 'math' } | |
| }, | |
| models: models | |
| }; | |
| console.log('Leaderboard data loaded successfully'); | |
| // Initialize the page after data is loaded | |
| init(); | |
| } catch (error) { | |
| console.error('Error loading leaderboard data:', error); | |
| const tableBody = document.querySelector('#leaderboardTable tbody'); | |
| if (tableBody) { | |
| tableBody.innerHTML = ` | |
| <tr> | |
| <td colspan="13" style="text-align:center;padding:40px;"> | |
| <div style="color:var(--rose);font-weight:600;margin-bottom:10px;">⚠️ Failed to load leaderboard data</div> | |
| <div style="color:var(--text-muted);font-size:11px;">${error.message}</div> | |
| <div style="margin-top:15px;"> | |
| <button onclick="location.reload()" style="background:var(--ac);color:white;border:none;padding:8px 16px;border-radius:8px;cursor:pointer;font-weight:600;">Retry</button> | |
| </div> | |
| </td> | |
| </tr> | |
| `; | |
| } | |
| } | |
| } | |
| // Placeholder - will be loaded from dataset | |
| const BENCHMARK_CATEGORIES = { | |
| 'gsm8k': 'math', | |
| 'aime2026': 'math', | |
| 'hmmt2026': 'math', | |
| 'mmluPro': 'knowledge', | |
| 'gpqa': 'knowledge', | |
| 'hle': 'knowledge', | |
| 'sweVerified': 'coding', | |
| 'swePro': 'coding', | |
| 'olmOcr': 'vision', | |
| 'arguana': 'embedding', | |
| 'terminalBench': 'agent', | |
| 'evasionBench': 'language' | |
| }; | |
| const CATEGORY_GRADIENTS = { | |
| 'math': { | |
| textColor: '#7c3aed', | |
| gradient: 'linear-gradient(90deg, #e9d5ff, #a855f7, #7c3aed)' | |
| }, | |
| 'knowledge': { | |
| textColor: '#2563eb', | |
| gradient: 'linear-gradient(90deg, #dbeafe, #60a5fa, #2563eb)' | |
| }, | |
| 'coding': { | |
| textColor: '#059669', | |
| gradient: 'linear-gradient(90deg, #d1fae5, #34d399, #059669)' | |
| }, | |
| 'agent': { | |
| textColor: '#0d9488', | |
| gradient: 'linear-gradient(90deg, #ccfbf1, #5eead4, #0d9488)' | |
| }, | |
| 'language': { | |
| textColor: '#ea580c', | |
| gradient: 'linear-gradient(90deg, #fed7aa, #fb923c, #ea580c)' | |
| }, | |
| 'vision': { | |
| textColor: '#db2777', | |
| gradient: 'linear-gradient(90deg, #fce7f3, #f472b6, #db2777)' | |
| }, | |
| 'embedding': { | |
| textColor: '#d97706', | |
| gradient: 'linear-gradient(90deg, #fef3c7, #fbbf24, #d97706)' | |
| }, | |
| 'aggregate': { | |
| textColor: '#6366f1', | |
| gradient: 'linear-gradient(90deg, #e0e7ff, #818cf8, #6366f1)' | |
| } | |
| }; | |
| // Benchmark filter functions | |
| let selectedBenchmarks = ['mmluPro', 'gpqa', 'hle', 'sweVerified', 'swePro', 'aime2026', 'terminalBench', 'evasionBench', 'hmmt2026']; | |
| function toggleBenchmark(benchmarkKey) { | |
| const pill = document.querySelector(`.benchmark-pill[data-benchmark="${benchmarkKey}"]`); | |
| if (selectedBenchmarks.includes(benchmarkKey)) { | |
| // Deselect | |
| selectedBenchmarks = selectedBenchmarks.filter(b => b !== benchmarkKey); | |
| pill.classList.remove('active'); | |
| } else { | |
| // Select | |
| selectedBenchmarks.push(benchmarkKey); | |
| pill.classList.add('active'); | |
| } | |
| // Update table | |
| filterByBenchmarks(selectedBenchmarks); | |
| } | |
| function selectAllBenchmarks() { | |
| // Get all benchmarks from BENCHMARK_CATEGORIES | |
| selectedBenchmarks = Object.keys(BENCHMARK_CATEGORIES); | |
| // Update all pills to active | |
| document.querySelectorAll('.benchmark-pill').forEach(pill => { | |
| pill.classList.add('active'); | |
| }); | |
| // Update table | |
| filterByBenchmarks(selectedBenchmarks); | |
| } | |
| function deselectAllBenchmarks() { | |
| selectedBenchmarks = []; | |
| // Update all pills to inactive | |
| document.querySelectorAll('.benchmark-pill').forEach(pill => { | |
| pill.classList.remove('active'); | |
| }); | |
| // Update table | |
| filterByBenchmarks(selectedBenchmarks); | |
| } | |
| function initBenchmarkFilter() { | |
| // Set initial active states based on selectedBenchmarks array | |
| selectedBenchmarks.forEach(benchKey => { | |
| const pill = document.querySelector(`.benchmark-pill[data-benchmark="${benchKey}"]`); | |
| if (pill) pill.classList.add('active'); | |
| }); | |
| // Initial table population | |
| filterByBenchmarks(selectedBenchmarks); | |
| } | |
| // ===== SIZE FILTER FUNCTIONS ===== | |
| function getModelSize(model) { | |
| // First try to get from metadata.parametersInBillions | |
| if (model.metadata && model.metadata.parametersInBillions !== null && model.metadata.parametersInBillions !== undefined) { | |
| return model.metadata.parametersInBillions; | |
| } | |
| // Return null if not available (model will always be shown) | |
| return null; | |
| } | |
| function initSizeFilter() { | |
| const minSlider = document.getElementById('minSize'); | |
| const maxSlider = document.getElementById('maxSize'); | |
| const minLabel = document.getElementById('minSizeLabel'); | |
| const maxLabel = document.getElementById('maxSizeLabel'); | |
| const sliderFill = document.getElementById('sliderFill'); | |
| function updateSlider() { | |
| let minVal = parseInt(minSlider.value); | |
| let maxVal = parseInt(maxSlider.value); | |
| // Prevent thumbs from crossing | |
| if (minVal > maxVal - 1) { | |
| minVal = maxVal - 1; | |
| minSlider.value = minVal; | |
| } | |
| if (maxVal < minVal + 1) { | |
| maxVal = minVal + 1; | |
| maxSlider.value = maxVal; | |
| } | |
| // Update labels | |
| minLabel.textContent = minVal + 'B'; | |
| maxLabel.textContent = maxVal === 1100 ? '1100B+' : maxVal + 'B'; | |
| // Update fill bar | |
| const minPercent = (minVal / 1100) * 100; | |
| const maxPercent = (maxVal / 1100) * 100; | |
| sliderFill.style.left = minPercent + '%'; | |
| sliderFill.style.width = (maxPercent - minPercent) + '%'; | |
| // Apply all filters | |
| applyAllFilters(); | |
| } | |
| // Real-time filtering on input (drag) | |
| minSlider.addEventListener('input', updateSlider); | |
| maxSlider.addEventListener('input', updateSlider); | |
| // Initial update | |
| updateSlider(); | |
| } | |
| function applyAllFilters() { | |
| const searchTerm = document.getElementById('searchBox').value.toLowerCase(); | |
| const minSize = parseInt(document.getElementById('minSize').value); | |
| const maxSize = parseInt(document.getElementById('maxSize').value); | |
| // Get all rows | |
| const rows = document.querySelectorAll('#tableBody tr'); | |
| rows.forEach(row => { | |
| // Use dataset.name which is already set | |
| const modelName = row.dataset.name || ''; | |
| const modelSize = parseFloat(row.dataset.size); | |
| // Search filter | |
| const matchesSearch = !searchTerm || modelName.includes(searchTerm); | |
| // Size filter (always show if no size data) | |
| let matchesSize = true; | |
| if (!isNaN(modelSize) && modelSize > 0) { | |
| matchesSize = modelSize >= minSize && modelSize <= maxSize; | |
| } | |
| // Show row if matches all filters (AND logic) | |
| if (matchesSearch && matchesSize) { | |
| row.style.display = ''; | |
| } else { | |
| row.style.display = 'none'; | |
| } | |
| }); | |
| } | |
| function filterByBenchmarks(selectedBenchmarks) { | |
| if (selectedBenchmarks.length === 0) { | |
| // Show empty state or all hidden | |
| document.getElementById('tableBody').innerHTML = '<tr><td colspan="16" style="text-align:center;padding:40px;color:var(--text-muted);">No benchmarks selected. Please select at least one benchmark.</td></tr>'; | |
| return; | |
| } | |
| // Get categories for selected benchmarks | |
| const categoriesToShow = [...new Set(selectedBenchmarks.map(bench => BENCHMARK_CATEGORIES[bench]))]; | |
| // Populate table with selected benchmarks | |
| populateTableWithBenchmarks(selectedBenchmarks); | |
| // Show/hide columns based on selected benchmarks | |
| const headers = document.querySelectorAll('thead th'); | |
| const benchmarkMapping = { | |
| 'gsm8k': 1, | |
| 'mmluPro': 2, | |
| 'gpqa': 3, | |
| 'hle': 4, | |
| 'olmOcr': 5, | |
| 'sweVerified': 6, | |
| 'arguana': 7, | |
| 'swePro': 8, | |
| 'aime2026': 9, | |
| 'terminalBench': 10, | |
| 'evasionBench': 11, | |
| 'hmmt2026': 12 | |
| }; | |
| // First show all benchmark columns | |
| Object.values(benchmarkMapping).forEach(colIndex => { | |
| const th = headers[colIndex]; | |
| if (th) th.style.display = ''; | |
| document.querySelectorAll(`tbody td:nth-child(${colIndex + 1})`).forEach(td => { | |
| td.style.display = ''; | |
| }); | |
| }); | |
| // Then hide unselected ones | |
| Object.entries(benchmarkMapping).forEach(([benchKey, colIndex]) => { | |
| if (!selectedBenchmarks.includes(benchKey)) { | |
| const th = headers[colIndex]; | |
| if (th) th.style.display = 'none'; | |
| document.querySelectorAll(`tbody td:nth-child(${colIndex + 1})`).forEach(td => { | |
| td.style.display = 'none'; | |
| }); | |
| } | |
| }); | |
| } | |
| function initStickyHeader() { | |
| const table = document.getElementById('leaderboardTable'); | |
| const thead = table.querySelector('thead'); | |
| const tableWrapper = document.querySelector('.tw'); | |
| // Clone the header for sticky version | |
| const stickyHeader = thead.cloneNode(true); | |
| stickyHeader.id = 'stickyHeader'; | |
| stickyHeader.style.cssText = 'position: fixed; top: 0; z-index: 1000; visibility: hidden; background: var(--surface-alt); box-shadow: 0 2px 8px rgba(0,0,0,0.15); display: table; table-layout: fixed;'; | |
| // Match the width and position of header | |
| const updateHeaderPosition = () => { | |
| const tableRect = tableWrapper.getBoundingClientRect(); | |
| const originalCells = thead.querySelectorAll('th'); | |
| const stickyCells = stickyHeader.querySelectorAll('th'); | |
| // Position sticky header to match table wrapper | |
| stickyHeader.style.left = tableRect.left + 'px'; | |
| stickyHeader.style.width = tableRect.width + 'px'; | |
| // Match column widths | |
| originalCells.forEach((cell, i) => { | |
| if (stickyCells[i]) { | |
| stickyCells[i].style.width = cell.offsetWidth + 'px'; | |
| } | |
| }); | |
| }; | |
| document.body.appendChild(stickyHeader); | |
| // Show/hide sticky header based on scroll | |
| const handleScroll = () => { | |
| const tableRect = table.getBoundingClientRect(); | |
| const theadRect = thead.getBoundingClientRect(); | |
| if (theadRect.top < 0 && tableRect.bottom > 50) { | |
| stickyHeader.style.visibility = 'visible'; | |
| updateHeaderPosition(); | |
| } else { | |
| stickyHeader.style.visibility = 'hidden'; | |
| } | |
| }; | |
| window.addEventListener('scroll', handleScroll); | |
| window.addEventListener('resize', updateHeaderPosition); | |
| } | |
| function init() { | |
| // Initialize benchmark filter (defaults to 9 benchmarks selected, excluding gsm8k, vision, and embedding) | |
| initBenchmarkFilter(); | |
| // Initialize size filter | |
| initSizeFilter(); | |
| // Initialize sticky header | |
| initStickyHeader(); | |
| // Load dark mode preference | |
| if (localStorage.getItem('dark') === 'true') { | |
| document.body.classList.add('dark'); | |
| document.getElementById('darkBtn').textContent = '☀️ Light'; | |
| } | |
| // Sort by Terminal Bench 2.0 (column 10) by default | |
| setTimeout(() => { | |
| sortTable(10); | |
| }, 100); | |
| } | |
| function populateTableWithDefaultHidden() { | |
| // Populate with all categories except vision and embedding | |
| const categoriesToShow = ['math', 'knowledge', 'coding', 'agent', 'language']; | |
| populateTableWithCategories(categoriesToShow); | |
| hideColumnsByCategory(['vision', 'embedding']); | |
| } | |
| function populateTableWithCategories(categories) { | |
| const tbody = document.getElementById('tableBody'); | |
| // Filter models to only show those with scores in visible categories | |
| // Keep original aggregate scores for fair ranking | |
| const processedModels = LEADERBOARD_DATA.models.map(model => { | |
| let visibleScoresCount = 0; | |
| // Count how many visible benchmarks this model has scores for | |
| Object.keys(model.benchmarks).forEach(benchKey => { | |
| const benchCategory = BENCHMARK_CATEGORIES[benchKey]; | |
| if (categories.includes(benchCategory) && model.benchmarks[benchKey]) { | |
| visibleScoresCount++; | |
| } | |
| }); | |
| return { | |
| ...model, | |
| visibleCoverageCount: visibleScoresCount, | |
| hasVisibleScores: visibleScoresCount > 0 | |
| }; | |
| }); | |
| // Filter out models with no scores in visible benchmarks | |
| // Sort by ORIGINAL aggregate score (not recalculated) | |
| const modelsToShow = processedModels | |
| .filter(model => model.hasVisibleScores) | |
| .sort((a, b) => b.aggregateScore - a.aggregateScore); | |
| tbody.innerHTML = ''; | |
| // Calculate total visible benchmarks | |
| const totalVisibleBenchmarks = Object.keys(BENCHMARK_CATEGORIES).filter(benchKey => { | |
| const benchCategory = BENCHMARK_CATEGORIES[benchKey]; | |
| return categories.includes(benchCategory); | |
| }).length; | |
| modelsToShow.forEach((model, idx) => { | |
| const row = document.createElement('tr'); | |
| if (model.type === 'open' && model.aggregateScore > 80) { | |
| row.classList.add('hl'); | |
| } | |
| row.dataset.type = model.type; | |
| row.dataset.name = model.name.toLowerCase(); | |
| row.dataset.provider = model.provider.toLowerCase(); | |
| row.dataset.size = getModelSize(model) || ''; | |
| const typeBadge = model.type === 'open' | |
| ? '<span class="badge-type badge-open">Open</span>' | |
| : '<span class="badge-type badge-closed">Closed</span>'; | |
| const aggGrade = getGrade(model.aggregateScore); | |
| row.innerHTML = ` | |
| <td class="c-model"> | |
| <div class="mc"> | |
| <div class="mn"> | |
| ${model.providerLogoUrl | |
| ? `<img src="${model.providerLogoUrl}" alt="${model.provider}" class="provider-logo-inline" title="${model.provider}" onerror="this.style.display='none';">` | |
| : `<span class="provider-logo-fallback-inline" title="${model.provider}">${model.provider.substring(0,2).toUpperCase()}</span>` | |
| } | |
| <a href="https://huggingface.co/${model.name}" target="_blank" rel="noopener noreferrer">${model.name}</a> | |
| </div> | |
| <div class="ms"> | |
| <span class="mp">${model.provider}</span> | |
| <span class="mp">${model.metadata.parameters || 'Unknown'}</span> | |
| </div> | |
| </div> | |
| </td> | |
| ${renderScore(model.benchmarks.gsm8k, 'gsm8k')} | |
| ${renderScore(model.benchmarks.mmluPro, 'mmluPro')} | |
| ${renderScore(model.benchmarks.gpqa, 'gpqa')} | |
| ${renderScore(model.benchmarks.hle, 'hle')} | |
| ${renderScore(model.benchmarks.olmOcr, 'olmOcr')} | |
| ${renderScore(model.benchmarks.sweVerified, 'sweVerified')} | |
| ${renderScore(model.benchmarks.arguana, 'arguana')} | |
| ${renderScore(model.benchmarks.swePro, 'swePro')} | |
| ${renderScore(model.benchmarks.aime2026, 'aime2026')} | |
| ${renderScore(model.benchmarks.terminalBench, 'terminalBench')} | |
| ${renderScore(model.benchmarks.evasionBench, 'evasionBench')} | |
| ${renderScore(model.benchmarks.hmmt2026, 'hmmt2026')} | |
| `; | |
| tbody.appendChild(row); | |
| }); | |
| // Update stats based on visible categories | |
| updateStatsForCategories(categories); | |
| } | |
| function updateStatsForCategories(categories) { | |
| const modelsWithScores = LEADERBOARD_DATA.models.filter(model => { | |
| return Object.keys(model.benchmarks).some(benchKey => { | |
| const benchCategory = BENCHMARK_CATEGORIES[benchKey]; | |
| return categories.includes(benchCategory) && model.benchmarks[benchKey]; | |
| }); | |
| }); | |
| const totalModels = modelsWithScores.length; | |
| const totalScores = modelsWithScores.reduce((sum, model) => { | |
| const visibleScores = Object.keys(model.benchmarks).filter(benchKey => { | |
| const benchCategory = BENCHMARK_CATEGORIES[benchKey]; | |
| return categories.includes(benchCategory) && model.benchmarks[benchKey]; | |
| }).length; | |
| return sum + visibleScores; | |
| }, 0); | |
| document.getElementById('statModels').textContent = totalModels; | |
| document.getElementById('statScores').textContent = totalScores; | |
| } | |
| function populateTableWithBenchmarks(selectedBenchmarks) { | |
| const tbody = document.getElementById('tableBody'); | |
| // Filter models to only show those with scores in selected benchmarks | |
| const processedModels = LEADERBOARD_DATA.models.map(model => { | |
| let visibleScoresCount = 0; | |
| // Count how many selected benchmarks this model has scores for | |
| selectedBenchmarks.forEach(benchKey => { | |
| if (model.benchmarks[benchKey]) { | |
| visibleScoresCount++; | |
| } | |
| }); | |
| return { | |
| ...model, | |
| visibleCoverageCount: visibleScoresCount, | |
| hasVisibleScores: visibleScoresCount > 0 | |
| }; | |
| }); | |
| // Filter out models with no scores in selected benchmarks | |
| // Sort by ORIGINAL aggregate score | |
| const modelsToShow = processedModels | |
| .filter(model => model.hasVisibleScores) | |
| .sort((a, b) => b.aggregateScore - a.aggregateScore); | |
| tbody.innerHTML = ''; | |
| const totalVisibleBenchmarks = selectedBenchmarks.length; | |
| modelsToShow.forEach((model, idx) => { | |
| const row = document.createElement('tr'); | |
| if (model.type === 'open' && model.aggregateScore > 80) { | |
| row.classList.add('hl'); | |
| } | |
| row.dataset.type = model.type; | |
| row.dataset.name = model.name.toLowerCase(); | |
| row.dataset.provider = model.provider.toLowerCase(); | |
| row.dataset.size = getModelSize(model) || ''; | |
| const typeBadge = model.type === 'open' | |
| ? '<span class="badge-type badge-open">Open</span>' | |
| : '<span class="badge-type badge-closed">Closed</span>'; | |
| const aggGrade = getGrade(model.aggregateScore); | |
| row.innerHTML = ` | |
| <td class="c-model"> | |
| <div class="mc"> | |
| <div class="mn"> | |
| ${model.providerLogoUrl | |
| ? `<img src="${model.providerLogoUrl}" alt="${model.provider}" class="provider-logo-inline" title="${model.provider}" onerror="this.style.display='none';">` | |
| : `<span class="provider-logo-fallback-inline" title="${model.provider}">${model.provider.substring(0,2).toUpperCase()}</span>` | |
| } | |
| <a href="https://huggingface.co/${model.name}" target="_blank" rel="noopener noreferrer">${model.name}</a> | |
| </div> | |
| <div class="ms"> | |
| <span class="mp">${model.provider}</span> | |
| <span class="mp">${model.metadata.parameters || 'Unknown'}</span> | |
| </div> | |
| </div> | |
| </td> | |
| ${renderScore(model.benchmarks.gsm8k, 'gsm8k')} | |
| ${renderScore(model.benchmarks.mmluPro, 'mmluPro')} | |
| ${renderScore(model.benchmarks.gpqa, 'gpqa')} | |
| ${renderScore(model.benchmarks.hle, 'hle')} | |
| ${renderScore(model.benchmarks.olmOcr, 'olmOcr')} | |
| ${renderScore(model.benchmarks.sweVerified, 'sweVerified')} | |
| ${renderScore(model.benchmarks.arguana, 'arguana')} | |
| ${renderScore(model.benchmarks.swePro, 'swePro')} | |
| ${renderScore(model.benchmarks.aime2026, 'aime2026')} | |
| ${renderScore(model.benchmarks.terminalBench, 'terminalBench')} | |
| ${renderScore(model.benchmarks.evasionBench, 'evasionBench')} | |
| ${renderScore(model.benchmarks.hmmt2026, 'hmmt2026')} | |
| `; | |
| tbody.appendChild(row); | |
| }); | |
| // Update stats | |
| updateStatsForBenchmarks(selectedBenchmarks); | |
| } | |
| function updateStatsForBenchmarks(selectedBenchmarks) { | |
| const modelsWithScores = LEADERBOARD_DATA.models.filter(model => { | |
| return selectedBenchmarks.some(benchKey => model.benchmarks[benchKey]); | |
| }); | |
| const totalModels = modelsWithScores.length; | |
| const totalScores = modelsWithScores.reduce((sum, model) => { | |
| const visibleScores = selectedBenchmarks.filter(benchKey => model.benchmarks[benchKey]).length; | |
| return sum + visibleScores; | |
| }, 0); | |
| document.getElementById('statModels').textContent = totalModels; | |
| document.getElementById('statScores').textContent = totalScores; | |
| } | |
| function hideColumnsByCategory(categories) { | |
| const headers = document.querySelectorAll('thead th'); | |
| headers.forEach((th, index) => { | |
| if (index < 4) return; // Skip Model, Provider, Aggregate, Coverage | |
| const classList = th.className.split(' '); | |
| const bmClass = classList.find(cls => cls.startsWith('bm-')); | |
| if (!bmClass) return; | |
| const benchCategory = bmClass.replace('bm-', ''); | |
| if (categories.includes(benchCategory)) { | |
| th.style.display = 'none'; | |
| const cellSelector = `tbody td:nth-child(${index + 1})`; | |
| document.querySelectorAll(cellSelector).forEach(td => { | |
| td.style.display = 'none'; | |
| }); | |
| } | |
| }); | |
| } | |
| function toggleDark() { | |
| document.body.classList.toggle('dark'); | |
| const isDark = document.body.classList.contains('dark'); | |
| localStorage.setItem('dark', isDark); | |
| document.getElementById('darkBtn').textContent = isDark ? '☀️ Light' : '🌙 Dark'; | |
| } | |
| function getGrade(score, benchmarkKey = null) { | |
| // Determine category | |
| let category = 'aggregate'; | |
| if (benchmarkKey && BENCHMARK_CATEGORIES[benchmarkKey]) { | |
| category = BENCHMARK_CATEGORIES[benchmarkKey]; | |
| } | |
| // Get gradient for this category | |
| const gradientData = CATEGORY_GRADIENTS[category] || CATEGORY_GRADIENTS['aggregate']; | |
| return { | |
| textColor: gradientData.textColor, | |
| gradient: gradientData.gradient | |
| }; | |
| } | |
| function getConfidenceBadge(confidence) { | |
| const badges = { | |
| 'official': '<span class="conf-badge conf-official">✓✓ Official</span>', | |
| 'verified': '<span class="conf-badge conf-verified">✓ Verified</span>', | |
| 'community': '<span class="conf-badge conf-community">~ Community</span>' | |
| }; | |
| return badges[confidence] || ''; | |
| } | |
| function renderScore(benchmarkData, benchmarkKey) { | |
| if (!benchmarkData) { | |
| return '<td><div class="sc"><span class="na">—</span></div></td>'; | |
| } | |
| const score = benchmarkData.score; | |
| const grade = getGrade(score, benchmarkKey); | |
| return ` | |
| <td> | |
| <div class="sc" title="${benchmarkData.source || ''} (${benchmarkData.date || 'unknown date'})"> | |
| <div class="sn" style="color: ${grade.textColor}; font-weight: 700;">${score.toFixed(1)}</div> | |
| </div> | |
| </td> | |
| `; | |
| } | |
| function filterType(type) { | |
| currentFilter = type; | |
| // Update button states | |
| document.querySelectorAll('.toolbar .fb').forEach(btn => { | |
| btn.classList.remove('on'); | |
| }); | |
| event.target.classList.add('on'); | |
| // Filter rows | |
| const rows = document.querySelectorAll('#tableBody tr'); | |
| rows.forEach(row => { | |
| if (type === 'all') { | |
| row.classList.remove('hidden'); | |
| } else { | |
| if (row.dataset.type === type) { | |
| row.classList.remove('hidden'); | |
| } else { | |
| row.classList.add('hidden'); | |
| } | |
| } | |
| }); | |
| } | |
| function filterModels() { | |
| // Delegate to unified filter function | |
| applyAllFilters(); | |
| } | |
| function filterCategory(category) { | |
| currentCategory = category; | |
| // Update button states | |
| document.querySelectorAll('.toolbar button[onclick^="filterCategory"]').forEach(btn => { | |
| btn.classList.remove('on'); | |
| }); | |
| event.target.classList.add('on'); | |
| // Determine which categories to show | |
| let categoriesToShow; | |
| if (category === 'all') { | |
| categoriesToShow = Object.values(BENCHMARK_CATEGORIES).filter((v, i, a) => a.indexOf(v) === i); | |
| } else { | |
| categoriesToShow = [category]; | |
| } | |
| // Repopulate table with filtered benchmarks and recalculated scores | |
| populateTableWithCategories(categoriesToShow); | |
| // Show all columns first | |
| const headers = document.querySelectorAll('thead th'); | |
| headers.forEach((th, index) => { | |
| if (index < 4) return; | |
| th.style.display = ''; | |
| const cellSelector = `tbody td:nth-child(${index + 1})`; | |
| document.querySelectorAll(cellSelector).forEach(td => { | |
| td.style.display = ''; | |
| }); | |
| }); | |
| // Hide columns not in the selected category | |
| headers.forEach((th, index) => { | |
| if (index < 4) return; | |
| const classList = th.className.split(' '); | |
| const bmClass = classList.find(cls => cls.startsWith('bm-')); | |
| if (!bmClass) return; | |
| const benchCategory = bmClass.replace('bm-', ''); | |
| if (!categoriesToShow.includes(benchCategory)) { | |
| th.style.display = 'none'; | |
| const cellSelector = `tbody td:nth-child(${index + 1})`; | |
| document.querySelectorAll(cellSelector).forEach(td => { | |
| td.style.display = 'none'; | |
| }); | |
| } | |
| }); | |
| } | |
| let currentSortColumn = null; | |
| let currentSortDirection = 'desc'; | |
| function sortTable(colIndex) { | |
| const table = document.getElementById('leaderboardTable'); | |
| const tbody = table.querySelector('tbody'); | |
| const rows = Array.from(tbody.querySelectorAll('tr')); | |
| // Toggle sort direction if clicking same column | |
| if (currentSortColumn === colIndex) { | |
| currentSortDirection = currentSortDirection === 'desc' ? 'asc' : 'desc'; | |
| } else { | |
| currentSortColumn = colIndex; | |
| currentSortDirection = 'desc'; | |
| } | |
| // Sort rows | |
| rows.sort((a, b) => { | |
| let aVal, bVal; | |
| if (colIndex === 0) { | |
| // Sort by model name | |
| aVal = a.dataset.name || ''; | |
| bVal = b.dataset.name || ''; | |
| return currentSortDirection === 'asc' | |
| ? aVal.localeCompare(bVal) | |
| : bVal.localeCompare(aVal); | |
| } else { | |
| // Sort by benchmark score | |
| const aCell = a.cells[colIndex]; | |
| const bCell = b.cells[colIndex]; | |
| // Extract score from cell (look for the score number) | |
| const aScore = aCell ? parseFloat(aCell.textContent.trim()) : -1; | |
| const bScore = bCell ? parseFloat(bCell.textContent.trim()) : -1; | |
| // Handle missing scores (—) - put them at the end | |
| if (isNaN(aScore) && isNaN(bScore)) return 0; | |
| if (isNaN(aScore)) return 1; | |
| if (isNaN(bScore)) return -1; | |
| return currentSortDirection === 'desc' | |
| ? bScore - aScore | |
| : aScore - bScore; | |
| } | |
| }); | |
| // Re-append rows in sorted order | |
| rows.forEach(row => tbody.appendChild(row)); | |
| // Update sort indicators | |
| updateSortIndicators(colIndex); | |
| } | |
| function updateSortIndicators(colIndex) { | |
| const headers = document.querySelectorAll('thead th'); | |
| headers.forEach((th, index) => { | |
| const sortArrow = th.querySelector('.sa'); | |
| if (sortArrow) { | |
| if (index === colIndex) { | |
| sortArrow.textContent = currentSortDirection === 'desc' ? '↓' : '↑'; | |
| th.style.color = 'var(--ac)'; | |
| } else { | |
| sortArrow.textContent = '↕'; | |
| th.style.color = ''; | |
| } | |
| } | |
| }); | |
| } | |
| // Initialize on page load - load data first, then init | |
| window.addEventListener('DOMContentLoaded', () => { | |
| // Load leaderboard data (no authentication required - dataset is public) | |
| loadLeaderboardData(); | |
| }); | |
| </script> | |
| <script type="module"> | |
| import { oauthLoginUrl, oauthHandleRedirectIfPresent } from "@huggingface/hub"; | |
| console.log("Initializing OAuth..."); | |
| let oauthResult = localStorage.getItem("oauth"); | |
| if (oauthResult) { | |
| try { | |
| oauthResult = JSON.parse(oauthResult); | |
| } catch { | |
| oauthResult = null; | |
| } | |
| } | |
| oauthResult ||= await oauthHandleRedirectIfPresent(); | |
| if (oauthResult) { | |
| // User is logged in | |
| console.log("OAuth success:", oauthResult); | |
| localStorage.setItem("oauth", JSON.stringify(oauthResult)); | |
| // Show user info | |
| document.getElementById("oauthUser").style.display = "flex"; | |
| document.getElementById("oauthAvatar").src = oauthResult.userInfo?.avatarUrl || ""; | |
| document.getElementById("oauthUsername").textContent = oauthResult.userInfo?.name || "User"; | |
| // Setup signout | |
| document.getElementById("oauthSignout").onclick = async function() { | |
| localStorage.removeItem("oauth"); | |
| window.location.href = window.location.href.replace(/\?.*$/, ''); | |
| window.location.reload(); | |
| }; | |
| // Store token globally for API calls (optional - dataset is public) | |
| window.HF_TOKEN = oauthResult.accessToken; | |
| console.log("User logged in:", oauthResult.userInfo?.name); | |
| } else { | |
| // User is not logged in | |
| document.getElementById("oauthSignin").style.display = "inline-block"; | |
| document.getElementById("oauthSignin").onclick = async function() { | |
| const scopes = window.huggingface?.variables?.OAUTH_SCOPES || "openid profile email read-repos gated-repos"; | |
| window.location.href = (await oauthLoginUrl({scopes: scopes})) + "&prompt=consent"; | |
| }; | |
| } | |
| </script> | |
| </body> | |
| </html> | |