feat: add lerobot-leaderboard to generate interactive eval comparison pages

New CLI tool that fetches eval results from multiple Hub model repos and produces a self-contained HTML leaderboard with sortable columns, per-suite breakdowns, best-in-column highlighting, and filtering. Made-with: Cursor
2026-07-25 10:46:01 +00:00 · 2026-03-16 02:57:37 +01:00
parent 464532ec37
commit 4a04465bb8
2 changed files with 606 additions and 0 deletions
@@ -246,6 +246,7 @@ lerobot-info="lerobot.scripts.lerobot_info:main"
 lerobot-find-joint-limits="lerobot.scripts.lerobot_find_joint_limits:main"
 lerobot-imgtransform-viz="lerobot.scripts.lerobot_imgtransform_viz:main"
 lerobot-edit-dataset="lerobot.scripts.lerobot_edit_dataset:main"
+lerobot-leaderboard="lerobot.scripts.lerobot_leaderboard:main"
 lerobot-setup-can="lerobot.scripts.lerobot_setup_can:main"
 lerobot-benchmark="lerobot.scripts.lerobot_benchmark:main"

@@ -0,0 +1,605 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Generate an interactive eval leaderboard from Hub model repos.
+
+Reads eval results (as pushed by ``lerobot-eval --push_to_hub``) from one or
+more Hugging Face model repos and produces a self-contained HTML page with a
+sortable, filterable leaderboard table.
+
+Usage::
+
+    lerobot-leaderboard \
+        --repo-ids user/model_a,user/model_b \
+        --output leaderboard.html
+
+    # Or from a file listing repo IDs (one per line):
+    lerobot-leaderboard \
+        --repo-ids-file models.txt \
+        --output leaderboard.html
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+import sys
+from dataclasses import dataclass, field
+from pathlib import Path
+
+logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ModelEntry:
+    repo_id: str
+    policy_type: str = "—"
+    dataset: str = "—"
+    training_steps: str = "—"
+    batch_size: str = "—"
+    # env_type -> {group_name -> pc_success}
+    eval_results: dict[str, dict[str, float]] = field(default_factory=dict)
+    # env_type -> overall pc_success
+    eval_overall: dict[str, float] = field(default_factory=dict)
+    # env_type -> n_episodes
+    eval_n_episodes: dict[str, int] = field(default_factory=dict)
+
+
+def _try_download(repo_id: str, filename: str) -> dict | None:
+    """Download a JSON file from a Hub repo, return parsed dict or None."""
+    from huggingface_hub import hf_hub_download
+    from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError
+
+    try:
+        path = hf_hub_download(repo_id, filename, repo_type="model")
+        with open(path) as f:
+            return json.load(f)
+    except (EntryNotFoundError, RepositoryNotFoundError, OSError):
+        return None
+
+
+def _list_eval_dirs(repo_id: str) -> list[str]:
+    """List env_type subdirectories under eval/ in a Hub repo."""
+    from huggingface_hub import HfApi
+    from huggingface_hub.utils import RepositoryNotFoundError
+
+    api = HfApi()
+    try:
+        files = api.list_repo_files(repo_id, repo_type="model")
+    except RepositoryNotFoundError:
+        return []
+
+    env_types = set()
+    for f in files:
+        if f.startswith("eval/") and f.count("/") >= 2:
+            env_types.add(f.split("/")[1])
+    return sorted(env_types)
+
+
+def fetch_model_entry(repo_id: str) -> ModelEntry:
+    """Fetch all available metadata and eval results for a single model."""
+    entry = ModelEntry(repo_id=repo_id)
+
+    # Policy config
+    policy_cfg = _try_download(repo_id, "config.json")
+    if policy_cfg:
+        entry.policy_type = policy_cfg.get("type", "—")
+
+    # Training config
+    train_cfg = _try_download(repo_id, "train_config.json")
+    if train_cfg:
+        ds = train_cfg.get("dataset", {})
+        entry.dataset = ds.get("repo_id", "—") if isinstance(ds, dict) else str(ds)
+        entry.training_steps = str(train_cfg.get("steps", "—"))
+        entry.batch_size = str(train_cfg.get("batch_size", "—"))
+
+    # Eval results per env_type
+    for env_type in _list_eval_dirs(repo_id):
+        eval_info = _try_download(repo_id, f"eval/{env_type}/eval_info.json")
+        if not eval_info:
+            continue
+
+        per_group = eval_info.get("per_group", {})
+        group_results = {}
+        for group_name, stats in per_group.items():
+            group_results[group_name] = stats.get("pc_success", float("nan"))
+
+        entry.eval_results[env_type] = group_results
+
+        overall = eval_info.get("overall", {})
+        entry.eval_overall[env_type] = overall.get("pc_success", float("nan"))
+        entry.eval_n_episodes[env_type] = overall.get("n_episodes", 0)
+
+    return entry
+
+
+def fetch_all(repo_ids: list[str]) -> list[ModelEntry]:
+    entries = []
+    for repo_id in repo_ids:
+        logger.info(f"Fetching {repo_id}...")
+        try:
+            entries.append(fetch_model_entry(repo_id))
+        except Exception as e:
+            logger.warning(f"Failed to fetch {repo_id}: {e}")
+    return entries
+
+
+def collect_all_env_types(entries: list[ModelEntry]) -> list[str]:
+    """Collect all unique env_types across all entries, sorted."""
+    env_types: set[str] = set()
+    for e in entries:
+        env_types.update(e.eval_overall.keys())
+    return sorted(env_types)
+
+
+def collect_all_groups(entries: list[ModelEntry]) -> dict[str, list[str]]:
+    """Collect all unique group names per env_type."""
+    groups: dict[str, set[str]] = {}
+    for e in entries:
+        for env_type, group_results in e.eval_results.items():
+            groups.setdefault(env_type, set()).update(group_results.keys())
+    return {k: sorted(v) for k, v in groups.items()}
+
+
+def build_html(entries: list[ModelEntry], title: str = "LeRobot Eval Leaderboard") -> str:
+    env_types = collect_all_env_types(entries)
+    all_groups = collect_all_groups(entries)
+
+    # Build column structure: fixed cols + per env_type (overall + per-group sub-columns)
+    # We'll build the data as JSON and let JS handle rendering
+    table_data = []
+    for e in entries:
+        row = {
+            "repo_id": e.repo_id,
+            "policy_type": e.policy_type,
+            "dataset": e.dataset,
+            "training_steps": e.training_steps,
+            "batch_size": e.batch_size,
+        }
+        for env_type in env_types:
+            overall = e.eval_overall.get(env_type)
+            row[f"{env_type}__overall"] = round(overall, 1) if overall is not None else None
+            n_ep = e.eval_n_episodes.get(env_type)
+            row[f"{env_type}__n_episodes"] = n_ep if n_ep else None
+            for group in all_groups.get(env_type, []):
+                val = e.eval_results.get(env_type, {}).get(group)
+                row[f"{env_type}__{group}"] = round(val, 1) if val is not None else None
+        table_data.append(row)
+
+    # Build column definitions for the JS table
+    columns_json = json.dumps(_build_column_defs(env_types, all_groups))
+    data_json = json.dumps(table_data)
+
+    return _HTML_TEMPLATE.format(
+        title=title,
+        columns_json=columns_json,
+        data_json=data_json,
+    )
+
+
+def _build_column_defs(env_types: list[str], all_groups: dict[str, list[str]]) -> list[dict]:
+    cols = [
+        {"key": "repo_id", "label": "Model", "group": "Model Info", "sortable": True, "type": "link"},
+        {"key": "policy_type", "label": "Policy", "group": "Model Info", "sortable": True, "type": "text"},
+        {"key": "dataset", "label": "Dataset", "group": "Model Info", "sortable": True, "type": "text"},
+        {
+            "key": "training_steps",
+            "label": "Steps",
+            "group": "Training",
+            "sortable": True,
+            "type": "number",
+        },
+        {
+            "key": "batch_size",
+            "label": "Batch",
+            "group": "Training",
+            "sortable": True,
+            "type": "number",
+        },
+    ]
+    for env_type in env_types:
+        cols.append(
+            {
+                "key": f"{env_type}__overall",
+                "label": "Overall %",
+                "group": env_type,
+                "sortable": True,
+                "type": "pct",
+            }
+        )
+        for group in all_groups.get(env_type, []):
+            cols.append(
+                {
+                    "key": f"{env_type}__{group}",
+                    "label": f"{group} %",
+                    "group": env_type,
+                    "sortable": True,
+                    "type": "pct",
+                }
+            )
+        cols.append(
+            {
+                "key": f"{env_type}__n_episodes",
+                "label": "Episodes",
+                "group": env_type,
+                "sortable": True,
+                "type": "number",
+            }
+        )
+    return cols
+
+
+_HTML_TEMPLATE = """\
+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="utf-8" />
+<meta name="viewport" content="width=device-width, initial-scale=1" />
+<title>{title}</title>
+<style>
+  :root {{
+    --bg: #0d1117;
+    --surface: #161b22;
+    --border: #30363d;
+    --text: #e6edf3;
+    --text-muted: #8b949e;
+    --accent: #58a6ff;
+    --green: #3fb950;
+    --yellow: #d29922;
+    --red: #f85149;
+    --header-bg: #1c2128;
+  }}
+  * {{ box-sizing: border-box; margin: 0; padding: 0; }}
+  body {{
+    font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif;
+    background: var(--bg);
+    color: var(--text);
+    padding: 24px;
+    line-height: 1.5;
+  }}
+  h1 {{
+    font-size: 1.75rem;
+    font-weight: 600;
+    margin-bottom: 8px;
+  }}
+  .subtitle {{
+    color: var(--text-muted);
+    font-size: 0.9rem;
+    margin-bottom: 20px;
+  }}
+  .controls {{
+    display: flex;
+    gap: 12px;
+    margin-bottom: 16px;
+    flex-wrap: wrap;
+    align-items: center;
+  }}
+  .controls input {{
+    background: var(--surface);
+    border: 1px solid var(--border);
+    color: var(--text);
+    padding: 8px 14px;
+    border-radius: 6px;
+    font-size: 0.875rem;
+    width: 280px;
+    outline: none;
+  }}
+  .controls input:focus {{ border-color: var(--accent); }}
+  .controls label {{
+    color: var(--text-muted);
+    font-size: 0.8rem;
+    display: flex;
+    align-items: center;
+    gap: 6px;
+    cursor: pointer;
+  }}
+  .controls input[type="checkbox"] {{
+    width: auto;
+    accent-color: var(--accent);
+  }}
+  .table-wrap {{
+    overflow-x: auto;
+    border: 1px solid var(--border);
+    border-radius: 8px;
+  }}
+  table {{
+    border-collapse: collapse;
+    width: 100%;
+    font-size: 0.85rem;
+    white-space: nowrap;
+  }}
+  thead th {{
+    background: var(--header-bg);
+    color: var(--text-muted);
+    font-weight: 600;
+    text-transform: uppercase;
+    font-size: 0.7rem;
+    letter-spacing: 0.05em;
+    padding: 10px 14px;
+    border-bottom: 1px solid var(--border);
+    cursor: pointer;
+    user-select: none;
+    position: sticky;
+    top: 0;
+    z-index: 2;
+  }}
+  thead th:hover {{ color: var(--text); }}
+  thead th .arrow {{ margin-left: 4px; opacity: 0.4; }}
+  thead th.sorted .arrow {{ opacity: 1; color: var(--accent); }}
+  thead tr.group-header th {{
+    text-align: center;
+    font-size: 0.75rem;
+    letter-spacing: 0.08em;
+    border-right: 1px solid var(--border);
+    cursor: default;
+  }}
+  tbody tr {{ border-bottom: 1px solid var(--border); }}
+  tbody tr:hover {{ background: rgba(88,166,255,0.06); }}
+  td {{
+    padding: 10px 14px;
+    vertical-align: middle;
+  }}
+  td.model-cell a {{
+    color: var(--accent);
+    text-decoration: none;
+    font-weight: 500;
+  }}
+  td.model-cell a:hover {{ text-decoration: underline; }}
+  td.pct {{
+    font-weight: 600;
+    font-variant-numeric: tabular-nums;
+    text-align: right;
+  }}
+  td.number {{
+    text-align: right;
+    font-variant-numeric: tabular-nums;
+  }}
+  .pct-high {{ color: var(--green); }}
+  .pct-mid  {{ color: var(--yellow); }}
+  .pct-low  {{ color: var(--red); }}
+  .pct-na   {{ color: var(--text-muted); font-weight: 400; }}
+  .best-in-col {{ background: rgba(63,185,80,0.12); }}
+  footer {{
+    margin-top: 20px;
+    color: var(--text-muted);
+    font-size: 0.75rem;
+  }}
+  footer a {{ color: var(--accent); text-decoration: none; }}
+</style>
+</head>
+<body>
+
+<h1>&#129302; {title}</h1>
+<p class="subtitle">Click any column header to sort. Filter by typing below.</p>
+
+<div class="controls">
+  <input type="text" id="filter" placeholder="Filter models..." />
+  <label><input type="checkbox" id="toggleGroups" checked /> Show sub-suite columns</label>
+</div>
+
+<div class="table-wrap">
+  <table id="leaderboard">
+    <thead></thead>
+    <tbody></tbody>
+  </table>
+</div>
+
+<footer>
+  Generated by <a href="https://github.com/huggingface/lerobot">LeRobot</a> &middot;
+  Eval results from <a href="https://huggingface.co">Hugging Face Hub</a>
+</footer>
+
+<script>
+const COLUMNS = {columns_json};
+const DATA    = {data_json};
+
+let sortKey = null, sortAsc = true, showGroups = true;
+
+function pctClass(v) {{
+  if (v == null) return 'pct-na';
+  if (v >= 70)   return 'pct-high';
+  if (v >= 40)   return 'pct-mid';
+  return 'pct-low';
+}}
+
+function visibleCols() {{
+  if (showGroups) return COLUMNS;
+  return COLUMNS.filter(c => !c.key.includes('__') || c.key.endsWith('__overall') || c.key.endsWith('__n_episodes'));
+}}
+
+function bestPerCol(rows) {{
+  const best = {{}};
+  for (const c of visibleCols()) {{
+    if (c.type !== 'pct') continue;
+    let max = -Infinity;
+    for (const r of rows) {{
+      const v = r[c.key];
+      if (v != null && v > max) max = v;
+    }}
+    best[c.key] = max === -Infinity ? null : max;
+  }}
+  return best;
+}}
+
+function render() {{
+  const filter = document.getElementById('filter').value.toLowerCase();
+  let rows = DATA.filter(r => r.repo_id.toLowerCase().includes(filter)
+    || (r.policy_type||'').toLowerCase().includes(filter)
+    || (r.dataset||'').toLowerCase().includes(filter));
+
+  if (sortKey) {{
+    rows.sort((a, b) => {{
+      let va = a[sortKey], vb = b[sortKey];
+      if (va == null && vb == null) return 0;
+      if (va == null) return 1;
+      if (vb == null) return -1;
+      if (typeof va === 'string') va = va.toLowerCase();
+      if (typeof vb === 'string') vb = vb.toLowerCase();
+      return sortAsc ? (va < vb ? -1 : va > vb ? 1 : 0) : (va > vb ? -1 : va < vb ? 1 : 0);
+    }});
+  }}
+
+  const cols = visibleCols();
+  const best = bestPerCol(rows);
+
+  // Group header row
+  const groups = [];
+  let lastGroup = null;
+  for (const c of cols) {{
+    if (c.group !== lastGroup) {{
+      groups.push({{ label: c.group, span: 1 }});
+      lastGroup = c.group;
+    }} else {{
+      groups[groups.length - 1].span++;
+    }}
+  }}
+
+  const thead = document.querySelector('#leaderboard thead');
+  thead.innerHTML = '';
+
+  // Group header
+  const gtr = document.createElement('tr');
+  gtr.className = 'group-header';
+  for (const g of groups) {{
+    const th = document.createElement('th');
+    th.colSpan = g.span;
+    th.textContent = g.label;
+    gtr.appendChild(th);
+  }}
+  thead.appendChild(gtr);
+
+  // Column headers
+  const htr = document.createElement('tr');
+  for (const c of cols) {{
+    const th = document.createElement('th');
+    th.innerHTML = c.label + ' <span class="arrow">' + (sortKey === c.key ? (sortAsc ? '&#9650;' : '&#9660;') : '&#8693;') + '</span>';
+    if (sortKey === c.key) th.classList.add('sorted');
+    th.addEventListener('click', () => {{
+      if (sortKey === c.key) {{ sortAsc = !sortAsc; }}
+      else {{ sortKey = c.key; sortAsc = c.type === 'pct' ? false : true; }}
+      render();
+    }});
+    htr.appendChild(th);
+  }}
+  thead.appendChild(htr);
+
+  // Body
+  const tbody = document.querySelector('#leaderboard tbody');
+  tbody.innerHTML = '';
+  for (const r of rows) {{
+    const tr = document.createElement('tr');
+    for (const c of cols) {{
+      const td = document.createElement('td');
+      const v = r[c.key];
+      if (c.type === 'link') {{
+        td.className = 'model-cell';
+        const a = document.createElement('a');
+        a.href = 'https://huggingface.co/' + v;
+        a.target = '_blank';
+        a.textContent = v;
+        td.appendChild(a);
+      }} else if (c.type === 'pct') {{
+        td.className = 'pct ' + pctClass(v);
+        td.textContent = v != null ? v.toFixed(1) : '—';
+        if (v != null && best[c.key] != null && v === best[c.key] && rows.length > 1) {{
+          td.classList.add('best-in-col');
+        }}
+      }} else if (c.type === 'number') {{
+        td.className = 'number';
+        td.textContent = v != null ? v : '—';
+      }} else {{
+        td.textContent = v || '—';
+      }}
+      tr.appendChild(td);
+    }}
+    tbody.appendChild(tr);
+  }}
+}}
+
+document.getElementById('filter').addEventListener('input', render);
+document.getElementById('toggleGroups').addEventListener('change', (e) => {{
+  showGroups = e.target.checked;
+  render();
+}});
+
+render();
+</script>
+</body>
+</html>
+"""
+
+
+def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
+    p = argparse.ArgumentParser(
+        description="Generate an interactive eval leaderboard from Hub model repos.",
+    )
+    p.add_argument(
+        "--repo-ids",
+        type=str,
+        default=None,
+        help="Comma-separated list of HF model repo IDs.",
+    )
+    p.add_argument(
+        "--repo-ids-file",
+        type=str,
+        default=None,
+        help="Path to a text file with one repo ID per line.",
+    )
+    p.add_argument(
+        "--output",
+        type=str,
+        default="leaderboard.html",
+        help="Output HTML file path (default: leaderboard.html).",
+    )
+    p.add_argument(
+        "--title",
+        type=str,
+        default="LeRobot Eval Leaderboard",
+        help="Title shown in the leaderboard page.",
+    )
+    return p.parse_args(argv)
+
+
+def main(argv: list[str] | None = None):
+    args = parse_args(argv)
+
+    repo_ids: list[str] = []
+    if args.repo_ids:
+        repo_ids.extend(r.strip() for r in args.repo_ids.split(",") if r.strip())
+    if args.repo_ids_file:
+        path = Path(args.repo_ids_file)
+        if not path.exists():
+            logger.error(f"File not found: {path}")
+            sys.exit(1)
+        repo_ids.extend(line.strip() for line in path.read_text().splitlines() if line.strip())
+    if not repo_ids:
+        logger.error("No repo IDs provided. Use --repo-ids or --repo-ids-file.")
+        sys.exit(1)
+
+    entries = fetch_all(repo_ids)
+    if not entries:
+        logger.error("No valid entries found.")
+        sys.exit(1)
+
+    html = build_html(entries, title=args.title)
+    out = Path(args.output)
+    out.write_text(html)
+    logger.info(f"Leaderboard written to {out.resolve()}")
+
+
+if __name__ == "__main__":
+    main()