feat(benchmarks): add matrix runner and leaderboard

2026-07-24 02:06:15 +00:00 · 2026-04-15 21:31:33 +02:00
parent dab511dbb1
commit 2ab59a3099
21 changed files with 2096 additions and 50 deletions
@@ -0,0 +1,27 @@
+---
+title: LeRobot Benchmark Leaderboard
+emoji: 🤖
+colorFrom: yellow
+colorTo: orange
+sdk: gradio
+sdk_version: 5.29.0
+app_file: app.py
+pinned: false
+license: apache-2.0
+short_description: Benchmark history for LeRobot policy x benchmark runs
+---
+
+# LeRobot Benchmark Leaderboard
+
+This Space reads immutable benchmark rows from a Hugging Face dataset and shows:
+
+- Latest result per policy and benchmark
+- Historical trends over time
+- Direct links to uploaded eval and config artifacts
+
+## Configuration
+
+Set `BENCHMARK_RESULTS_REPO` in the Space settings if you want to point the UI
+at a different public dataset. The default is:
+
+- `lerobot/benchmark-history`
@@ -0,0 +1,226 @@
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import json
+import os
+import time
+from pathlib import Path
+from typing import Any
+
+import gradio as gr
+import pandas as pd
+import plotly.express as px
+from huggingface_hub import HfApi, hf_hub_download
+
+RESULTS_REPO = os.environ.get("BENCHMARK_RESULTS_REPO", "lerobot/benchmark-history")
+CACHE_DIR = Path("/tmp/benchmark-leaderboard-cache")
+CACHE_DIR.mkdir(parents=True, exist_ok=True)
+CACHE_TTL_S = 300
+
+_CACHE: dict[str, tuple[float, pd.DataFrame]] = {}
+
+
+def _row_to_record(row: dict[str, Any]) -> dict[str, Any]:
+    overall = row.get("eval", {}).get("overall", {})
+    resources = row.get("resources", {})
+    timings = row.get("timings", {})
+    artifact_urls = row.get("artifact_urls", {})
+    return {
+        "created_at": row.get("created_at"),
+        "benchmark": row.get("benchmark"),
+        "policy": row.get("policy"),
+        "success_rate": overall.get("pc_success"),
+        "n_episodes": overall.get("n_episodes"),
+        "avg_sum_reward": overall.get("avg_sum_reward"),
+        "train_wall_time_s": timings.get("train_wall_time_s"),
+        "eval_wall_time_s": timings.get("eval_wall_time_s"),
+        "total_wall_time_s": timings.get("total_wall_time_s"),
+        "num_gpus": resources.get("num_gpus"),
+        "microbatch_per_gpu": resources.get("microbatch_per_gpu"),
+        "gradient_accumulation_steps": resources.get("gradient_accumulation_steps"),
+        "effective_batch_size": resources.get("effective_batch_size"),
+        "git_commit": row.get("git_commit"),
+        "row_url": artifact_urls.get("row"),
+        "eval_info_url": artifact_urls.get("eval_info"),
+        "train_config_url": artifact_urls.get("train_config"),
+    }
+
+
+def load_rows(repo_id: str = RESULTS_REPO) -> pd.DataFrame:
+    cache_key = f"rows::{repo_id}"
+    cached = _CACHE.get(cache_key)
+    if cached is not None and (time.monotonic() - cached[0]) < CACHE_TTL_S:
+        return cached[1]
+
+    api = HfApi()
+    files = [path for path in api.list_repo_files(repo_id=repo_id, repo_type="dataset") if path.startswith("rows/")]
+    records: list[dict[str, Any]] = []
+    for path_in_repo in sorted(files, reverse=True):
+        local_path = hf_hub_download(repo_id=repo_id, repo_type="dataset", filename=path_in_repo, cache_dir=CACHE_DIR)
+        with open(local_path) as f:
+            row = json.load(f)
+        records.append(_row_to_record(row))
+
+    df = pd.DataFrame.from_records(records)
+    if not df.empty:
+        df["created_at"] = pd.to_datetime(df["created_at"], utc=True)
+        df = df.sort_values("created_at", ascending=False).reset_index(drop=True)
+    _CACHE[cache_key] = (time.monotonic(), df)
+    return df
+
+
+def make_latest_table(df: pd.DataFrame) -> pd.DataFrame:
+    if df.empty:
+        return df
+    latest = (
+        df.sort_values("created_at", ascending=False)
+        .groupby(["benchmark", "policy"], as_index=False)
+        .first()
+        .sort_values(["benchmark", "success_rate"], ascending=[True, False], na_position="last")
+    )
+    return latest[
+        [
+            "benchmark",
+            "policy",
+            "success_rate",
+            "n_episodes",
+            "train_wall_time_s",
+            "eval_wall_time_s",
+            "num_gpus",
+            "effective_batch_size",
+            "git_commit",
+            "row_url",
+            "eval_info_url",
+            "train_config_url",
+        ]
+    ]
+
+
+def make_history_figure(df: pd.DataFrame, benchmark: str, policy: str | None) -> Any:
+    filtered = df[df["benchmark"] == benchmark]
+    if policy and policy != "All":
+        filtered = filtered[filtered["policy"] == policy]
+    if filtered.empty:
+        return px.line(title="No benchmark rows found")
+    fig = px.line(
+        filtered.sort_values("created_at"),
+        x="created_at",
+        y="success_rate",
+        color="policy",
+        markers=True,
+        hover_data=["git_commit", "num_gpus", "train_wall_time_s", "eval_wall_time_s"],
+        title=f"{benchmark} success rate history",
+    )
+    fig.update_layout(yaxis_title="Success rate (%)", xaxis_title="Run time")
+    return fig
+
+
+def make_run_markdown(df: pd.DataFrame, benchmark: str, policy: str | None) -> str:
+    filtered = df[df["benchmark"] == benchmark]
+    if policy and policy != "All":
+        filtered = filtered[filtered["policy"] == policy]
+    if filtered.empty:
+        return "No matching runs yet."
+    latest = filtered.sort_values("created_at", ascending=False).iloc[0]
+    row_link = latest["row_url"] if pd.notna(latest["row_url"]) else None
+    eval_link = latest["eval_info_url"] if pd.notna(latest["eval_info_url"]) else None
+    train_link = latest["train_config_url"] if pd.notna(latest["train_config_url"]) else None
+    lines = [
+        f"Latest run: `{latest['policy']}` on `{latest['benchmark']}`",
+        f"Success rate: `{latest['success_rate']}`",
+        f"GPUs: `{latest['num_gpus']}`",
+        f"Effective batch size: `{latest['effective_batch_size']}`",
+        f"Commit: `{latest['git_commit']}`",
+    ]
+    if row_link:
+        lines.append(f"Row JSON: [open]({row_link})")
+    if eval_link:
+        lines.append(f"Eval Info: [open]({eval_link})")
+    if train_link:
+        lines.append(f"Train Config: [open]({train_link})")
+    return "\n\n".join(lines)
+
+
+def refresh_view(benchmark: str, policy: str) -> tuple[pd.DataFrame, dict[str, Any], Any, str]:
+    df = load_rows()
+    latest_table = make_latest_table(df)
+    benchmark_names = sorted(df["benchmark"].dropna().unique().tolist()) if not df.empty else []
+    if benchmark not in benchmark_names and benchmark_names:
+        benchmark = benchmark_names[0]
+    policy_choices = ["All"]
+    if benchmark and not df.empty:
+        policy_choices.extend(sorted(df[df["benchmark"] == benchmark]["policy"].dropna().unique().tolist()))
+    if policy not in policy_choices:
+        policy = "All"
+    history = make_history_figure(df, benchmark, policy)
+    summary = make_run_markdown(df, benchmark, policy)
+    return latest_table, gr.update(choices=policy_choices, value=policy), history, summary
+
+
+with gr.Blocks(title="LeRobot Benchmark Leaderboard") as demo:
+    gr.Markdown(
+        f"""
+# LeRobot Benchmark Leaderboard
+
+Results dataset: [`{RESULTS_REPO}`](https://huggingface.co/datasets/{RESULTS_REPO})
+"""
+    )
+
+    with gr.Row():
+        benchmark_dropdown = gr.Dropdown(label="Benchmark", choices=[])
+        policy_dropdown = gr.Dropdown(label="Policy", choices=["All"], value="All")
+        refresh_button = gr.Button("Refresh")
+
+    latest_table = gr.Dataframe(label="Latest Results", interactive=False)
+    history_plot = gr.Plot(label="History")
+    latest_summary = gr.Markdown()
+
+    def _initial_state():
+        df = load_rows()
+        benchmarks = sorted(df["benchmark"].dropna().unique().tolist()) if not df.empty else []
+        benchmark = benchmarks[0] if benchmarks else ""
+        latest, policy_choices, history, summary = refresh_view(benchmark, "All")
+        return (
+            gr.update(choices=benchmarks, value=benchmark),
+            policy_choices,
+            latest,
+            history,
+            summary,
+        )
+
+    demo.load(
+        _initial_state,
+        outputs=[benchmark_dropdown, policy_dropdown, latest_table, history_plot, latest_summary],
+    )
+    refresh_button.click(
+        refresh_view,
+        inputs=[benchmark_dropdown, policy_dropdown],
+        outputs=[latest_table, policy_dropdown, history_plot, latest_summary],
+    )
+    benchmark_dropdown.change(
+        refresh_view,
+        inputs=[benchmark_dropdown, policy_dropdown],
+        outputs=[latest_table, policy_dropdown, history_plot, latest_summary],
+    )
+    policy_dropdown.change(
+        refresh_view,
+        inputs=[benchmark_dropdown, policy_dropdown],
+        outputs=[latest_table, policy_dropdown, history_plot, latest_summary],
+    )
+
+
+if __name__ == "__main__":
+    demo.launch()
@@ -0,0 +1,4 @@
+gradio>=5.0.0,<6.0.0
+plotly>=5.18.0
+pandas>=2.0.0
+huggingface-hub>=1.0.0,<2.0.0