mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-22 03:59:42 +00:00
feat(benchmarks): add matrix runner and leaderboard
This commit is contained in:
@@ -0,0 +1,27 @@
|
||||
---
|
||||
title: LeRobot Benchmark Leaderboard
|
||||
emoji: 🤖
|
||||
colorFrom: yellow
|
||||
colorTo: orange
|
||||
sdk: gradio
|
||||
sdk_version: 5.29.0
|
||||
app_file: app.py
|
||||
pinned: false
|
||||
license: apache-2.0
|
||||
short_description: Benchmark history for LeRobot policy x benchmark runs
|
||||
---
|
||||
|
||||
# LeRobot Benchmark Leaderboard
|
||||
|
||||
This Space reads immutable benchmark rows from a Hugging Face dataset and shows:
|
||||
|
||||
- Latest result per policy and benchmark
|
||||
- Historical trends over time
|
||||
- Direct links to uploaded eval and config artifacts
|
||||
|
||||
## Configuration
|
||||
|
||||
Set `BENCHMARK_RESULTS_REPO` in the Space settings if you want to point the UI
|
||||
at a different public dataset. The default is:
|
||||
|
||||
- `lerobot/benchmark-history`
|
||||
@@ -0,0 +1,226 @@
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import gradio as gr
|
||||
import pandas as pd
|
||||
import plotly.express as px
|
||||
from huggingface_hub import HfApi, hf_hub_download
|
||||
|
||||
RESULTS_REPO = os.environ.get("BENCHMARK_RESULTS_REPO", "lerobot/benchmark-history")
|
||||
CACHE_DIR = Path("/tmp/benchmark-leaderboard-cache")
|
||||
CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
||||
CACHE_TTL_S = 300
|
||||
|
||||
_CACHE: dict[str, tuple[float, pd.DataFrame]] = {}
|
||||
|
||||
|
||||
def _row_to_record(row: dict[str, Any]) -> dict[str, Any]:
|
||||
overall = row.get("eval", {}).get("overall", {})
|
||||
resources = row.get("resources", {})
|
||||
timings = row.get("timings", {})
|
||||
artifact_urls = row.get("artifact_urls", {})
|
||||
return {
|
||||
"created_at": row.get("created_at"),
|
||||
"benchmark": row.get("benchmark"),
|
||||
"policy": row.get("policy"),
|
||||
"success_rate": overall.get("pc_success"),
|
||||
"n_episodes": overall.get("n_episodes"),
|
||||
"avg_sum_reward": overall.get("avg_sum_reward"),
|
||||
"train_wall_time_s": timings.get("train_wall_time_s"),
|
||||
"eval_wall_time_s": timings.get("eval_wall_time_s"),
|
||||
"total_wall_time_s": timings.get("total_wall_time_s"),
|
||||
"num_gpus": resources.get("num_gpus"),
|
||||
"microbatch_per_gpu": resources.get("microbatch_per_gpu"),
|
||||
"gradient_accumulation_steps": resources.get("gradient_accumulation_steps"),
|
||||
"effective_batch_size": resources.get("effective_batch_size"),
|
||||
"git_commit": row.get("git_commit"),
|
||||
"row_url": artifact_urls.get("row"),
|
||||
"eval_info_url": artifact_urls.get("eval_info"),
|
||||
"train_config_url": artifact_urls.get("train_config"),
|
||||
}
|
||||
|
||||
|
||||
def load_rows(repo_id: str = RESULTS_REPO) -> pd.DataFrame:
|
||||
cache_key = f"rows::{repo_id}"
|
||||
cached = _CACHE.get(cache_key)
|
||||
if cached is not None and (time.monotonic() - cached[0]) < CACHE_TTL_S:
|
||||
return cached[1]
|
||||
|
||||
api = HfApi()
|
||||
files = [path for path in api.list_repo_files(repo_id=repo_id, repo_type="dataset") if path.startswith("rows/")]
|
||||
records: list[dict[str, Any]] = []
|
||||
for path_in_repo in sorted(files, reverse=True):
|
||||
local_path = hf_hub_download(repo_id=repo_id, repo_type="dataset", filename=path_in_repo, cache_dir=CACHE_DIR)
|
||||
with open(local_path) as f:
|
||||
row = json.load(f)
|
||||
records.append(_row_to_record(row))
|
||||
|
||||
df = pd.DataFrame.from_records(records)
|
||||
if not df.empty:
|
||||
df["created_at"] = pd.to_datetime(df["created_at"], utc=True)
|
||||
df = df.sort_values("created_at", ascending=False).reset_index(drop=True)
|
||||
_CACHE[cache_key] = (time.monotonic(), df)
|
||||
return df
|
||||
|
||||
|
||||
def make_latest_table(df: pd.DataFrame) -> pd.DataFrame:
|
||||
if df.empty:
|
||||
return df
|
||||
latest = (
|
||||
df.sort_values("created_at", ascending=False)
|
||||
.groupby(["benchmark", "policy"], as_index=False)
|
||||
.first()
|
||||
.sort_values(["benchmark", "success_rate"], ascending=[True, False], na_position="last")
|
||||
)
|
||||
return latest[
|
||||
[
|
||||
"benchmark",
|
||||
"policy",
|
||||
"success_rate",
|
||||
"n_episodes",
|
||||
"train_wall_time_s",
|
||||
"eval_wall_time_s",
|
||||
"num_gpus",
|
||||
"effective_batch_size",
|
||||
"git_commit",
|
||||
"row_url",
|
||||
"eval_info_url",
|
||||
"train_config_url",
|
||||
]
|
||||
]
|
||||
|
||||
|
||||
def make_history_figure(df: pd.DataFrame, benchmark: str, policy: str | None) -> Any:
|
||||
filtered = df[df["benchmark"] == benchmark]
|
||||
if policy and policy != "All":
|
||||
filtered = filtered[filtered["policy"] == policy]
|
||||
if filtered.empty:
|
||||
return px.line(title="No benchmark rows found")
|
||||
fig = px.line(
|
||||
filtered.sort_values("created_at"),
|
||||
x="created_at",
|
||||
y="success_rate",
|
||||
color="policy",
|
||||
markers=True,
|
||||
hover_data=["git_commit", "num_gpus", "train_wall_time_s", "eval_wall_time_s"],
|
||||
title=f"{benchmark} success rate history",
|
||||
)
|
||||
fig.update_layout(yaxis_title="Success rate (%)", xaxis_title="Run time")
|
||||
return fig
|
||||
|
||||
|
||||
def make_run_markdown(df: pd.DataFrame, benchmark: str, policy: str | None) -> str:
|
||||
filtered = df[df["benchmark"] == benchmark]
|
||||
if policy and policy != "All":
|
||||
filtered = filtered[filtered["policy"] == policy]
|
||||
if filtered.empty:
|
||||
return "No matching runs yet."
|
||||
latest = filtered.sort_values("created_at", ascending=False).iloc[0]
|
||||
row_link = latest["row_url"] if pd.notna(latest["row_url"]) else None
|
||||
eval_link = latest["eval_info_url"] if pd.notna(latest["eval_info_url"]) else None
|
||||
train_link = latest["train_config_url"] if pd.notna(latest["train_config_url"]) else None
|
||||
lines = [
|
||||
f"Latest run: `{latest['policy']}` on `{latest['benchmark']}`",
|
||||
f"Success rate: `{latest['success_rate']}`",
|
||||
f"GPUs: `{latest['num_gpus']}`",
|
||||
f"Effective batch size: `{latest['effective_batch_size']}`",
|
||||
f"Commit: `{latest['git_commit']}`",
|
||||
]
|
||||
if row_link:
|
||||
lines.append(f"Row JSON: [open]({row_link})")
|
||||
if eval_link:
|
||||
lines.append(f"Eval Info: [open]({eval_link})")
|
||||
if train_link:
|
||||
lines.append(f"Train Config: [open]({train_link})")
|
||||
return "\n\n".join(lines)
|
||||
|
||||
|
||||
def refresh_view(benchmark: str, policy: str) -> tuple[pd.DataFrame, dict[str, Any], Any, str]:
|
||||
df = load_rows()
|
||||
latest_table = make_latest_table(df)
|
||||
benchmark_names = sorted(df["benchmark"].dropna().unique().tolist()) if not df.empty else []
|
||||
if benchmark not in benchmark_names and benchmark_names:
|
||||
benchmark = benchmark_names[0]
|
||||
policy_choices = ["All"]
|
||||
if benchmark and not df.empty:
|
||||
policy_choices.extend(sorted(df[df["benchmark"] == benchmark]["policy"].dropna().unique().tolist()))
|
||||
if policy not in policy_choices:
|
||||
policy = "All"
|
||||
history = make_history_figure(df, benchmark, policy)
|
||||
summary = make_run_markdown(df, benchmark, policy)
|
||||
return latest_table, gr.update(choices=policy_choices, value=policy), history, summary
|
||||
|
||||
|
||||
with gr.Blocks(title="LeRobot Benchmark Leaderboard") as demo:
|
||||
gr.Markdown(
|
||||
f"""
|
||||
# LeRobot Benchmark Leaderboard
|
||||
|
||||
Results dataset: [`{RESULTS_REPO}`](https://huggingface.co/datasets/{RESULTS_REPO})
|
||||
"""
|
||||
)
|
||||
|
||||
with gr.Row():
|
||||
benchmark_dropdown = gr.Dropdown(label="Benchmark", choices=[])
|
||||
policy_dropdown = gr.Dropdown(label="Policy", choices=["All"], value="All")
|
||||
refresh_button = gr.Button("Refresh")
|
||||
|
||||
latest_table = gr.Dataframe(label="Latest Results", interactive=False)
|
||||
history_plot = gr.Plot(label="History")
|
||||
latest_summary = gr.Markdown()
|
||||
|
||||
def _initial_state():
|
||||
df = load_rows()
|
||||
benchmarks = sorted(df["benchmark"].dropna().unique().tolist()) if not df.empty else []
|
||||
benchmark = benchmarks[0] if benchmarks else ""
|
||||
latest, policy_choices, history, summary = refresh_view(benchmark, "All")
|
||||
return (
|
||||
gr.update(choices=benchmarks, value=benchmark),
|
||||
policy_choices,
|
||||
latest,
|
||||
history,
|
||||
summary,
|
||||
)
|
||||
|
||||
demo.load(
|
||||
_initial_state,
|
||||
outputs=[benchmark_dropdown, policy_dropdown, latest_table, history_plot, latest_summary],
|
||||
)
|
||||
refresh_button.click(
|
||||
refresh_view,
|
||||
inputs=[benchmark_dropdown, policy_dropdown],
|
||||
outputs=[latest_table, policy_dropdown, history_plot, latest_summary],
|
||||
)
|
||||
benchmark_dropdown.change(
|
||||
refresh_view,
|
||||
inputs=[benchmark_dropdown, policy_dropdown],
|
||||
outputs=[latest_table, policy_dropdown, history_plot, latest_summary],
|
||||
)
|
||||
policy_dropdown.change(
|
||||
refresh_view,
|
||||
inputs=[benchmark_dropdown, policy_dropdown],
|
||||
outputs=[latest_table, policy_dropdown, history_plot, latest_summary],
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo.launch()
|
||||
@@ -0,0 +1,4 @@
|
||||
gradio>=5.0.0,<6.0.0
|
||||
plotly>=5.18.0
|
||||
pandas>=2.0.0
|
||||
huggingface-hub>=1.0.0,<2.0.0
|
||||
Reference in New Issue
Block a user