Files
lerobot/spaces/benchmark-leaderboard/app.py
T

227 lines
8.3 KiB
Python

# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import json
import os
import time
from pathlib import Path
from typing import Any
import gradio as gr
import pandas as pd
import plotly.express as px
from huggingface_hub import HfApi, hf_hub_download
RESULTS_REPO = os.environ.get("BENCHMARK_RESULTS_REPO", "lerobot/benchmark-history")
CACHE_DIR = Path("/tmp/benchmark-leaderboard-cache")
CACHE_DIR.mkdir(parents=True, exist_ok=True)
CACHE_TTL_S = 300
_CACHE: dict[str, tuple[float, pd.DataFrame]] = {}
def _row_to_record(row: dict[str, Any]) -> dict[str, Any]:
overall = row.get("eval", {}).get("overall", {})
resources = row.get("resources", {})
timings = row.get("timings", {})
artifact_urls = row.get("artifact_urls", {})
return {
"created_at": row.get("created_at"),
"benchmark": row.get("benchmark"),
"policy": row.get("policy"),
"success_rate": overall.get("pc_success"),
"n_episodes": overall.get("n_episodes"),
"avg_sum_reward": overall.get("avg_sum_reward"),
"train_wall_time_s": timings.get("train_wall_time_s"),
"eval_wall_time_s": timings.get("eval_wall_time_s"),
"total_wall_time_s": timings.get("total_wall_time_s"),
"num_gpus": resources.get("num_gpus"),
"microbatch_per_gpu": resources.get("microbatch_per_gpu"),
"gradient_accumulation_steps": resources.get("gradient_accumulation_steps"),
"effective_batch_size": resources.get("effective_batch_size"),
"git_commit": row.get("git_commit"),
"row_url": artifact_urls.get("row"),
"eval_info_url": artifact_urls.get("eval_info"),
"train_config_url": artifact_urls.get("train_config"),
}
def load_rows(repo_id: str = RESULTS_REPO) -> pd.DataFrame:
cache_key = f"rows::{repo_id}"
cached = _CACHE.get(cache_key)
if cached is not None and (time.monotonic() - cached[0]) < CACHE_TTL_S:
return cached[1]
api = HfApi()
files = [path for path in api.list_repo_files(repo_id=repo_id, repo_type="dataset") if path.startswith("rows/")]
records: list[dict[str, Any]] = []
for path_in_repo in sorted(files, reverse=True):
local_path = hf_hub_download(repo_id=repo_id, repo_type="dataset", filename=path_in_repo, cache_dir=CACHE_DIR)
with open(local_path) as f:
row = json.load(f)
records.append(_row_to_record(row))
df = pd.DataFrame.from_records(records)
if not df.empty:
df["created_at"] = pd.to_datetime(df["created_at"], utc=True)
df = df.sort_values("created_at", ascending=False).reset_index(drop=True)
_CACHE[cache_key] = (time.monotonic(), df)
return df
def make_latest_table(df: pd.DataFrame) -> pd.DataFrame:
if df.empty:
return df
latest = (
df.sort_values("created_at", ascending=False)
.groupby(["benchmark", "policy"], as_index=False)
.first()
.sort_values(["benchmark", "success_rate"], ascending=[True, False], na_position="last")
)
return latest[
[
"benchmark",
"policy",
"success_rate",
"n_episodes",
"train_wall_time_s",
"eval_wall_time_s",
"num_gpus",
"effective_batch_size",
"git_commit",
"row_url",
"eval_info_url",
"train_config_url",
]
]
def make_history_figure(df: pd.DataFrame, benchmark: str, policy: str | None) -> Any:
filtered = df[df["benchmark"] == benchmark]
if policy and policy != "All":
filtered = filtered[filtered["policy"] == policy]
if filtered.empty:
return px.line(title="No benchmark rows found")
fig = px.line(
filtered.sort_values("created_at"),
x="created_at",
y="success_rate",
color="policy",
markers=True,
hover_data=["git_commit", "num_gpus", "train_wall_time_s", "eval_wall_time_s"],
title=f"{benchmark} success rate history",
)
fig.update_layout(yaxis_title="Success rate (%)", xaxis_title="Run time")
return fig
def make_run_markdown(df: pd.DataFrame, benchmark: str, policy: str | None) -> str:
filtered = df[df["benchmark"] == benchmark]
if policy and policy != "All":
filtered = filtered[filtered["policy"] == policy]
if filtered.empty:
return "No matching runs yet."
latest = filtered.sort_values("created_at", ascending=False).iloc[0]
row_link = latest["row_url"] if pd.notna(latest["row_url"]) else None
eval_link = latest["eval_info_url"] if pd.notna(latest["eval_info_url"]) else None
train_link = latest["train_config_url"] if pd.notna(latest["train_config_url"]) else None
lines = [
f"Latest run: `{latest['policy']}` on `{latest['benchmark']}`",
f"Success rate: `{latest['success_rate']}`",
f"GPUs: `{latest['num_gpus']}`",
f"Effective batch size: `{latest['effective_batch_size']}`",
f"Commit: `{latest['git_commit']}`",
]
if row_link:
lines.append(f"Row JSON: [open]({row_link})")
if eval_link:
lines.append(f"Eval Info: [open]({eval_link})")
if train_link:
lines.append(f"Train Config: [open]({train_link})")
return "\n\n".join(lines)
def refresh_view(benchmark: str, policy: str) -> tuple[pd.DataFrame, dict[str, Any], Any, str]:
df = load_rows()
latest_table = make_latest_table(df)
benchmark_names = sorted(df["benchmark"].dropna().unique().tolist()) if not df.empty else []
if benchmark not in benchmark_names and benchmark_names:
benchmark = benchmark_names[0]
policy_choices = ["All"]
if benchmark and not df.empty:
policy_choices.extend(sorted(df[df["benchmark"] == benchmark]["policy"].dropna().unique().tolist()))
if policy not in policy_choices:
policy = "All"
history = make_history_figure(df, benchmark, policy)
summary = make_run_markdown(df, benchmark, policy)
return latest_table, gr.update(choices=policy_choices, value=policy), history, summary
with gr.Blocks(title="LeRobot Benchmark Leaderboard") as demo:
gr.Markdown(
f"""
# LeRobot Benchmark Leaderboard
Results dataset: [`{RESULTS_REPO}`](https://huggingface.co/datasets/{RESULTS_REPO})
"""
)
with gr.Row():
benchmark_dropdown = gr.Dropdown(label="Benchmark", choices=[])
policy_dropdown = gr.Dropdown(label="Policy", choices=["All"], value="All")
refresh_button = gr.Button("Refresh")
latest_table = gr.Dataframe(label="Latest Results", interactive=False)
history_plot = gr.Plot(label="History")
latest_summary = gr.Markdown()
def _initial_state():
df = load_rows()
benchmarks = sorted(df["benchmark"].dropna().unique().tolist()) if not df.empty else []
benchmark = benchmarks[0] if benchmarks else ""
latest, policy_choices, history, summary = refresh_view(benchmark, "All")
return (
gr.update(choices=benchmarks, value=benchmark),
policy_choices,
latest,
history,
summary,
)
demo.load(
_initial_state,
outputs=[benchmark_dropdown, policy_dropdown, latest_table, history_plot, latest_summary],
)
refresh_button.click(
refresh_view,
inputs=[benchmark_dropdown, policy_dropdown],
outputs=[latest_table, policy_dropdown, history_plot, latest_summary],
)
benchmark_dropdown.change(
refresh_view,
inputs=[benchmark_dropdown, policy_dropdown],
outputs=[latest_table, policy_dropdown, history_plot, latest_summary],
)
policy_dropdown.change(
refresh_view,
inputs=[benchmark_dropdown, policy_dropdown],
outputs=[latest_table, policy_dropdown, history_plot, latest_summary],
)
if __name__ == "__main__":
demo.launch()