mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-20 11:09:59 +00:00
227 lines
8.3 KiB
Python
227 lines
8.3 KiB
Python
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import os
|
|
import time
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
import gradio as gr
|
|
import pandas as pd
|
|
import plotly.express as px
|
|
from huggingface_hub import HfApi, hf_hub_download
|
|
|
|
RESULTS_REPO = os.environ.get("BENCHMARK_RESULTS_REPO", "lerobot/benchmark-history")
|
|
CACHE_DIR = Path("/tmp/benchmark-leaderboard-cache")
|
|
CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
|
CACHE_TTL_S = 300
|
|
|
|
_CACHE: dict[str, tuple[float, pd.DataFrame]] = {}
|
|
|
|
|
|
def _row_to_record(row: dict[str, Any]) -> dict[str, Any]:
|
|
overall = row.get("eval", {}).get("overall", {})
|
|
resources = row.get("resources", {})
|
|
timings = row.get("timings", {})
|
|
artifact_urls = row.get("artifact_urls", {})
|
|
return {
|
|
"created_at": row.get("created_at"),
|
|
"benchmark": row.get("benchmark"),
|
|
"policy": row.get("policy"),
|
|
"success_rate": overall.get("pc_success"),
|
|
"n_episodes": overall.get("n_episodes"),
|
|
"avg_sum_reward": overall.get("avg_sum_reward"),
|
|
"train_wall_time_s": timings.get("train_wall_time_s"),
|
|
"eval_wall_time_s": timings.get("eval_wall_time_s"),
|
|
"total_wall_time_s": timings.get("total_wall_time_s"),
|
|
"num_gpus": resources.get("num_gpus"),
|
|
"microbatch_per_gpu": resources.get("microbatch_per_gpu"),
|
|
"gradient_accumulation_steps": resources.get("gradient_accumulation_steps"),
|
|
"effective_batch_size": resources.get("effective_batch_size"),
|
|
"git_commit": row.get("git_commit"),
|
|
"row_url": artifact_urls.get("row"),
|
|
"eval_info_url": artifact_urls.get("eval_info"),
|
|
"train_config_url": artifact_urls.get("train_config"),
|
|
}
|
|
|
|
|
|
def load_rows(repo_id: str = RESULTS_REPO) -> pd.DataFrame:
|
|
cache_key = f"rows::{repo_id}"
|
|
cached = _CACHE.get(cache_key)
|
|
if cached is not None and (time.monotonic() - cached[0]) < CACHE_TTL_S:
|
|
return cached[1]
|
|
|
|
api = HfApi()
|
|
files = [path for path in api.list_repo_files(repo_id=repo_id, repo_type="dataset") if path.startswith("rows/")]
|
|
records: list[dict[str, Any]] = []
|
|
for path_in_repo in sorted(files, reverse=True):
|
|
local_path = hf_hub_download(repo_id=repo_id, repo_type="dataset", filename=path_in_repo, cache_dir=CACHE_DIR)
|
|
with open(local_path) as f:
|
|
row = json.load(f)
|
|
records.append(_row_to_record(row))
|
|
|
|
df = pd.DataFrame.from_records(records)
|
|
if not df.empty:
|
|
df["created_at"] = pd.to_datetime(df["created_at"], utc=True)
|
|
df = df.sort_values("created_at", ascending=False).reset_index(drop=True)
|
|
_CACHE[cache_key] = (time.monotonic(), df)
|
|
return df
|
|
|
|
|
|
def make_latest_table(df: pd.DataFrame) -> pd.DataFrame:
|
|
if df.empty:
|
|
return df
|
|
latest = (
|
|
df.sort_values("created_at", ascending=False)
|
|
.groupby(["benchmark", "policy"], as_index=False)
|
|
.first()
|
|
.sort_values(["benchmark", "success_rate"], ascending=[True, False], na_position="last")
|
|
)
|
|
return latest[
|
|
[
|
|
"benchmark",
|
|
"policy",
|
|
"success_rate",
|
|
"n_episodes",
|
|
"train_wall_time_s",
|
|
"eval_wall_time_s",
|
|
"num_gpus",
|
|
"effective_batch_size",
|
|
"git_commit",
|
|
"row_url",
|
|
"eval_info_url",
|
|
"train_config_url",
|
|
]
|
|
]
|
|
|
|
|
|
def make_history_figure(df: pd.DataFrame, benchmark: str, policy: str | None) -> Any:
|
|
filtered = df[df["benchmark"] == benchmark]
|
|
if policy and policy != "All":
|
|
filtered = filtered[filtered["policy"] == policy]
|
|
if filtered.empty:
|
|
return px.line(title="No benchmark rows found")
|
|
fig = px.line(
|
|
filtered.sort_values("created_at"),
|
|
x="created_at",
|
|
y="success_rate",
|
|
color="policy",
|
|
markers=True,
|
|
hover_data=["git_commit", "num_gpus", "train_wall_time_s", "eval_wall_time_s"],
|
|
title=f"{benchmark} success rate history",
|
|
)
|
|
fig.update_layout(yaxis_title="Success rate (%)", xaxis_title="Run time")
|
|
return fig
|
|
|
|
|
|
def make_run_markdown(df: pd.DataFrame, benchmark: str, policy: str | None) -> str:
|
|
filtered = df[df["benchmark"] == benchmark]
|
|
if policy and policy != "All":
|
|
filtered = filtered[filtered["policy"] == policy]
|
|
if filtered.empty:
|
|
return "No matching runs yet."
|
|
latest = filtered.sort_values("created_at", ascending=False).iloc[0]
|
|
row_link = latest["row_url"] if pd.notna(latest["row_url"]) else None
|
|
eval_link = latest["eval_info_url"] if pd.notna(latest["eval_info_url"]) else None
|
|
train_link = latest["train_config_url"] if pd.notna(latest["train_config_url"]) else None
|
|
lines = [
|
|
f"Latest run: `{latest['policy']}` on `{latest['benchmark']}`",
|
|
f"Success rate: `{latest['success_rate']}`",
|
|
f"GPUs: `{latest['num_gpus']}`",
|
|
f"Effective batch size: `{latest['effective_batch_size']}`",
|
|
f"Commit: `{latest['git_commit']}`",
|
|
]
|
|
if row_link:
|
|
lines.append(f"Row JSON: [open]({row_link})")
|
|
if eval_link:
|
|
lines.append(f"Eval Info: [open]({eval_link})")
|
|
if train_link:
|
|
lines.append(f"Train Config: [open]({train_link})")
|
|
return "\n\n".join(lines)
|
|
|
|
|
|
def refresh_view(benchmark: str, policy: str) -> tuple[pd.DataFrame, dict[str, Any], Any, str]:
|
|
df = load_rows()
|
|
latest_table = make_latest_table(df)
|
|
benchmark_names = sorted(df["benchmark"].dropna().unique().tolist()) if not df.empty else []
|
|
if benchmark not in benchmark_names and benchmark_names:
|
|
benchmark = benchmark_names[0]
|
|
policy_choices = ["All"]
|
|
if benchmark and not df.empty:
|
|
policy_choices.extend(sorted(df[df["benchmark"] == benchmark]["policy"].dropna().unique().tolist()))
|
|
if policy not in policy_choices:
|
|
policy = "All"
|
|
history = make_history_figure(df, benchmark, policy)
|
|
summary = make_run_markdown(df, benchmark, policy)
|
|
return latest_table, gr.update(choices=policy_choices, value=policy), history, summary
|
|
|
|
|
|
with gr.Blocks(title="LeRobot Benchmark Leaderboard") as demo:
|
|
gr.Markdown(
|
|
f"""
|
|
# LeRobot Benchmark Leaderboard
|
|
|
|
Results dataset: [`{RESULTS_REPO}`](https://huggingface.co/datasets/{RESULTS_REPO})
|
|
"""
|
|
)
|
|
|
|
with gr.Row():
|
|
benchmark_dropdown = gr.Dropdown(label="Benchmark", choices=[])
|
|
policy_dropdown = gr.Dropdown(label="Policy", choices=["All"], value="All")
|
|
refresh_button = gr.Button("Refresh")
|
|
|
|
latest_table = gr.Dataframe(label="Latest Results", interactive=False)
|
|
history_plot = gr.Plot(label="History")
|
|
latest_summary = gr.Markdown()
|
|
|
|
def _initial_state():
|
|
df = load_rows()
|
|
benchmarks = sorted(df["benchmark"].dropna().unique().tolist()) if not df.empty else []
|
|
benchmark = benchmarks[0] if benchmarks else ""
|
|
latest, policy_choices, history, summary = refresh_view(benchmark, "All")
|
|
return (
|
|
gr.update(choices=benchmarks, value=benchmark),
|
|
policy_choices,
|
|
latest,
|
|
history,
|
|
summary,
|
|
)
|
|
|
|
demo.load(
|
|
_initial_state,
|
|
outputs=[benchmark_dropdown, policy_dropdown, latest_table, history_plot, latest_summary],
|
|
)
|
|
refresh_button.click(
|
|
refresh_view,
|
|
inputs=[benchmark_dropdown, policy_dropdown],
|
|
outputs=[latest_table, policy_dropdown, history_plot, latest_summary],
|
|
)
|
|
benchmark_dropdown.change(
|
|
refresh_view,
|
|
inputs=[benchmark_dropdown, policy_dropdown],
|
|
outputs=[latest_table, policy_dropdown, history_plot, latest_summary],
|
|
)
|
|
policy_dropdown.change(
|
|
refresh_view,
|
|
inputs=[benchmark_dropdown, policy_dropdown],
|
|
outputs=[latest_table, policy_dropdown, history_plot, latest_summary],
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
demo.launch()
|