lerobot/spaces/benchmark-leaderboard/app.py

# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

import json
import os
import time
from pathlib import Path
from typing import Any

import gradio as gr
import pandas as pd
import plotly.express as px
from huggingface_hub import HfApi, hf_hub_download

RESULTS_REPO = os.environ.get("BENCHMARK_RESULTS_REPO", "lerobot/benchmark-history")
CACHE_DIR = Path("/tmp/benchmark-leaderboard-cache")
CACHE_DIR.mkdir(parents=True, exist_ok=True)
CACHE_TTL_S = 300

_CACHE: dict[str, tuple[float, pd.DataFrame]] = {}


def _row_to_record(row: dict[str, Any]) -> dict[str, Any]:
    overall = row.get("eval", {}).get("overall", {})
    resources = row.get("resources", {})
    timings = row.get("timings", {})
    artifact_urls = row.get("artifact_urls", {})
    return {
        "created_at": row.get("created_at"),
        "benchmark": row.get("benchmark"),
        "policy": row.get("policy"),
        "success_rate": overall.get("pc_success"),
        "n_episodes": overall.get("n_episodes"),
        "avg_sum_reward": overall.get("avg_sum_reward"),
        "train_wall_time_s": timings.get("train_wall_time_s"),
        "eval_wall_time_s": timings.get("eval_wall_time_s"),
        "total_wall_time_s": timings.get("total_wall_time_s"),
        "num_gpus": resources.get("num_gpus"),
        "microbatch_per_gpu": resources.get("microbatch_per_gpu"),
        "gradient_accumulation_steps": resources.get("gradient_accumulation_steps"),
        "effective_batch_size": resources.get("effective_batch_size"),
        "git_commit": row.get("git_commit"),
        "row_url": artifact_urls.get("row"),
        "eval_info_url": artifact_urls.get("eval_info"),
        "train_config_url": artifact_urls.get("train_config"),
    }


def load_rows(repo_id: str = RESULTS_REPO) -> pd.DataFrame:
    cache_key = f"rows::{repo_id}"
    cached = _CACHE.get(cache_key)
    if cached is not None and (time.monotonic() - cached[0]) < CACHE_TTL_S:
        return cached[1]

    api = HfApi()
    files = [path for path in api.list_repo_files(repo_id=repo_id, repo_type="dataset") if path.startswith("rows/")]
    records: list[dict[str, Any]] = []
    for path_in_repo in sorted(files, reverse=True):
        local_path = hf_hub_download(repo_id=repo_id, repo_type="dataset", filename=path_in_repo, cache_dir=CACHE_DIR)
        with open(local_path) as f:
            row = json.load(f)
        records.append(_row_to_record(row))

    df = pd.DataFrame.from_records(records)
    if not df.empty:
        df["created_at"] = pd.to_datetime(df["created_at"], utc=True)
        df = df.sort_values("created_at", ascending=False).reset_index(drop=True)
    _CACHE[cache_key] = (time.monotonic(), df)
    return df


def make_latest_table(df: pd.DataFrame) -> pd.DataFrame:
    if df.empty:
        return df
    latest = (
        df.sort_values("created_at", ascending=False)
        .groupby(["benchmark", "policy"], as_index=False)
        .first()
        .sort_values(["benchmark", "success_rate"], ascending=[True, False], na_position="last")
    )
    return latest[
        [
            "benchmark",
            "policy",
            "success_rate",
            "n_episodes",
            "train_wall_time_s",
            "eval_wall_time_s",
            "num_gpus",
            "effective_batch_size",
            "git_commit",
            "row_url",
            "eval_info_url",
            "train_config_url",
        ]
    ]


def make_history_figure(df: pd.DataFrame, benchmark: str, policy: str | None) -> Any:
    filtered = df[df["benchmark"] == benchmark]
    if policy and policy != "All":
        filtered = filtered[filtered["policy"] == policy]
    if filtered.empty:
        return px.line(title="No benchmark rows found")
    fig = px.line(
        filtered.sort_values("created_at"),
        x="created_at",
        y="success_rate",
        color="policy",
        markers=True,
        hover_data=["git_commit", "num_gpus", "train_wall_time_s", "eval_wall_time_s"],
        title=f"{benchmark} success rate history",
    )
    fig.update_layout(yaxis_title="Success rate (%)", xaxis_title="Run time")
    return fig


def make_run_markdown(df: pd.DataFrame, benchmark: str, policy: str | None) -> str:
    filtered = df[df["benchmark"] == benchmark]
    if policy and policy != "All":
        filtered = filtered[filtered["policy"] == policy]
    if filtered.empty:
        return "No matching runs yet."
    latest = filtered.sort_values("created_at", ascending=False).iloc[0]
    row_link = latest["row_url"] if pd.notna(latest["row_url"]) else None
    eval_link = latest["eval_info_url"] if pd.notna(latest["eval_info_url"]) else None
    train_link = latest["train_config_url"] if pd.notna(latest["train_config_url"]) else None
    lines = [
        f"Latest run: `{latest['policy']}` on `{latest['benchmark']}`",
        f"Success rate: `{latest['success_rate']}`",
        f"GPUs: `{latest['num_gpus']}`",
        f"Effective batch size: `{latest['effective_batch_size']}`",
        f"Commit: `{latest['git_commit']}`",
    ]
    if row_link:
        lines.append(f"Row JSON: [open]({row_link})")
    if eval_link:
        lines.append(f"Eval Info: [open]({eval_link})")
    if train_link:
        lines.append(f"Train Config: [open]({train_link})")
    return "\n\n".join(lines)


def refresh_view(benchmark: str, policy: str) -> tuple[pd.DataFrame, dict[str, Any], Any, str]:
    df = load_rows()
    latest_table = make_latest_table(df)
    benchmark_names = sorted(df["benchmark"].dropna().unique().tolist()) if not df.empty else []
    if benchmark not in benchmark_names and benchmark_names:
        benchmark = benchmark_names[0]
    policy_choices = ["All"]
    if benchmark and not df.empty:
        policy_choices.extend(sorted(df[df["benchmark"] == benchmark]["policy"].dropna().unique().tolist()))
    if policy not in policy_choices:
        policy = "All"
    history = make_history_figure(df, benchmark, policy)
    summary = make_run_markdown(df, benchmark, policy)
    return latest_table, gr.update(choices=policy_choices, value=policy), history, summary


with gr.Blocks(title="LeRobot Benchmark Leaderboard") as demo:
    gr.Markdown(
        f"""
# LeRobot Benchmark Leaderboard

Results dataset: [`{RESULTS_REPO}`](https://huggingface.co/datasets/{RESULTS_REPO})
"""
    )

    with gr.Row():
        benchmark_dropdown = gr.Dropdown(label="Benchmark", choices=[])
        policy_dropdown = gr.Dropdown(label="Policy", choices=["All"], value="All")
        refresh_button = gr.Button("Refresh")

    latest_table = gr.Dataframe(label="Latest Results", interactive=False)
    history_plot = gr.Plot(label="History")
    latest_summary = gr.Markdown()

    def _initial_state():
        df = load_rows()
        benchmarks = sorted(df["benchmark"].dropna().unique().tolist()) if not df.empty else []
        benchmark = benchmarks[0] if benchmarks else ""
        latest, policy_choices, history, summary = refresh_view(benchmark, "All")
        return (
            gr.update(choices=benchmarks, value=benchmark),
            policy_choices,
            latest,
            history,
            summary,
        )

    demo.load(
        _initial_state,
        outputs=[benchmark_dropdown, policy_dropdown, latest_table, history_plot, latest_summary],
    )
    refresh_button.click(
        refresh_view,
        inputs=[benchmark_dropdown, policy_dropdown],
        outputs=[latest_table, policy_dropdown, history_plot, latest_summary],
    )
    benchmark_dropdown.change(
        refresh_view,
        inputs=[benchmark_dropdown, policy_dropdown],
        outputs=[latest_table, policy_dropdown, history_plot, latest_summary],
    )
    policy_dropdown.change(
        refresh_view,
        inputs=[benchmark_dropdown, policy_dropdown],
        outputs=[latest_table, policy_dropdown, history_plot, latest_summary],
    )


if __name__ == "__main__":
    demo.launch()