feat(benchmarks): add matrix runner and leaderboard

2026-07-22 01:11:57 +00:00 · 2026-04-15 21:31:33 +02:00
parent dab511dbb1
commit 2ab59a3099
21 changed files with 2096 additions and 50 deletions
@@ -0,0 +1,142 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+
+from benchmarks.run_benchmark_matrix import (
+    PlannedJob,
+    compute_gradient_accumulation_steps,
+    plan_jobs,
+    render_sbatch_script,
+    write_manifest,
+)
+
+
+def _one_job(job_list: list[PlannedJob]) -> PlannedJob:
+    assert len(job_list) == 1
+    return job_list[0]
+
+
+def test_compute_gradient_accumulation_steps_for_fixed_effective_batch():
+    assert compute_gradient_accumulation_steps(
+        effective_batch_size=256,
+        num_gpus=8,
+        microbatch_per_gpu=32,
+    ) == 1
+    assert compute_gradient_accumulation_steps(
+        effective_batch_size=256,
+        num_gpus=4,
+        microbatch_per_gpu=32,
+    ) == 2
+    assert compute_gradient_accumulation_steps(
+        effective_batch_size=256,
+        num_gpus=1,
+        microbatch_per_gpu=32,
+    ) == 8
+
+
+def test_plan_jobs_filters_libero_plus_only(tmp_path):
+    jobs = plan_jobs(
+        output_dir=tmp_path,
+        hub_org="lerobot",
+        results_repo="lerobot/benchmark-history",
+        policies=["pi0", "act"],
+        benchmarks=["libero_plus"],
+    )
+
+    assert [job.benchmark for job in jobs] == ["libero_plus", "libero_plus"]
+    assert [job.policy for job in jobs] == ["pi0", "act"]
+
+
+def test_plan_jobs_includes_libero_plus_and_robomme(tmp_path):
+    jobs = plan_jobs(
+        output_dir=tmp_path,
+        hub_org="lerobot",
+        results_repo="lerobot/benchmark-history",
+        policies=["pi0"],
+        benchmarks=["libero_plus", "robomme"],
+    )
+
+    assert [job.benchmark for job in jobs] == ["libero_plus", "robomme"]
+    assert jobs[0].effective_batch_size == 256
+    assert jobs[1].effective_batch_size == 256
+
+
+def test_plan_jobs_sets_expected_gpu_and_accumulation(tmp_path):
+    jobs = plan_jobs(
+        output_dir=tmp_path,
+        hub_org="lerobot",
+        results_repo="lerobot/benchmark-history",
+        policies=["pi0", "xvla", "act"],
+        benchmarks=["robomme"],
+    )
+    by_policy = {job.policy: job for job in jobs}
+
+    assert by_policy["pi0"].num_gpus == 8
+    assert by_policy["pi0"].gradient_accumulation_steps == 1
+    assert by_policy["xvla"].num_gpus == 4
+    assert by_policy["xvla"].gradient_accumulation_steps == 2
+    assert by_policy["act"].num_gpus == 1
+    assert by_policy["act"].gradient_accumulation_steps == 8
+
+
+def test_render_sbatch_script_contains_train_eval_and_publish(tmp_path):
+    job = _one_job(
+        plan_jobs(
+            output_dir=tmp_path,
+            hub_org="lerobot",
+            results_repo="lerobot/benchmark-history",
+            policies=["pi0_fast"],
+            benchmarks=["robomme"],
+        )
+    )
+
+    script = render_sbatch_script(
+        job=job,
+        output_dir=tmp_path,
+        results_repo_id="lerobot/benchmark-history",
+        git_commit="deadbeef",
+    )
+
+    assert "docker/Dockerfile" not in script
+    assert "lerobot-benchmark-robomme:latest" in script
+    assert '--dataset.repo_id="lerobot/robomme"' in script
+    assert '--env.type="robomme"' in script
+    assert "--gradient_accumulation_steps=1" in script
+    assert "lerobot-train-tokenizer" in script
+    assert "benchmarks/publish_benchmark_result.py" in script
+
+
+def test_write_manifest_records_job_metadata(tmp_path):
+    jobs = plan_jobs(
+        output_dir=tmp_path,
+        hub_org="lerobot",
+        results_repo="lerobot/benchmark-history",
+        policies=["pi0"],
+        benchmarks=["libero_plus", "robomme"],
+    )
+    manifest_path = write_manifest(
+        output_dir=tmp_path,
+        jobs=jobs,
+        git_commit="deadbeef",
+        hub_org="lerobot",
+        results_repo="lerobot/benchmark-history",
+    )
+
+    manifest = json.loads(manifest_path.read_text())
+    assert manifest["git_commit"] == "deadbeef"
+    assert manifest["results_repo"] == "lerobot/benchmark-history"
+    assert [job["benchmark"] for job in manifest["jobs"]] == ["libero_plus", "robomme"]
@@ -0,0 +1,123 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import sys
+from types import ModuleType
+from unittest.mock import MagicMock
+
+import numpy as np
+
+
+def _install_robomme_stub():
+    stub = ModuleType("robomme")
+    wrapper_stub = ModuleType("robomme.env_record_wrapper")
+
+    class FakeBuilder:
+        def __init__(self, **kwargs):
+            pass
+
+        def make_env_for_episode(self, episode_idx: int, max_steps: int):
+            env = MagicMock()
+            obs = {
+                "front_rgb_list": [np.zeros((256, 256, 3), dtype=np.uint8)],
+                "wrist_rgb_list": [np.zeros((256, 256, 3), dtype=np.uint8)],
+                "joint_state_list": [np.zeros(7, dtype=np.float32)],
+                "gripper_state_list": [np.zeros(2, dtype=np.float32)],
+            }
+            env.reset.return_value = (obs, {"status": "ongoing", "task_goal": "pick the cube"})
+            env.step.return_value = (obs, 0.0, False, False, {"status": "ongoing", "task_goal": ""})
+            return env
+
+    wrapper_stub.BenchmarkEnvBuilder = FakeBuilder
+    stub.env_record_wrapper = wrapper_stub
+    sys.modules["robomme"] = stub
+    sys.modules["robomme.env_record_wrapper"] = wrapper_stub
+
+
+def _uninstall_robomme_stub():
+    sys.modules.pop("robomme", None)
+    sys.modules.pop("robomme.env_record_wrapper", None)
+
+
+def test_robomme_env_config_defaults():
+    from lerobot.envs.configs import RoboMMEEnv
+
+    cfg = RoboMMEEnv()
+    assert cfg.task == "PickXtimes"
+    assert cfg.fps == 10
+    assert cfg.episode_length == 300
+    assert cfg.action_space == "joint_angle"
+    assert cfg.dataset_split == "test"
+    assert cfg.task_ids is None
+
+
+def test_robomme_features_map():
+    from lerobot.envs.configs import RoboMMEEnv
+    from lerobot.utils.constants import ACTION, OBS_IMAGES, OBS_STATE
+
+    cfg = RoboMMEEnv()
+    assert cfg.features_map[ACTION] == ACTION
+    assert cfg.features_map["image"] == f"{OBS_IMAGES}.image"
+    assert cfg.features_map["wrist_image"] == f"{OBS_IMAGES}.wrist_image"
+    assert cfg.features_map[OBS_STATE] == OBS_STATE
+
+
+def test_convert_obs_list_format():
+    _install_robomme_stub()
+    try:
+        from lerobot.envs.robomme import RoboMMEGymEnv
+
+        env = RoboMMEGymEnv.__new__(RoboMMEGymEnv)
+
+        front = np.full((256, 256, 3), 42, dtype=np.uint8)
+        wrist = np.full((256, 256, 3), 7, dtype=np.uint8)
+        joints = np.arange(7, dtype=np.float32)
+        gripper = np.array([0.5, 0.5], dtype=np.float32)
+
+        obs_raw = {
+            "front_rgb_list": [np.zeros_like(front), front],
+            "wrist_rgb_list": [np.zeros_like(wrist), wrist],
+            "joint_state_list": [np.zeros(7, dtype=np.float32), joints],
+            "gripper_state_list": [np.zeros(2, dtype=np.float32), gripper],
+        }
+
+        result = env._convert_obs(obs_raw)
+        np.testing.assert_array_equal(result["image"], front)
+        np.testing.assert_array_equal(result["wrist_image"], wrist)
+        assert result["state"].shape == (8,)
+        np.testing.assert_array_almost_equal(result["state"][:7], joints)
+        assert result["state"][7] == gripper[0]
+    finally:
+        _uninstall_robomme_stub()
+
+
+def test_create_robomme_envs_multi_task():
+    _install_robomme_stub()
+    try:
+        from lerobot.envs.robomme import create_robomme_envs
+
+        env_cls = MagicMock(return_value=MagicMock())
+        result = create_robomme_envs(
+            task="PickXtimes,BinFill,StopCube",
+            n_envs=1,
+            env_cls=env_cls,
+        )
+
+        assert set(result.keys()) == {"PickXtimes", "BinFill", "StopCube"}
+    finally:
+        _uninstall_robomme_stub()