feat(benchmarks): add matrix runner and leaderboard

2026-07-23 01:41:54 +00:00 · 2026-04-15 21:31:33 +02:00
parent dab511dbb1
commit 2ab59a3099
21 changed files with 2096 additions and 50 deletions
@@ -0,0 +1,142 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+
+from benchmarks.run_benchmark_matrix import (
+    PlannedJob,
+    compute_gradient_accumulation_steps,
+    plan_jobs,
+    render_sbatch_script,
+    write_manifest,
+)
+
+
+def _one_job(job_list: list[PlannedJob]) -> PlannedJob:
+    assert len(job_list) == 1
+    return job_list[0]
+
+
+def test_compute_gradient_accumulation_steps_for_fixed_effective_batch():
+    assert compute_gradient_accumulation_steps(
+        effective_batch_size=256,
+        num_gpus=8,
+        microbatch_per_gpu=32,
+    ) == 1
+    assert compute_gradient_accumulation_steps(
+        effective_batch_size=256,
+        num_gpus=4,
+        microbatch_per_gpu=32,
+    ) == 2
+    assert compute_gradient_accumulation_steps(
+        effective_batch_size=256,
+        num_gpus=1,
+        microbatch_per_gpu=32,
+    ) == 8
+
+
+def test_plan_jobs_filters_libero_plus_only(tmp_path):
+    jobs = plan_jobs(
+        output_dir=tmp_path,
+        hub_org="lerobot",
+        results_repo="lerobot/benchmark-history",
+        policies=["pi0", "act"],
+        benchmarks=["libero_plus"],
+    )
+
+    assert [job.benchmark for job in jobs] == ["libero_plus", "libero_plus"]
+    assert [job.policy for job in jobs] == ["pi0", "act"]
+
+
+def test_plan_jobs_includes_libero_plus_and_robomme(tmp_path):
+    jobs = plan_jobs(
+        output_dir=tmp_path,
+        hub_org="lerobot",
+        results_repo="lerobot/benchmark-history",
+        policies=["pi0"],
+        benchmarks=["libero_plus", "robomme"],
+    )
+
+    assert [job.benchmark for job in jobs] == ["libero_plus", "robomme"]
+    assert jobs[0].effective_batch_size == 256
+    assert jobs[1].effective_batch_size == 256
+
+
+def test_plan_jobs_sets_expected_gpu_and_accumulation(tmp_path):
+    jobs = plan_jobs(
+        output_dir=tmp_path,
+        hub_org="lerobot",
+        results_repo="lerobot/benchmark-history",
+        policies=["pi0", "xvla", "act"],
+        benchmarks=["robomme"],
+    )
+    by_policy = {job.policy: job for job in jobs}
+
+    assert by_policy["pi0"].num_gpus == 8
+    assert by_policy["pi0"].gradient_accumulation_steps == 1
+    assert by_policy["xvla"].num_gpus == 4
+    assert by_policy["xvla"].gradient_accumulation_steps == 2
+    assert by_policy["act"].num_gpus == 1
+    assert by_policy["act"].gradient_accumulation_steps == 8
+
+
+def test_render_sbatch_script_contains_train_eval_and_publish(tmp_path):
+    job = _one_job(
+        plan_jobs(
+            output_dir=tmp_path,
+            hub_org="lerobot",
+            results_repo="lerobot/benchmark-history",
+            policies=["pi0_fast"],
+            benchmarks=["robomme"],
+        )
+    )
+
+    script = render_sbatch_script(
+        job=job,
+        output_dir=tmp_path,
+        results_repo_id="lerobot/benchmark-history",
+        git_commit="deadbeef",
+    )
+
+    assert "docker/Dockerfile" not in script
+    assert "lerobot-benchmark-robomme:latest" in script
+    assert '--dataset.repo_id="lerobot/robomme"' in script
+    assert '--env.type="robomme"' in script
+    assert "--gradient_accumulation_steps=1" in script
+    assert "lerobot-train-tokenizer" in script
+    assert "benchmarks/publish_benchmark_result.py" in script
+
+
+def test_write_manifest_records_job_metadata(tmp_path):
+    jobs = plan_jobs(
+        output_dir=tmp_path,
+        hub_org="lerobot",
+        results_repo="lerobot/benchmark-history",
+        policies=["pi0"],
+        benchmarks=["libero_plus", "robomme"],
+    )
+    manifest_path = write_manifest(
+        output_dir=tmp_path,
+        jobs=jobs,
+        git_commit="deadbeef",
+        hub_org="lerobot",
+        results_repo="lerobot/benchmark-history",
+    )
+
+    manifest = json.loads(manifest_path.read_text())
+    assert manifest["git_commit"] == "deadbeef"
+    assert manifest["results_repo"] == "lerobot/benchmark-history"
+    assert [job["benchmark"] for job in manifest["jobs"]] == ["libero_plus", "robomme"]