feat(benchmarks): add matrix runner and leaderboard

2026-05-23 12:40:08 +00:00 · 2026-04-15 21:31:33 +02:00
parent dab511dbb1
commit 2ab59a3099
21 changed files with 2096 additions and 50 deletions
@@ -0,0 +1,156 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Publish benchmark rows and lightweight artifacts to a Hub dataset."""
+
+from __future__ import annotations
+
+import argparse
+import json
+from datetime import UTC, datetime
+from pathlib import Path
+from typing import Any
+
+from lerobot.utils.history_repo import UploadTarget, make_hub_file_url, upload_targets, utc_timestamp_slug
+
+
+def load_json_if_exists(path: Path) -> dict[str, Any] | None:
+    if not path.exists():
+        return None
+    return json.loads(path.read_text())
+
+
+def find_latest_train_config_path(run_root: Path) -> Path | None:
+    checkpoints_dir = run_root / "train" / "checkpoints"
+    if not checkpoints_dir.exists():
+        return None
+    candidates = sorted(
+        checkpoints_dir.glob("*/pretrained_model/train_config.json"),
+        key=lambda path: path.parts[-3],
+    )
+    return candidates[-1] if candidates else None
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--benchmark", required=True)
+    parser.add_argument("--policy", required=True)
+    parser.add_argument("--run_root", required=True, type=Path)
+    parser.add_argument("--results_repo", required=True)
+    parser.add_argument("--git_commit", required=True)
+    parser.add_argument("--num_gpus", required=True, type=int)
+    parser.add_argument("--microbatch_per_gpu", required=True, type=int)
+    parser.add_argument("--gradient_accumulation_steps", required=True, type=int)
+    parser.add_argument("--effective_batch_size", required=True, type=int)
+    parser.add_argument("--train_wall_time_s", required=True, type=float)
+    parser.add_argument("--eval_wall_time_s", required=True, type=float)
+    parser.add_argument("--slurm_job_id", default="")
+    parser.add_argument("--docker_image", required=True)
+    return parser.parse_args()
+
+
+def build_row(args: argparse.Namespace) -> tuple[dict[str, Any], list[UploadTarget]]:
+    now = datetime.now(UTC)
+    created_at = now.isoformat()
+    timestamp = utc_timestamp_slug(now)
+    run_id = f"{timestamp}__{args.benchmark}__{args.policy}__{args.slurm_job_id or 'manual'}"
+    eval_info = load_json_if_exists(args.run_root / "eval" / "eval_info.json") or {}
+    train_config_path = find_latest_train_config_path(args.run_root)
+    train_config = load_json_if_exists(train_config_path) or {}
+
+    artifact_prefix = f"artifacts/{args.benchmark}/{args.policy}/{run_id}"
+    row_path_in_repo = f"rows/{args.benchmark}/{args.policy}/{run_id}.json"
+
+    row = {
+        "schema_version": 1,
+        "created_at": created_at,
+        "run_id": run_id,
+        "benchmark": args.benchmark,
+        "policy": args.policy,
+        "git_commit": args.git_commit,
+        "slurm_job_id": args.slurm_job_id or None,
+        "docker_image": args.docker_image,
+        "resources": {
+            "num_gpus": args.num_gpus,
+            "microbatch_per_gpu": args.microbatch_per_gpu,
+            "gradient_accumulation_steps": args.gradient_accumulation_steps,
+            "effective_batch_size": args.effective_batch_size,
+        },
+        "timings": {
+            "train_wall_time_s": args.train_wall_time_s,
+            "eval_wall_time_s": args.eval_wall_time_s,
+            "total_wall_time_s": args.train_wall_time_s + args.eval_wall_time_s,
+        },
+        "eval": {
+            "overall": eval_info.get("overall", {}),
+            "per_group": eval_info.get("per_group", {}),
+            "per_task_count": len(eval_info.get("per_task", [])),
+        },
+        "paths": {
+            "run_root": str(args.run_root),
+            "train_dir": str(args.run_root / "train"),
+            "eval_dir": str(args.run_root / "eval"),
+        },
+        "train_config": train_config,
+        "artifact_urls": {
+            "row": make_hub_file_url(args.results_repo, row_path_in_repo),
+        },
+    }
+
+    row_path = args.run_root / "benchmark_row.json"
+    row_path.parent.mkdir(parents=True, exist_ok=True)
+    upload_list = [UploadTarget(local_path=row_path, path_in_repo=row_path_in_repo)]
+
+    eval_info_path = args.run_root / "eval" / "eval_info.json"
+    if eval_info_path.exists():
+        row["artifact_urls"]["eval_info"] = make_hub_file_url(
+            args.results_repo, f"{artifact_prefix}/eval_info.json"
+        )
+        upload_list.append(
+            UploadTarget(local_path=eval_info_path, path_in_repo=f"{artifact_prefix}/eval_info.json")
+        )
+
+    if train_config_path is not None and train_config_path.exists():
+        row["artifact_urls"]["train_config"] = make_hub_file_url(
+            args.results_repo, f"{artifact_prefix}/train_config.json"
+        )
+        upload_list.append(
+            UploadTarget(local_path=train_config_path, path_in_repo=f"{artifact_prefix}/train_config.json")
+        )
+
+    row_path.write_text(json.dumps(row, indent=2, sort_keys=True))
+    return row, upload_list
+
+
+def main() -> int:
+    args = parse_args()
+    row, upload_list = build_row(args)
+    uploaded = upload_targets(
+        repo_id=args.results_repo,
+        targets=upload_list,
+        repo_type="dataset",
+        private=False,
+        commit_message=f"Add benchmark row {row['run_id']}",
+    )
+    row["uploaded_paths"] = uploaded
+    row_path = args.run_root / "benchmark_row.json"
+    row_path.write_text(json.dumps(row, indent=2, sort_keys=True))
+    print(json.dumps(row, indent=2, sort_keys=True))
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())