mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-22 20:19:43 +00:00
452d9abaa4
- spaces/health-dashboard/app.py: Gradio Space that queries the GitHub Actions API directly (no extra datastore). Shows benchmark status badges, success-rate and duration trend charts, and embeds the latest rollout video per benchmark. Results cached 5 min in-memory; video files cached on disk by artifact ID so downloads only happen once. - spaces/health-dashboard/requirements.txt + README.md: Space card with setup instructions for the GITHUB_RO_TOKEN secret (actions:read, metadata:read only). - scripts/ci/parse_eval_metrics.py: runs on the CI host after each eval, reads eval_info.json written by lerobot-eval, extracts pc_success and n_episodes, and writes metrics.json to the artifacts dir. - .github/workflows/benchmark_tests.yml: add "Parse … metrics" and "Upload … metrics" steps (if: always()) after each eval so the dashboard has data even when the eval fails. The Space should be deployed as a private Space under the huggingface org. Required secret: GITHUB_RO_TOKEN (fine-grained, read-only). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
114 lines
3.8 KiB
Python
114 lines
3.8 KiB
Python
#!/usr/bin/env python3
|
|
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
"""Parse lerobot-eval output into a small metrics.json artifact.
|
|
|
|
Reads eval_info.json written by lerobot-eval --output_dir and extracts the
|
|
key metrics needed by the health dashboard. Handles both single-task and
|
|
multi-task eval output formats.
|
|
|
|
Usage:
|
|
python scripts/ci/parse_eval_metrics.py \\
|
|
--artifacts-dir /tmp/libero-artifacts \\
|
|
--env libero \\
|
|
--task libero_spatial \\
|
|
--policy pepijn223/smolvla_libero
|
|
|
|
Writes <artifacts-dir>/metrics.json. The CI workflow then uploads this file
|
|
as a GitHub Actions artifact named "<env>-metrics".
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import math
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
|
|
def _extract_pc_success(info: dict) -> tuple[float | None, int | None]:
|
|
"""Extract (pc_success, n_episodes) from eval_info.json.
|
|
|
|
Handles two output shapes:
|
|
- Single-task: {"aggregated": {"pc_success": 80.0, ...}}
|
|
- Multi-task: {"overall": {"pc_success": 80.0, "n_episodes": 5, ...}}
|
|
"""
|
|
# Single-task path
|
|
if "aggregated" in info:
|
|
agg = info["aggregated"]
|
|
pc = agg.get("pc_success")
|
|
n = agg.get("n_episodes") # may be absent in older format
|
|
if pc is not None and not math.isnan(pc):
|
|
return float(pc), int(n) if n is not None else None
|
|
|
|
# Multi-task path
|
|
if "overall" in info:
|
|
overall = info["overall"]
|
|
pc = overall.get("pc_success")
|
|
n = overall.get("n_episodes")
|
|
if pc is not None and not math.isnan(pc):
|
|
return float(pc), int(n) if n is not None else None
|
|
|
|
return None, None
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser(
|
|
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
|
|
)
|
|
parser.add_argument("--artifacts-dir", required=True, help="Path to the mounted artifacts volume")
|
|
parser.add_argument("--env", required=True, help="Environment name (e.g. libero)")
|
|
parser.add_argument("--task", required=True, help="Task name (e.g. libero_spatial)")
|
|
parser.add_argument("--policy", required=True, help="Policy hub path (e.g. pepijn223/smolvla_libero)")
|
|
args = parser.parse_args()
|
|
|
|
artifacts_dir = Path(args.artifacts_dir)
|
|
eval_info_path = artifacts_dir / "eval_info.json"
|
|
|
|
pc_success: float | None = None
|
|
n_episodes: int | None = None
|
|
|
|
if eval_info_path.exists():
|
|
try:
|
|
info = json.loads(eval_info_path.read_text())
|
|
pc_success, n_episodes = _extract_pc_success(info)
|
|
except (json.JSONDecodeError, KeyError, TypeError) as exc:
|
|
print(f"[parse_eval_metrics] Warning: could not parse eval_info.json: {exc}", file=sys.stderr)
|
|
else:
|
|
print(
|
|
f"[parse_eval_metrics] Warning: {eval_info_path} not found — eval may have failed.",
|
|
file=sys.stderr,
|
|
)
|
|
|
|
metrics = {
|
|
"env": args.env,
|
|
"task": args.task,
|
|
"policy": args.policy,
|
|
"pc_success": pc_success,
|
|
"n_episodes": n_episodes,
|
|
}
|
|
|
|
out_path = artifacts_dir / "metrics.json"
|
|
out_path.write_text(json.dumps(metrics, indent=2))
|
|
print(f"[parse_eval_metrics] Written: {out_path}")
|
|
print(json.dumps(metrics, indent=2))
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|