diff --git a/.github/workflows/benchmark_tests.yml b/.github/workflows/benchmark_tests.yml index eed3d4800..5a4b58bc5 100644 --- a/.github/workflows/benchmark_tests.yml +++ b/.github/workflows/benchmark_tests.yml @@ -137,6 +137,15 @@ jobs: docker cp libero-eval:/tmp/eval-artifacts/. /tmp/libero-artifacts/ 2>/dev/null || true docker rm -f libero-eval || true + - name: Parse Libero eval metrics + if: always() + run: | + python3 scripts/ci/parse_eval_metrics.py \ + --artifacts-dir /tmp/libero-artifacts \ + --env libero \ + --task libero_spatial \ + --policy pepijn223/smolvla_libero + - name: Upload Libero rollout video if: always() uses: actions/upload-artifact@v4 @@ -145,6 +154,14 @@ jobs: path: /tmp/libero-artifacts/videos/ if-no-files-found: warn + - name: Upload Libero eval metrics + if: always() + uses: actions/upload-artifact@v4 + with: + name: libero-metrics + path: /tmp/libero-artifacts/metrics.json + if-no-files-found: warn + # ── METAWORLD ───────────────────────────────────────────────────────────── # Isolated image: lerobot[metaworld] only (metaworld==3.0.0, mujoco>=3 chain) metaworld-integration-test: @@ -206,6 +223,15 @@ jobs: docker cp metaworld-eval:/tmp/eval-artifacts/. /tmp/metaworld-artifacts/ 2>/dev/null || true docker rm -f metaworld-eval || true + - name: Parse MetaWorld eval metrics + if: always() + run: | + python3 scripts/ci/parse_eval_metrics.py \ + --artifacts-dir /tmp/metaworld-artifacts \ + --env metaworld \ + --task metaworld-push-v3 \ + --policy pepijn223/smolvla_metaworld + - name: Upload MetaWorld rollout video if: always() uses: actions/upload-artifact@v4 @@ -213,3 +239,11 @@ jobs: name: metaworld-rollout-video path: /tmp/metaworld-artifacts/videos/ if-no-files-found: warn + + - name: Upload MetaWorld eval metrics + if: always() + uses: actions/upload-artifact@v4 + with: + name: metaworld-metrics + path: /tmp/metaworld-artifacts/metrics.json + if-no-files-found: warn diff --git a/scripts/ci/parse_eval_metrics.py b/scripts/ci/parse_eval_metrics.py new file mode 100644 index 000000000..b10be7170 --- /dev/null +++ b/scripts/ci/parse_eval_metrics.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Parse lerobot-eval output into a small metrics.json artifact. + +Reads eval_info.json written by lerobot-eval --output_dir and extracts the +key metrics needed by the health dashboard. Handles both single-task and +multi-task eval output formats. + +Usage: + python scripts/ci/parse_eval_metrics.py \\ + --artifacts-dir /tmp/libero-artifacts \\ + --env libero \\ + --task libero_spatial \\ + --policy pepijn223/smolvla_libero + +Writes /metrics.json. The CI workflow then uploads this file +as a GitHub Actions artifact named "-metrics". +""" + +from __future__ import annotations + +import argparse +import json +import math +import sys +from pathlib import Path + + +def _extract_metrics(info: dict) -> tuple[float | None, int | None, float | None, float | None]: + """Extract (pc_success, n_episodes, avg_sum_reward, eval_s) from eval_info.json. + + Handles two output shapes: + - Single-task: {"aggregated": {"pc_success": 80.0, ...}} + - Multi-task: {"overall": {"pc_success": 80.0, "n_episodes": 5, ...}} + """ + for key in ("aggregated", "overall"): + if key not in info: + continue + agg = info[key] + pc = agg.get("pc_success") + n = agg.get("n_episodes") + reward = agg.get("avg_sum_reward") + eval_s = agg.get("eval_s") + if pc is not None and not math.isnan(pc): + return ( + float(pc), + int(n) if n is not None else None, + float(reward) if reward is not None else None, + float(eval_s) if eval_s is not None else None, + ) + + return None, None, None, None + + +def main() -> int: + parser = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument("--artifacts-dir", required=True, help="Path to the mounted artifacts volume") + parser.add_argument("--env", required=True, help="Environment name (e.g. libero)") + parser.add_argument("--task", required=True, help="Task name (e.g. libero_spatial)") + parser.add_argument("--policy", required=True, help="Policy hub path (e.g. pepijn223/smolvla_libero)") + args = parser.parse_args() + + artifacts_dir = Path(args.artifacts_dir) + eval_info_path = artifacts_dir / "eval_info.json" + + pc_success: float | None = None + n_episodes: int | None = None + avg_sum_reward: float | None = None + eval_s: float | None = None + + if eval_info_path.exists(): + try: + info = json.loads(eval_info_path.read_text()) + pc_success, n_episodes, avg_sum_reward, eval_s = _extract_metrics(info) + except (json.JSONDecodeError, KeyError, TypeError) as exc: + print(f"[parse_eval_metrics] Warning: could not parse eval_info.json: {exc}", file=sys.stderr) + else: + print( + f"[parse_eval_metrics] Warning: {eval_info_path} not found — eval may have failed.", + file=sys.stderr, + ) + + metrics = { + "env": args.env, + "task": args.task, + "policy": args.policy, + "pc_success": pc_success, + "n_episodes": n_episodes, + "avg_sum_reward": avg_sum_reward, + "eval_s": eval_s, + } + + out_path = artifacts_dir / "metrics.json" + out_path.write_text(json.dumps(metrics, indent=2)) + print(f"[parse_eval_metrics] Written: {out_path}") + print(json.dumps(metrics, indent=2)) + + return 0 + + +if __name__ == "__main__": + sys.exit(main())