feat(ci): add parse_eval_metrics step to benchmark workflow

Adds scripts/ci/parse_eval_metrics.py and wires it into both Libero and MetaWorld jobs so the dashboard can read pc_success, avg_sum_reward and eval_s from the metrics artifact instead of relying on GitHub step timing. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-22 12:09:42 +00:00 · 2026-04-09 10:04:30 +02:00
parent 0dd0a8f11a
commit 3534331fcc
2 changed files with 151 additions and 0 deletions
@@ -137,6 +137,15 @@ jobs:
          docker cp libero-eval:/tmp/eval-artifacts/. /tmp/libero-artifacts/ 2>/dev/null || true
          docker rm -f libero-eval || true
      - name: Parse Libero eval metrics
        if: always()
        run: |
          python3 scripts/ci/parse_eval_metrics.py \
            --artifacts-dir /tmp/libero-artifacts \
            --env libero \
            --task libero_spatial \
            --policy pepijn223/smolvla_libero
      - name: Upload Libero rollout video
        if: always()
        uses: actions/upload-artifact@v4
@@ -145,6 +154,14 @@ jobs:
          path: /tmp/libero-artifacts/videos/
          if-no-files-found: warn
      - name: Upload Libero eval metrics
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: libero-metrics
          path: /tmp/libero-artifacts/metrics.json
          if-no-files-found: warn
  # ── METAWORLD ─────────────────────────────────────────────────────────────
  # Isolated image: lerobot[metaworld] only (metaworld==3.0.0, mujoco>=3 chain)
  metaworld-integration-test:
@@ -206,6 +223,15 @@ jobs:
          docker cp metaworld-eval:/tmp/eval-artifacts/. /tmp/metaworld-artifacts/ 2>/dev/null || true
          docker rm -f metaworld-eval || true
      - name: Parse MetaWorld eval metrics
        if: always()
        run: |
          python3 scripts/ci/parse_eval_metrics.py \
            --artifacts-dir /tmp/metaworld-artifacts \
            --env metaworld \
            --task metaworld-push-v3 \
            --policy pepijn223/smolvla_metaworld
      - name: Upload MetaWorld rollout video
        if: always()
        uses: actions/upload-artifact@v4
@@ -213,3 +239,11 @@ jobs:
          name: metaworld-rollout-video
          path: /tmp/metaworld-artifacts/videos/
          if-no-files-found: warn
      - name: Upload MetaWorld eval metrics
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: metaworld-metrics
          path: /tmp/metaworld-artifacts/metrics.json
          if-no-files-found: warn
@@ -0,0 +1,117 @@
 #!/usr/bin/env python3
 # Copyright 2025 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Parse lerobot-eval output into a small metrics.json artifact.
 Reads eval_info.json written by lerobot-eval --output_dir and extracts the
 key metrics needed by the health dashboard. Handles both single-task and
 multi-task eval output formats.
 Usage:
    python scripts/ci/parse_eval_metrics.py \\
        --artifacts-dir /tmp/libero-artifacts \\
        --env libero \\
        --task libero_spatial \\
        --policy pepijn223/smolvla_libero
 Writes <artifacts-dir>/metrics.json. The CI workflow then uploads this file
 as a GitHub Actions artifact named "<env>-metrics".
 """
 from __future__ import annotations
 import argparse
 import json
 import math
 import sys
 from pathlib import Path
 def _extract_metrics(info: dict) -> tuple[float | None, int | None, float | None, float | None]:
    """Extract (pc_success, n_episodes, avg_sum_reward, eval_s) from eval_info.json.
    Handles two output shapes:
      - Single-task: {"aggregated": {"pc_success": 80.0, ...}}
      - Multi-task:  {"overall": {"pc_success": 80.0, "n_episodes": 5, ...}}
    """
    for key in ("aggregated", "overall"):
        if key not in info:
            continue
        agg = info[key]
        pc = agg.get("pc_success")
        n = agg.get("n_episodes")
        reward = agg.get("avg_sum_reward")
        eval_s = agg.get("eval_s")
        if pc is not None and not math.isnan(pc):
            return (
                float(pc),
                int(n) if n is not None else None,
                float(reward) if reward is not None else None,
                float(eval_s) if eval_s is not None else None,
            )
    return None, None, None, None
 def main() -> int:
    parser = argparse.ArgumentParser(
        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
    )
    parser.add_argument("--artifacts-dir", required=True, help="Path to the mounted artifacts volume")
    parser.add_argument("--env", required=True, help="Environment name (e.g. libero)")
    parser.add_argument("--task", required=True, help="Task name (e.g. libero_spatial)")
    parser.add_argument("--policy", required=True, help="Policy hub path (e.g. pepijn223/smolvla_libero)")
    args = parser.parse_args()
    artifacts_dir = Path(args.artifacts_dir)
    eval_info_path = artifacts_dir / "eval_info.json"
    pc_success: float | None = None
    n_episodes: int | None = None
    avg_sum_reward: float | None = None
    eval_s: float | None = None
    if eval_info_path.exists():
        try:
            info = json.loads(eval_info_path.read_text())
            pc_success, n_episodes, avg_sum_reward, eval_s = _extract_metrics(info)
        except (json.JSONDecodeError, KeyError, TypeError) as exc:
            print(f"[parse_eval_metrics] Warning: could not parse eval_info.json: {exc}", file=sys.stderr)
    else:
        print(
            f"[parse_eval_metrics] Warning: {eval_info_path} not found — eval may have failed.",
            file=sys.stderr,
        )
    metrics = {
        "env": args.env,
        "task": args.task,
        "policy": args.policy,
        "pc_success": pc_success,
        "n_episodes": n_episodes,
        "avg_sum_reward": avg_sum_reward,
        "eval_s": eval_s,
    }
    out_path = artifacts_dir / "metrics.json"
    out_path.write_text(json.dumps(metrics, indent=2))
    print(f"[parse_eval_metrics] Written: {out_path}")
    print(json.dumps(metrics, indent=2))
    return 0
 if __name__ == "__main__":
    sys.exit(main())