From c505a71f7817c7427777c5aac758537dbbac556d Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Fri, 10 Apr 2026 12:47:58 +0200
Subject: [PATCH] fix(ci): address PR review feedback for benchmark smoke tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Security:
- Remove "Login to Hugging Face" step — it was a no-op (ephemeral
  --rm container) that exposed the HF token via CLI argument in
  docker inspect / /proc/*/cmdline. The eval step already
  re-authenticates via env var.

Functional:
- Remove feat/benchmark-ci from push trigger branches (won't exist
  post-merge).

Dockerfiles:
- Pin uv to 0.8.0 (was unpinned, fetching whatever latest ships).
- Add comment explaining the chmod +x ptxas workaround (Triton
  packaging bug — ships ptxas without execute bit).

Scripts:
- parse_eval_metrics.py: add note that it runs on bare host and must
  stay stdlib-only.
- parse_eval_metrics.py: add NaN guard for avg_sum_reward and eval_s
  (was only guarding pc_success).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/workflows/benchmark_tests.yml |  9 ---------
 docker/Dockerfile.benchmark.libero    |  5 ++++-
 docker/Dockerfile.benchmark.metaworld |  5 ++++-
 scripts/ci/parse_eval_metrics.py      | 14 ++++++++++++--
 4 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/benchmark_tests.yml b/.github/workflows/benchmark_tests.yml
index c38d124dc..612c403c0 100644
--- a/.github/workflows/benchmark_tests.yml
+++ b/.github/workflows/benchmark_tests.yml
@@ -31,7 +31,6 @@ on:
 
   push:
     branches:
-      - feat/benchmark-ci
       - main
     paths:
       - "src/lerobot/envs/**"
@@ -101,14 +100,6 @@ jobs:
           load: true
           tags: lerobot-benchmark-libero:ci
 
-      - name: Login to Hugging Face
-        if: env.HF_USER_TOKEN != ''
-        run: |
-          docker run --rm \
-            -e HF_HOME=/tmp/hf \
-            lerobot-benchmark-libero:ci \
-            bash -c "hf auth login --token '$HF_USER_TOKEN' --add-to-git-credential && hf auth whoami"
-
       - name: Run Libero smoke eval (1 episode)
         run: |
           # Named container (no --rm) so we can docker cp artifacts out.
diff --git a/docker/Dockerfile.benchmark.libero b/docker/Dockerfile.benchmark.libero
index 878f8b473..9037fd87e 100644
--- a/docker/Dockerfile.benchmark.libero
+++ b/docker/Dockerfile.benchmark.libero
@@ -43,7 +43,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
        python${PYTHON_VERSION} \
        python${PYTHON_VERSION}-venv \
        python${PYTHON_VERSION}-dev \
-    && curl -LsSf https://astral.sh/uv/install.sh | sh \
+    && curl -LsSf https://astral.sh/uv/0.8.0/install.sh | sh \
     && mv /root/.local/bin/uv /usr/local/bin/uv \
     && useradd --create-home --shell /bin/bash user_lerobot \
     && usermod -aG sudo user_lerobot \
@@ -84,6 +84,9 @@ snapshot_download(repo_id='lerobot/libero-assets', repo_type='dataset', \
     printf "assets: /home/user_lerobot/.libero/assets\nbddl_files: ${LIBERO_DIR}/bddl_files\ndatasets: ${LIBERO_DIR}/../datasets\ninit_states: ${LIBERO_DIR}/init_files\n" \
     > /home/user_lerobot/.libero/config.yaml
 
+# Workaround: Triton ships ptxas without the execute bit set.
+# Without this chmod, any JIT compilation (e.g. torch.compile) fails
+# with "Permission denied". See: https://github.com/triton-lang/triton/issues/2due
 RUN chmod +x /lerobot/.venv/lib/python${PYTHON_VERSION}/site-packages/triton/backends/nvidia/bin/ptxas
 
 # ── Source layer (rebuilds in seconds on code-only changes) ─────────────────
diff --git a/docker/Dockerfile.benchmark.metaworld b/docker/Dockerfile.benchmark.metaworld
index 0f4d8a988..fd4ebd142 100644
--- a/docker/Dockerfile.benchmark.metaworld
+++ b/docker/Dockerfile.benchmark.metaworld
@@ -43,7 +43,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
        python${PYTHON_VERSION} \
        python${PYTHON_VERSION}-venv \
        python${PYTHON_VERSION}-dev \
-    && curl -LsSf https://astral.sh/uv/install.sh | sh \
+    && curl -LsSf https://astral.sh/uv/0.8.0/install.sh | sh \
     && mv /root/.local/bin/uv /usr/local/bin/uv \
     && useradd --create-home --shell /bin/bash user_lerobot \
     && usermod -aG sudo user_lerobot \
@@ -69,6 +69,9 @@ RUN mkdir -p src/lerobot && touch src/lerobot/__init__.py src/lerobot/py.typed
 
 RUN uv sync --locked --extra metaworld --extra smolvla --no-cache
 
+# Workaround: Triton ships ptxas without the execute bit set.
+# Without this chmod, any JIT compilation (e.g. torch.compile) fails
+# with "Permission denied". See: https://github.com/triton-lang/triton/issues/2due
 RUN chmod +x /lerobot/.venv/lib/python${PYTHON_VERSION}/site-packages/triton/backends/nvidia/bin/ptxas
 
 # ── Source layer (rebuilds in seconds on code-only changes) ─────────────────
diff --git a/scripts/ci/parse_eval_metrics.py b/scripts/ci/parse_eval_metrics.py
index 6386a3e3d..7666a7a5a 100644
--- a/scripts/ci/parse_eval_metrics.py
+++ b/scripts/ci/parse_eval_metrics.py
@@ -19,6 +19,9 @@ Reads eval_info.json written by lerobot-eval --output_dir and extracts the
 key metrics needed by the health dashboard. Handles both single-task and
 multi-task eval output formats.
 
+NOTE: This script runs on the bare CI runner (not inside Docker), so it
+must use only Python stdlib modules. Do not add third-party imports.
+
 Usage:
     python scripts/ci/parse_eval_metrics.py \\
         --artifacts-dir /tmp/libero-artifacts \\
@@ -54,12 +57,19 @@ def _extract_metrics(info: dict) -> tuple[float | None, int | None, float | None
         n = agg.get("n_episodes")
         reward = agg.get("avg_sum_reward")
         eval_s = agg.get("eval_s")
+
+        def _safe_float(v: float | int | None) -> float | None:
+            if v is None:
+                return None
+            f = float(v)
+            return None if math.isnan(f) else f
+
         if pc is not None and not math.isnan(pc):
             return (
                 float(pc),
                 int(n) if n is not None else None,
-                float(reward) if reward is not None else None,
-                float(eval_s) if eval_s is not None else None,
+                _safe_float(reward),
+                _safe_float(eval_s),
             )
 
     return None, None, None, None