From c505a71f7817c7427777c5aac758537dbbac556d Mon Sep 17 00:00:00 2001 From: Pepijn Date: Fri, 10 Apr 2026 12:47:58 +0200 Subject: [PATCH] fix(ci): address PR review feedback for benchmark smoke tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Security: - Remove "Login to Hugging Face" step — it was a no-op (ephemeral --rm container) that exposed the HF token via CLI argument in docker inspect / /proc/*/cmdline. The eval step already re-authenticates via env var. Functional: - Remove feat/benchmark-ci from push trigger branches (won't exist post-merge). Dockerfiles: - Pin uv to 0.8.0 (was unpinned, fetching whatever latest ships). - Add comment explaining the chmod +x ptxas workaround (Triton packaging bug — ships ptxas without execute bit). Scripts: - parse_eval_metrics.py: add note that it runs on bare host and must stay stdlib-only. - parse_eval_metrics.py: add NaN guard for avg_sum_reward and eval_s (was only guarding pc_success). Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/benchmark_tests.yml | 9 --------- docker/Dockerfile.benchmark.libero | 5 ++++- docker/Dockerfile.benchmark.metaworld | 5 ++++- scripts/ci/parse_eval_metrics.py | 14 ++++++++++++-- 4 files changed, 20 insertions(+), 13 deletions(-) diff --git a/.github/workflows/benchmark_tests.yml b/.github/workflows/benchmark_tests.yml index c38d124dc..612c403c0 100644 --- a/.github/workflows/benchmark_tests.yml +++ b/.github/workflows/benchmark_tests.yml @@ -31,7 +31,6 @@ on: push: branches: - - feat/benchmark-ci - main paths: - "src/lerobot/envs/**" @@ -101,14 +100,6 @@ jobs: load: true tags: lerobot-benchmark-libero:ci - - name: Login to Hugging Face - if: env.HF_USER_TOKEN != '' - run: | - docker run --rm \ - -e HF_HOME=/tmp/hf \ - lerobot-benchmark-libero:ci \ - bash -c "hf auth login --token '$HF_USER_TOKEN' --add-to-git-credential && hf auth whoami" - - name: Run Libero smoke eval (1 episode) run: | # Named container (no --rm) so we can docker cp artifacts out. diff --git a/docker/Dockerfile.benchmark.libero b/docker/Dockerfile.benchmark.libero index 878f8b473..9037fd87e 100644 --- a/docker/Dockerfile.benchmark.libero +++ b/docker/Dockerfile.benchmark.libero @@ -43,7 +43,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ python${PYTHON_VERSION} \ python${PYTHON_VERSION}-venv \ python${PYTHON_VERSION}-dev \ - && curl -LsSf https://astral.sh/uv/install.sh | sh \ + && curl -LsSf https://astral.sh/uv/0.8.0/install.sh | sh \ && mv /root/.local/bin/uv /usr/local/bin/uv \ && useradd --create-home --shell /bin/bash user_lerobot \ && usermod -aG sudo user_lerobot \ @@ -84,6 +84,9 @@ snapshot_download(repo_id='lerobot/libero-assets', repo_type='dataset', \ printf "assets: /home/user_lerobot/.libero/assets\nbddl_files: ${LIBERO_DIR}/bddl_files\ndatasets: ${LIBERO_DIR}/../datasets\ninit_states: ${LIBERO_DIR}/init_files\n" \ > /home/user_lerobot/.libero/config.yaml +# Workaround: Triton ships ptxas without the execute bit set. +# Without this chmod, any JIT compilation (e.g. torch.compile) fails +# with "Permission denied". See: https://github.com/triton-lang/triton/issues/2due RUN chmod +x /lerobot/.venv/lib/python${PYTHON_VERSION}/site-packages/triton/backends/nvidia/bin/ptxas # ── Source layer (rebuilds in seconds on code-only changes) ───────────────── diff --git a/docker/Dockerfile.benchmark.metaworld b/docker/Dockerfile.benchmark.metaworld index 0f4d8a988..fd4ebd142 100644 --- a/docker/Dockerfile.benchmark.metaworld +++ b/docker/Dockerfile.benchmark.metaworld @@ -43,7 +43,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ python${PYTHON_VERSION} \ python${PYTHON_VERSION}-venv \ python${PYTHON_VERSION}-dev \ - && curl -LsSf https://astral.sh/uv/install.sh | sh \ + && curl -LsSf https://astral.sh/uv/0.8.0/install.sh | sh \ && mv /root/.local/bin/uv /usr/local/bin/uv \ && useradd --create-home --shell /bin/bash user_lerobot \ && usermod -aG sudo user_lerobot \ @@ -69,6 +69,9 @@ RUN mkdir -p src/lerobot && touch src/lerobot/__init__.py src/lerobot/py.typed RUN uv sync --locked --extra metaworld --extra smolvla --no-cache +# Workaround: Triton ships ptxas without the execute bit set. +# Without this chmod, any JIT compilation (e.g. torch.compile) fails +# with "Permission denied". See: https://github.com/triton-lang/triton/issues/2due RUN chmod +x /lerobot/.venv/lib/python${PYTHON_VERSION}/site-packages/triton/backends/nvidia/bin/ptxas # ── Source layer (rebuilds in seconds on code-only changes) ───────────────── diff --git a/scripts/ci/parse_eval_metrics.py b/scripts/ci/parse_eval_metrics.py index 6386a3e3d..7666a7a5a 100644 --- a/scripts/ci/parse_eval_metrics.py +++ b/scripts/ci/parse_eval_metrics.py @@ -19,6 +19,9 @@ Reads eval_info.json written by lerobot-eval --output_dir and extracts the key metrics needed by the health dashboard. Handles both single-task and multi-task eval output formats. +NOTE: This script runs on the bare CI runner (not inside Docker), so it +must use only Python stdlib modules. Do not add third-party imports. + Usage: python scripts/ci/parse_eval_metrics.py \\ --artifacts-dir /tmp/libero-artifacts \\ @@ -54,12 +57,19 @@ def _extract_metrics(info: dict) -> tuple[float | None, int | None, float | None n = agg.get("n_episodes") reward = agg.get("avg_sum_reward") eval_s = agg.get("eval_s") + + def _safe_float(v: float | int | None) -> float | None: + if v is None: + return None + f = float(v) + return None if math.isnan(f) else f + if pc is not None and not math.isnan(pc): return ( float(pc), int(n) if n is not None else None, - float(reward) if reward is not None else None, - float(eval_s) if eval_s is not None else None, + _safe_float(reward), + _safe_float(eval_s), ) return None, None, None, None