lerobot/.github/workflows/benchmark_tests.yml

# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Integration tests: build an isolated Docker image per benchmark and run a
# 1-episode smoke eval. Each benchmark gets its own image so incompatible
# dependency trees (e.g. hf-libero vs metaworld==3.0.0) can never collide.
#
# To add a new benchmark:
#   1. Add docker/Dockerfile.benchmark.<name>  (install only lerobot[<name>])
#   2. Copy one of the jobs below and adjust the image name and eval command.
name: Benchmark Integration Tests

on:
  # Run manually from the Actions tab
  workflow_dispatch:

  push:
    branches:
      - feat/benchmark-ci
      - main
    paths:
      - "src/lerobot/envs/**"
      - "src/lerobot/scripts/lerobot_eval.py"
      - "docker/Dockerfile.benchmark.*"
      - ".github/workflows/benchmark_tests.yml"
      - "pyproject.toml"

  pull_request:
    branches:
      - main
    paths:
      - "src/lerobot/envs/**"
      - "src/lerobot/scripts/lerobot_eval.py"
      - "docker/Dockerfile.benchmark.*"
      - ".github/workflows/benchmark_tests.yml"
      - "pyproject.toml"

permissions:
  contents: read

env:
  UV_VERSION: "0.8.0"
  PYTHON_VERSION: "3.12"

# Cancel in-flight runs for the same branch/PR.
concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
  cancel-in-progress: true

jobs:
  # ── LIBERO ────────────────────────────────────────────────────────────────
  # Isolated image: lerobot[libero] only (hf-libero, dm-control, mujoco chain)
  libero-integration-test:
    name: Libero — build image + 1-episode eval
    runs-on:
      group: aws-g6-4xlarge-plus
    env:
      HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}

    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
        with:
          persist-credentials: false
          lfs: true

      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
        with:
          cache-binary: false

      # Build the benchmark-specific image; layer cache lives in the runner's
      # local Docker daemon — reused across re-runs on the same machine.
      - name: Build Libero benchmark image
        uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
        with:
          context: .
          file: docker/Dockerfile.benchmark.libero
          push: false
          load: true
          tags: lerobot-benchmark-libero:ci
          cache-from: type=local,src=/tmp/.buildx-cache-libero
          cache-to: type=local,dest=/tmp/.buildx-cache-libero,mode=max

      - name: Login to Hugging Face
        if: env.HF_USER_TOKEN != ''
        run: |
          docker run --rm \
            -e HF_HOME=/tmp/hf \
            lerobot-benchmark-libero:ci \
            bash -c "hf auth login --token '$HF_USER_TOKEN' --add-to-git-credential && hf auth whoami"

      - name: Prepare Libero artifact directory
        run: |
          mkdir -p /tmp/libero-artifacts
          # Grant write access from inside the container to handle any host/container UID mismatch.
          docker run --rm --user root \
            -v /tmp/libero-artifacts:/artifacts \
            lerobot-benchmark-libero:ci \
            bash -c "mkdir -p /artifacts/videos && chmod -R 777 /artifacts"

      - name: Run Libero smoke eval (1 episode)
        run: |
          docker run --rm --gpus all \
            --shm-size=4g \
            -e HF_HOME=/tmp/hf \
            -e HF_USER_TOKEN="${HF_USER_TOKEN}" \
            -e HF_HUB_DOWNLOAD_TIMEOUT=300 \
            -v /tmp/libero-artifacts:/artifacts \
            lerobot-benchmark-libero:ci \
            bash -c "
              hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
              lerobot-eval \
                --policy.path=pepijn223/smolvla_libero \
                --env.type=libero \
                --env.task=libero_spatial \
                --eval.batch_size=1 \
                --eval.n_episodes=1 \
                --eval.use_async_envs=false \
                --policy.device=cuda \
                '--env.camera_name_mapping={\"agentview_image\": \"camera1\", \"robot0_eye_in_hand_image\": \"camera2\"}' \
                --policy.empty_cameras=1 \
                --output_dir=/artifacts
            "

      - name: Fix Libero artifact permissions
        if: always()
        run: |
          # New files written by user_lerobot inside the container inherit a
          # restrictive umask; re-chmod as root so the runner can read them.
          docker run --rm --user root \
            -v /tmp/libero-artifacts:/artifacts \
            lerobot-benchmark-libero:ci \
            bash -c "chmod -R 777 /artifacts"

      - name: Parse Libero eval metrics
        if: always()
        run: |
          python3 scripts/ci/parse_eval_metrics.py \
            --artifacts-dir /tmp/libero-artifacts \
            --env libero \
            --task libero_spatial \
            --policy pepijn223/smolvla_libero

      - name: Upload Libero rollout video
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: libero-rollout-video
          path: /tmp/libero-artifacts/videos/
          if-no-files-found: warn

      - name: Upload Libero eval metrics
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: libero-metrics
          path: /tmp/libero-artifacts/metrics.json
          if-no-files-found: warn

  # ── METAWORLD ─────────────────────────────────────────────────────────────
  # Isolated image: lerobot[metaworld] only (metaworld==3.0.0, mujoco>=3 chain)
  metaworld-integration-test:
    name: MetaWorld — build image + 1-episode eval
    runs-on:
      group: aws-g6-4xlarge-plus
    env:
      HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}

    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
        with:
          persist-credentials: false
          lfs: true

      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
        with:
          cache-binary: false

      - name: Build MetaWorld benchmark image
        uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
        with:
          context: .
          file: docker/Dockerfile.benchmark.metaworld
          push: false
          load: true
          tags: lerobot-benchmark-metaworld:ci
          cache-from: type=local,src=/tmp/.buildx-cache-metaworld
          cache-to: type=local,dest=/tmp/.buildx-cache-metaworld,mode=max

      - name: Prepare MetaWorld artifact directory
        run: |
          mkdir -p /tmp/metaworld-artifacts
          # Grant write access from inside the container to handle any host/container UID mismatch.
          docker run --rm --user root \
            -v /tmp/metaworld-artifacts:/artifacts \
            lerobot-benchmark-metaworld:ci \
            bash -c "mkdir -p /artifacts/videos && chmod -R 777 /artifacts"

      - name: Run MetaWorld smoke eval (1 episode)
        run: |
          docker run --rm --gpus all \
            --shm-size=4g \
            -e HF_HOME=/tmp/hf \
            -e HF_USER_TOKEN="${HF_USER_TOKEN}" \
            -e HF_HUB_DOWNLOAD_TIMEOUT=300 \
            -v /tmp/metaworld-artifacts:/artifacts \
            lerobot-benchmark-metaworld:ci \
            bash -c "
              hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
              lerobot-eval \
                --policy.path=pepijn223/smolvla_metaworld \
                --env.type=metaworld \
                --env.task=metaworld-push-v3 \
                --eval.batch_size=1 \
                --eval.n_episodes=1 \
                --eval.use_async_envs=false \
                --policy.device=cuda \
                '--rename_map={\"observation.image\": \"observation.images.camera1\"}' \
                --policy.empty_cameras=2 \
                --output_dir=/artifacts
            "

      - name: Fix MetaWorld artifact permissions
        if: always()
        run: |
          docker run --rm --user root \
            -v /tmp/metaworld-artifacts:/artifacts \
            lerobot-benchmark-metaworld:ci \
            bash -c "chmod -R 777 /artifacts"

      - name: Parse MetaWorld eval metrics
        if: always()
        run: |
          python3 scripts/ci/parse_eval_metrics.py \
            --artifacts-dir /tmp/metaworld-artifacts \
            --env metaworld \
            --task metaworld-push-v3 \
            --policy pepijn223/smolvla_metaworld

      - name: Upload MetaWorld rollout video
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: metaworld-rollout-video
          path: /tmp/metaworld-artifacts/videos/
          if-no-files-found: warn

      - name: Upload MetaWorld eval metrics
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: metaworld-metrics
          path: /tmp/metaworld-artifacts/metrics.json
          if-no-files-found: warn