feat(docker): add per-benchmark evaluation containers

Add Dockerfile.benchmark (parameterized via ARG BENCHMARK), a docker-compose.benchmark.yml with services for libero, libero_plus, robomme, and robocasa, and a smoke_test_benchmark.sh that verifies imports and CLI entry-points in each container. Also add the missing `robocasa` optional dep group to pyproject.toml (the docs already referenced `pip install ".[robocasa]"` but the group was not defined). Build a specific benchmark image: docker build --build-arg BENCHMARK=robomme \ -f docker/Dockerfile.benchmark -t lerobot-benchmark-robomme . Build all via compose: docker compose -f docker/docker-compose.benchmark.yml build Smoke-test inside a container: docker compose -f docker/docker-compose.benchmark.yml run --rm robomme \ bash docker/smoke_test_benchmark.sh Co-Authored-By: Claude <noreply@anthropic.com>
2026-05-16 00:59:46 +00:00 · 2026-03-20 21:43:04 -07:00
parent 285c500aef
commit 39cf11d5dc
4 changed files with 300 additions and 0 deletions
@@ -0,0 +1,110 @@
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Benchmark evaluation container — one image per benchmark, built via BENCHMARK arg.
+#
+# Supported values for BENCHMARK:
+#   libero       — LIBERO suite (spatial / object / goal / 10 / 90)
+#   libero_plus  — LIBERO-plus extended benchmark (requires robosuite, bddl, robomimic)
+#   robomme      — RoboMME memory-augmented manipulation benchmark
+#   robocasa     — RoboCasa kitchen composite-task benchmark
+#
+# Build:
+#   docker build --build-arg BENCHMARK=libero -f docker/Dockerfile.benchmark \
+#                -t lerobot-benchmark-libero .
+#
+# Run (interactive):
+#   docker run --gpus all --rm -it lerobot-benchmark-libero
+# Run eval:
+#   docker run --gpus all --rm lerobot-benchmark-libero lerobot-eval --help
+
+ARG CUDA_VERSION=12.4.1
+ARG OS_VERSION=22.04
+FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu${OS_VERSION}
+
+ARG PYTHON_VERSION=3.12
+ARG BENCHMARK=libero
+
+ENV DEBIAN_FRONTEND=noninteractive \
+    MUJOCO_GL=egl \
+    PATH=/lerobot/.venv/bin:$PATH \
+    CUDA_VISIBLE_DEVICES=0 \
+    DEVICE=cuda \
+    BENCHMARK=${BENCHMARK}
+
+# ── Base system deps (shared across all benchmarks) ───────────────────────────
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    software-properties-common build-essential git curl \
+    libglib2.0-0 libgl1-mesa-glx libegl1-mesa libegl1-mesa-dev \
+    libglew-dev libglfw3-dev libgl1-mesa-dri \
+    ffmpeg libusb-1.0-0-dev speech-dispatcher libgeos-dev portaudio19-dev \
+    cmake pkg-config ninja-build \
+    && add-apt-repository -y ppa:deadsnakes/ppa \
+    && apt-get update \
+    && apt-get install -y --no-install-recommends \
+       python${PYTHON_VERSION} \
+       python${PYTHON_VERSION}-venv \
+       python${PYTHON_VERSION}-dev \
+    && curl -LsSf https://astral.sh/uv/install.sh | sh \
+    && mv /root/.local/bin/uv /usr/local/bin/uv \
+    && useradd --create-home --shell /bin/bash user_lerobot \
+    && usermod -aG sudo user_lerobot \
+    && apt-get clean && rm -rf /var/lib/apt/lists/*
+
+# ── Benchmark-specific system deps ────────────────────────────────────────────
+# libero_plus: the `wand` Python package requires ImageMagick headers.
+RUN case "${BENCHMARK}" in \
+    libero_plus) \
+        apt-get update && apt-get install -y --no-install-recommends \
+            libmagickwand-dev \
+        && apt-get clean && rm -rf /var/lib/apt/lists/* ;; \
+    esac
+
+WORKDIR /lerobot
+RUN chown -R user_lerobot:user_lerobot /lerobot
+
+USER user_lerobot
+
+ENV HOME=/home/user_lerobot \
+    HF_HOME=/home/user_lerobot/.cache/huggingface \
+    HF_LEROBOT_HOME=/home/user_lerobot/.cache/huggingface/lerobot \
+    TORCH_HOME=/home/user_lerobot/.cache/torch \
+    TRITON_CACHE_DIR=/home/user_lerobot/.cache/triton
+
+RUN uv venv --python python${PYTHON_VERSION}
+
+# Copy only the dependency manifests first so Docker can cache this layer
+# independently of source-code changes.
+COPY --chown=user_lerobot:user_lerobot setup.py pyproject.toml README.md MANIFEST.in ./
+COPY --chown=user_lerobot:user_lerobot src/ src/
+
+ARG UNBOUND_DEPS=false
+RUN if [ "$UNBOUND_DEPS" = "true" ]; then \
+    sed -i 's/,[[:space:]]*<[0-9\.]*//g' pyproject.toml; \
+    echo "Dependencies unbound:" && cat pyproject.toml; \
+    fi
+
+# Install lerobot core + the selected benchmark extra.
+# Git-based deps (libero_plus, robomme) require network access at build time.
+RUN uv pip install --no-cache ".[${BENCHMARK}]"
+
+# Triton requires its ptxas binary to be executable (NVIDIA-specific).
+RUN if [ -f /lerobot/.venv/lib/python${PYTHON_VERSION}/site-packages/triton/backends/nvidia/bin/ptxas ]; then \
+    chmod +x /lerobot/.venv/lib/python${PYTHON_VERSION}/site-packages/triton/backends/nvidia/bin/ptxas; \
+    fi
+
+# Copy full source (tests, examples, configs, etc.)
+COPY --chown=user_lerobot:user_lerobot . .
+
+CMD ["/bin/bash"]
@@ -0,0 +1,68 @@
+# Benchmark evaluation services for LeRobot.
+#
+# Each service builds Dockerfile.benchmark with the matching BENCHMARK arg,
+# giving an isolated environment per benchmark suite.
+#
+# Usage:
+#   Build all:        docker compose -f docker/docker-compose.benchmark.yml build
+#   Build one:        docker compose -f docker/docker-compose.benchmark.yml build libero
+#   Run smoke tests:  docker compose -f docker/docker-compose.benchmark.yml run --rm libero bash docker/smoke_test_benchmark.sh
+#   Interactive:      docker compose -f docker/docker-compose.benchmark.yml run --rm libero
+
+x-benchmark-base: &benchmark-base
+  build:
+    context: .
+    dockerfile: docker/Dockerfile.benchmark
+  image: lerobot-benchmark-${BENCHMARK:-libero}
+  environment:
+    - MUJOCO_GL=egl
+    - DEVICE=cuda
+  deploy:
+    resources:
+      reservations:
+        devices:
+          - driver: nvidia
+            count: 1
+            capabilities: [gpu]
+  volumes:
+    # Mount HuggingFace cache from host so model weights are not re-downloaded each run.
+    - ${HF_HOME:-~/.cache/huggingface}:/home/user_lerobot/.cache/huggingface
+  stdin_open: true
+  tty: true
+
+services:
+  libero:
+    <<: *benchmark-base
+    image: lerobot-benchmark-libero
+    build:
+      context: .
+      dockerfile: docker/Dockerfile.benchmark
+      args:
+        BENCHMARK: libero
+
+  libero_plus:
+    <<: *benchmark-base
+    image: lerobot-benchmark-libero-plus
+    build:
+      context: .
+      dockerfile: docker/Dockerfile.benchmark
+      args:
+        BENCHMARK: libero_plus
+
+  robomme:
+    <<: *benchmark-base
+    image: lerobot-benchmark-robomme
+    build:
+      context: .
+      dockerfile: docker/Dockerfile.benchmark
+      args:
+        BENCHMARK: robomme
+
+  robocasa:
+    <<: *benchmark-base
+    image: lerobot-benchmark-robocasa
+    build:
+      context: .
+      dockerfile: docker/Dockerfile.benchmark
+      args:
+        BENCHMARK: robocasa
@@ -0,0 +1,114 @@
+#!/usr/bin/env bash
+# Smoke-test a benchmark container: verifies imports and CLI entry-points.
+#
+# Run inside the container (BENCHMARK env var must be set):
+#   bash docker/smoke_test_benchmark.sh
+#
+# Or run all benchmarks via docker compose:
+#   for svc in libero libero_plus robomme robocasa; do
+#     docker compose -f docker/docker-compose.benchmark.yml run --rm "$svc" \
+#       bash docker/smoke_test_benchmark.sh
+#   done
+
+set -euo pipefail
+
+BENCHMARK="${BENCHMARK:-libero}"
+PASS=0
+FAIL=0
+
+ok()   { echo "[PASS] $*"; PASS=$((PASS + 1)); }
+fail() { echo "[FAIL] $*"; FAIL=$((FAIL + 1)); }
+
+python_import() {
+    local module="$1"
+    if python -c "import ${module}" 2>/dev/null; then
+        ok "import ${module}"
+    else
+        fail "import ${module}"
+    fi
+}
+
+cli_help() {
+    local cmd="$1"
+    if "${cmd}" --help > /dev/null 2>&1; then
+        ok "${cmd} --help"
+    else
+        fail "${cmd} --help"
+    fi
+}
+
+echo "=== Smoke test: benchmark=${BENCHMARK} ==="
+
+# ── lerobot core ──────────────────────────────────────────────────────────────
+python_import "lerobot"
+python_import "lerobot.envs"
+python_import "lerobot.configs.eval"
+cli_help "lerobot-eval"
+
+# ── Benchmark-specific env import ─────────────────────────────────────────────
+case "${BENCHMARK}" in
+    libero)
+        python_import "lerobot.envs.libero"
+        python -c "
+from lerobot.envs.configs import LiberoEnv
+cfg = LiberoEnv(task='libero_spatial/KITCHEN_SCENE1_open_the_bottom_drawer_of_the_cabinet')
+print('  LiberoEnv config OK:', cfg.type)
+" && ok "LiberoEnv config instantiation" || fail "LiberoEnv config instantiation"
+        ;;
+
+    libero_plus)
+        python_import "lerobot.envs.libero"
+        python -c "
+from lerobot.envs.configs import LiberoPlusEnv
+cfg = LiberoPlusEnv()
+print('  LiberoPlusEnv config OK:', cfg.type)
+" && ok "LiberoPlusEnv config instantiation" || fail "LiberoPlusEnv config instantiation"
+        # Verify the LIBERO-plus package itself is importable
+        python_import "libero"
+        python_import "robosuite"
+        ;;
+
+    robomme)
+        python_import "lerobot.envs.robomme"
+        python -c "
+from lerobot.envs.robomme import ROBOMME_TASKS, RoboMMEGymEnv
+assert len(ROBOMME_TASKS) == 16, f'Expected 16 tasks, got {len(ROBOMME_TASKS)}'
+print('  ROBOMME_TASKS OK:', ROBOMME_TASKS[:3], '...')
+" && ok "RoboMME task list" || fail "RoboMME task list"
+        python -c "
+from lerobot.envs.configs import RoboMMEEnv
+cfg = RoboMMEEnv(task='PickXtimes')
+print('  RoboMMEEnv config OK:', cfg.type)
+" && ok "RoboMMEEnv config instantiation" || fail "RoboMMEEnv config instantiation"
+        python_import "robomme"
+        ;;
+
+    robocasa)
+        python_import "lerobot.envs.robocasa"
+        python -c "
+from lerobot.envs.robocasa import ACTION_DIM, STATE_DIM
+assert ACTION_DIM == 12, f'Expected ACTION_DIM=12, got {ACTION_DIM}'
+assert STATE_DIM == 16, f'Expected STATE_DIM=16, got {STATE_DIM}'
+print('  ACTION_DIM:', ACTION_DIM, '  STATE_DIM:', STATE_DIM)
+" && ok "RoboCasa constants" || fail "RoboCasa constants"
+        python -c "
+from lerobot.envs.configs import RoboCasaEnv
+cfg = RoboCasaEnv(task='PickPlaceCounterToCabinet')
+print('  RoboCasaEnv config OK:', cfg.type)
+" && ok "RoboCasaEnv config instantiation" || fail "RoboCasaEnv config instantiation"
+        python_import "robocasa"
+        python_import "robosuite"
+        ;;
+
+    *)
+        echo "Unknown BENCHMARK='${BENCHMARK}'. Valid values: libero, libero_plus, robomme, robocasa"
+        exit 1
+        ;;
+esac
+
+# ── Summary ───────────────────────────────────────────────────────────────────
+echo ""
+echo "=== Results: ${PASS} passed, ${FAIL} failed ==="
+if [ "${FAIL}" -gt 0 ]; then
+    exit 1
+fi
@@ -194,6 +194,14 @@ libero-plus = ["lerobot[libero_plus]"]
 robomme = [
    "robomme @ git+https://github.com/RoboMME/robomme_benchmark.git@main ; sys_platform == 'linux'",
 ]
+robocasa = [
+    "robocasa; sys_platform == 'linux'",
+    # robocasa's setup does not declare all runtime deps; list them here explicitly.
+    "robosuite>=1.4.0,<1.5.0; sys_platform == 'linux'",
+    "easydict>=1.9; sys_platform == 'linux'",
+    "scikit-image>=0.20.0; sys_platform == 'linux'",
+    "lerobot[scipy-dep]",
+]
 metaworld = ["metaworld==3.0.0", "lerobot[scipy-dep]"]

 # All