feat(docker): add per-benchmark evaluation containers

Add Dockerfile.benchmark (parameterized via ARG BENCHMARK), a docker-compose.benchmark.yml with services for libero, libero_plus, robomme, and robocasa, and a smoke_test_benchmark.sh that verifies imports and CLI entry-points in each container. Also add the missing `robocasa` optional dep group to pyproject.toml (the docs already referenced `pip install ".[robocasa]"` but the group was not defined). Build a specific benchmark image: docker build --build-arg BENCHMARK=robomme \ -f docker/Dockerfile.benchmark -t lerobot-benchmark-robomme . Build all via compose: docker compose -f docker/docker-compose.benchmark.yml build Smoke-test inside a container: docker compose -f docker/docker-compose.benchmark.yml run --rm robomme \ bash docker/smoke_test_benchmark.sh Co-Authored-By: Claude <noreply@anthropic.com>
2026-07-24 18:26:11 +00:00 · 2026-03-20 21:43:04 -07:00
parent 285c500aef
commit 39cf11d5dc
4 changed files with 300 additions and 0 deletions
@@ -0,0 +1,110 @@
 # Copyright 2025 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Benchmark evaluation container — one image per benchmark, built via BENCHMARK arg.
 #
 # Supported values for BENCHMARK:
 #   libero       — LIBERO suite (spatial / object / goal / 10 / 90)
 #   libero_plus  — LIBERO-plus extended benchmark (requires robosuite, bddl, robomimic)
 #   robomme      — RoboMME memory-augmented manipulation benchmark
 #   robocasa     — RoboCasa kitchen composite-task benchmark
 #
 # Build:
 #   docker build --build-arg BENCHMARK=libero -f docker/Dockerfile.benchmark \
 #                -t lerobot-benchmark-libero .
 #
 # Run (interactive):
 #   docker run --gpus all --rm -it lerobot-benchmark-libero
 # Run eval:
 #   docker run --gpus all --rm lerobot-benchmark-libero lerobot-eval --help
 ARG CUDA_VERSION=12.4.1
 ARG OS_VERSION=22.04
 FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu${OS_VERSION}
 ARG PYTHON_VERSION=3.12
 ARG BENCHMARK=libero
 ENV DEBIAN_FRONTEND=noninteractive \
    MUJOCO_GL=egl \
    PATH=/lerobot/.venv/bin:$PATH \
    CUDA_VISIBLE_DEVICES=0 \
    DEVICE=cuda \
    BENCHMARK=${BENCHMARK}
 # ── Base system deps (shared across all benchmarks) ───────────────────────────
 RUN apt-get update && apt-get install -y --no-install-recommends \
    software-properties-common build-essential git curl \
    libglib2.0-0 libgl1-mesa-glx libegl1-mesa libegl1-mesa-dev \
    libglew-dev libglfw3-dev libgl1-mesa-dri \
    ffmpeg libusb-1.0-0-dev speech-dispatcher libgeos-dev portaudio19-dev \
    cmake pkg-config ninja-build \
    && add-apt-repository -y ppa:deadsnakes/ppa \
    && apt-get update \
    && apt-get install -y --no-install-recommends \
       python${PYTHON_VERSION} \
       python${PYTHON_VERSION}-venv \
       python${PYTHON_VERSION}-dev \
    && curl -LsSf https://astral.sh/uv/install.sh | sh \
    && mv /root/.local/bin/uv /usr/local/bin/uv \
    && useradd --create-home --shell /bin/bash user_lerobot \
    && usermod -aG sudo user_lerobot \
    && apt-get clean && rm -rf /var/lib/apt/lists/*
 # ── Benchmark-specific system deps ────────────────────────────────────────────
 # libero_plus: the `wand` Python package requires ImageMagick headers.
 RUN case "${BENCHMARK}" in \
    libero_plus) \
        apt-get update && apt-get install -y --no-install-recommends \
            libmagickwand-dev \
        && apt-get clean && rm -rf /var/lib/apt/lists/* ;; \
    esac
 WORKDIR /lerobot
 RUN chown -R user_lerobot:user_lerobot /lerobot
 USER user_lerobot
 ENV HOME=/home/user_lerobot \
    HF_HOME=/home/user_lerobot/.cache/huggingface \
    HF_LEROBOT_HOME=/home/user_lerobot/.cache/huggingface/lerobot \
    TORCH_HOME=/home/user_lerobot/.cache/torch \
    TRITON_CACHE_DIR=/home/user_lerobot/.cache/triton
 RUN uv venv --python python${PYTHON_VERSION}
 # Copy only the dependency manifests first so Docker can cache this layer
 # independently of source-code changes.
 COPY --chown=user_lerobot:user_lerobot setup.py pyproject.toml README.md MANIFEST.in ./
 COPY --chown=user_lerobot:user_lerobot src/ src/
 ARG UNBOUND_DEPS=false
 RUN if [ "$UNBOUND_DEPS" = "true" ]; then \
    sed -i 's/,[[:space:]]*<[0-9\.]*//g' pyproject.toml; \
    echo "Dependencies unbound:" && cat pyproject.toml; \
    fi
 # Install lerobot core + the selected benchmark extra.
 # Git-based deps (libero_plus, robomme) require network access at build time.
 RUN uv pip install --no-cache ".[${BENCHMARK}]"
 # Triton requires its ptxas binary to be executable (NVIDIA-specific).
 RUN if [ -f /lerobot/.venv/lib/python${PYTHON_VERSION}/site-packages/triton/backends/nvidia/bin/ptxas ]; then \
    chmod +x /lerobot/.venv/lib/python${PYTHON_VERSION}/site-packages/triton/backends/nvidia/bin/ptxas; \
    fi
 # Copy full source (tests, examples, configs, etc.)
 COPY --chown=user_lerobot:user_lerobot . .
 CMD ["/bin/bash"]
@@ -0,0 +1,68 @@
 # Benchmark evaluation services for LeRobot.
 #
 # Each service builds Dockerfile.benchmark with the matching BENCHMARK arg,
 # giving an isolated environment per benchmark suite.
 #
 # Usage:
 #   Build all:        docker compose -f docker/docker-compose.benchmark.yml build
 #   Build one:        docker compose -f docker/docker-compose.benchmark.yml build libero
 #   Run smoke tests:  docker compose -f docker/docker-compose.benchmark.yml run --rm libero bash docker/smoke_test_benchmark.sh
 #   Interactive:      docker compose -f docker/docker-compose.benchmark.yml run --rm libero
 x-benchmark-base: &benchmark-base
  build:
    context: .
    dockerfile: docker/Dockerfile.benchmark
  image: lerobot-benchmark-${BENCHMARK:-libero}
  environment:
    - MUJOCO_GL=egl
    - DEVICE=cuda
  deploy:
    resources:
      reservations:
        devices:
          - driver: nvidia
            count: 1
            capabilities: [gpu]
  volumes:
    # Mount HuggingFace cache from host so model weights are not re-downloaded each run.
    - ${HF_HOME:-~/.cache/huggingface}:/home/user_lerobot/.cache/huggingface
  stdin_open: true
  tty: true
 services:
  libero:
    <<: *benchmark-base
    image: lerobot-benchmark-libero
    build:
      context: .
      dockerfile: docker/Dockerfile.benchmark
      args:
        BENCHMARK: libero
  libero_plus:
    <<: *benchmark-base
    image: lerobot-benchmark-libero-plus
    build:
      context: .
      dockerfile: docker/Dockerfile.benchmark
      args:
        BENCHMARK: libero_plus
  robomme:
    <<: *benchmark-base
    image: lerobot-benchmark-robomme
    build:
      context: .
      dockerfile: docker/Dockerfile.benchmark
      args:
        BENCHMARK: robomme
  robocasa:
    <<: *benchmark-base
    image: lerobot-benchmark-robocasa
    build:
      context: .
      dockerfile: docker/Dockerfile.benchmark
      args:
        BENCHMARK: robocasa
@@ -0,0 +1,114 @@
 #!/usr/bin/env bash
 # Smoke-test a benchmark container: verifies imports and CLI entry-points.
 #
 # Run inside the container (BENCHMARK env var must be set):
 #   bash docker/smoke_test_benchmark.sh
 #
 # Or run all benchmarks via docker compose:
 #   for svc in libero libero_plus robomme robocasa; do
 #     docker compose -f docker/docker-compose.benchmark.yml run --rm "$svc" \
 #       bash docker/smoke_test_benchmark.sh
 #   done
 set -euo pipefail
 BENCHMARK="${BENCHMARK:-libero}"
 PASS=0
 FAIL=0
 ok()   { echo "[PASS] $*"; PASS=$((PASS + 1)); }
 fail() { echo "[FAIL] $*"; FAIL=$((FAIL + 1)); }
 python_import() {
    local module="$1"
    if python -c "import ${module}" 2>/dev/null; then
        ok "import ${module}"
    else
        fail "import ${module}"
    fi
 }
 cli_help() {
    local cmd="$1"
    if "${cmd}" --help > /dev/null 2>&1; then
        ok "${cmd} --help"
    else
        fail "${cmd} --help"
    fi
 }
 echo "=== Smoke test: benchmark=${BENCHMARK} ==="
 # ── lerobot core ──────────────────────────────────────────────────────────────
 python_import "lerobot"
 python_import "lerobot.envs"
 python_import "lerobot.configs.eval"
 cli_help "lerobot-eval"
 # ── Benchmark-specific env import ─────────────────────────────────────────────
 case "${BENCHMARK}" in
    libero)
        python_import "lerobot.envs.libero"
        python -c "
 from lerobot.envs.configs import LiberoEnv
 cfg = LiberoEnv(task='libero_spatial/KITCHEN_SCENE1_open_the_bottom_drawer_of_the_cabinet')
 print('  LiberoEnv config OK:', cfg.type)
 " && ok "LiberoEnv config instantiation" || fail "LiberoEnv config instantiation"
        ;;
    libero_plus)
        python_import "lerobot.envs.libero"
        python -c "
 from lerobot.envs.configs import LiberoPlusEnv
 cfg = LiberoPlusEnv()
 print('  LiberoPlusEnv config OK:', cfg.type)
 " && ok "LiberoPlusEnv config instantiation" || fail "LiberoPlusEnv config instantiation"
        # Verify the LIBERO-plus package itself is importable
        python_import "libero"
        python_import "robosuite"
        ;;
    robomme)
        python_import "lerobot.envs.robomme"
        python -c "
 from lerobot.envs.robomme import ROBOMME_TASKS, RoboMMEGymEnv
 assert len(ROBOMME_TASKS) == 16, f'Expected 16 tasks, got {len(ROBOMME_TASKS)}'
 print('  ROBOMME_TASKS OK:', ROBOMME_TASKS[:3], '...')
 " && ok "RoboMME task list" || fail "RoboMME task list"
        python -c "
 from lerobot.envs.configs import RoboMMEEnv
 cfg = RoboMMEEnv(task='PickXtimes')
 print('  RoboMMEEnv config OK:', cfg.type)
 " && ok "RoboMMEEnv config instantiation" || fail "RoboMMEEnv config instantiation"
        python_import "robomme"
        ;;
    robocasa)
        python_import "lerobot.envs.robocasa"
        python -c "
 from lerobot.envs.robocasa import ACTION_DIM, STATE_DIM
 assert ACTION_DIM == 12, f'Expected ACTION_DIM=12, got {ACTION_DIM}'
 assert STATE_DIM == 16, f'Expected STATE_DIM=16, got {STATE_DIM}'
 print('  ACTION_DIM:', ACTION_DIM, '  STATE_DIM:', STATE_DIM)
 " && ok "RoboCasa constants" || fail "RoboCasa constants"
        python -c "
 from lerobot.envs.configs import RoboCasaEnv
 cfg = RoboCasaEnv(task='PickPlaceCounterToCabinet')
 print('  RoboCasaEnv config OK:', cfg.type)
 " && ok "RoboCasaEnv config instantiation" || fail "RoboCasaEnv config instantiation"
        python_import "robocasa"
        python_import "robosuite"
        ;;
    *)
        echo "Unknown BENCHMARK='${BENCHMARK}'. Valid values: libero, libero_plus, robomme, robocasa"
        exit 1
        ;;
 esac
 # ── Summary ───────────────────────────────────────────────────────────────────
 echo ""
 echo "=== Results: ${PASS} passed, ${FAIL} failed ==="
 if [ "${FAIL}" -gt 0 ]; then
    exit 1
 fi
@@ -194,6 +194,14 @@ libero-plus = ["lerobot[libero_plus]"]
 robomme = [
    "robomme @ git+https://github.com/RoboMME/robomme_benchmark.git@main ; sys_platform == 'linux'",
 ]
 robocasa = [
    "robocasa; sys_platform == 'linux'",
    # robocasa's setup does not declare all runtime deps; list them here explicitly.
    "robosuite>=1.4.0,<1.5.0; sys_platform == 'linux'",
    "easydict>=1.9; sys_platform == 'linux'",
    "scikit-image>=0.20.0; sys_platform == 'linux'",
    "lerobot[scipy-dep]",
 ]
 metaworld = ["metaworld==3.0.0", "lerobot[scipy-dep]"]
 # All