diff --git a/.github/workflows/benchmark_tests.yml b/.github/workflows/benchmark_tests.yml
index 70258c7f6..00806f990 100644
--- a/.github/workflows/benchmark_tests.yml
+++ b/.github/workflows/benchmark_tests.yml
@@ -425,3 +425,103 @@ jobs:
name: robotwin-metrics
path: /tmp/robotwin-artifacts/metrics.json
if-no-files-found: warn
+
+ # ── ROBOCASA365 ──────────────────────────────────────────────────────────
+ # Isolated image: robocasa + robosuite installed manually as editable
+ # clones (no `lerobot[robocasa]` extra — robocasa's setup.py pins
+ # `lerobot==0.3.3`, which would shadow this repo's lerobot).
+ robocasa-integration-test:
+ name: RoboCasa365 — build image + 1-episode eval
+ runs-on:
+ group: aws-g6-4xlarge-plus
+ env:
+ HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
+
+ steps:
+ - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+ with:
+ persist-credentials: false
+ lfs: true
+
+ - name: Set up Docker Buildx
+ uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
+ with:
+ cache-binary: false
+
+ - name: Login to Docker Hub
+ if: ${{ env.DOCKERHUB_USERNAME != '' }}
+ uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
+ with:
+ username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
+ password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
+ env:
+ DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
+
+ - name: Build RoboCasa365 benchmark image
+ uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
+ with:
+ context: .
+ file: docker/Dockerfile.benchmark.robocasa
+ push: false
+ load: true
+ tags: lerobot-benchmark-robocasa:ci
+
+ - name: Run RoboCasa365 smoke eval (10 atomic tasks, 1 episode each)
+ if: env.HF_USER_TOKEN != ''
+ run: |
+ docker run --name robocasa-eval --gpus all \
+ --shm-size=4g \
+ -e HF_HOME=/tmp/hf \
+ -e HF_USER_TOKEN="${HF_USER_TOKEN}" \
+ -e HF_HUB_DOWNLOAD_TIMEOUT=300 \
+ -e MUJOCO_GL=egl \
+ lerobot-benchmark-robocasa:ci \
+ bash -c "
+ hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
+ lerobot-eval \
+ --policy.path=lerobot/smolvla_robocasa \
+ --env.type=robocasa \
+ --env.task=CloseFridge,OpenCabinet,OpenDrawer,TurnOnMicrowave,TurnOffStove,CloseToasterOvenDoor,SlideDishwasherRack,TurnOnSinkFaucet,NavigateKitchen,TurnOnElectricKettle \
+ --eval.batch_size=1 \
+ --eval.n_episodes=1 \
+ --eval.use_async_envs=false \
+ --policy.device=cuda \
+ '--rename_map={\"observation.images.robot0_agentview_left\": \"observation.images.camera1\", \"observation.images.robot0_eye_in_hand\": \"observation.images.camera2\", \"observation.images.robot0_agentview_right\": \"observation.images.camera3\"}' \
+ --output_dir=/tmp/eval-artifacts
+ python scripts/ci/extract_task_descriptions.py \
+ --env robocasa \
+ --task CloseFridge,OpenCabinet,OpenDrawer,TurnOnMicrowave,TurnOffStove,CloseToasterOvenDoor,SlideDishwasherRack,TurnOnSinkFaucet,NavigateKitchen,TurnOnElectricKettle \
+ --output /tmp/eval-artifacts/task_descriptions.json
+ "
+
+ - name: Copy RoboCasa365 artifacts from container
+ if: always()
+ run: |
+ mkdir -p /tmp/robocasa-artifacts
+ docker cp robocasa-eval:/tmp/eval-artifacts/. /tmp/robocasa-artifacts/ 2>/dev/null || true
+ docker rm -f robocasa-eval || true
+
+ - name: Parse RoboCasa365 eval metrics
+ if: always()
+ run: |
+ python3 scripts/ci/parse_eval_metrics.py \
+ --artifacts-dir /tmp/robocasa-artifacts \
+ --env robocasa \
+ --task atomic_smoke_10 \
+ --policy lerobot/smolvla_robocasa
+
+ - name: Upload RoboCasa365 rollout video
+ if: always()
+ uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
+ with:
+ name: robocasa-rollout-video
+ path: /tmp/robocasa-artifacts/videos/
+ if-no-files-found: warn
+
+ - name: Upload RoboCasa365 eval metrics
+ if: always()
+ uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
+ with:
+ name: robocasa-metrics
+ path: /tmp/robocasa-artifacts/metrics.json
+ if-no-files-found: warn
diff --git a/docker/Dockerfile.benchmark.robocasa b/docker/Dockerfile.benchmark.robocasa
new file mode 100644
index 000000000..9de1612cb
--- /dev/null
+++ b/docker/Dockerfile.benchmark.robocasa
@@ -0,0 +1,71 @@
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Benchmark image for RoboCasa365 integration tests.
+# Extends the nightly GPU image (which already has all extras installed)
+# with the PR's source code and RoboCasa-specific asset setup.
+#
+# Build: docker build -f docker/Dockerfile.benchmark.robocasa -t lerobot-benchmark-robocasa .
+# Run: docker run --gpus all --rm lerobot-benchmark-robocasa lerobot-eval ...
+
+FROM huggingface/lerobot-gpu:latest
+
+# Install robocasa + robosuite as editable clones. pip-installing from git
+# omits data files like robocasa/models/assets/box_links/box_links_assets.json
+# (not declared in package_data), which download_kitchen_assets needs at import.
+#
+# `--no-deps` on robocasa is deliberate: its setup.py pins `lerobot==0.3.3`
+# in install_requires, which would shadow the editable lerobot baked into
+# this image. We install robocasa's actual runtime deps explicitly instead.
+# Pinned SHAs for reproducible benchmark runs. Bump when you need an
+# upstream fix; don't rely on `main`/`master` drift.
+ARG ROBOCASA_SHA=56e355ccc64389dfc1b8a61a33b9127b975ba681
+ARG ROBOSUITE_SHA=aaa8b9b214ce8e77e82926d677b4d61d55e577ab
+RUN git clone https://github.com/robocasa/robocasa.git ~/robocasa && \
+ git -C ~/robocasa checkout ${ROBOCASA_SHA} && \
+ git clone https://github.com/ARISE-Initiative/robosuite.git ~/robosuite && \
+ git -C ~/robosuite checkout ${ROBOSUITE_SHA} && \
+ uv pip install --no-cache -e ~/robocasa --no-deps && \
+ uv pip install --no-cache -e ~/robosuite && \
+ uv pip install --no-cache \
+ "numpy==2.2.5" "numba==0.61.2" "scipy==1.15.3" "mujoco==3.3.1" \
+ "pygame==2.6.1" "Pillow==12.2.0" "opencv-python==4.13.0.92" \
+ "pyyaml==6.0.3" "pynput==1.8.1" "tqdm==4.67.3" "termcolor==3.3.0" \
+ "imageio==2.37.3" "h5py==3.16.0" "lxml==6.0.4" "hidapi==0.14.0.post4" \
+ "tianshou==0.4.10" "gymnasium==1.2.3"
+
+# Set up robocasa macros and download kitchen assets. We need:
+# - tex : base environment textures
+# - tex_generative : AI-generated textures; kitchen fixture XMLs embed
+# refs to generative_textures/wall/tex*.png
+# unconditionally, so MjModel.from_xml_string fails
+# at reset time without them (even if the env is
+# constructed with generative_textures=None).
+# - fixtures_lw : lightwheel kitchen fixtures (fridge, counters...)
+# - objs_lw : lightwheel object meshes (stools, misc props)
+# We skip the objaverse/aigen object packs (~30GB combined) by pairing
+# this with --env.obj_registries=["lightwheel"] on the lerobot side.
+# The download script prompts interactively, so pipe 'y' to auto-accept.
+RUN python -m robocasa.scripts.setup_macros && \
+ yes y | python -m robocasa.scripts.download_kitchen_assets \
+ --type tex tex_generative fixtures_lw objs_lw
+
+# Overlay the PR's source code on top of the nightly image.
+COPY --chown=user_lerobot:user_lerobot . .
+
+# Re-install lerobot editably so the new source (with RoboCasaEnv registration)
+# replaces the stale package baked into the nightly image.
+RUN uv pip install --no-cache --no-deps -e .
+
+CMD ["/bin/bash"]
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 13405decf..bb0dad1bf 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -81,6 +81,8 @@
title: Meta-World
- local: robotwin
title: RoboTwin 2.0
+ - local: robocasa
+ title: RoboCasa365
- local: envhub_isaaclab_arena
title: NVIDIA IsaacLab Arena Environments
title: "Benchmarks"
diff --git a/docs/source/robocasa.mdx b/docs/source/robocasa.mdx
new file mode 100644
index 000000000..f6a784e72
--- /dev/null
+++ b/docs/source/robocasa.mdx
@@ -0,0 +1,188 @@
+# RoboCasa365
+
+[RoboCasa365](https://robocasa.ai) is a large-scale simulation framework for training and benchmarking **generalist robots** in everyday kitchen tasks. It ships 365 diverse manipulation tasks across 2,500 kitchen environments, 3,200+ object assets and 600+ hours of human demonstration data, on a PandaOmron 12-DOF mobile manipulator (Franka arm on a holonomic base).
+
+- Paper: [RoboCasa: Large-Scale Simulation of Everyday Tasks for Generalist Robots](https://arxiv.org/abs/2406.02523)
+- GitHub: [robocasa/robocasa](https://github.com/robocasa/robocasa)
+- Project website: [robocasa.ai](https://robocasa.ai)
+- Pretrained policy: [`lerobot/smolvla_robocasa`](https://huggingface.co/lerobot/smolvla_robocasa)
+- Single-task dataset (CloseFridge): [`pepijn223/robocasa_CloseFridge`](https://huggingface.co/datasets/pepijn223/robocasa_CloseFridge)
+
+
+
+## Available tasks
+
+RoboCasa365 organizes its 365 tasks into two families and three upstream benchmark groups that LeRobot exposes as first-class `--env.task` shortcuts:
+
+| Family | Tasks | Description |
+| --------- | ----- | ------------------------------------------------------------------------------- |
+| Atomic | ~65 | Single-skill tasks: pick-and-place, door/drawer manipulation, appliance control |
+| Composite | ~300 | Multi-step tasks across 60+ categories: cooking, cleaning, organizing, etc. |
+
+**Atomic task examples:** `CloseFridge`, `OpenDrawer`, `OpenCabinet`, `TurnOnMicrowave`, `TurnOffStove`, `NavigateKitchen`, `PickPlaceCounterToStove`.
+
+**Composite task categories:** baking, boiling, brewing, chopping, clearing table, defrosting food, loading dishwasher, making tea, microwaving food, washing dishes, and more.
+
+`--env.task` accepts three forms:
+
+- a single task name (`CloseFridge`)
+- a comma-separated list (`CloseFridge,OpenBlenderLid,PickPlaceCoffee`)
+- a benchmark-group shortcut — `atomic_seen`, `composite_seen`, `composite_unseen`, `pretrain50`, `pretrain100`, `pretrain200`, `pretrain300` — which auto-expands to the upstream task list and auto-sets the dataset `split` (`target` or `pretrain`).
+
+## Installation
+
+RoboCasa and its dependency `robosuite` are not published on PyPI, and RoboCasa's own `setup.py` hardcodes `lerobot==0.3.3`, which conflicts with this repo's `lerobot`. LeRobot therefore does **not** expose a `robocasa` extra — install the two packages manually as editable clones (using `--no-deps` on `robocasa` to skip its shadowed `lerobot` pin):
+
+```bash
+# After following the standard LeRobot installation instructions.
+
+git clone https://github.com/robocasa/robocasa.git ~/robocasa
+git clone https://github.com/ARISE-Initiative/robosuite.git ~/robosuite
+pip install -e ~/robocasa --no-deps
+pip install -e ~/robosuite
+
+# Robocasa's runtime deps (the ones its setup.py would have pulled, minus
+# the bad lerobot pin).
+pip install numpy numba scipy mujoco pygame Pillow opencv-python \
+ pyyaml pynput tqdm termcolor imageio h5py lxml hidapi \
+ tianshou gymnasium
+
+python -m robocasa.scripts.setup_macros
+# Lightweight assets (lightwheel object meshes + textures). Enough for
+# the default env out of the box.
+python -m robocasa.scripts.download_kitchen_assets \
+ --type tex tex_generative fixtures_lw objs_lw
+# Optional: full objaverse/aigen registries (~30GB) for richer object
+# variety. Enable at eval time via --env.obj_registries (see below).
+# python -m robocasa.scripts.download_kitchen_assets --type objs_objaverse
+```
+
+
+RoboCasa requires MuJoCo. Set the rendering backend before training or evaluation:
+
+```bash
+export MUJOCO_GL=egl # for headless servers (HPC, cloud)
+```
+
+
+
+### Object registries
+
+By default the env samples objects only from the `lightwheel` registry (what `--type objs_lw` ships), which avoids a `Probabilities contain NaN` crash when the objaverse / aigen packs aren't on disk. If you've downloaded the full asset set, enable the full registry at runtime:
+
+```bash
+--env.obj_registries='[objaverse,lightwheel]'
+```
+
+## Evaluation
+
+All eval snippets below mirror the CI command (see `.github/workflows/benchmark_tests.yml`). The `--rename_map` argument maps RoboCasa's native camera keys (`robot0_agentview_left` / `robot0_eye_in_hand` / `robot0_agentview_right`) onto the three-camera (`camera1` / `camera2` / `camera3`) input layout the released `smolvla_robocasa` policy was trained on.
+
+### Single-task evaluation (recommended for quick iteration)
+
+```bash
+lerobot-eval \
+ --policy.path=lerobot/smolvla_robocasa \
+ --env.type=robocasa \
+ --env.task=CloseFridge \
+ --eval.batch_size=1 \
+ --eval.n_episodes=20 \
+ --eval.use_async_envs=false \
+ --policy.device=cuda \
+ '--rename_map={"observation.images.robot0_agentview_left": "observation.images.camera1", "observation.images.robot0_eye_in_hand": "observation.images.camera2", "observation.images.robot0_agentview_right": "observation.images.camera3"}'
+```
+
+### Multi-task evaluation
+
+Pass a comma-separated list of tasks:
+
+```bash
+lerobot-eval \
+ --policy.path=lerobot/smolvla_robocasa \
+ --env.type=robocasa \
+ --env.task=CloseFridge,OpenCabinet,OpenDrawer,TurnOnMicrowave,TurnOffStove \
+ --eval.batch_size=1 \
+ --eval.n_episodes=20 \
+ --eval.use_async_envs=false \
+ --policy.device=cuda \
+ '--rename_map={"observation.images.robot0_agentview_left": "observation.images.camera1", "observation.images.robot0_eye_in_hand": "observation.images.camera2", "observation.images.robot0_agentview_right": "observation.images.camera3"}'
+```
+
+### Benchmark-group evaluation
+
+Run an entire upstream group (e.g. all 18 `atomic_seen` tasks with `split=target`):
+
+```bash
+lerobot-eval \
+ --policy.path=lerobot/smolvla_robocasa \
+ --env.type=robocasa \
+ --env.task=atomic_seen \
+ --eval.batch_size=1 \
+ --eval.n_episodes=20 \
+ --eval.use_async_envs=false \
+ --policy.device=cuda \
+ '--rename_map={"observation.images.robot0_agentview_left": "observation.images.camera1", "observation.images.robot0_eye_in_hand": "observation.images.camera2", "observation.images.robot0_agentview_right": "observation.images.camera3"}'
+```
+
+### Recommended evaluation episodes
+
+**20 episodes per task** for reproducible benchmarking. Matches the protocol used in published results.
+
+## Policy inputs and outputs
+
+**Observations** (raw RoboCasa camera names are preserved verbatim):
+
+- `observation.state` — 16-dim proprioceptive state (base position, base quaternion, relative end-effector position, relative end-effector quaternion, gripper qpos)
+- `observation.images.robot0_agentview_left` — left agent view, 256×256 HWC uint8
+- `observation.images.robot0_eye_in_hand` — wrist camera view, 256×256 HWC uint8
+- `observation.images.robot0_agentview_right` — right agent view, 256×256 HWC uint8
+
+**Actions:**
+
+- Continuous control in `Box(-1, 1, shape=(12,))` — base motion (4D) + control mode (1D) + end-effector position (3D) + end-effector rotation (3D) + gripper (1D).
+
+## Training
+
+### Single-task example
+
+A ready-to-use single-task dataset is on the Hub:
+[`pepijn223/robocasa_CloseFridge`](https://huggingface.co/datasets/pepijn223/robocasa_CloseFridge).
+
+Fine-tune a SmolVLA base on `CloseFridge`:
+
+```bash
+lerobot-train \
+ --policy.type=smolvla \
+ --policy.repo_id=${HF_USER}/smolvla_robocasa_CloseFridge \
+ --policy.load_vlm_weights=true \
+ --policy.push_to_hub=true \
+ --dataset.repo_id=pepijn223/robocasa_CloseFridge \
+ --env.type=robocasa \
+ --env.task=CloseFridge \
+ --output_dir=./outputs/smolvla_robocasa_CloseFridge \
+ --steps=100000 \
+ --batch_size=4 \
+ --eval_freq=5000 \
+ --eval.batch_size=1 \
+ --eval.n_episodes=5 \
+ --save_freq=10000
+```
+
+Evaluate the resulting checkpoint:
+
+```bash
+lerobot-eval \
+ --policy.path=${HF_USER}/smolvla_robocasa_CloseFridge \
+ --env.type=robocasa \
+ --env.task=CloseFridge \
+ --eval.batch_size=1 \
+ --eval.n_episodes=20
+```
+
+## Reproducing published results
+
+The released checkpoint [`lerobot/smolvla_robocasa`](https://huggingface.co/lerobot/smolvla_robocasa) is evaluated with the commands in the [Evaluation](#evaluation) section. CI runs a 10-atomic-task smoke eval (one episode each) on every PR touching the benchmark, picking fixture-centric tasks that don't require the objaverse asset pack.
diff --git a/pyproject.toml b/pyproject.toml
index 6e4993c85..10789b0f2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -212,6 +212,11 @@ aloha = ["lerobot[dataset]", "gym-aloha>=0.1.2,<0.2.0", "lerobot[scipy-dep]"]
pusht = ["lerobot[dataset]", "gym-pusht>=0.1.5,<0.2.0", "pymunk>=6.6.0,<7.0.0"] # TODO: Fix pymunk version in gym-pusht instead
libero = ["lerobot[dataset]", "lerobot[transformers-dep]", "hf-libero>=0.1.3,<0.2.0; sys_platform == 'linux'", "lerobot[scipy-dep]"]
metaworld = ["lerobot[dataset]", "metaworld==3.0.0", "lerobot[scipy-dep]"]
+# NOTE: robocasa is NOT exposed as a `lerobot` extra. Its setup.py pins
+# `lerobot==0.3.3` in install_requires, which cyclically shadows our own
+# workspace `lerobot` and makes the graph unsolvable under any resolver
+# (uv, pip). Install it manually alongside robosuite — see
+# docs/source/robocasa.mdx for the recipe.
# All
all = [
diff --git a/scripts/ci/extract_task_descriptions.py b/scripts/ci/extract_task_descriptions.py
index 9035bacb0..c9216e02d 100644
--- a/scripts/ci/extract_task_descriptions.py
+++ b/scripts/ci/extract_task_descriptions.py
@@ -75,6 +75,23 @@ def _robotwin_descriptions(task_names: str) -> dict[str, str]:
return out
+def _robocasa_descriptions(task_spec: str) -> dict[str, str]:
+ """For each task in the comma-separated list, emit a cleaned-name label.
+
+ RoboCasa episodes carry their language instruction in the env's
+ `ep_meta['lang']`, populated per reset. Pulling it requires spinning
+ up the full kitchen env per task (~seconds each); we use the task
+ name as the key here and let the eval's episode info carry the
+ actual instruction.
+ """
+ out: dict[str, str] = {}
+ for task in (t.strip() for t in task_spec.split(",") if t.strip()):
+ # Split CamelCase into words: "CloseFridge" → "close fridge".
+ label = "".join(f" {c.lower()}" if c.isupper() else c for c in task).strip()
+ out[f"{task}_0"] = label or task
+ return out
+
+
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--env", required=True, help="Environment family (libero, metaworld, ...)")
@@ -90,6 +107,8 @@ def main() -> int:
descriptions = _metaworld_descriptions(args.task)
elif args.env == "robotwin":
descriptions = _robotwin_descriptions(args.task)
+ elif args.env == "robocasa":
+ descriptions = _robocasa_descriptions(args.task)
else:
print(
f"[extract_task_descriptions] No description extractor for env '{args.env}'.",
diff --git a/src/lerobot/envs/configs.py b/src/lerobot/envs/configs.py
index 108a88f1d..4f93a26a7 100644
--- a/src/lerobot/envs/configs.py
+++ b/src/lerobot/envs/configs.py
@@ -496,6 +496,81 @@ class MetaworldEnv(EnvConfig):
)
+@EnvConfig.register_subclass("robocasa")
+@dataclass
+class RoboCasaEnv(EnvConfig):
+ task: str = "CloseFridge"
+ fps: int = 20
+ episode_length: int = 1000
+ obs_type: str = "pixels_agent_pos"
+ render_mode: str = "rgb_array"
+ camera_name: str = "robot0_agentview_left,robot0_eye_in_hand,robot0_agentview_right"
+ observation_height: int = 256
+ observation_width: int = 256
+ visualization_height: int = 512
+ visualization_width: int = 512
+ split: str | None = None
+ # Object-mesh registries to sample from. Upstream default is
+ # ("objaverse", "lightwheel"), but objaverse is ~30GB and the CI image
+ # only ships the lightwheel pack. Override to include objaverse once
+ # you've run `python -m robocasa.scripts.download_kitchen_assets
+ # --type objaverse` locally.
+ obj_registries: list[str] = field(default_factory=lambda: ["lightwheel"])
+ features: dict[str, PolicyFeature] = field(
+ default_factory=lambda: {ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(12,))}
+ )
+ features_map: dict[str, str] = field(default_factory=lambda: {ACTION: ACTION, "agent_pos": OBS_STATE})
+
+ def __post_init__(self):
+ if self.obs_type not in ("pixels", "pixels_agent_pos"):
+ raise ValueError(f"Unsupported obs_type: {self.obs_type}")
+
+ # Preserve raw RoboCasa camera names end-to-end (e.g.
+ # `observation.images.robot0_agentview_left`). This matches the
+ # naming convention used by the RoboCasa datasets on the Hub, so
+ # trained policies don't need a `--rename_map` at eval time.
+ cams = [c.strip() for c in self.camera_name.split(",") if c.strip()]
+ for cam in cams:
+ self.features[f"pixels/{cam}"] = PolicyFeature(
+ type=FeatureType.VISUAL,
+ shape=(self.observation_height, self.observation_width, 3),
+ )
+ self.features_map[f"pixels/{cam}"] = f"{OBS_IMAGES}.{cam}"
+
+ if self.obs_type == "pixels_agent_pos":
+ self.features["agent_pos"] = PolicyFeature(type=FeatureType.STATE, shape=(16,))
+
+ @property
+ def gym_kwargs(self) -> dict:
+ kwargs: dict[str, Any] = {
+ "obs_type": self.obs_type,
+ "render_mode": self.render_mode,
+ "observation_height": self.observation_height,
+ "observation_width": self.observation_width,
+ "visualization_height": self.visualization_height,
+ "visualization_width": self.visualization_width,
+ }
+ if self.split is not None:
+ kwargs["split"] = self.split
+ return kwargs
+
+ def create_envs(self, n_envs: int, use_async_envs: bool = False):
+ from .robocasa import create_robocasa_envs
+
+ if self.task is None:
+ raise ValueError("RoboCasaEnv requires a task to be specified")
+ env_cls = _make_vec_env_cls(use_async_envs, n_envs)
+ return create_robocasa_envs(
+ task=self.task,
+ n_envs=n_envs,
+ camera_name=self.camera_name,
+ gym_kwargs=self.gym_kwargs,
+ env_cls=env_cls,
+ episode_length=self.episode_length,
+ obj_registries=tuple(self.obj_registries),
+ )
+
+
@EnvConfig.register_subclass("isaaclab_arena")
@dataclass
class IsaaclabArenaEnv(HubEnvConfig):
diff --git a/src/lerobot/envs/libero.py b/src/lerobot/envs/libero.py
index f357d4eef..c9aba71bb 100644
--- a/src/lerobot/envs/libero.py
+++ b/src/lerobot/envs/libero.py
@@ -31,20 +31,7 @@ from libero.libero.envs import OffScreenRenderEnv
from lerobot.types import RobotObservation
-from .utils import _LazyAsyncVectorEnv
-
-
-def _parse_camera_names(camera_name: str | Sequence[str]) -> list[str]:
- """Normalize camera_name into a non-empty list of strings."""
- if isinstance(camera_name, str):
- cams = [c.strip() for c in camera_name.split(",") if c.strip()]
- elif isinstance(camera_name, (list | tuple)):
- cams = [str(c).strip() for c in camera_name if str(c).strip()]
- else:
- raise TypeError(f"camera_name must be str or sequence[str], got {type(camera_name).__name__}")
- if not cams:
- raise ValueError("camera_name resolved to an empty list.")
- return cams
+from .utils import _LazyAsyncVectorEnv, parse_camera_names
def _get_suite(name: str) -> benchmark.Benchmark:
@@ -128,7 +115,7 @@ class LiberoEnv(gym.Env):
self.visualization_width = visualization_width
self.visualization_height = visualization_height
self.init_states = init_states
- self.camera_name = _parse_camera_names(
+ self.camera_name = parse_camera_names(
camera_name
) # agentview_image (main) or robot0_eye_in_hand_image (wrist)
@@ -437,7 +424,7 @@ def create_libero_envs(
gym_kwargs = dict(gym_kwargs or {})
task_ids_filter = gym_kwargs.pop("task_ids", None) # optional: limit to specific tasks
- camera_names = _parse_camera_names(camera_name)
+ camera_names = parse_camera_names(camera_name)
suite_names = [s.strip() for s in str(task).split(",") if s.strip()]
if not suite_names:
raise ValueError("`task` must contain at least one LIBERO suite name.")
diff --git a/src/lerobot/envs/robocasa.py b/src/lerobot/envs/robocasa.py
new file mode 100644
index 000000000..a84a7c766
--- /dev/null
+++ b/src/lerobot/envs/robocasa.py
@@ -0,0 +1,425 @@
+#!/usr/bin/env python
+
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import logging
+from collections import defaultdict
+from collections.abc import Callable, Sequence
+from functools import partial
+from typing import Any
+
+import gymnasium as gym
+import numpy as np
+from gymnasium import spaces
+
+from lerobot.types import RobotObservation
+
+from .utils import _LazyAsyncVectorEnv, parse_camera_names
+
+logger = logging.getLogger(__name__)
+
+# Dimensions for the flat action/state vectors used by the LeRobot wrapper.
+# These correspond to the PandaOmron robot in RoboCasa365.
+OBS_STATE_DIM = 16 # base_pos(3) + base_quat(4) + ee_pos_rel(3) + ee_quat_rel(4) + gripper_qpos(2)
+ACTION_DIM = 12 # base_motion(4) + control_mode(1) + ee_pos(3) + ee_rot(3) + gripper(1)
+ACTION_LOW = -1.0
+ACTION_HIGH = 1.0
+
+# Default PandaOmron cameras. We surface these raw names directly as
+# `observation.images.` so the LeRobot dataset/policy keys match
+# RoboCasa's native convention (no implicit renaming).
+DEFAULT_CAMERAS = [
+ "robot0_agentview_left",
+ "robot0_eye_in_hand",
+ "robot0_agentview_right",
+]
+
+# Object-mesh registries to sample from. RoboCasa's upstream default is
+# ("objaverse", "lightwheel"), but the objaverse pack is huge (~30GB) and
+# most users — including our CI image — only download the lightwheel pack
+# (`--type objs_lw` in `download_kitchen_assets`). When a sampled object
+# category has zero candidates in every registry, robocasa crashes with
+# `ValueError: Probabilities contain NaN` (0/0 divide in the probability
+# normalization). Restricting to registries that are actually on disk
+# avoids the NaN and matches what the asset download provides.
+DEFAULT_OBJ_REGISTRIES: tuple[str, ...] = ("lightwheel",)
+
+# Task-group shortcuts accepted as `--env.task`. When the user passes one of
+# these names, we expand it to the upstream RoboCasa task list and auto-set
+# the dataset split. Individual task names (optionally comma-separated) still
+# take precedence; this only triggers on an exact group-name match.
+_TASK_GROUP_SPLITS = {
+ "atomic_seen": "target",
+ "composite_seen": "target",
+ "composite_unseen": "target",
+ "pretrain50": "pretrain",
+ "pretrain100": "pretrain",
+ "pretrain200": "pretrain",
+ "pretrain300": "pretrain",
+}
+
+
+def _resolve_tasks(task: str) -> tuple[list[str], str | None]:
+ """Resolve a `--env.task` value to (task_names, split_override).
+
+ If `task` is a known task-group name (e.g. `atomic_seen`, `pretrain100`),
+ expand it via `robocasa.utils.dataset_registry.{TARGET,PRETRAINING}_TASKS`
+ and return the matching split. Otherwise treat `task` as a single task or
+ comma-separated list and leave the split untouched (None).
+ """
+ key = task.strip()
+ if key in _TASK_GROUP_SPLITS:
+ from robocasa.utils.dataset_registry import PRETRAINING_TASKS, TARGET_TASKS
+
+ combined = {**TARGET_TASKS, **PRETRAINING_TASKS}
+ if key not in combined:
+ raise ValueError(
+ f"Task group '{key}' is not available in this version of robocasa. "
+ f"Known groups: {sorted(combined.keys())}."
+ )
+ return list(combined[key]), _TASK_GROUP_SPLITS[key]
+
+ names = [t.strip() for t in task.split(",") if t.strip()]
+ if not names:
+ raise ValueError("`task` must contain at least one RoboCasa task name.")
+ return names, None
+
+
+def convert_action(flat_action: np.ndarray) -> dict[str, Any]:
+ """Split a flat (12,) action vector into a RoboCasa action dict.
+
+ Layout: base_motion(4) + control_mode(1) + ee_pos(3) + ee_rot(3) + gripper(1)
+ """
+ return {
+ "action.base_motion": flat_action[0:4],
+ "action.control_mode": flat_action[4:5],
+ "action.end_effector_position": flat_action[5:8],
+ "action.end_effector_rotation": flat_action[8:11],
+ "action.gripper_close": flat_action[11:12],
+ }
+
+
+class RoboCasaEnv(gym.Env):
+ """LeRobot gym.Env wrapper for RoboCasa365 kitchen environments.
+
+ Wraps RoboCasaGymEnv from the robocasa package and converts its
+ dict-based observations and actions into the flat arrays LeRobot expects.
+ Raw RoboCasa camera names are preserved verbatim under `pixels/`.
+ """
+
+ metadata = {"render_modes": ["rgb_array"], "render_fps": 20}
+
+ def __init__(
+ self,
+ task: str,
+ camera_name: str | Sequence[str] = ",".join(DEFAULT_CAMERAS),
+ obs_type: str = "pixels_agent_pos",
+ render_mode: str = "rgb_array",
+ observation_width: int = 256,
+ observation_height: int = 256,
+ visualization_width: int = 512,
+ visualization_height: int = 512,
+ split: str | None = None,
+ episode_length: int | None = None,
+ obj_registries: Sequence[str] = DEFAULT_OBJ_REGISTRIES,
+ episode_index: int = 0,
+ ):
+ super().__init__()
+ self.task = task
+ self.obs_type = obs_type
+ self.render_mode = render_mode
+ self.observation_width = observation_width
+ self.observation_height = observation_height
+ self.visualization_width = visualization_width
+ self.visualization_height = visualization_height
+ self.split = split
+ self.obj_registries = tuple(obj_registries)
+ # Per-worker index (0..n_envs-1) used to spread the user-provided
+ # seed across factories so each sub-env explores a distinct layout
+ # even when the same seed is passed to `reset()`.
+ self.episode_index = int(episode_index)
+
+ self.camera_name = parse_camera_names(camera_name)
+
+ self._max_episode_steps = episode_length if episode_length is not None else 1000
+
+ # Deferred — created on first reset() inside the worker subprocess
+ # to avoid inheriting stale GPU/EGL contexts across fork().
+ self._env: Any = None
+ self.task_description = ""
+
+ images = {
+ cam: spaces.Box(
+ low=0,
+ high=255,
+ shape=(self.observation_height, self.observation_width, 3),
+ dtype=np.uint8,
+ )
+ for cam in self.camera_name
+ }
+
+ if self.obs_type == "pixels":
+ self.observation_space = spaces.Dict({"pixels": spaces.Dict(images)})
+ elif self.obs_type == "pixels_agent_pos":
+ self.observation_space = spaces.Dict(
+ {
+ "pixels": spaces.Dict(images),
+ "agent_pos": spaces.Box(
+ low=-np.inf,
+ high=np.inf,
+ shape=(OBS_STATE_DIM,),
+ dtype=np.float32,
+ ),
+ }
+ )
+ else:
+ raise ValueError(f"Unsupported obs_type '{self.obs_type}'. Use 'pixels' or 'pixels_agent_pos'.")
+
+ self.action_space = spaces.Box(
+ low=ACTION_LOW,
+ high=ACTION_HIGH,
+ shape=(ACTION_DIM,),
+ dtype=np.float32,
+ )
+
+ def _ensure_env(self) -> None:
+ """Create the underlying RoboCasaGymEnv on first use.
+
+ Called inside the worker subprocess after fork(), so each worker gets
+ its own clean rendering context rather than inheriting a stale one from
+ the parent process (which causes crashes with AsyncVectorEnv).
+ """
+ if self._env is not None:
+ return
+ from robocasa.wrappers.gym_wrapper import RoboCasaGymEnv
+
+ # RoboCasaGymEnv defaults split="test", which create_env rejects
+ # (only None/"all"/"pretrain"/"target" are valid). Always pass a
+ # valid value so we don't hit that default. Extra kwargs are
+ # forwarded to the underlying kitchen env via create_env/robosuite.make.
+ self._env = RoboCasaGymEnv(
+ env_name=self.task,
+ camera_widths=self.observation_width,
+ camera_heights=self.observation_height,
+ split=self.split if self.split is not None else "all",
+ obj_registries=self.obj_registries,
+ )
+
+ ep_meta = self._env.env.get_ep_meta()
+ self.task_description = ep_meta.get("lang", self.task)
+
+ def _format_raw_obs(self, raw_obs: dict) -> RobotObservation:
+ """Convert RoboCasaGymEnv observation dict to LeRobot format."""
+ # RoboCasaGymEnv emits camera frames under "video.".
+ images = {cam: raw_obs[f"video.{cam}"] for cam in self.camera_name if f"video.{cam}" in raw_obs}
+
+ if self.obs_type == "pixels":
+ return {"pixels": images}
+
+ # `state.*` keys come from PandaOmronKeyConverter inside the wrapper.
+ agent_pos = np.concatenate(
+ [
+ raw_obs.get("state.base_position", np.zeros(3)),
+ raw_obs.get("state.base_rotation", np.zeros(4)),
+ raw_obs.get("state.end_effector_position_relative", np.zeros(3)),
+ raw_obs.get("state.end_effector_rotation_relative", np.zeros(4)),
+ raw_obs.get("state.gripper_qpos", np.zeros(2)),
+ ],
+ axis=-1,
+ ).astype(np.float32)
+
+ return {"pixels": images, "agent_pos": agent_pos}
+
+ def render(self) -> np.ndarray:
+ self._ensure_env()
+ assert self._env is not None
+ return self._env.render()
+
+ def reset(self, seed=None, **kwargs):
+ self._ensure_env()
+ assert self._env is not None
+ super().reset(seed=seed)
+ # Spread the seed across workers so n_envs factories don't all
+ # roll the same scene. With an explicit user seed we shift it by
+ # episode_index; with no seed we fall back to episode_index so
+ # each worker is still distinct rather than inheriting the same
+ # global RNG state.
+ worker_seed = seed + self.episode_index if seed is not None else self.episode_index
+ raw_obs, info = self._env.reset(seed=worker_seed)
+
+ ep_meta = self._env.env.get_ep_meta()
+ self.task_description = ep_meta.get("lang", self.task)
+
+ observation = self._format_raw_obs(raw_obs)
+ info = {"is_success": False}
+ return observation, info
+
+ def step(self, action: np.ndarray) -> tuple[RobotObservation, float, bool, bool, dict[str, Any]]:
+ self._ensure_env()
+ assert self._env is not None
+ if action.ndim != 1:
+ raise ValueError(
+ f"Expected action to be 1-D (shape (action_dim,)), "
+ f"but got shape {action.shape} with ndim={action.ndim}"
+ )
+
+ action_dict = convert_action(action)
+ raw_obs, reward, done, truncated, info = self._env.step(action_dict)
+
+ is_success = bool(info.get("success", False))
+ terminated = done or is_success
+ info.update({"task": self.task, "done": done, "is_success": is_success})
+
+ observation = self._format_raw_obs(raw_obs)
+ if terminated:
+ info["final_info"] = {
+ "task": self.task,
+ "done": bool(done),
+ "is_success": bool(is_success),
+ }
+ self.reset()
+
+ return observation, reward, terminated, truncated, info
+
+ def close(self):
+ if self._env is not None:
+ self._env.close()
+
+
+def _make_env_fns(
+ *,
+ task: str,
+ n_envs: int,
+ camera_names: list[str],
+ obs_type: str,
+ render_mode: str,
+ observation_width: int,
+ observation_height: int,
+ visualization_width: int,
+ visualization_height: int,
+ split: str | None,
+ episode_length: int | None,
+ obj_registries: Sequence[str],
+) -> list[Callable[[], RoboCasaEnv]]:
+ """Build n_envs factory callables for a single task.
+
+ Each factory carries a distinct ``episode_index`` (``0..n_envs-1``) so
+ ``RoboCasaEnv.reset()`` can derive a per-worker seed series from the
+ user-provided seed.
+ """
+
+ def _make_env(episode_index: int) -> RoboCasaEnv:
+ return RoboCasaEnv(
+ task=task,
+ camera_name=camera_names,
+ obs_type=obs_type,
+ render_mode=render_mode,
+ observation_width=observation_width,
+ observation_height=observation_height,
+ visualization_width=visualization_width,
+ visualization_height=visualization_height,
+ split=split,
+ episode_length=episode_length,
+ obj_registries=obj_registries,
+ episode_index=episode_index,
+ )
+
+ return [partial(_make_env, i) for i in range(n_envs)]
+
+
+def create_robocasa_envs(
+ task: str,
+ n_envs: int,
+ gym_kwargs: dict[str, Any] | None = None,
+ camera_name: str | Sequence[str] = ",".join(DEFAULT_CAMERAS),
+ env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None,
+ episode_length: int | None = None,
+ obj_registries: Sequence[str] = DEFAULT_OBJ_REGISTRIES,
+) -> dict[str, dict[int, Any]]:
+ """Create vectorized RoboCasa365 environments with a consistent return shape.
+
+ Returns:
+ dict[task_name][task_id] -> vec_env (env_cls([...]) with exactly n_envs factories)
+
+ `task` can be:
+ - a single task name (e.g. `CloseFridge`)
+ - a comma-separated list of task names (e.g. `CloseFridge,PickPlaceCoffee`)
+ - a benchmark-group shortcut (`atomic_seen`, `composite_seen`,
+ `composite_unseen`, `pretrain50`, `pretrain100`, `pretrain200`,
+ `pretrain300`), which auto-expands to the upstream task list and
+ auto-sets the dataset `split` ("target" or "pretrain").
+ """
+ if env_cls is None or not callable(env_cls):
+ raise ValueError("env_cls must be a callable that wraps a list of environment factory callables.")
+ if not isinstance(n_envs, int) or n_envs <= 0:
+ raise ValueError(f"n_envs must be a positive int; got {n_envs}.")
+
+ gym_kwargs = dict(gym_kwargs or {})
+ obs_type = gym_kwargs.pop("obs_type", "pixels_agent_pos")
+ render_mode = gym_kwargs.pop("render_mode", "rgb_array")
+ observation_width = gym_kwargs.pop("observation_width", 256)
+ observation_height = gym_kwargs.pop("observation_height", 256)
+ visualization_width = gym_kwargs.pop("visualization_width", 512)
+ visualization_height = gym_kwargs.pop("visualization_height", 512)
+ split = gym_kwargs.pop("split", None)
+
+ camera_names = parse_camera_names(camera_name)
+ task_names, group_split = _resolve_tasks(str(task))
+ if group_split is not None and split is None:
+ split = group_split
+
+ logger.info(
+ "Creating RoboCasa envs | tasks=%s | split=%s | n_envs(per task)=%d",
+ task_names,
+ split,
+ n_envs,
+ )
+
+ is_async = env_cls is gym.vector.AsyncVectorEnv
+
+ cached_obs_space: spaces.Space | None = None
+ cached_act_space: spaces.Space | None = None
+ cached_metadata: dict[str, Any] | None = None
+ out: dict[str, dict[int, Any]] = defaultdict(dict)
+
+ for task_name in task_names:
+ fns = _make_env_fns(
+ task=task_name,
+ n_envs=n_envs,
+ camera_names=camera_names,
+ obs_type=obs_type,
+ render_mode=render_mode,
+ observation_width=observation_width,
+ observation_height=observation_height,
+ visualization_width=visualization_width,
+ visualization_height=visualization_height,
+ split=split,
+ episode_length=episode_length,
+ obj_registries=obj_registries,
+ )
+
+ if is_async:
+ lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata)
+ if cached_obs_space is None:
+ cached_obs_space = lazy.observation_space
+ cached_act_space = lazy.action_space
+ cached_metadata = lazy.metadata
+ out[task_name][0] = lazy
+ else:
+ out[task_name][0] = env_cls(fns)
+ logger.info("Built vec env | task=%s | n_envs=%d", task_name, n_envs)
+
+ return {name: dict(task_map) for name, task_map in out.items()}
diff --git a/src/lerobot/envs/utils.py b/src/lerobot/envs/utils.py
index 9b915713d..6e6f352e9 100644
--- a/src/lerobot/envs/utils.py
+++ b/src/lerobot/envs/utils.py
@@ -34,6 +34,25 @@ from lerobot.utils.utils import get_channel_first_image_shape
from .configs import EnvConfig
+def parse_camera_names(camera_name: str | Sequence[str]) -> list[str]:
+ """Normalize ``camera_name`` into a non-empty list of strings.
+
+ Accepts a comma-separated string (``"cam_a,cam_b"``) or a sequence of
+ strings (tuples/lists). Whitespace is stripped; empty entries are
+ dropped. Raises ``TypeError`` for unsupported input types and
+ ``ValueError`` when the normalized list is empty.
+ """
+ if isinstance(camera_name, str):
+ cams = [c.strip() for c in camera_name.split(",") if c.strip()]
+ elif isinstance(camera_name, (list | tuple)):
+ cams = [str(c).strip() for c in camera_name if str(c).strip()]
+ else:
+ raise TypeError(f"camera_name must be str or sequence[str], got {type(camera_name).__name__}")
+ if not cams:
+ raise ValueError("camera_name resolved to an empty list.")
+ return cams
+
+
def _convert_nested_dict(d):
result = {}
for k, v in d.items():