diff --git a/.github/workflows/benchmark_tests.yml b/.github/workflows/benchmark_tests.yml index 70258c7f6..00806f990 100644 --- a/.github/workflows/benchmark_tests.yml +++ b/.github/workflows/benchmark_tests.yml @@ -425,3 +425,103 @@ jobs: name: robotwin-metrics path: /tmp/robotwin-artifacts/metrics.json if-no-files-found: warn + + # ── ROBOCASA365 ────────────────────────────────────────────────────────── + # Isolated image: robocasa + robosuite installed manually as editable + # clones (no `lerobot[robocasa]` extra — robocasa's setup.py pins + # `lerobot==0.3.3`, which would shadow this repo's lerobot). + robocasa-integration-test: + name: RoboCasa365 — build image + 1-episode eval + runs-on: + group: aws-g6-4xlarge-plus + env: + HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }} + + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + lfs: true + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses] + with: + cache-binary: false + + - name: Login to Docker Hub + if: ${{ env.DOCKERHUB_USERNAME != '' }} + uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses] + with: + username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }} + password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }} + env: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }} + + - name: Build RoboCasa365 benchmark image + uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses] + with: + context: . + file: docker/Dockerfile.benchmark.robocasa + push: false + load: true + tags: lerobot-benchmark-robocasa:ci + + - name: Run RoboCasa365 smoke eval (10 atomic tasks, 1 episode each) + if: env.HF_USER_TOKEN != '' + run: | + docker run --name robocasa-eval --gpus all \ + --shm-size=4g \ + -e HF_HOME=/tmp/hf \ + -e HF_USER_TOKEN="${HF_USER_TOKEN}" \ + -e HF_HUB_DOWNLOAD_TIMEOUT=300 \ + -e MUJOCO_GL=egl \ + lerobot-benchmark-robocasa:ci \ + bash -c " + hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true + lerobot-eval \ + --policy.path=lerobot/smolvla_robocasa \ + --env.type=robocasa \ + --env.task=CloseFridge,OpenCabinet,OpenDrawer,TurnOnMicrowave,TurnOffStove,CloseToasterOvenDoor,SlideDishwasherRack,TurnOnSinkFaucet,NavigateKitchen,TurnOnElectricKettle \ + --eval.batch_size=1 \ + --eval.n_episodes=1 \ + --eval.use_async_envs=false \ + --policy.device=cuda \ + '--rename_map={\"observation.images.robot0_agentview_left\": \"observation.images.camera1\", \"observation.images.robot0_eye_in_hand\": \"observation.images.camera2\", \"observation.images.robot0_agentview_right\": \"observation.images.camera3\"}' \ + --output_dir=/tmp/eval-artifacts + python scripts/ci/extract_task_descriptions.py \ + --env robocasa \ + --task CloseFridge,OpenCabinet,OpenDrawer,TurnOnMicrowave,TurnOffStove,CloseToasterOvenDoor,SlideDishwasherRack,TurnOnSinkFaucet,NavigateKitchen,TurnOnElectricKettle \ + --output /tmp/eval-artifacts/task_descriptions.json + " + + - name: Copy RoboCasa365 artifacts from container + if: always() + run: | + mkdir -p /tmp/robocasa-artifacts + docker cp robocasa-eval:/tmp/eval-artifacts/. /tmp/robocasa-artifacts/ 2>/dev/null || true + docker rm -f robocasa-eval || true + + - name: Parse RoboCasa365 eval metrics + if: always() + run: | + python3 scripts/ci/parse_eval_metrics.py \ + --artifacts-dir /tmp/robocasa-artifacts \ + --env robocasa \ + --task atomic_smoke_10 \ + --policy lerobot/smolvla_robocasa + + - name: Upload RoboCasa365 rollout video + if: always() + uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses] + with: + name: robocasa-rollout-video + path: /tmp/robocasa-artifacts/videos/ + if-no-files-found: warn + + - name: Upload RoboCasa365 eval metrics + if: always() + uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses] + with: + name: robocasa-metrics + path: /tmp/robocasa-artifacts/metrics.json + if-no-files-found: warn diff --git a/docker/Dockerfile.benchmark.robocasa b/docker/Dockerfile.benchmark.robocasa new file mode 100644 index 000000000..9de1612cb --- /dev/null +++ b/docker/Dockerfile.benchmark.robocasa @@ -0,0 +1,71 @@ +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Benchmark image for RoboCasa365 integration tests. +# Extends the nightly GPU image (which already has all extras installed) +# with the PR's source code and RoboCasa-specific asset setup. +# +# Build: docker build -f docker/Dockerfile.benchmark.robocasa -t lerobot-benchmark-robocasa . +# Run: docker run --gpus all --rm lerobot-benchmark-robocasa lerobot-eval ... + +FROM huggingface/lerobot-gpu:latest + +# Install robocasa + robosuite as editable clones. pip-installing from git +# omits data files like robocasa/models/assets/box_links/box_links_assets.json +# (not declared in package_data), which download_kitchen_assets needs at import. +# +# `--no-deps` on robocasa is deliberate: its setup.py pins `lerobot==0.3.3` +# in install_requires, which would shadow the editable lerobot baked into +# this image. We install robocasa's actual runtime deps explicitly instead. +# Pinned SHAs for reproducible benchmark runs. Bump when you need an +# upstream fix; don't rely on `main`/`master` drift. +ARG ROBOCASA_SHA=56e355ccc64389dfc1b8a61a33b9127b975ba681 +ARG ROBOSUITE_SHA=aaa8b9b214ce8e77e82926d677b4d61d55e577ab +RUN git clone https://github.com/robocasa/robocasa.git ~/robocasa && \ + git -C ~/robocasa checkout ${ROBOCASA_SHA} && \ + git clone https://github.com/ARISE-Initiative/robosuite.git ~/robosuite && \ + git -C ~/robosuite checkout ${ROBOSUITE_SHA} && \ + uv pip install --no-cache -e ~/robocasa --no-deps && \ + uv pip install --no-cache -e ~/robosuite && \ + uv pip install --no-cache \ + "numpy==2.2.5" "numba==0.61.2" "scipy==1.15.3" "mujoco==3.3.1" \ + "pygame==2.6.1" "Pillow==12.2.0" "opencv-python==4.13.0.92" \ + "pyyaml==6.0.3" "pynput==1.8.1" "tqdm==4.67.3" "termcolor==3.3.0" \ + "imageio==2.37.3" "h5py==3.16.0" "lxml==6.0.4" "hidapi==0.14.0.post4" \ + "tianshou==0.4.10" "gymnasium==1.2.3" + +# Set up robocasa macros and download kitchen assets. We need: +# - tex : base environment textures +# - tex_generative : AI-generated textures; kitchen fixture XMLs embed +# refs to generative_textures/wall/tex*.png +# unconditionally, so MjModel.from_xml_string fails +# at reset time without them (even if the env is +# constructed with generative_textures=None). +# - fixtures_lw : lightwheel kitchen fixtures (fridge, counters...) +# - objs_lw : lightwheel object meshes (stools, misc props) +# We skip the objaverse/aigen object packs (~30GB combined) by pairing +# this with --env.obj_registries=["lightwheel"] on the lerobot side. +# The download script prompts interactively, so pipe 'y' to auto-accept. +RUN python -m robocasa.scripts.setup_macros && \ + yes y | python -m robocasa.scripts.download_kitchen_assets \ + --type tex tex_generative fixtures_lw objs_lw + +# Overlay the PR's source code on top of the nightly image. +COPY --chown=user_lerobot:user_lerobot . . + +# Re-install lerobot editably so the new source (with RoboCasaEnv registration) +# replaces the stale package baked into the nightly image. +RUN uv pip install --no-cache --no-deps -e . + +CMD ["/bin/bash"] diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index 13405decf..bb0dad1bf 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -81,6 +81,8 @@ title: Meta-World - local: robotwin title: RoboTwin 2.0 + - local: robocasa + title: RoboCasa365 - local: envhub_isaaclab_arena title: NVIDIA IsaacLab Arena Environments title: "Benchmarks" diff --git a/docs/source/robocasa.mdx b/docs/source/robocasa.mdx new file mode 100644 index 000000000..f6a784e72 --- /dev/null +++ b/docs/source/robocasa.mdx @@ -0,0 +1,188 @@ +# RoboCasa365 + +[RoboCasa365](https://robocasa.ai) is a large-scale simulation framework for training and benchmarking **generalist robots** in everyday kitchen tasks. It ships 365 diverse manipulation tasks across 2,500 kitchen environments, 3,200+ object assets and 600+ hours of human demonstration data, on a PandaOmron 12-DOF mobile manipulator (Franka arm on a holonomic base). + +- Paper: [RoboCasa: Large-Scale Simulation of Everyday Tasks for Generalist Robots](https://arxiv.org/abs/2406.02523) +- GitHub: [robocasa/robocasa](https://github.com/robocasa/robocasa) +- Project website: [robocasa.ai](https://robocasa.ai) +- Pretrained policy: [`lerobot/smolvla_robocasa`](https://huggingface.co/lerobot/smolvla_robocasa) +- Single-task dataset (CloseFridge): [`pepijn223/robocasa_CloseFridge`](https://huggingface.co/datasets/pepijn223/robocasa_CloseFridge) + +RoboCasa365 benchmark overview + +## Available tasks + +RoboCasa365 organizes its 365 tasks into two families and three upstream benchmark groups that LeRobot exposes as first-class `--env.task` shortcuts: + +| Family | Tasks | Description | +| --------- | ----- | ------------------------------------------------------------------------------- | +| Atomic | ~65 | Single-skill tasks: pick-and-place, door/drawer manipulation, appliance control | +| Composite | ~300 | Multi-step tasks across 60+ categories: cooking, cleaning, organizing, etc. | + +**Atomic task examples:** `CloseFridge`, `OpenDrawer`, `OpenCabinet`, `TurnOnMicrowave`, `TurnOffStove`, `NavigateKitchen`, `PickPlaceCounterToStove`. + +**Composite task categories:** baking, boiling, brewing, chopping, clearing table, defrosting food, loading dishwasher, making tea, microwaving food, washing dishes, and more. + +`--env.task` accepts three forms: + +- a single task name (`CloseFridge`) +- a comma-separated list (`CloseFridge,OpenBlenderLid,PickPlaceCoffee`) +- a benchmark-group shortcut — `atomic_seen`, `composite_seen`, `composite_unseen`, `pretrain50`, `pretrain100`, `pretrain200`, `pretrain300` — which auto-expands to the upstream task list and auto-sets the dataset `split` (`target` or `pretrain`). + +## Installation + +RoboCasa and its dependency `robosuite` are not published on PyPI, and RoboCasa's own `setup.py` hardcodes `lerobot==0.3.3`, which conflicts with this repo's `lerobot`. LeRobot therefore does **not** expose a `robocasa` extra — install the two packages manually as editable clones (using `--no-deps` on `robocasa` to skip its shadowed `lerobot` pin): + +```bash +# After following the standard LeRobot installation instructions. + +git clone https://github.com/robocasa/robocasa.git ~/robocasa +git clone https://github.com/ARISE-Initiative/robosuite.git ~/robosuite +pip install -e ~/robocasa --no-deps +pip install -e ~/robosuite + +# Robocasa's runtime deps (the ones its setup.py would have pulled, minus +# the bad lerobot pin). +pip install numpy numba scipy mujoco pygame Pillow opencv-python \ + pyyaml pynput tqdm termcolor imageio h5py lxml hidapi \ + tianshou gymnasium + +python -m robocasa.scripts.setup_macros +# Lightweight assets (lightwheel object meshes + textures). Enough for +# the default env out of the box. +python -m robocasa.scripts.download_kitchen_assets \ + --type tex tex_generative fixtures_lw objs_lw +# Optional: full objaverse/aigen registries (~30GB) for richer object +# variety. Enable at eval time via --env.obj_registries (see below). +# python -m robocasa.scripts.download_kitchen_assets --type objs_objaverse +``` + + +RoboCasa requires MuJoCo. Set the rendering backend before training or evaluation: + +```bash +export MUJOCO_GL=egl # for headless servers (HPC, cloud) +``` + + + +### Object registries + +By default the env samples objects only from the `lightwheel` registry (what `--type objs_lw` ships), which avoids a `Probabilities contain NaN` crash when the objaverse / aigen packs aren't on disk. If you've downloaded the full asset set, enable the full registry at runtime: + +```bash +--env.obj_registries='[objaverse,lightwheel]' +``` + +## Evaluation + +All eval snippets below mirror the CI command (see `.github/workflows/benchmark_tests.yml`). The `--rename_map` argument maps RoboCasa's native camera keys (`robot0_agentview_left` / `robot0_eye_in_hand` / `robot0_agentview_right`) onto the three-camera (`camera1` / `camera2` / `camera3`) input layout the released `smolvla_robocasa` policy was trained on. + +### Single-task evaluation (recommended for quick iteration) + +```bash +lerobot-eval \ + --policy.path=lerobot/smolvla_robocasa \ + --env.type=robocasa \ + --env.task=CloseFridge \ + --eval.batch_size=1 \ + --eval.n_episodes=20 \ + --eval.use_async_envs=false \ + --policy.device=cuda \ + '--rename_map={"observation.images.robot0_agentview_left": "observation.images.camera1", "observation.images.robot0_eye_in_hand": "observation.images.camera2", "observation.images.robot0_agentview_right": "observation.images.camera3"}' +``` + +### Multi-task evaluation + +Pass a comma-separated list of tasks: + +```bash +lerobot-eval \ + --policy.path=lerobot/smolvla_robocasa \ + --env.type=robocasa \ + --env.task=CloseFridge,OpenCabinet,OpenDrawer,TurnOnMicrowave,TurnOffStove \ + --eval.batch_size=1 \ + --eval.n_episodes=20 \ + --eval.use_async_envs=false \ + --policy.device=cuda \ + '--rename_map={"observation.images.robot0_agentview_left": "observation.images.camera1", "observation.images.robot0_eye_in_hand": "observation.images.camera2", "observation.images.robot0_agentview_right": "observation.images.camera3"}' +``` + +### Benchmark-group evaluation + +Run an entire upstream group (e.g. all 18 `atomic_seen` tasks with `split=target`): + +```bash +lerobot-eval \ + --policy.path=lerobot/smolvla_robocasa \ + --env.type=robocasa \ + --env.task=atomic_seen \ + --eval.batch_size=1 \ + --eval.n_episodes=20 \ + --eval.use_async_envs=false \ + --policy.device=cuda \ + '--rename_map={"observation.images.robot0_agentview_left": "observation.images.camera1", "observation.images.robot0_eye_in_hand": "observation.images.camera2", "observation.images.robot0_agentview_right": "observation.images.camera3"}' +``` + +### Recommended evaluation episodes + +**20 episodes per task** for reproducible benchmarking. Matches the protocol used in published results. + +## Policy inputs and outputs + +**Observations** (raw RoboCasa camera names are preserved verbatim): + +- `observation.state` — 16-dim proprioceptive state (base position, base quaternion, relative end-effector position, relative end-effector quaternion, gripper qpos) +- `observation.images.robot0_agentview_left` — left agent view, 256×256 HWC uint8 +- `observation.images.robot0_eye_in_hand` — wrist camera view, 256×256 HWC uint8 +- `observation.images.robot0_agentview_right` — right agent view, 256×256 HWC uint8 + +**Actions:** + +- Continuous control in `Box(-1, 1, shape=(12,))` — base motion (4D) + control mode (1D) + end-effector position (3D) + end-effector rotation (3D) + gripper (1D). + +## Training + +### Single-task example + +A ready-to-use single-task dataset is on the Hub: +[`pepijn223/robocasa_CloseFridge`](https://huggingface.co/datasets/pepijn223/robocasa_CloseFridge). + +Fine-tune a SmolVLA base on `CloseFridge`: + +```bash +lerobot-train \ + --policy.type=smolvla \ + --policy.repo_id=${HF_USER}/smolvla_robocasa_CloseFridge \ + --policy.load_vlm_weights=true \ + --policy.push_to_hub=true \ + --dataset.repo_id=pepijn223/robocasa_CloseFridge \ + --env.type=robocasa \ + --env.task=CloseFridge \ + --output_dir=./outputs/smolvla_robocasa_CloseFridge \ + --steps=100000 \ + --batch_size=4 \ + --eval_freq=5000 \ + --eval.batch_size=1 \ + --eval.n_episodes=5 \ + --save_freq=10000 +``` + +Evaluate the resulting checkpoint: + +```bash +lerobot-eval \ + --policy.path=${HF_USER}/smolvla_robocasa_CloseFridge \ + --env.type=robocasa \ + --env.task=CloseFridge \ + --eval.batch_size=1 \ + --eval.n_episodes=20 +``` + +## Reproducing published results + +The released checkpoint [`lerobot/smolvla_robocasa`](https://huggingface.co/lerobot/smolvla_robocasa) is evaluated with the commands in the [Evaluation](#evaluation) section. CI runs a 10-atomic-task smoke eval (one episode each) on every PR touching the benchmark, picking fixture-centric tasks that don't require the objaverse asset pack. diff --git a/pyproject.toml b/pyproject.toml index 6e4993c85..10789b0f2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -212,6 +212,11 @@ aloha = ["lerobot[dataset]", "gym-aloha>=0.1.2,<0.2.0", "lerobot[scipy-dep]"] pusht = ["lerobot[dataset]", "gym-pusht>=0.1.5,<0.2.0", "pymunk>=6.6.0,<7.0.0"] # TODO: Fix pymunk version in gym-pusht instead libero = ["lerobot[dataset]", "lerobot[transformers-dep]", "hf-libero>=0.1.3,<0.2.0; sys_platform == 'linux'", "lerobot[scipy-dep]"] metaworld = ["lerobot[dataset]", "metaworld==3.0.0", "lerobot[scipy-dep]"] +# NOTE: robocasa is NOT exposed as a `lerobot` extra. Its setup.py pins +# `lerobot==0.3.3` in install_requires, which cyclically shadows our own +# workspace `lerobot` and makes the graph unsolvable under any resolver +# (uv, pip). Install it manually alongside robosuite — see +# docs/source/robocasa.mdx for the recipe. # All all = [ diff --git a/scripts/ci/extract_task_descriptions.py b/scripts/ci/extract_task_descriptions.py index 9035bacb0..c9216e02d 100644 --- a/scripts/ci/extract_task_descriptions.py +++ b/scripts/ci/extract_task_descriptions.py @@ -75,6 +75,23 @@ def _robotwin_descriptions(task_names: str) -> dict[str, str]: return out +def _robocasa_descriptions(task_spec: str) -> dict[str, str]: + """For each task in the comma-separated list, emit a cleaned-name label. + + RoboCasa episodes carry their language instruction in the env's + `ep_meta['lang']`, populated per reset. Pulling it requires spinning + up the full kitchen env per task (~seconds each); we use the task + name as the key here and let the eval's episode info carry the + actual instruction. + """ + out: dict[str, str] = {} + for task in (t.strip() for t in task_spec.split(",") if t.strip()): + # Split CamelCase into words: "CloseFridge" → "close fridge". + label = "".join(f" {c.lower()}" if c.isupper() else c for c in task).strip() + out[f"{task}_0"] = label or task + return out + + def main() -> int: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--env", required=True, help="Environment family (libero, metaworld, ...)") @@ -90,6 +107,8 @@ def main() -> int: descriptions = _metaworld_descriptions(args.task) elif args.env == "robotwin": descriptions = _robotwin_descriptions(args.task) + elif args.env == "robocasa": + descriptions = _robocasa_descriptions(args.task) else: print( f"[extract_task_descriptions] No description extractor for env '{args.env}'.", diff --git a/src/lerobot/envs/configs.py b/src/lerobot/envs/configs.py index 108a88f1d..4f93a26a7 100644 --- a/src/lerobot/envs/configs.py +++ b/src/lerobot/envs/configs.py @@ -496,6 +496,81 @@ class MetaworldEnv(EnvConfig): ) +@EnvConfig.register_subclass("robocasa") +@dataclass +class RoboCasaEnv(EnvConfig): + task: str = "CloseFridge" + fps: int = 20 + episode_length: int = 1000 + obs_type: str = "pixels_agent_pos" + render_mode: str = "rgb_array" + camera_name: str = "robot0_agentview_left,robot0_eye_in_hand,robot0_agentview_right" + observation_height: int = 256 + observation_width: int = 256 + visualization_height: int = 512 + visualization_width: int = 512 + split: str | None = None + # Object-mesh registries to sample from. Upstream default is + # ("objaverse", "lightwheel"), but objaverse is ~30GB and the CI image + # only ships the lightwheel pack. Override to include objaverse once + # you've run `python -m robocasa.scripts.download_kitchen_assets + # --type objaverse` locally. + obj_registries: list[str] = field(default_factory=lambda: ["lightwheel"]) + features: dict[str, PolicyFeature] = field( + default_factory=lambda: {ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(12,))} + ) + features_map: dict[str, str] = field(default_factory=lambda: {ACTION: ACTION, "agent_pos": OBS_STATE}) + + def __post_init__(self): + if self.obs_type not in ("pixels", "pixels_agent_pos"): + raise ValueError(f"Unsupported obs_type: {self.obs_type}") + + # Preserve raw RoboCasa camera names end-to-end (e.g. + # `observation.images.robot0_agentview_left`). This matches the + # naming convention used by the RoboCasa datasets on the Hub, so + # trained policies don't need a `--rename_map` at eval time. + cams = [c.strip() for c in self.camera_name.split(",") if c.strip()] + for cam in cams: + self.features[f"pixels/{cam}"] = PolicyFeature( + type=FeatureType.VISUAL, + shape=(self.observation_height, self.observation_width, 3), + ) + self.features_map[f"pixels/{cam}"] = f"{OBS_IMAGES}.{cam}" + + if self.obs_type == "pixels_agent_pos": + self.features["agent_pos"] = PolicyFeature(type=FeatureType.STATE, shape=(16,)) + + @property + def gym_kwargs(self) -> dict: + kwargs: dict[str, Any] = { + "obs_type": self.obs_type, + "render_mode": self.render_mode, + "observation_height": self.observation_height, + "observation_width": self.observation_width, + "visualization_height": self.visualization_height, + "visualization_width": self.visualization_width, + } + if self.split is not None: + kwargs["split"] = self.split + return kwargs + + def create_envs(self, n_envs: int, use_async_envs: bool = False): + from .robocasa import create_robocasa_envs + + if self.task is None: + raise ValueError("RoboCasaEnv requires a task to be specified") + env_cls = _make_vec_env_cls(use_async_envs, n_envs) + return create_robocasa_envs( + task=self.task, + n_envs=n_envs, + camera_name=self.camera_name, + gym_kwargs=self.gym_kwargs, + env_cls=env_cls, + episode_length=self.episode_length, + obj_registries=tuple(self.obj_registries), + ) + + @EnvConfig.register_subclass("isaaclab_arena") @dataclass class IsaaclabArenaEnv(HubEnvConfig): diff --git a/src/lerobot/envs/libero.py b/src/lerobot/envs/libero.py index f357d4eef..c9aba71bb 100644 --- a/src/lerobot/envs/libero.py +++ b/src/lerobot/envs/libero.py @@ -31,20 +31,7 @@ from libero.libero.envs import OffScreenRenderEnv from lerobot.types import RobotObservation -from .utils import _LazyAsyncVectorEnv - - -def _parse_camera_names(camera_name: str | Sequence[str]) -> list[str]: - """Normalize camera_name into a non-empty list of strings.""" - if isinstance(camera_name, str): - cams = [c.strip() for c in camera_name.split(",") if c.strip()] - elif isinstance(camera_name, (list | tuple)): - cams = [str(c).strip() for c in camera_name if str(c).strip()] - else: - raise TypeError(f"camera_name must be str or sequence[str], got {type(camera_name).__name__}") - if not cams: - raise ValueError("camera_name resolved to an empty list.") - return cams +from .utils import _LazyAsyncVectorEnv, parse_camera_names def _get_suite(name: str) -> benchmark.Benchmark: @@ -128,7 +115,7 @@ class LiberoEnv(gym.Env): self.visualization_width = visualization_width self.visualization_height = visualization_height self.init_states = init_states - self.camera_name = _parse_camera_names( + self.camera_name = parse_camera_names( camera_name ) # agentview_image (main) or robot0_eye_in_hand_image (wrist) @@ -437,7 +424,7 @@ def create_libero_envs( gym_kwargs = dict(gym_kwargs or {}) task_ids_filter = gym_kwargs.pop("task_ids", None) # optional: limit to specific tasks - camera_names = _parse_camera_names(camera_name) + camera_names = parse_camera_names(camera_name) suite_names = [s.strip() for s in str(task).split(",") if s.strip()] if not suite_names: raise ValueError("`task` must contain at least one LIBERO suite name.") diff --git a/src/lerobot/envs/robocasa.py b/src/lerobot/envs/robocasa.py new file mode 100644 index 000000000..a84a7c766 --- /dev/null +++ b/src/lerobot/envs/robocasa.py @@ -0,0 +1,425 @@ +#!/usr/bin/env python + +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import logging +from collections import defaultdict +from collections.abc import Callable, Sequence +from functools import partial +from typing import Any + +import gymnasium as gym +import numpy as np +from gymnasium import spaces + +from lerobot.types import RobotObservation + +from .utils import _LazyAsyncVectorEnv, parse_camera_names + +logger = logging.getLogger(__name__) + +# Dimensions for the flat action/state vectors used by the LeRobot wrapper. +# These correspond to the PandaOmron robot in RoboCasa365. +OBS_STATE_DIM = 16 # base_pos(3) + base_quat(4) + ee_pos_rel(3) + ee_quat_rel(4) + gripper_qpos(2) +ACTION_DIM = 12 # base_motion(4) + control_mode(1) + ee_pos(3) + ee_rot(3) + gripper(1) +ACTION_LOW = -1.0 +ACTION_HIGH = 1.0 + +# Default PandaOmron cameras. We surface these raw names directly as +# `observation.images.` so the LeRobot dataset/policy keys match +# RoboCasa's native convention (no implicit renaming). +DEFAULT_CAMERAS = [ + "robot0_agentview_left", + "robot0_eye_in_hand", + "robot0_agentview_right", +] + +# Object-mesh registries to sample from. RoboCasa's upstream default is +# ("objaverse", "lightwheel"), but the objaverse pack is huge (~30GB) and +# most users — including our CI image — only download the lightwheel pack +# (`--type objs_lw` in `download_kitchen_assets`). When a sampled object +# category has zero candidates in every registry, robocasa crashes with +# `ValueError: Probabilities contain NaN` (0/0 divide in the probability +# normalization). Restricting to registries that are actually on disk +# avoids the NaN and matches what the asset download provides. +DEFAULT_OBJ_REGISTRIES: tuple[str, ...] = ("lightwheel",) + +# Task-group shortcuts accepted as `--env.task`. When the user passes one of +# these names, we expand it to the upstream RoboCasa task list and auto-set +# the dataset split. Individual task names (optionally comma-separated) still +# take precedence; this only triggers on an exact group-name match. +_TASK_GROUP_SPLITS = { + "atomic_seen": "target", + "composite_seen": "target", + "composite_unseen": "target", + "pretrain50": "pretrain", + "pretrain100": "pretrain", + "pretrain200": "pretrain", + "pretrain300": "pretrain", +} + + +def _resolve_tasks(task: str) -> tuple[list[str], str | None]: + """Resolve a `--env.task` value to (task_names, split_override). + + If `task` is a known task-group name (e.g. `atomic_seen`, `pretrain100`), + expand it via `robocasa.utils.dataset_registry.{TARGET,PRETRAINING}_TASKS` + and return the matching split. Otherwise treat `task` as a single task or + comma-separated list and leave the split untouched (None). + """ + key = task.strip() + if key in _TASK_GROUP_SPLITS: + from robocasa.utils.dataset_registry import PRETRAINING_TASKS, TARGET_TASKS + + combined = {**TARGET_TASKS, **PRETRAINING_TASKS} + if key not in combined: + raise ValueError( + f"Task group '{key}' is not available in this version of robocasa. " + f"Known groups: {sorted(combined.keys())}." + ) + return list(combined[key]), _TASK_GROUP_SPLITS[key] + + names = [t.strip() for t in task.split(",") if t.strip()] + if not names: + raise ValueError("`task` must contain at least one RoboCasa task name.") + return names, None + + +def convert_action(flat_action: np.ndarray) -> dict[str, Any]: + """Split a flat (12,) action vector into a RoboCasa action dict. + + Layout: base_motion(4) + control_mode(1) + ee_pos(3) + ee_rot(3) + gripper(1) + """ + return { + "action.base_motion": flat_action[0:4], + "action.control_mode": flat_action[4:5], + "action.end_effector_position": flat_action[5:8], + "action.end_effector_rotation": flat_action[8:11], + "action.gripper_close": flat_action[11:12], + } + + +class RoboCasaEnv(gym.Env): + """LeRobot gym.Env wrapper for RoboCasa365 kitchen environments. + + Wraps RoboCasaGymEnv from the robocasa package and converts its + dict-based observations and actions into the flat arrays LeRobot expects. + Raw RoboCasa camera names are preserved verbatim under `pixels/`. + """ + + metadata = {"render_modes": ["rgb_array"], "render_fps": 20} + + def __init__( + self, + task: str, + camera_name: str | Sequence[str] = ",".join(DEFAULT_CAMERAS), + obs_type: str = "pixels_agent_pos", + render_mode: str = "rgb_array", + observation_width: int = 256, + observation_height: int = 256, + visualization_width: int = 512, + visualization_height: int = 512, + split: str | None = None, + episode_length: int | None = None, + obj_registries: Sequence[str] = DEFAULT_OBJ_REGISTRIES, + episode_index: int = 0, + ): + super().__init__() + self.task = task + self.obs_type = obs_type + self.render_mode = render_mode + self.observation_width = observation_width + self.observation_height = observation_height + self.visualization_width = visualization_width + self.visualization_height = visualization_height + self.split = split + self.obj_registries = tuple(obj_registries) + # Per-worker index (0..n_envs-1) used to spread the user-provided + # seed across factories so each sub-env explores a distinct layout + # even when the same seed is passed to `reset()`. + self.episode_index = int(episode_index) + + self.camera_name = parse_camera_names(camera_name) + + self._max_episode_steps = episode_length if episode_length is not None else 1000 + + # Deferred — created on first reset() inside the worker subprocess + # to avoid inheriting stale GPU/EGL contexts across fork(). + self._env: Any = None + self.task_description = "" + + images = { + cam: spaces.Box( + low=0, + high=255, + shape=(self.observation_height, self.observation_width, 3), + dtype=np.uint8, + ) + for cam in self.camera_name + } + + if self.obs_type == "pixels": + self.observation_space = spaces.Dict({"pixels": spaces.Dict(images)}) + elif self.obs_type == "pixels_agent_pos": + self.observation_space = spaces.Dict( + { + "pixels": spaces.Dict(images), + "agent_pos": spaces.Box( + low=-np.inf, + high=np.inf, + shape=(OBS_STATE_DIM,), + dtype=np.float32, + ), + } + ) + else: + raise ValueError(f"Unsupported obs_type '{self.obs_type}'. Use 'pixels' or 'pixels_agent_pos'.") + + self.action_space = spaces.Box( + low=ACTION_LOW, + high=ACTION_HIGH, + shape=(ACTION_DIM,), + dtype=np.float32, + ) + + def _ensure_env(self) -> None: + """Create the underlying RoboCasaGymEnv on first use. + + Called inside the worker subprocess after fork(), so each worker gets + its own clean rendering context rather than inheriting a stale one from + the parent process (which causes crashes with AsyncVectorEnv). + """ + if self._env is not None: + return + from robocasa.wrappers.gym_wrapper import RoboCasaGymEnv + + # RoboCasaGymEnv defaults split="test", which create_env rejects + # (only None/"all"/"pretrain"/"target" are valid). Always pass a + # valid value so we don't hit that default. Extra kwargs are + # forwarded to the underlying kitchen env via create_env/robosuite.make. + self._env = RoboCasaGymEnv( + env_name=self.task, + camera_widths=self.observation_width, + camera_heights=self.observation_height, + split=self.split if self.split is not None else "all", + obj_registries=self.obj_registries, + ) + + ep_meta = self._env.env.get_ep_meta() + self.task_description = ep_meta.get("lang", self.task) + + def _format_raw_obs(self, raw_obs: dict) -> RobotObservation: + """Convert RoboCasaGymEnv observation dict to LeRobot format.""" + # RoboCasaGymEnv emits camera frames under "video.". + images = {cam: raw_obs[f"video.{cam}"] for cam in self.camera_name if f"video.{cam}" in raw_obs} + + if self.obs_type == "pixels": + return {"pixels": images} + + # `state.*` keys come from PandaOmronKeyConverter inside the wrapper. + agent_pos = np.concatenate( + [ + raw_obs.get("state.base_position", np.zeros(3)), + raw_obs.get("state.base_rotation", np.zeros(4)), + raw_obs.get("state.end_effector_position_relative", np.zeros(3)), + raw_obs.get("state.end_effector_rotation_relative", np.zeros(4)), + raw_obs.get("state.gripper_qpos", np.zeros(2)), + ], + axis=-1, + ).astype(np.float32) + + return {"pixels": images, "agent_pos": agent_pos} + + def render(self) -> np.ndarray: + self._ensure_env() + assert self._env is not None + return self._env.render() + + def reset(self, seed=None, **kwargs): + self._ensure_env() + assert self._env is not None + super().reset(seed=seed) + # Spread the seed across workers so n_envs factories don't all + # roll the same scene. With an explicit user seed we shift it by + # episode_index; with no seed we fall back to episode_index so + # each worker is still distinct rather than inheriting the same + # global RNG state. + worker_seed = seed + self.episode_index if seed is not None else self.episode_index + raw_obs, info = self._env.reset(seed=worker_seed) + + ep_meta = self._env.env.get_ep_meta() + self.task_description = ep_meta.get("lang", self.task) + + observation = self._format_raw_obs(raw_obs) + info = {"is_success": False} + return observation, info + + def step(self, action: np.ndarray) -> tuple[RobotObservation, float, bool, bool, dict[str, Any]]: + self._ensure_env() + assert self._env is not None + if action.ndim != 1: + raise ValueError( + f"Expected action to be 1-D (shape (action_dim,)), " + f"but got shape {action.shape} with ndim={action.ndim}" + ) + + action_dict = convert_action(action) + raw_obs, reward, done, truncated, info = self._env.step(action_dict) + + is_success = bool(info.get("success", False)) + terminated = done or is_success + info.update({"task": self.task, "done": done, "is_success": is_success}) + + observation = self._format_raw_obs(raw_obs) + if terminated: + info["final_info"] = { + "task": self.task, + "done": bool(done), + "is_success": bool(is_success), + } + self.reset() + + return observation, reward, terminated, truncated, info + + def close(self): + if self._env is not None: + self._env.close() + + +def _make_env_fns( + *, + task: str, + n_envs: int, + camera_names: list[str], + obs_type: str, + render_mode: str, + observation_width: int, + observation_height: int, + visualization_width: int, + visualization_height: int, + split: str | None, + episode_length: int | None, + obj_registries: Sequence[str], +) -> list[Callable[[], RoboCasaEnv]]: + """Build n_envs factory callables for a single task. + + Each factory carries a distinct ``episode_index`` (``0..n_envs-1``) so + ``RoboCasaEnv.reset()`` can derive a per-worker seed series from the + user-provided seed. + """ + + def _make_env(episode_index: int) -> RoboCasaEnv: + return RoboCasaEnv( + task=task, + camera_name=camera_names, + obs_type=obs_type, + render_mode=render_mode, + observation_width=observation_width, + observation_height=observation_height, + visualization_width=visualization_width, + visualization_height=visualization_height, + split=split, + episode_length=episode_length, + obj_registries=obj_registries, + episode_index=episode_index, + ) + + return [partial(_make_env, i) for i in range(n_envs)] + + +def create_robocasa_envs( + task: str, + n_envs: int, + gym_kwargs: dict[str, Any] | None = None, + camera_name: str | Sequence[str] = ",".join(DEFAULT_CAMERAS), + env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None, + episode_length: int | None = None, + obj_registries: Sequence[str] = DEFAULT_OBJ_REGISTRIES, +) -> dict[str, dict[int, Any]]: + """Create vectorized RoboCasa365 environments with a consistent return shape. + + Returns: + dict[task_name][task_id] -> vec_env (env_cls([...]) with exactly n_envs factories) + + `task` can be: + - a single task name (e.g. `CloseFridge`) + - a comma-separated list of task names (e.g. `CloseFridge,PickPlaceCoffee`) + - a benchmark-group shortcut (`atomic_seen`, `composite_seen`, + `composite_unseen`, `pretrain50`, `pretrain100`, `pretrain200`, + `pretrain300`), which auto-expands to the upstream task list and + auto-sets the dataset `split` ("target" or "pretrain"). + """ + if env_cls is None or not callable(env_cls): + raise ValueError("env_cls must be a callable that wraps a list of environment factory callables.") + if not isinstance(n_envs, int) or n_envs <= 0: + raise ValueError(f"n_envs must be a positive int; got {n_envs}.") + + gym_kwargs = dict(gym_kwargs or {}) + obs_type = gym_kwargs.pop("obs_type", "pixels_agent_pos") + render_mode = gym_kwargs.pop("render_mode", "rgb_array") + observation_width = gym_kwargs.pop("observation_width", 256) + observation_height = gym_kwargs.pop("observation_height", 256) + visualization_width = gym_kwargs.pop("visualization_width", 512) + visualization_height = gym_kwargs.pop("visualization_height", 512) + split = gym_kwargs.pop("split", None) + + camera_names = parse_camera_names(camera_name) + task_names, group_split = _resolve_tasks(str(task)) + if group_split is not None and split is None: + split = group_split + + logger.info( + "Creating RoboCasa envs | tasks=%s | split=%s | n_envs(per task)=%d", + task_names, + split, + n_envs, + ) + + is_async = env_cls is gym.vector.AsyncVectorEnv + + cached_obs_space: spaces.Space | None = None + cached_act_space: spaces.Space | None = None + cached_metadata: dict[str, Any] | None = None + out: dict[str, dict[int, Any]] = defaultdict(dict) + + for task_name in task_names: + fns = _make_env_fns( + task=task_name, + n_envs=n_envs, + camera_names=camera_names, + obs_type=obs_type, + render_mode=render_mode, + observation_width=observation_width, + observation_height=observation_height, + visualization_width=visualization_width, + visualization_height=visualization_height, + split=split, + episode_length=episode_length, + obj_registries=obj_registries, + ) + + if is_async: + lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata) + if cached_obs_space is None: + cached_obs_space = lazy.observation_space + cached_act_space = lazy.action_space + cached_metadata = lazy.metadata + out[task_name][0] = lazy + else: + out[task_name][0] = env_cls(fns) + logger.info("Built vec env | task=%s | n_envs=%d", task_name, n_envs) + + return {name: dict(task_map) for name, task_map in out.items()} diff --git a/src/lerobot/envs/utils.py b/src/lerobot/envs/utils.py index 9b915713d..6e6f352e9 100644 --- a/src/lerobot/envs/utils.py +++ b/src/lerobot/envs/utils.py @@ -34,6 +34,25 @@ from lerobot.utils.utils import get_channel_first_image_shape from .configs import EnvConfig +def parse_camera_names(camera_name: str | Sequence[str]) -> list[str]: + """Normalize ``camera_name`` into a non-empty list of strings. + + Accepts a comma-separated string (``"cam_a,cam_b"``) or a sequence of + strings (tuples/lists). Whitespace is stripped; empty entries are + dropped. Raises ``TypeError`` for unsupported input types and + ``ValueError`` when the normalized list is empty. + """ + if isinstance(camera_name, str): + cams = [c.strip() for c in camera_name.split(",") if c.strip()] + elif isinstance(camera_name, (list | tuple)): + cams = [str(c).strip() for c in camera_name if str(c).strip()] + else: + raise TypeError(f"camera_name must be str or sequence[str], got {type(camera_name).__name__}") + if not cams: + raise ValueError("camera_name resolved to an empty list.") + return cams + + def _convert_nested_dict(d): result = {} for k, v in d.items():