diff --git a/.github/workflows/benchmark_tests.yml b/.github/workflows/benchmark_tests.yml index 74234ed5b..8ac040f67 100644 --- a/.github/workflows/benchmark_tests.yml +++ b/.github/workflows/benchmark_tests.yml @@ -366,7 +366,7 @@ jobs: --eval.n_episodes=1 \ --eval.use_async_envs=false \ --policy.device=cuda \ - '--rename_map={\"observation.images.head_camera\": \"observation.images.camera1\", \"observation.images.front_camera\": \"observation.images.camera2\", \"observation.images.left_wrist\": \"observation.images.camera3\"}' \ + '--rename_map={\"observation.images.head_camera\": \"observation.images.camera1\", \"observation.images.left_camera\": \"observation.images.camera2\", \"observation.images.right_camera\": \"observation.images.camera3\"}' \ --output_dir=/tmp/eval-artifacts " diff --git a/src/lerobot/envs/configs.py b/src/lerobot/envs/configs.py index 17edeb903..9f27a49da 100644 --- a/src/lerobot/envs/configs.py +++ b/src/lerobot/envs/configs.py @@ -594,11 +594,14 @@ class RoboTwinEnvConfig(EnvConfig): episode_length: int = 300 obs_type: str = "pixels_agent_pos" render_mode: str = "rgb_array" - # Comma-separated list of cameras to include in observations. - # Available: head_camera, front_camera, left_wrist, right_wrist - camera_names: str = "head_camera,front_camera,left_wrist,right_wrist" - observation_height: int = 480 - observation_width: int = 640 + # Available cameras from RoboTwin's aloha-agilex embodiment: head_camera + # (torso-mounted) + left_camera / right_camera (wrists). + camera_names: str = "head_camera,left_camera,right_camera" + # Match the D435 dims in task_config/demo_clean.yml (_camera_config.yml). + # Gym's vector-env concatenate pre-allocates buffers of this shape, so it + # must equal what SAPIEN actually renders. + observation_height: int = 240 + observation_width: int = 320 features: dict[str, PolicyFeature] = field( default_factory=lambda: { ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(14,)), @@ -608,9 +611,8 @@ class RoboTwinEnvConfig(EnvConfig): default_factory=lambda: { ACTION: ACTION, "pixels/head_camera": f"{OBS_IMAGES}.head_camera", - "pixels/front_camera": f"{OBS_IMAGES}.front_camera", - "pixels/left_wrist": f"{OBS_IMAGES}.left_wrist", - "pixels/right_wrist": f"{OBS_IMAGES}.right_wrist", + "pixels/left_camera": f"{OBS_IMAGES}.left_camera", + "pixels/right_camera": f"{OBS_IMAGES}.right_camera", "agent_pos": OBS_STATE, } ) diff --git a/src/lerobot/envs/robotwin.py b/src/lerobot/envs/robotwin.py index e18aea7dd..6f370d1d4 100644 --- a/src/lerobot/envs/robotwin.py +++ b/src/lerobot/envs/robotwin.py @@ -228,8 +228,8 @@ class RoboTwinEnv(gym.Env): episode_index: int = 0, n_envs: int = 1, camera_names: Sequence[str] = ROBOTWIN_CAMERA_NAMES, - observation_height: int = 480, - observation_width: int = 640, + observation_height: int | None = None, + observation_width: int | None = None, episode_length: int = DEFAULT_EPISODE_LENGTH, render_mode: str = "rgb_array", ): @@ -240,20 +240,24 @@ class RoboTwinEnv(gym.Env): self.episode_index = episode_index self._reset_stride = n_envs self.camera_names = list(camera_names) - self.observation_height = observation_height - self.observation_width = observation_width + # Match the declared observation_space to the camera resolution in + # RoboTwin's YAML — otherwise gym.vector's concatenate pre-allocates + # a buffer of the wrong shape and np.stack raises ValueError. + setup_kwargs = _load_robotwin_setup_kwargs(task_name) + self.observation_height = observation_height or setup_kwargs["head_camera_h"] + self.observation_width = observation_width or setup_kwargs["head_camera_w"] self.episode_length = episode_length self.render_mode = render_mode self._env: Any | None = None # deferred — created on first reset() inside worker self._step_count: int = 0 - self._black_frame = np.zeros((observation_height, observation_width, 3), dtype=np.uint8) + self._black_frame = np.zeros((self.observation_height, self.observation_width, 3), dtype=np.uint8) image_spaces = { cam: spaces.Box( low=0, high=255, - shape=(observation_height, observation_width, 3), + shape=(self.observation_height, self.observation_width, 3), dtype=np.uint8, ) for cam in self.camera_names