From f40b30202b206f1f21a6e01529d3d95f378661e7 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Tue, 14 Apr 2026 18:25:08 +0200
Subject: [PATCH] fix(robotwin): read nested get_obs() output and use
 aloha-agilex camera names

RoboTwin's base_task.get_obs() returns a nested dict:

    {"observation": {cam: {"rgb": ..., "intrinsic_matrix": ...}},
     "joint_action": {"left_arm": ..., "left_gripper": ...,
                      "right_arm": ..., "right_gripper": ...,
                      "vector": np.ndarray},
     "endpose": {...}}

Our _get_obs was reading raw["{cam}_rgb"] / raw["{cam}"] and raw["joint_action"]
as if they were flat, so np.asarray(raw["joint_action"], dtype=float64) tripped
on a dict and raised

    TypeError: float() argument must be a string or a real number, not 'dict'

Fix:
- Pull images from raw["observation"][cam]["rgb"]
- Pull joint state from raw["joint_action"]["vector"] (the flat array)
- Update the default camera tuple to (head_camera, left_camera, right_camera)
  to match RoboTwin's actual wrist-camera names (envs/camera/camera.py:135-151)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/lerobot/envs/robotwin.py | 42 ++++++++++++++++--------------------
 1 file changed, 19 insertions(+), 23 deletions(-)
diff --git a/src/lerobot/envs/robotwin.py b/src/lerobot/envs/robotwin.py
index dc8bc657c..07f9f6202 100644
--- a/src/lerobot/envs/robotwin.py
+++ b/src/lerobot/envs/robotwin.py
@@ -32,9 +32,8 @@ from lerobot.types import RobotObservation
 # up keys in get_obs() output (e.g. "head_camera" → "head_camera_rgb").
 ROBOTWIN_CAMERA_NAMES: tuple[str, ...] = (
     "head_camera",
-    "front_camera",
-    "left_wrist",
-    "right_wrist",
+    "left_camera",
+    "right_camera",
 )
 
 ACTION_DIM = 14  # 7 DOF × 2 arms
@@ -278,38 +277,35 @@ class RoboTwinEnv(gym.Env):
         assert self._env is not None, "_get_obs called before _ensure_env()"
         raw = self._env.get_obs()
 
+        # RoboTwin's get_obs returns a nested dict:
+        #   {"observation": {cam_name: {"rgb": img, "intrinsic_matrix": ...}, ...},
+        #    "joint_action": {"left_arm": [...], "left_gripper": ..., "vector": np.ndarray},
+        #    "endpose": {...}}
+        cameras_raw = raw.get("observation", {}) if isinstance(raw, dict) else {}
         images: dict[str, np.ndarray] = {}
         for cam in self.camera_names:
-            # RoboTwin 2.0 camera keys follow the "<name>_rgb" convention.
-            # Fall back to the bare name if the suffixed key is absent.
-            key = f"{cam}_rgb" if f"{cam}_rgb" in raw else cam
-            if key in raw:
-                img = np.asarray(raw[key], dtype=np.uint8)
-                # Ensure HWC and exactly 3 channels
+            cam_data = cameras_raw.get(cam)
+            img = cam_data.get("rgb") if isinstance(cam_data, dict) else None
+            if img is not None:
+                img = np.asarray(img, dtype=np.uint8)
                 if img.ndim == 2:
                     img = np.stack([img, img, img], axis=-1)
                 elif img.shape[-1] != 3:
                     img = img[..., :3]
-                if img.shape[0] != self.observation_height or img.shape[1] != self.observation_width:
-                    # Resize is best done outside hot-loop; return as-is and let
-                    # the processor handle it if needed.
-                    pass
                 images[cam] = img
             else:
                 # Camera not exposed by this task — return a black frame so the
                 # observation shape stays consistent across all tasks.
                 images[cam] = np.zeros((self.observation_height, self.observation_width, 3), dtype=np.uint8)
 
-        # Joint state: try common key names used by RoboTwin base_task.
-        joint_state: np.ndarray | None = None
-        for key in ("joint_action", "qpos", "robot_qpos", "joint_state", "obs"):
-            if key in raw:
-                arr = np.asarray(raw[key], dtype=np.float64).flatten()
-                if arr.size >= ACTION_DIM:
-                    joint_state = arr[:ACTION_DIM]
-                    break
-        if joint_state is None:
-            joint_state = np.zeros(ACTION_DIM, dtype=np.float64)
+        # Joint state: RoboTwin exposes it as raw["joint_action"]["vector"]
+        # (flat array of left_arm+left_gripper+right_arm+right_gripper).
+        joint_state = np.zeros(ACTION_DIM, dtype=np.float64)
+        ja = raw.get("joint_action") if isinstance(raw, dict) else None
+        if isinstance(ja, dict) and "vector" in ja:
+            arr = np.asarray(ja["vector"], dtype=np.float64).flatten()
+            if arr.size >= ACTION_DIM:
+                joint_state = arr[:ACTION_DIM]
 
         return {"pixels": images, "agent_pos": joint_state}