From aca02ff24c84485e4cea91b9720026dff2625cf8 Mon Sep 17 00:00:00 2001
From: pepijn223 <pepijn223@hf.co>
Date: Fri, 5 Jun 2026 13:47:43 +0200
Subject: [PATCH] fix(robocasa): align env state/action order to
 openpi/robocasa convention

LeRobot's RoboCasaEnv used a divergent flat state/action layout vs the
robocasa package (robocasa.utils.env_utils.convert_action) and the openpi
robocasa pipeline. This scrambles I/O when using openpi-convention checkpoints
(e.g. the JAX->PyTorch->LeRobot converted pi05 robocasa model: CloseFridge
20% -> 60% once both orders match openpi).

- convert_action: ee_pos(3)+ee_rot(3)+gripper(1)+base_motion(4)+control_mode(1)
- observation.state: ee_pos_rel(3)+ee_rot_rel(4)+base_pos(3)+base_rot(4)+gripper(2)

Matches openpi examples/robocasa/main.py + RobocasaInputs ordering.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 src/lerobot/envs/robocasa.py | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/src/lerobot/envs/robocasa.py b/src/lerobot/envs/robocasa.py
index a84a7c766..5d8932f03 100644
--- a/src/lerobot/envs/robocasa.py
+++ b/src/lerobot/envs/robocasa.py
@@ -33,8 +33,8 @@ logger = logging.getLogger(__name__)
 
 # Dimensions for the flat action/state vectors used by the LeRobot wrapper.
 # These correspond to the PandaOmron robot in RoboCasa365.
-OBS_STATE_DIM = 16  # base_pos(3) + base_quat(4) + ee_pos_rel(3) + ee_quat_rel(4) + gripper_qpos(2)
-ACTION_DIM = 12  # base_motion(4) + control_mode(1) + ee_pos(3) + ee_rot(3) + gripper(1)
+OBS_STATE_DIM = 16  # ee_pos_rel(3) + ee_quat_rel(4) + base_pos(3) + base_quat(4) + gripper_qpos(2)
+ACTION_DIM = 12  # ee_pos(3) + ee_rot(3) + gripper(1) + base_motion(4) + control_mode(1)
 ACTION_LOW = -1.0
 ACTION_HIGH = 1.0
 
@@ -101,14 +101,15 @@ def _resolve_tasks(task: str) -> tuple[list[str], str | None]:
 def convert_action(flat_action: np.ndarray) -> dict[str, Any]:
     """Split a flat (12,) action vector into a RoboCasa action dict.
 
-    Layout: base_motion(4) + control_mode(1) + ee_pos(3) + ee_rot(3) + gripper(1)
+    Layout (openpi / robocasa.utils.env_utils.convert_action order):
+        ee_pos(3) + ee_rot(3) + gripper(1) + base_motion(4) + control_mode(1)
     """
     return {
-        "action.base_motion": flat_action[0:4],
-        "action.control_mode": flat_action[4:5],
-        "action.end_effector_position": flat_action[5:8],
-        "action.end_effector_rotation": flat_action[8:11],
-        "action.gripper_close": flat_action[11:12],
+        "action.end_effector_position": flat_action[0:3],
+        "action.end_effector_rotation": flat_action[3:6],
+        "action.gripper_close": flat_action[6:7],
+        "action.base_motion": flat_action[7:11],
+        "action.control_mode": flat_action[11:12],
     }
 
 
@@ -230,12 +231,14 @@ class RoboCasaEnv(gym.Env):
             return {"pixels": images}
 
         # `state.*` keys come from PandaOmronKeyConverter inside the wrapper.
+        # openpi state order: ee first, then base, then gripper (matches the
+        # openpi robocasa pipeline / examples/robocasa/main.py state layout).
         agent_pos = np.concatenate(
             [
-                raw_obs.get("state.base_position", np.zeros(3)),
-                raw_obs.get("state.base_rotation", np.zeros(4)),
                 raw_obs.get("state.end_effector_position_relative", np.zeros(3)),
                 raw_obs.get("state.end_effector_rotation_relative", np.zeros(4)),
+                raw_obs.get("state.base_position", np.zeros(3)),
+                raw_obs.get("state.base_rotation", np.zeros(4)),
                 raw_obs.get("state.gripper_qpos", np.zeros(2)),
             ],
             axis=-1,