From cc46497f4c77d525a0c5989abe7c5d984dfd0da9 Mon Sep 17 00:00:00 2001
From: "Jade Choghari (jchoghar)" <chogharijade@gmail.com>
Date: Wed, 20 Aug 2025 06:55:05 -0400
Subject: [PATCH] fix renaming issues with cams

---
 examples/test.sh                |  5 +--
 src/lerobot/configs/policies.py |  8 +++-
 src/lerobot/constants.py        |  4 --
 src/lerobot/envs/configs.py     |  4 +-
 src/lerobot/envs/libero.py      | 12 ++++--
 src/lerobot/envs/utils.py       | 66 +++++++++++++++++----------------
 src/lerobot/policies/factory.py |  1 -
 src/lerobot/scripts/eval.py     |  6 +--
 src/lerobot/scripts/train.py    |  2 +
 9 files changed, 59 insertions(+), 49 deletions(-)

diff --git a/examples/test.sh b/examples/test.sh
index 5dfcd581e..d6f29f35e 100644
--- a/examples/test.sh
+++ b/examples/test.sh
@@ -1,6 +1,5 @@
 #!/bin/bash
 
-# Example evaluation script for LeRobot policies
 unset LEROBOT_HOME
 unset HF_LEROBOT_HOME
 # === CONFIGURATION ===
@@ -12,11 +11,11 @@ N_EPISODES=1
 USE_AMP=false
 DEVICE=cuda
 
-# === RUN EVALUATION ===
+# RUN EVALUATION
 python src/lerobot/scripts/eval.py \
     --policy.path="$POLICY_PATH" \
     --env.type="$ENV_TYPE" \
     --eval.batch_size="$BATCH_SIZE" \
     --eval.n_episodes="$N_EPISODES" \
-    --env.multitask_eval=False \
+    --env.multitask_eval=True \
     --env.task=$TASK \
diff --git a/src/lerobot/configs/policies.py b/src/lerobot/configs/policies.py
index f5fa727cf..8ae69fc62 100644
--- a/src/lerobot/configs/policies.py
+++ b/src/lerobot/configs/policies.py
@@ -124,7 +124,13 @@ class PreTrainedConfig(draccus.ChoiceRegistry, HubMixin, abc.ABC):
             if ft.type is FeatureType.STATE and ft_name == OBS_STATE:
                 return ft
         return None
-
+        
+    @property
+    def robot_state_feature_key(self) -> PolicyFeature | None:
+        for key, ft in self.input_features.items():
+            if ft.type is FeatureType.STATE:
+                return key
+        return None
     @property
     def env_state_feature(self) -> PolicyFeature | None:
         for _, ft in self.input_features.items():
diff --git a/src/lerobot/constants.py b/src/lerobot/constants.py
index 3d6f9edc3..bc5b2013c 100644
--- a/src/lerobot/constants.py
+++ b/src/lerobot/constants.py
@@ -21,10 +21,6 @@ OBS_ENV_STATE = "observation.environment_state"
 OBS_STATE = "observation.state"
 OBS_IMAGE = "observation.image"
 OBS_IMAGE_2 = "observation.image2"
-OBS_IMAGE = "image"
-OBS_IMAGE_2 = "image2"
-# OBS_IMAGE = "image"
-# OBS_IMAGE_2 = "wrist_image"
 OBS_IMAGES = "observation.images"
 ACTION = "action"
 REWARD = "next.reward"
diff --git a/src/lerobot/envs/configs.py b/src/lerobot/envs/configs.py
index 47a23eb04..da3e2d5bf 100644
--- a/src/lerobot/envs/configs.py
+++ b/src/lerobot/envs/configs.py
@@ -295,8 +295,8 @@ class LiberoEnv(EnvConfig):
         default_factory=lambda: {
             "action": ACTION,
             "agent_pos": OBS_STATE,
-            "pixels/agentview_image": f"observation.images.{OBS_IMAGE}",
-            "pixels/robot0_eye_in_hand_image": f"observation.images.{OBS_IMAGE_2}",
+            "pixels/agentview_image": f"{OBS_IMAGE}",
+            "pixels/robot0_eye_in_hand_image": f"{OBS_IMAGE_2}",
         }
     )
 
diff --git a/src/lerobot/envs/libero.py b/src/lerobot/envs/libero.py
index 10bc4aa9f..b83958d6c 100644
--- a/src/lerobot/envs/libero.py
+++ b/src/lerobot/envs/libero.py
@@ -180,8 +180,8 @@ class LiberoEnv(gym.Env):
         )  # agentview_image (main) or robot0_eye_in_hand_image (wrist)
         # TODO: jadechoghari, check mapping
         self.camera_name_mapping = {
-            "agentview_image": OBS_IMAGE,
-            "robot0_eye_in_hand_image": OBS_IMAGE_2,
+            "agentview_image": "image",
+            "robot0_eye_in_hand_image": "image2",
         }
 
         self.num_steps_wait = (
@@ -234,10 +234,16 @@ class LiberoEnv(gym.Env):
 
         self.action_space = spaces.Box(low=-1, high=1, shape=(7,), dtype=np.float32)
 
-    def render(self):
+    def render1(self):
         raw_obs = self._env.env._get_observations()
         image = self._format_raw_obs(raw_obs)["pixels"][OBS_IMAGE]
         return image
+    def render(self):
+        raw_obs = self._env.env._get_observations()
+        formatted = self._format_raw_obs(raw_obs)
+        # grab the "main" camera
+        return formatted["pixels"]["image"]
+
 
     def _make_envs_task(self, task_suite, task_id: int = 0):
         task = task_suite.get_task(task_id)
diff --git a/src/lerobot/envs/utils.py b/src/lerobot/envs/utils.py
index 2cf9efcfe..cd76230d1 100644
--- a/src/lerobot/envs/utils.py
+++ b/src/lerobot/envs/utils.py
@@ -26,61 +26,63 @@ from lerobot.configs.types import FeatureType, PolicyFeature
 from lerobot.envs.configs import EnvConfig
 from lerobot.utils.utils import get_channel_first_image_shape
 
-
-def preprocess_observation(observations: dict[str, np.ndarray]) -> dict[str, Tensor]:
-    # TODO(aliberts, rcadene): refactor this to use features from the environment (no hardcoding)
+def preprocess_observation(
+    observations: dict[str, np.ndarray], cfg: dict[str, Any] = None
+) -> dict[str, Tensor]:
     """Convert environment observation to LeRobot format observation.
     Args:
-        observation: Dictionary of observation batches from a Gym vector environment.
+        observations: Dictionary of observation batches from a Gym vector environment.
+        cfg: Policy config containing expected feature keys.
     Returns:
-        Dictionary of observation batches with keys renamed to LeRobot format and values as tensors.
+        Dictionary of observation batches with keys renamed to match policy expectations.
     """
-    # map to expected inputs for the policy
     return_observations = {}
+
+    # expected keys from policy
+    policy_img_keys = list(cfg.image_features.keys()) if cfg else ["observation.image"]
+    state_key = cfg.robot_state_feature_key if cfg else "observation.state"
+
+    # handle images
     if "pixels" in observations:
         if isinstance(observations["pixels"], dict):
-            imgs = {f"observation.images.{key}": img for key, img in observations["pixels"].items()}
+            env_img_keys = list(observations["pixels"].keys())
+            imgs = observations["pixels"]
         else:
-            imgs = {"observation.image": observations["pixels"]}
+            env_img_keys = ["pixels"]
+            imgs = {"pixels": observations["pixels"]}
+
+        # build rename map env_key -> policy_key
+        rename_map = dict(zip(env_img_keys, policy_img_keys))
 
         for imgkey, img in imgs.items():
-            # TODO(aliberts, rcadene): use transforms.ToTensor()?
+            target_key = rename_map.get(imgkey, imgkey)
+
             img = torch.from_numpy(img)
 
-            # When preprocessing observations in a non-vectorized environment, we need to add a batch dimension.
-            # This is the case for human-in-the-loop RL where there is only one environment.
-            if img.ndim == 3:
-                img = img.unsqueeze(0)
-            # sanity check that images are channel last
+            # sanity checks
             _, h, w, c = img.shape
-            assert c < h and c < w, f"expect channel last images, but instead got {img.shape=}"
+            assert c < h and c < w, f"expect channel last images, got {img.shape=}"
+            assert img.dtype == torch.uint8, f"expect torch.uint8, got {img.dtype=}"
 
-            # sanity check that images are uint8
-            assert img.dtype == torch.uint8, f"expect torch.uint8, but instead {img.dtype=}"
-
-            # convert to channel first of type float32 in range [0,1]
+            # channel last → channel first, normalize
             img = einops.rearrange(img, "b h w c -> b c h w").contiguous()
-            img = img.type(torch.float32)
-            img /= 255
+            img = img.float() / 255.0
 
-            return_observations[imgkey] = img
+            return_observations[target_key] = img
 
+    # handle state
     if "environment_state" in observations:
-        env_state = torch.from_numpy(observations["environment_state"]).float()
-        if env_state.dim() == 1:
-            env_state = env_state.unsqueeze(0)
+        return_observations["observation.environment_state"] = torch.from_numpy(
+            observations["environment_state"]
+        ).float()
 
-        return_observations["observation.environment_state"] = env_state
+    return_observations[state_key] = torch.from_numpy(observations["agent_pos"]).float()
 
-    # TODO(rcadene): enable pixels only baseline with `obs_type="pixels"` in environment by removing
-    agent_pos = torch.from_numpy(observations["agent_pos"]).float()
-    if agent_pos.dim() == 1:
-        agent_pos = agent_pos.unsqueeze(0)
-    return_observations["observation.state"] = agent_pos
+    if "task" in observations:
+        return_observations["task"] = observations["task"]
 
     return return_observations
 
-
 def env_to_policy_features(env_cfg: EnvConfig) -> dict[str, PolicyFeature]:
     # TODO(aliberts, rcadene): remove this hardcoding of keys and just use the nested keys as is
     # (need to also refactor preprocess_observation and externalize normalization from policies)
diff --git a/src/lerobot/policies/factory.py b/src/lerobot/policies/factory.py
index cab752112..4b8eeffd1 100644
--- a/src/lerobot/policies/factory.py
+++ b/src/lerobot/policies/factory.py
@@ -168,7 +168,6 @@ def make_policy(
     else:
         # Make a fresh policy.
         policy = policy_cls(**kwargs)
-
     policy.to(cfg.device)
     assert isinstance(policy, nn.Module)
 
diff --git a/src/lerobot/scripts/eval.py b/src/lerobot/scripts/eval.py
index 7e086821f..37ef6348f 100644
--- a/src/lerobot/scripts/eval.py
+++ b/src/lerobot/scripts/eval.py
@@ -146,7 +146,8 @@ def rollout(
     check_env_attributes_and_types(env)
     while not np.all(done) and step < max_steps:
         # Numpy array to tensor and changing dictionary keys to LeRobot policy format.
-        observation = preprocess_observation(observation)
+        # observation = preprocess_observation(observation)
+        observation = preprocess_observation(observation, cfg=policy.config)
         if return_observations:
             all_observations.append(deepcopy(observation))
 
@@ -159,7 +160,6 @@ def rollout(
         observation = add_envs_task(env, observation)
         with torch.inference_mode():
             action = policy.select_action(observation)
-        observation["observation.images.image"]
         # Convert to CPU / numpy.
         action = action.to("cpu").numpy()
         assert action.ndim == 2, "Action dimensions should be (batch, action_dim)"
@@ -198,7 +198,7 @@ def rollout(
 
     # Track the final observation.
     if return_observations:
-        observation = preprocess_observation(observation)
+        observation = preprocess_observation(observation, cfg=policy.config)
         all_observations.append(deepcopy(observation))
 
     # Stack the sequence along the first dimension so that we have (batch, sequence, *) tensors.
diff --git a/src/lerobot/scripts/train.py b/src/lerobot/scripts/train.py
index 6c07a1ae8..82f78e595 100644
--- a/src/lerobot/scripts/train.py
+++ b/src/lerobot/scripts/train.py
@@ -269,6 +269,7 @@ def train(cfg: TrainPipelineConfig):
                             continue  # Skip the overall stats since we already printed it
                         print(f"\nAggregated Metrics for {task_group}:")
                         print(task_group_info["aggregated"])
+                        breakpoint()
                 else:
                     print("START EVAL")
                     eval_info = eval_policy(
@@ -279,6 +280,7 @@ def train(cfg: TrainPipelineConfig):
                         max_episodes_rendered=4,
                         start_seed=cfg.seed,
                     )
+                    breakpoint()
                     aggregated = eval_info["aggregated"]
                     print("END EVAL")