fix: enable SmolVLA eval on LIBERO with custom camera mappings

- Thread camera_name_mapping from LiberoEnv config through to gym envs - Sync features_map with camera_name_mapping in LiberoEnv.__post_init__ - Fix render() to use first available camera instead of hardcoded "image" - Handle non-dict final_info in rollout by falling back to info["is_success"] - Add use_peft legacy field to SmolVLAConfig for checkpoint compat - Add defaults to GR00TN15Config init=False fields for transformers 5.3 Made-with: Cursor
2026-07-06 09:37:06 +00:00 · 2026-04-07 11:18:29 +02:00
parent 8ed658c6aa
commit e383207a15
5 changed files with 23 additions and 19 deletions
@@ -381,6 +381,12 @@ class LiberoEnv(EnvConfig):
        else:
            raise ValueError(f"Unsupported obs_type: {self.obs_type}")

+        if self.camera_name_mapping is not None:
+            mapped_agentview = self.camera_name_mapping.get("agentview_image", "image")
+            mapped_eye_in_hand = self.camera_name_mapping.get("robot0_eye_in_hand_image", "image2")
+            self.features_map[LIBERO_KEY_PIXELS_AGENTVIEW] = f"{OBS_IMAGES}.{mapped_agentview}"
+            self.features_map[LIBERO_KEY_PIXELS_EYE_IN_HAND] = f"{OBS_IMAGES}.{mapped_eye_in_hand}"
+
    @property
    def gym_kwargs(self) -> dict:
        kwargs: dict[str, Any] = {"obs_type": self.obs_type, "render_mode": self.render_mode}
@@ -403,6 +409,7 @@ class LiberoEnv(EnvConfig):
            env_cls=env_cls,
            control_mode=self.control_mode,
            episode_length=self.episode_length,
+            camera_name_mapping=self.camera_name_mapping,
        )

    def get_env_processors(self):
@@ -223,7 +223,8 @@ class LiberoEnv(gym.Env):

    def render(self):
        raw_obs = self._env.env._get_observations()
-        image = self._format_raw_obs(raw_obs)["pixels"]["image"]
+        pixels = self._format_raw_obs(raw_obs)["pixels"]
+        image = next(iter(pixels.values()))
        image = image[::-1, ::-1]  # flip both H and W for visualization
        return image

@@ -339,12 +340,6 @@ class LiberoEnv(gym.Env):
        )
        observation = self._format_raw_obs(raw_obs)
        if terminated:
-            info["final_info"] = {
-                "task": self.task,
-                "task_id": self.task_id,
-                "done": bool(done),
-                "is_success": bool(is_success),
-            }
            self.reset()
        truncated = False
        return observation, reward, terminated, truncated, info
@@ -364,6 +359,7 @@ def _make_env_fns(
    init_states: bool,
    gym_kwargs: Mapping[str, Any],
    control_mode: str,
+    camera_name_mapping: dict[str, str] | None = None,
 ) -> list[Callable[[], LiberoEnv]]:
    """Build n_envs factory callables for a single (suite, task_id)."""

@@ -379,6 +375,7 @@ def _make_env_fns(
            episode_index=episode_index,
            n_envs=n_envs,
            control_mode=control_mode,
+            camera_name_mapping=camera_name_mapping,
            **local_kwargs,
        )

@@ -400,6 +397,7 @@ def create_libero_envs(
    env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None,
    control_mode: str = "relative",
    episode_length: int | None = None,
+    camera_name_mapping: dict[str, str] | None = None,
 ) -> dict[str, dict[int, Any]]:
    """
    Create vectorized LIBERO environments with a consistent return shape.
@@ -449,6 +447,7 @@ def create_libero_envs(
                init_states=init_states,
                gym_kwargs=gym_kwargs,
                control_mode=control_mode,
+                camera_name_mapping=camera_name_mapping,
            )
            out[suite_name][tid] = env_cls(fns)
            print(f"Built vec env | suite={suite_name} | task_id={tid} | n_envs={n_envs}")
@@ -176,13 +176,13 @@ N_COLOR_CHANNELS = 3
@dataclass
 class GR00TN15Config(PretrainedConfig):
    model_type = "gr00t_n1_5"
-    backbone_cfg: dict = field(init=False, metadata={"help": "Backbone configuration."})
+    backbone_cfg: dict = field(init=False, default_factory=dict, metadata={"help": "Backbone configuration."})

-    action_head_cfg: dict = field(init=False, metadata={"help": "Action head configuration."})
+    action_head_cfg: dict = field(init=False, default_factory=dict, metadata={"help": "Action head configuration."})

-    action_horizon: int = field(init=False, metadata={"help": "Action horizon."})
+    action_horizon: int = field(init=False, default=0, metadata={"help": "Action horizon."})

-    action_dim: int = field(init=False, metadata={"help": "Action dimension."})
+    action_dim: int = field(init=False, default=0, metadata={"help": "Action dimension."})
    compute_dtype: str = field(default="float32", metadata={"help": "Compute dtype."})

    def __init__(self, **kwargs):
@@ -109,6 +109,7 @@ class SmolVLAConfig(PreTrainedConfig):
    compile_model: bool = False  # Whether to use torch.compile for model optimization
    compile_mode: str = "max-autotune"  # Torch compile mode

+
    def __post_init__(self):
        super().__post_init__()

@@ -193,14 +193,11 @@ def rollout(

        # VectorEnv stores is_success in `info["final_info"][env_index]["is_success"]`. "final_info" isn't
        # available if none of the envs finished.
-        if "final_info" in info:
-            final_info = info["final_info"]
-            if not isinstance(final_info, dict):
-                raise RuntimeError(
-                    "Unsupported `final_info` format: expected dict (Gymnasium >= 1.0). "
-                    "You're likely using an older version of gymnasium (< 1.0). Please upgrade."
-                )
-            successes = final_info["is_success"].tolist()
+        if "final_info" in info and isinstance(info["final_info"], dict):
+            successes = info["final_info"]["is_success"].tolist()
+        elif "is_success" in info:
+            is_success = info["is_success"]
+            successes = is_success.tolist() if hasattr(is_success, "tolist") else [bool(is_success)] * env.num_envs
        else:
            successes = [False] * env.num_envs