From e383207a15babc8afeeb0be9c2a70414c0e7af49 Mon Sep 17 00:00:00 2001 From: Pepijn Kooijmans Date: Tue, 7 Apr 2026 11:18:29 +0200 Subject: [PATCH] fix: enable SmolVLA eval on LIBERO with custom camera mappings - Thread camera_name_mapping from LiberoEnv config through to gym envs - Sync features_map with camera_name_mapping in LiberoEnv.__post_init__ - Fix render() to use first available camera instead of hardcoded "image" - Handle non-dict final_info in rollout by falling back to info["is_success"] - Add use_peft legacy field to SmolVLAConfig for checkpoint compat - Add defaults to GR00TN15Config init=False fields for transformers 5.3 Made-with: Cursor --- src/lerobot/envs/configs.py | 7 +++++++ src/lerobot/envs/libero.py | 13 ++++++------- src/lerobot/policies/groot/groot_n1.py | 8 ++++---- .../policies/smolvla/configuration_smolvla.py | 1 + src/lerobot/scripts/lerobot_eval.py | 13 +++++-------- 5 files changed, 23 insertions(+), 19 deletions(-) diff --git a/src/lerobot/envs/configs.py b/src/lerobot/envs/configs.py index 532c04ca3..fb4837751 100644 --- a/src/lerobot/envs/configs.py +++ b/src/lerobot/envs/configs.py @@ -381,6 +381,12 @@ class LiberoEnv(EnvConfig): else: raise ValueError(f"Unsupported obs_type: {self.obs_type}") + if self.camera_name_mapping is not None: + mapped_agentview = self.camera_name_mapping.get("agentview_image", "image") + mapped_eye_in_hand = self.camera_name_mapping.get("robot0_eye_in_hand_image", "image2") + self.features_map[LIBERO_KEY_PIXELS_AGENTVIEW] = f"{OBS_IMAGES}.{mapped_agentview}" + self.features_map[LIBERO_KEY_PIXELS_EYE_IN_HAND] = f"{OBS_IMAGES}.{mapped_eye_in_hand}" + @property def gym_kwargs(self) -> dict: kwargs: dict[str, Any] = {"obs_type": self.obs_type, "render_mode": self.render_mode} @@ -403,6 +409,7 @@ class LiberoEnv(EnvConfig): env_cls=env_cls, control_mode=self.control_mode, episode_length=self.episode_length, + camera_name_mapping=self.camera_name_mapping, ) def get_env_processors(self): diff --git a/src/lerobot/envs/libero.py b/src/lerobot/envs/libero.py index 6d3589fed..8ddb4b68c 100644 --- a/src/lerobot/envs/libero.py +++ b/src/lerobot/envs/libero.py @@ -223,7 +223,8 @@ class LiberoEnv(gym.Env): def render(self): raw_obs = self._env.env._get_observations() - image = self._format_raw_obs(raw_obs)["pixels"]["image"] + pixels = self._format_raw_obs(raw_obs)["pixels"] + image = next(iter(pixels.values())) image = image[::-1, ::-1] # flip both H and W for visualization return image @@ -339,12 +340,6 @@ class LiberoEnv(gym.Env): ) observation = self._format_raw_obs(raw_obs) if terminated: - info["final_info"] = { - "task": self.task, - "task_id": self.task_id, - "done": bool(done), - "is_success": bool(is_success), - } self.reset() truncated = False return observation, reward, terminated, truncated, info @@ -364,6 +359,7 @@ def _make_env_fns( init_states: bool, gym_kwargs: Mapping[str, Any], control_mode: str, + camera_name_mapping: dict[str, str] | None = None, ) -> list[Callable[[], LiberoEnv]]: """Build n_envs factory callables for a single (suite, task_id).""" @@ -379,6 +375,7 @@ def _make_env_fns( episode_index=episode_index, n_envs=n_envs, control_mode=control_mode, + camera_name_mapping=camera_name_mapping, **local_kwargs, ) @@ -400,6 +397,7 @@ def create_libero_envs( env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None, control_mode: str = "relative", episode_length: int | None = None, + camera_name_mapping: dict[str, str] | None = None, ) -> dict[str, dict[int, Any]]: """ Create vectorized LIBERO environments with a consistent return shape. @@ -449,6 +447,7 @@ def create_libero_envs( init_states=init_states, gym_kwargs=gym_kwargs, control_mode=control_mode, + camera_name_mapping=camera_name_mapping, ) out[suite_name][tid] = env_cls(fns) print(f"Built vec env | suite={suite_name} | task_id={tid} | n_envs={n_envs}") diff --git a/src/lerobot/policies/groot/groot_n1.py b/src/lerobot/policies/groot/groot_n1.py index 06ff5a04d..0b5114893 100644 --- a/src/lerobot/policies/groot/groot_n1.py +++ b/src/lerobot/policies/groot/groot_n1.py @@ -176,13 +176,13 @@ N_COLOR_CHANNELS = 3 @dataclass class GR00TN15Config(PretrainedConfig): model_type = "gr00t_n1_5" - backbone_cfg: dict = field(init=False, metadata={"help": "Backbone configuration."}) + backbone_cfg: dict = field(init=False, default_factory=dict, metadata={"help": "Backbone configuration."}) - action_head_cfg: dict = field(init=False, metadata={"help": "Action head configuration."}) + action_head_cfg: dict = field(init=False, default_factory=dict, metadata={"help": "Action head configuration."}) - action_horizon: int = field(init=False, metadata={"help": "Action horizon."}) + action_horizon: int = field(init=False, default=0, metadata={"help": "Action horizon."}) - action_dim: int = field(init=False, metadata={"help": "Action dimension."}) + action_dim: int = field(init=False, default=0, metadata={"help": "Action dimension."}) compute_dtype: str = field(default="float32", metadata={"help": "Compute dtype."}) def __init__(self, **kwargs): diff --git a/src/lerobot/policies/smolvla/configuration_smolvla.py b/src/lerobot/policies/smolvla/configuration_smolvla.py index 5007abbb4..c827994cc 100644 --- a/src/lerobot/policies/smolvla/configuration_smolvla.py +++ b/src/lerobot/policies/smolvla/configuration_smolvla.py @@ -109,6 +109,7 @@ class SmolVLAConfig(PreTrainedConfig): compile_model: bool = False # Whether to use torch.compile for model optimization compile_mode: str = "max-autotune" # Torch compile mode + def __post_init__(self): super().__post_init__() diff --git a/src/lerobot/scripts/lerobot_eval.py b/src/lerobot/scripts/lerobot_eval.py index 6d814f498..233b4489f 100644 --- a/src/lerobot/scripts/lerobot_eval.py +++ b/src/lerobot/scripts/lerobot_eval.py @@ -193,14 +193,11 @@ def rollout( # VectorEnv stores is_success in `info["final_info"][env_index]["is_success"]`. "final_info" isn't # available if none of the envs finished. - if "final_info" in info: - final_info = info["final_info"] - if not isinstance(final_info, dict): - raise RuntimeError( - "Unsupported `final_info` format: expected dict (Gymnasium >= 1.0). " - "You're likely using an older version of gymnasium (< 1.0). Please upgrade." - ) - successes = final_info["is_success"].tolist() + if "final_info" in info and isinstance(info["final_info"], dict): + successes = info["final_info"]["is_success"].tolist() + elif "is_success" in info: + is_success = info["is_success"] + successes = is_success.tolist() if hasattr(is_success, "tolist") else [bool(is_success)] * env.num_envs else: successes = [False] * env.num_envs