fix: enable SmolVLA eval on LIBERO with custom camera mappings

- Thread camera_name_mapping from LiberoEnv config through to gym envs
- Sync features_map with camera_name_mapping in LiberoEnv.__post_init__
- Fix render() to use first available camera instead of hardcoded "image"
- Handle non-dict final_info in rollout by falling back to info["is_success"]
- Add use_peft legacy field to SmolVLAConfig for checkpoint compat
- Add defaults to GR00TN15Config init=False fields for transformers 5.3

Made-with: Cursor
This commit is contained in:
Pepijn Kooijmans
2026-04-07 11:18:29 +02:00
parent 8ed658c6aa
commit e383207a15
5 changed files with 23 additions and 19 deletions
+7
View File
@@ -381,6 +381,12 @@ class LiberoEnv(EnvConfig):
else:
raise ValueError(f"Unsupported obs_type: {self.obs_type}")
if self.camera_name_mapping is not None:
mapped_agentview = self.camera_name_mapping.get("agentview_image", "image")
mapped_eye_in_hand = self.camera_name_mapping.get("robot0_eye_in_hand_image", "image2")
self.features_map[LIBERO_KEY_PIXELS_AGENTVIEW] = f"{OBS_IMAGES}.{mapped_agentview}"
self.features_map[LIBERO_KEY_PIXELS_EYE_IN_HAND] = f"{OBS_IMAGES}.{mapped_eye_in_hand}"
@property
def gym_kwargs(self) -> dict:
kwargs: dict[str, Any] = {"obs_type": self.obs_type, "render_mode": self.render_mode}
@@ -403,6 +409,7 @@ class LiberoEnv(EnvConfig):
env_cls=env_cls,
control_mode=self.control_mode,
episode_length=self.episode_length,
camera_name_mapping=self.camera_name_mapping,
)
def get_env_processors(self):
+6 -7
View File
@@ -223,7 +223,8 @@ class LiberoEnv(gym.Env):
def render(self):
raw_obs = self._env.env._get_observations()
image = self._format_raw_obs(raw_obs)["pixels"]["image"]
pixels = self._format_raw_obs(raw_obs)["pixels"]
image = next(iter(pixels.values()))
image = image[::-1, ::-1] # flip both H and W for visualization
return image
@@ -339,12 +340,6 @@ class LiberoEnv(gym.Env):
)
observation = self._format_raw_obs(raw_obs)
if terminated:
info["final_info"] = {
"task": self.task,
"task_id": self.task_id,
"done": bool(done),
"is_success": bool(is_success),
}
self.reset()
truncated = False
return observation, reward, terminated, truncated, info
@@ -364,6 +359,7 @@ def _make_env_fns(
init_states: bool,
gym_kwargs: Mapping[str, Any],
control_mode: str,
camera_name_mapping: dict[str, str] | None = None,
) -> list[Callable[[], LiberoEnv]]:
"""Build n_envs factory callables for a single (suite, task_id)."""
@@ -379,6 +375,7 @@ def _make_env_fns(
episode_index=episode_index,
n_envs=n_envs,
control_mode=control_mode,
camera_name_mapping=camera_name_mapping,
**local_kwargs,
)
@@ -400,6 +397,7 @@ def create_libero_envs(
env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None,
control_mode: str = "relative",
episode_length: int | None = None,
camera_name_mapping: dict[str, str] | None = None,
) -> dict[str, dict[int, Any]]:
"""
Create vectorized LIBERO environments with a consistent return shape.
@@ -449,6 +447,7 @@ def create_libero_envs(
init_states=init_states,
gym_kwargs=gym_kwargs,
control_mode=control_mode,
camera_name_mapping=camera_name_mapping,
)
out[suite_name][tid] = env_cls(fns)
print(f"Built vec env | suite={suite_name} | task_id={tid} | n_envs={n_envs}")
+4 -4
View File
@@ -176,13 +176,13 @@ N_COLOR_CHANNELS = 3
@dataclass
class GR00TN15Config(PretrainedConfig):
model_type = "gr00t_n1_5"
backbone_cfg: dict = field(init=False, metadata={"help": "Backbone configuration."})
backbone_cfg: dict = field(init=False, default_factory=dict, metadata={"help": "Backbone configuration."})
action_head_cfg: dict = field(init=False, metadata={"help": "Action head configuration."})
action_head_cfg: dict = field(init=False, default_factory=dict, metadata={"help": "Action head configuration."})
action_horizon: int = field(init=False, metadata={"help": "Action horizon."})
action_horizon: int = field(init=False, default=0, metadata={"help": "Action horizon."})
action_dim: int = field(init=False, metadata={"help": "Action dimension."})
action_dim: int = field(init=False, default=0, metadata={"help": "Action dimension."})
compute_dtype: str = field(default="float32", metadata={"help": "Compute dtype."})
def __init__(self, **kwargs):
@@ -109,6 +109,7 @@ class SmolVLAConfig(PreTrainedConfig):
compile_model: bool = False # Whether to use torch.compile for model optimization
compile_mode: str = "max-autotune" # Torch compile mode
def __post_init__(self):
super().__post_init__()
+5 -8
View File
@@ -193,14 +193,11 @@ def rollout(
# VectorEnv stores is_success in `info["final_info"][env_index]["is_success"]`. "final_info" isn't
# available if none of the envs finished.
if "final_info" in info:
final_info = info["final_info"]
if not isinstance(final_info, dict):
raise RuntimeError(
"Unsupported `final_info` format: expected dict (Gymnasium >= 1.0). "
"You're likely using an older version of gymnasium (< 1.0). Please upgrade."
)
successes = final_info["is_success"].tolist()
if "final_info" in info and isinstance(info["final_info"], dict):
successes = info["final_info"]["is_success"].tolist()
elif "is_success" in info:
is_success = info["is_success"]
successes = is_success.tolist() if hasattr(is_success, "tolist") else [bool(is_success)] * env.num_envs
else:
successes = [False] * env.num_envs