fix: integrate PR #3315 review feedback

- envs(robotwin): default `observation_height/width` in `create_robotwin_envs` to `DEFAULT_CAMERA_H/W` (240/320) so they match the D435 dims baked into `task_config/demo_clean.yml`. - envs(robotwin): resolve `task_config/demo_clean.yml` via `CONFIGS_PATH` instead of a cwd-relative path; works regardless of where `lerobot-eval` is invoked. - envs(robotwin): replace `print()` calls in `create_robotwin_envs` with `logger.info(...)` (module-level `logger = logging.getLogger`). - envs(robotwin): use `_LazyAsyncVectorEnv` for the async path so async workers start lazily (matches LIBERO / RoboCasa / VLABench). - envs(robotwin): cast `agent_pos` space + joint-state output to float32 end-to-end (was mixed float64/float32). - envs(configs): use the existing `_make_vec_env_cls(use_async, n_envs)` helper in `RoboTwinEnvConfig.create_envs`; drop the `get_env_processors` override so RoboTwin uses the identity processor inherited from `EnvConfig`. - processor: delete `RoboTwinProcessorStep` — the float32 cast now happens in the wrapper itself, so the processor is redundant. - tests: drop the `TestRoboTwinProcessorStep` suite; update the mock obs fixture to use float32 `joint_action.vector`. - ci: hoist `ROBOTWIN_POLICY` and `ROBOTWIN_TASKS` to job-level env vars so the task list and policy aren't duplicated across eval / extract / parse steps. - docker: pin RoboTwin + CuRobo upstream clones to commit SHAs (`RoboTwin@0aeea2d6`, `curobo@ca941586`) for reproducibility. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-23 12:40:08 +00:00 · 2026-04-20 15:18:41 +02:00
parent 0fed8b45c2
commit 1157fb11e6
6 changed files with 52 additions and 87 deletions
@@ -328,6 +328,8 @@ jobs:
      group: aws-g6-4xlarge-plus
    env:
      HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
+      ROBOTWIN_POLICY: lerobot/smolvla_robotwin
+      ROBOTWIN_TASKS: beat_block_hammer,click_bell,handover_block,stack_blocks_two,click_alarmclock,open_microwave,adjust_bottle,lift_pot,stamp_seal,turn_switch

    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
@@ -371,13 +373,15 @@ jobs:
            --shm-size=4g \
            -e HF_HOME=/tmp/hf \
            -e HF_USER_TOKEN="${HF_USER_TOKEN}" \
+            -e ROBOTWIN_POLICY="${ROBOTWIN_POLICY}" \
+            -e ROBOTWIN_TASKS="${ROBOTWIN_TASKS}" \
            lerobot-benchmark-robotwin:ci \
            bash -c "
              hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
              cd /opt/robotwin && lerobot-eval \
-                --policy.path=lerobot/smolvla_robotwin \
+                --policy.path=\"\$ROBOTWIN_POLICY\" \
                --env.type=robotwin \
-                --env.task=beat_block_hammer,click_bell,handover_block,stack_blocks_two,click_alarmclock,open_microwave,adjust_bottle,lift_pot,stamp_seal,turn_switch \
+                --env.task=\"\$ROBOTWIN_TASKS\" \
                --eval.batch_size=1 \
                --eval.n_episodes=1 \
                --eval.use_async_envs=false \
@@ -386,7 +390,7 @@ jobs:
                --output_dir=/tmp/eval-artifacts
              python /lerobot/scripts/ci/extract_task_descriptions.py \
                --env robotwin \
-                --task beat_block_hammer,click_bell,handover_block,stack_blocks_two,click_alarmclock,open_microwave,adjust_bottle,lift_pot,stamp_seal,turn_switch \
+                --task \"\$ROBOTWIN_TASKS\" \
                --output /tmp/eval-artifacts/task_descriptions.json
            "

@@ -403,8 +407,8 @@ jobs:
          python3 scripts/ci/parse_eval_metrics.py \
            --artifacts-dir /tmp/robotwin-artifacts \
            --env robotwin \
-            --task beat_block_hammer,click_bell,handover_block,stack_blocks_two,click_alarmclock,open_microwave,adjust_bottle,lift_pot,stamp_seal,turn_switch \
-            --policy lerobot/smolvla_robotwin
+            --task "${ROBOTWIN_TASKS}" \
+            --policy "${ROBOTWIN_POLICY}"

      - name: Upload RoboTwin rollout video
        if: always()
@@ -30,6 +30,9 @@ ENV NVIDIA_DRIVER_CAPABILITIES=all \
 # The nightly base is CUDA -base (no compiler, no Vulkan loader). CuRobo's
 # `pip install -e .` runs nvcc, and SAPIEN renders via Vulkan — add both.
 USER root
+# Pinned upstream SHA for reproducible benchmark runs. Bump when we need
+# an upstream fix; don't rely on `main` drift.
+ARG ROBOTWIN_SHA=0aeea2d669c0f8516f4d5785f0aa33ba812c14b4
 RUN apt-get update \
    && apt-get install -y --no-install-recommends \
         cuda-nvcc-12-4 cuda-cudart-dev-12-4 \
@@ -37,7 +40,8 @@ RUN apt-get update \
    && mkdir -p /usr/share/vulkan/icd.d \
    && echo '{"file_format_version":"1.0.0","ICD":{"library_path":"libGLX_nvidia.so.0","api_version":"1.3.0"}}' \
       > /usr/share/vulkan/icd.d/nvidia_icd.json \
-    && git clone --depth=1 https://github.com/RoboTwin-Platform/RoboTwin.git ${ROBOTWIN_ROOT} \
+    && git clone https://github.com/RoboTwin-Platform/RoboTwin.git ${ROBOTWIN_ROOT} \
+    && git -C ${ROBOTWIN_ROOT} checkout ${ROBOTWIN_SHA} \
    && chown -R user_lerobot:user_lerobot ${ROBOTWIN_ROOT} \
    && apt-get clean && rm -rf /var/lib/apt/lists/*
 USER user_lerobot
@@ -52,9 +56,11 @@ RUN uv pip install --no-cache --no-build-isolation \
        "git+https://github.com/facebookresearch/pytorch3d.git@stable"

 # CuRobo — NVlabs motion generator; TORCH_CUDA_ARCH_LIST must be set or the
-# build aborts on an empty arch list.
+# build aborts on an empty arch list. Pinned SHA for reproducibility.
+ARG CUROBO_SHA=ca941586c33b8482ed9c0e74d60f23efd64b516a
 RUN cd ${ROBOTWIN_ROOT}/envs \
-    && git clone --depth=1 https://github.com/NVlabs/curobo.git \
+    && git clone https://github.com/NVlabs/curobo.git \
+    && git -C curobo checkout ${CUROBO_SHA} \
    && cd curobo \
    && TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;8.9;9.0" \
       uv pip install -e . --no-build-isolation --no-cache
@@ -650,7 +650,7 @@ class RoboTwinEnvConfig(EnvConfig):
        if not self.task:
            raise ValueError("RoboTwinEnvConfig requires `task` to be specified.")

-        env_cls = gym.vector.AsyncVectorEnv if (use_async_envs and n_envs > 1) else gym.vector.SyncVectorEnv
+        env_cls = _make_vec_env_cls(use_async_envs, n_envs)
        cam_list = [c.strip() for c in self.camera_names.split(",") if c.strip()]
        return create_robotwin_envs(
            task=self.task,
@@ -661,12 +661,3 @@ class RoboTwinEnvConfig(EnvConfig):
            observation_width=self.observation_width,
            episode_length=self.episode_length,
        )
-
-    def get_env_processors(self):
-        from lerobot.processor.env_processor import RoboTwinProcessorStep
-        from lerobot.processor.pipeline import PolicyProcessorPipeline
-
-        return (
-            PolicyProcessorPipeline(steps=[RoboTwinProcessorStep()]),
-            PolicyProcessorPipeline(steps=[]),
-        )
@@ -16,6 +16,7 @@
 from __future__ import annotations

 import importlib
+import logging
 from collections import defaultdict
 from collections.abc import Callable, Sequence
 from functools import partial
@@ -28,6 +29,10 @@ from gymnasium import spaces

 from lerobot.types import RobotObservation

+from .utils import _LazyAsyncVectorEnv
+
+logger = logging.getLogger(__name__)
+
 # Camera names as used by RoboTwin 2.0. The wrapper appends "_rgb" when looking
 # up keys in get_obs() output (e.g. "head_camera" → "head_camera_rgb").
 ROBOTWIN_CAMERA_NAMES: tuple[str, ...] = (
@@ -126,7 +131,7 @@ def _load_robotwin_setup_kwargs(task_name: str) -> dict[str, Any]:
    from envs import CONFIGS_PATH  # type: ignore[import-not-found]

    task_config = "demo_clean"
-    with open(f"./task_config/{task_config}.yml", encoding="utf-8") as f:
+    with open(os.path.join(CONFIGS_PATH, f"{task_config}.yml"), encoding="utf-8") as f:
        args = yaml.safe_load(f)

    # Resolve embodiment — demo_clean.yml uses [aloha-agilex] (dual-arm single robot)
@@ -262,7 +267,7 @@ class RoboTwinEnv(gym.Env):
        self.observation_space = spaces.Dict(
            {
                "pixels": spaces.Dict(image_spaces),
-                "agent_pos": spaces.Box(low=-np.inf, high=np.inf, shape=(ACTION_DIM,), dtype=np.float64),
+                "agent_pos": spaces.Box(low=-np.inf, high=np.inf, shape=(ACTION_DIM,), dtype=np.float32),
            }
        )
        self.action_space = spaces.Box(
@@ -303,12 +308,12 @@ class RoboTwinEnv(gym.Env):
        ja = raw.get("joint_action") or {}
        vec = ja.get("vector")
        if vec is not None:
-            arr = np.asarray(vec, dtype=np.float64).ravel()
+            arr = np.asarray(vec, dtype=np.float32).ravel()
            joint_state = (
-                arr[:ACTION_DIM] if arr.size >= ACTION_DIM else np.zeros(ACTION_DIM, dtype=np.float64)
+                arr[:ACTION_DIM] if arr.size >= ACTION_DIM else np.zeros(ACTION_DIM, dtype=np.float32)
            )
        else:
-            joint_state = np.zeros(ACTION_DIM, dtype=np.float64)
+            joint_state = np.zeros(ACTION_DIM, dtype=np.float32)

        return {"pixels": images, "agent_pos": joint_state}

@@ -415,8 +420,8 @@ def create_robotwin_envs(
    n_envs: int,
    env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None,
    camera_names: Sequence[str] = ROBOTWIN_CAMERA_NAMES,
-    observation_height: int = 480,
-    observation_width: int = 640,
+    observation_height: int = DEFAULT_CAMERA_H,
+    observation_width: int = DEFAULT_CAMERA_W,
    episode_length: int = DEFAULT_EPISODE_LENGTH,
 ) -> dict[str, dict[int, Any]]:
    """Create vectorized RoboTwin 2.0 environments.
@@ -448,7 +453,16 @@ def create_robotwin_envs(
    if unknown:
        raise ValueError(f"Unknown RoboTwin tasks: {unknown}. Available tasks: {sorted(ROBOTWIN_TASKS)}")

-    print(f"Creating RoboTwin envs | tasks={task_names} | n_envs(per task)={n_envs}")
+    logger.info(
+        "Creating RoboTwin envs | tasks=%s | n_envs(per task)=%d",
+        task_names,
+        n_envs,
+    )
+
+    is_async = env_cls is gym.vector.AsyncVectorEnv
+    cached_obs_space: spaces.Space | None = None
+    cached_act_space: spaces.Space | None = None
+    cached_metadata: dict[str, Any] | None = None

    out: dict[str, dict[int, Any]] = defaultdict(dict)
    for task_name in task_names:
@@ -460,7 +474,15 @@ def create_robotwin_envs(
            observation_width=observation_width,
            episode_length=episode_length,
        )
-        out[task_name][0] = env_cls(fns)
-        print(f"Built vec env | task={task_name} | n_envs={n_envs}")
+        if is_async:
+            lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata)
+            if cached_obs_space is None:
+                cached_obs_space = lazy.observation_space
+                cached_act_space = lazy.action_space
+                cached_metadata = lazy.metadata
+            out[task_name][0] = lazy
+        else:
+            out[task_name][0] = env_cls(fns)
+        logger.info("Built vec env | task=%s | n_envs=%d", task_name, n_envs)

    return {k: dict(v) for k, v in out.items()}
@@ -226,29 +226,3 @@ class IsaaclabArenaProcessorStep(ObservationProcessorStep):

    def observation(self, observation):
        return self._process_observation(observation)
-
-
-@dataclass
-@ProcessorStepRegistry.register(name="robotwin_processor")
-class RoboTwinProcessorStep(ObservationProcessorStep):
-    """Passthrough step for RoboTwin observations, casting state to float32.
-
-    RoboTwin observations already arrive in LeRobot format (observation.images.*
-    and observation.state), so this step mainly ensures state dtype is float32.
-    """
-
-    def _process_observation(self, observation):
-        processed_obs = dict(observation)
-        if OBS_STATE in processed_obs:
-            state = processed_obs[OBS_STATE]
-            if hasattr(state, "dtype") and state.dtype != torch.float32:
-                processed_obs[OBS_STATE] = state.float()
-        return processed_obs
-
-    def transform_features(
-        self, features: dict[PipelineFeatureType, dict[str, PolicyFeature]]
-    ) -> dict[PipelineFeatureType, dict[str, PolicyFeature]]:
-        return features
-
-    def observation(self, observation):
-        return self._process_observation(observation)
@@ -28,7 +28,6 @@ from unittest.mock import MagicMock, patch
 import gymnasium as gym
 import numpy as np
 import pytest
-import torch

 from lerobot.envs.robotwin import (
    ACTION_DIM,
@@ -56,7 +55,7 @@ def _make_mock_task_env(
    """
    obs_dict = {
        "observation": {cam: {"rgb": np.zeros((height, width, 3), dtype=np.uint8)} for cam in cameras},
-        "joint_action": {"vector": np.zeros(ACTION_DIM, dtype=np.float64)},
+        "joint_action": {"vector": np.zeros(ACTION_DIM, dtype=np.float32)},
        "endpose": {},
    }

@@ -281,34 +280,3 @@ def test_all_tasks_are_strings():

 def test_no_duplicate_tasks():
    assert len(ROBOTWIN_TASKS) == len(set(ROBOTWIN_TASKS))
-
-
-# ---------------------------------------------------------------------------
-# RoboTwinProcessorStep
-# ---------------------------------------------------------------------------
-
-
-class TestRoboTwinProcessorStep:
-    def test_passes_through_images_and_state(self):
-        from lerobot.processor.env_processor import RoboTwinProcessorStep
-        from lerobot.utils.constants import OBS_IMAGES, OBS_STATE
-
-        step = RoboTwinProcessorStep()
-        obs = {
-            f"{OBS_IMAGES}.head_camera": torch.zeros(1, 3, 4, 4),
-            f"{OBS_IMAGES}.left_camera": torch.zeros(1, 3, 4, 4),
-            OBS_STATE: torch.zeros(1, 14),
-        }
-        result = step.observation(obs)
-        assert f"{OBS_IMAGES}.head_camera" in result
-        assert f"{OBS_IMAGES}.left_camera" in result
-        assert result[OBS_STATE].dtype == torch.float32
-
-    def test_state_cast_to_float32(self):
-        from lerobot.processor.env_processor import RoboTwinProcessorStep
-        from lerobot.utils.constants import OBS_STATE
-
-        step = RoboTwinProcessorStep()
-        obs = {OBS_STATE: torch.zeros(1, 14, dtype=torch.float64)}
-        result = step.observation(obs)
-        assert result[OBS_STATE].dtype == torch.float32