From 1157fb11e6ed24ed4f98130e8a4083668fe2cb33 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Mon, 20 Apr 2026 15:18:41 +0200
Subject: [PATCH] fix: integrate PR #3315 review feedback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- envs(robotwin): default `observation_height/width` in
  `create_robotwin_envs` to `DEFAULT_CAMERA_H/W` (240/320) so they
  match the D435 dims baked into `task_config/demo_clean.yml`.
- envs(robotwin): resolve `task_config/demo_clean.yml` via
  `CONFIGS_PATH` instead of a cwd-relative path; works regardless
  of where `lerobot-eval` is invoked.
- envs(robotwin): replace `print()` calls in `create_robotwin_envs`
  with `logger.info(...)` (module-level `logger = logging.getLogger`).
- envs(robotwin): use `_LazyAsyncVectorEnv` for the async path so
  async workers start lazily (matches LIBERO / RoboCasa / VLABench).
- envs(robotwin): cast `agent_pos` space + joint-state output to
  float32 end-to-end (was mixed float64/float32).
- envs(configs): use the existing `_make_vec_env_cls(use_async,
  n_envs)` helper in `RoboTwinEnvConfig.create_envs`; drop the
  `get_env_processors` override so RoboTwin uses the identity
  processor inherited from `EnvConfig`.
- processor: delete `RoboTwinProcessorStep` — the float32 cast now
  happens in the wrapper itself, so the processor is redundant.
- tests: drop the `TestRoboTwinProcessorStep` suite; update the
  mock obs fixture to use float32 `joint_action.vector`.
- ci: hoist `ROBOTWIN_POLICY` and `ROBOTWIN_TASKS` to job-level
  env vars so the task list and policy aren't duplicated across
  eval / extract / parse steps.
- docker: pin RoboTwin + CuRobo upstream clones to commit SHAs
  (`RoboTwin@0aeea2d6`, `curobo@ca941586`) for reproducibility.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/benchmark_tests.yml  | 14 ++++++---
 docker/Dockerfile.benchmark.robotwin   | 12 ++++++--
 src/lerobot/envs/configs.py            | 11 +------
 src/lerobot/envs/robotwin.py           | 42 ++++++++++++++++++++------
 src/lerobot/processor/env_processor.py | 26 ----------------
 tests/envs/test_robotwin.py            | 34 +--------------------
 6 files changed, 52 insertions(+), 87 deletions(-)

diff --git a/.github/workflows/benchmark_tests.yml b/.github/workflows/benchmark_tests.yml
index 9b11c8c19..70258c7f6 100644
--- a/.github/workflows/benchmark_tests.yml
+++ b/.github/workflows/benchmark_tests.yml
@@ -328,6 +328,8 @@ jobs:
       group: aws-g6-4xlarge-plus
     env:
       HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
+      ROBOTWIN_POLICY: lerobot/smolvla_robotwin
+      ROBOTWIN_TASKS: beat_block_hammer,click_bell,handover_block,stack_blocks_two,click_alarmclock,open_microwave,adjust_bottle,lift_pot,stamp_seal,turn_switch
 
     steps:
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
@@ -371,13 +373,15 @@ jobs:
             --shm-size=4g \
             -e HF_HOME=/tmp/hf \
             -e HF_USER_TOKEN="${HF_USER_TOKEN}" \
+            -e ROBOTWIN_POLICY="${ROBOTWIN_POLICY}" \
+            -e ROBOTWIN_TASKS="${ROBOTWIN_TASKS}" \
             lerobot-benchmark-robotwin:ci \
             bash -c "
               hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
               cd /opt/robotwin && lerobot-eval \
-                --policy.path=lerobot/smolvla_robotwin \
+                --policy.path=\"\$ROBOTWIN_POLICY\" \
                 --env.type=robotwin \
-                --env.task=beat_block_hammer,click_bell,handover_block,stack_blocks_two,click_alarmclock,open_microwave,adjust_bottle,lift_pot,stamp_seal,turn_switch \
+                --env.task=\"\$ROBOTWIN_TASKS\" \
                 --eval.batch_size=1 \
                 --eval.n_episodes=1 \
                 --eval.use_async_envs=false \
@@ -386,7 +390,7 @@ jobs:
                 --output_dir=/tmp/eval-artifacts
               python /lerobot/scripts/ci/extract_task_descriptions.py \
                 --env robotwin \
-                --task beat_block_hammer,click_bell,handover_block,stack_blocks_two,click_alarmclock,open_microwave,adjust_bottle,lift_pot,stamp_seal,turn_switch \
+                --task \"\$ROBOTWIN_TASKS\" \
                 --output /tmp/eval-artifacts/task_descriptions.json
             "
 
@@ -403,8 +407,8 @@ jobs:
           python3 scripts/ci/parse_eval_metrics.py \
             --artifacts-dir /tmp/robotwin-artifacts \
             --env robotwin \
-            --task beat_block_hammer,click_bell,handover_block,stack_blocks_two,click_alarmclock,open_microwave,adjust_bottle,lift_pot,stamp_seal,turn_switch \
-            --policy lerobot/smolvla_robotwin
+            --task "${ROBOTWIN_TASKS}" \
+            --policy "${ROBOTWIN_POLICY}"
 
       - name: Upload RoboTwin rollout video
         if: always()
diff --git a/docker/Dockerfile.benchmark.robotwin b/docker/Dockerfile.benchmark.robotwin
index 7a24071eb..423854c31 100644
--- a/docker/Dockerfile.benchmark.robotwin
+++ b/docker/Dockerfile.benchmark.robotwin
@@ -30,6 +30,9 @@ ENV NVIDIA_DRIVER_CAPABILITIES=all \
 # The nightly base is CUDA -base (no compiler, no Vulkan loader). CuRobo's
 # `pip install -e .` runs nvcc, and SAPIEN renders via Vulkan — add both.
 USER root
+# Pinned upstream SHA for reproducible benchmark runs. Bump when we need
+# an upstream fix; don't rely on `main` drift.
+ARG ROBOTWIN_SHA=0aeea2d669c0f8516f4d5785f0aa33ba812c14b4
 RUN apt-get update \
     && apt-get install -y --no-install-recommends \
          cuda-nvcc-12-4 cuda-cudart-dev-12-4 \
@@ -37,7 +40,8 @@ RUN apt-get update \
     && mkdir -p /usr/share/vulkan/icd.d \
     && echo '{"file_format_version":"1.0.0","ICD":{"library_path":"libGLX_nvidia.so.0","api_version":"1.3.0"}}' \
        > /usr/share/vulkan/icd.d/nvidia_icd.json \
-    && git clone --depth=1 https://github.com/RoboTwin-Platform/RoboTwin.git ${ROBOTWIN_ROOT} \
+    && git clone https://github.com/RoboTwin-Platform/RoboTwin.git ${ROBOTWIN_ROOT} \
+    && git -C ${ROBOTWIN_ROOT} checkout ${ROBOTWIN_SHA} \
     && chown -R user_lerobot:user_lerobot ${ROBOTWIN_ROOT} \
     && apt-get clean && rm -rf /var/lib/apt/lists/*
 USER user_lerobot
@@ -52,9 +56,11 @@ RUN uv pip install --no-cache --no-build-isolation \
         "git+https://github.com/facebookresearch/pytorch3d.git@stable"
 
 # CuRobo — NVlabs motion generator; TORCH_CUDA_ARCH_LIST must be set or the
-# build aborts on an empty arch list.
+# build aborts on an empty arch list. Pinned SHA for reproducibility.
+ARG CUROBO_SHA=ca941586c33b8482ed9c0e74d60f23efd64b516a
 RUN cd ${ROBOTWIN_ROOT}/envs \
-    && git clone --depth=1 https://github.com/NVlabs/curobo.git \
+    && git clone https://github.com/NVlabs/curobo.git \
+    && git -C curobo checkout ${CUROBO_SHA} \
     && cd curobo \
     && TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;8.9;9.0" \
        uv pip install -e . --no-build-isolation --no-cache
diff --git a/src/lerobot/envs/configs.py b/src/lerobot/envs/configs.py
index a85a9a2fe..108a88f1d 100644
--- a/src/lerobot/envs/configs.py
+++ b/src/lerobot/envs/configs.py
@@ -650,7 +650,7 @@ class RoboTwinEnvConfig(EnvConfig):
         if not self.task:
             raise ValueError("RoboTwinEnvConfig requires `task` to be specified.")
 
-        env_cls = gym.vector.AsyncVectorEnv if (use_async_envs and n_envs > 1) else gym.vector.SyncVectorEnv
+        env_cls = _make_vec_env_cls(use_async_envs, n_envs)
         cam_list = [c.strip() for c in self.camera_names.split(",") if c.strip()]
         return create_robotwin_envs(
             task=self.task,
@@ -661,12 +661,3 @@ class RoboTwinEnvConfig(EnvConfig):
             observation_width=self.observation_width,
             episode_length=self.episode_length,
         )
-
-    def get_env_processors(self):
-        from lerobot.processor.env_processor import RoboTwinProcessorStep
-        from lerobot.processor.pipeline import PolicyProcessorPipeline
-
-        return (
-            PolicyProcessorPipeline(steps=[RoboTwinProcessorStep()]),
-            PolicyProcessorPipeline(steps=[]),
-        )
diff --git a/src/lerobot/envs/robotwin.py b/src/lerobot/envs/robotwin.py
index 5959b9738..823f14fa0 100644
--- a/src/lerobot/envs/robotwin.py
+++ b/src/lerobot/envs/robotwin.py
@@ -16,6 +16,7 @@
 from __future__ import annotations
 
 import importlib
+import logging
 from collections import defaultdict
 from collections.abc import Callable, Sequence
 from functools import partial
@@ -28,6 +29,10 @@ from gymnasium import spaces
 
 from lerobot.types import RobotObservation
 
+from .utils import _LazyAsyncVectorEnv
+
+logger = logging.getLogger(__name__)
+
 # Camera names as used by RoboTwin 2.0. The wrapper appends "_rgb" when looking
 # up keys in get_obs() output (e.g. "head_camera" → "head_camera_rgb").
 ROBOTWIN_CAMERA_NAMES: tuple[str, ...] = (
@@ -126,7 +131,7 @@ def _load_robotwin_setup_kwargs(task_name: str) -> dict[str, Any]:
     from envs import CONFIGS_PATH  # type: ignore[import-not-found]
 
     task_config = "demo_clean"
-    with open(f"./task_config/{task_config}.yml", encoding="utf-8") as f:
+    with open(os.path.join(CONFIGS_PATH, f"{task_config}.yml"), encoding="utf-8") as f:
         args = yaml.safe_load(f)
 
     # Resolve embodiment — demo_clean.yml uses [aloha-agilex] (dual-arm single robot)
@@ -262,7 +267,7 @@ class RoboTwinEnv(gym.Env):
         self.observation_space = spaces.Dict(
             {
                 "pixels": spaces.Dict(image_spaces),
-                "agent_pos": spaces.Box(low=-np.inf, high=np.inf, shape=(ACTION_DIM,), dtype=np.float64),
+                "agent_pos": spaces.Box(low=-np.inf, high=np.inf, shape=(ACTION_DIM,), dtype=np.float32),
             }
         )
         self.action_space = spaces.Box(
@@ -303,12 +308,12 @@ class RoboTwinEnv(gym.Env):
         ja = raw.get("joint_action") or {}
         vec = ja.get("vector")
         if vec is not None:
-            arr = np.asarray(vec, dtype=np.float64).ravel()
+            arr = np.asarray(vec, dtype=np.float32).ravel()
             joint_state = (
-                arr[:ACTION_DIM] if arr.size >= ACTION_DIM else np.zeros(ACTION_DIM, dtype=np.float64)
+                arr[:ACTION_DIM] if arr.size >= ACTION_DIM else np.zeros(ACTION_DIM, dtype=np.float32)
             )
         else:
-            joint_state = np.zeros(ACTION_DIM, dtype=np.float64)
+            joint_state = np.zeros(ACTION_DIM, dtype=np.float32)
 
         return {"pixels": images, "agent_pos": joint_state}
 
@@ -415,8 +420,8 @@ def create_robotwin_envs(
     n_envs: int,
     env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None,
     camera_names: Sequence[str] = ROBOTWIN_CAMERA_NAMES,
-    observation_height: int = 480,
-    observation_width: int = 640,
+    observation_height: int = DEFAULT_CAMERA_H,
+    observation_width: int = DEFAULT_CAMERA_W,
     episode_length: int = DEFAULT_EPISODE_LENGTH,
 ) -> dict[str, dict[int, Any]]:
     """Create vectorized RoboTwin 2.0 environments.
@@ -448,7 +453,16 @@ def create_robotwin_envs(
     if unknown:
         raise ValueError(f"Unknown RoboTwin tasks: {unknown}. Available tasks: {sorted(ROBOTWIN_TASKS)}")
 
-    print(f"Creating RoboTwin envs | tasks={task_names} | n_envs(per task)={n_envs}")
+    logger.info(
+        "Creating RoboTwin envs | tasks=%s | n_envs(per task)=%d",
+        task_names,
+        n_envs,
+    )
+
+    is_async = env_cls is gym.vector.AsyncVectorEnv
+    cached_obs_space: spaces.Space | None = None
+    cached_act_space: spaces.Space | None = None
+    cached_metadata: dict[str, Any] | None = None
 
     out: dict[str, dict[int, Any]] = defaultdict(dict)
     for task_name in task_names:
@@ -460,7 +474,15 @@ def create_robotwin_envs(
             observation_width=observation_width,
             episode_length=episode_length,
         )
-        out[task_name][0] = env_cls(fns)
-        print(f"Built vec env | task={task_name} | n_envs={n_envs}")
+        if is_async:
+            lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata)
+            if cached_obs_space is None:
+                cached_obs_space = lazy.observation_space
+                cached_act_space = lazy.action_space
+                cached_metadata = lazy.metadata
+            out[task_name][0] = lazy
+        else:
+            out[task_name][0] = env_cls(fns)
+        logger.info("Built vec env | task=%s | n_envs=%d", task_name, n_envs)
 
     return {k: dict(v) for k, v in out.items()}
diff --git a/src/lerobot/processor/env_processor.py b/src/lerobot/processor/env_processor.py
index c826a4c70..75cbb79de 100644
--- a/src/lerobot/processor/env_processor.py
+++ b/src/lerobot/processor/env_processor.py
@@ -226,29 +226,3 @@ class IsaaclabArenaProcessorStep(ObservationProcessorStep):
 
     def observation(self, observation):
         return self._process_observation(observation)
-
-
-@dataclass
-@ProcessorStepRegistry.register(name="robotwin_processor")
-class RoboTwinProcessorStep(ObservationProcessorStep):
-    """Passthrough step for RoboTwin observations, casting state to float32.
-
-    RoboTwin observations already arrive in LeRobot format (observation.images.*
-    and observation.state), so this step mainly ensures state dtype is float32.
-    """
-
-    def _process_observation(self, observation):
-        processed_obs = dict(observation)
-        if OBS_STATE in processed_obs:
-            state = processed_obs[OBS_STATE]
-            if hasattr(state, "dtype") and state.dtype != torch.float32:
-                processed_obs[OBS_STATE] = state.float()
-        return processed_obs
-
-    def transform_features(
-        self, features: dict[PipelineFeatureType, dict[str, PolicyFeature]]
-    ) -> dict[PipelineFeatureType, dict[str, PolicyFeature]]:
-        return features
-
-    def observation(self, observation):
-        return self._process_observation(observation)
diff --git a/tests/envs/test_robotwin.py b/tests/envs/test_robotwin.py
index b31f63ce6..fcd45adbf 100644
--- a/tests/envs/test_robotwin.py
+++ b/tests/envs/test_robotwin.py
@@ -28,7 +28,6 @@ from unittest.mock import MagicMock, patch
 import gymnasium as gym
 import numpy as np
 import pytest
-import torch
 
 from lerobot.envs.robotwin import (
     ACTION_DIM,
@@ -56,7 +55,7 @@ def _make_mock_task_env(
     """
     obs_dict = {
         "observation": {cam: {"rgb": np.zeros((height, width, 3), dtype=np.uint8)} for cam in cameras},
-        "joint_action": {"vector": np.zeros(ACTION_DIM, dtype=np.float64)},
+        "joint_action": {"vector": np.zeros(ACTION_DIM, dtype=np.float32)},
         "endpose": {},
     }
 
@@ -281,34 +280,3 @@ def test_all_tasks_are_strings():
 
 def test_no_duplicate_tasks():
     assert len(ROBOTWIN_TASKS) == len(set(ROBOTWIN_TASKS))
-
-
-# ---------------------------------------------------------------------------
-# RoboTwinProcessorStep
-# ---------------------------------------------------------------------------
-
-
-class TestRoboTwinProcessorStep:
-    def test_passes_through_images_and_state(self):
-        from lerobot.processor.env_processor import RoboTwinProcessorStep
-        from lerobot.utils.constants import OBS_IMAGES, OBS_STATE
-
-        step = RoboTwinProcessorStep()
-        obs = {
-            f"{OBS_IMAGES}.head_camera": torch.zeros(1, 3, 4, 4),
-            f"{OBS_IMAGES}.left_camera": torch.zeros(1, 3, 4, 4),
-            OBS_STATE: torch.zeros(1, 14),
-        }
-        result = step.observation(obs)
-        assert f"{OBS_IMAGES}.head_camera" in result
-        assert f"{OBS_IMAGES}.left_camera" in result
-        assert result[OBS_STATE].dtype == torch.float32
-
-    def test_state_cast_to_float32(self):
-        from lerobot.processor.env_processor import RoboTwinProcessorStep
-        from lerobot.utils.constants import OBS_STATE
-
-        step = RoboTwinProcessorStep()
-        obs = {OBS_STATE: torch.zeros(1, 14, dtype=torch.float64)}
-        result = step.observation(obs)
-        assert result[OBS_STATE].dtype == torch.float32