fix(ci): run VLABench tasks 5-at-a-time in parallel

The eval script already supports running multiple tasks concurrently via a ThreadPoolExecutor (env.max_parallel_tasks). Set it to 5 so the 10 VLABench tasks finish in ~2 waves instead of running sequentially. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
fix(ci): cap VLABench smoke eval at 50 steps per task
2026-05-15 00:29:52 +00:00 · 2026-05-07 11:57:03 +02:00 · 2026-05-07 11:16:43 +02:00 · 2026-05-07 11:11:12 +02:00
4 changed files with 24 additions and 112 deletions
@@ -900,6 +900,8 @@ jobs:
                --policy.path=lerobot/smolvla_vlabench \
                --env.type=vlabench \
                --env.task=select_fruit,select_toy,select_book,select_painting,select_drink,select_ingredient,select_billiards,select_poker,add_condiment,insert_flower \
+                --env.episode_length=50 \
+                --env.max_parallel_tasks=5 \
                --eval.batch_size=1 \
                --eval.n_episodes=1 \
                --eval.use_async_envs=false \
@@ -35,7 +35,7 @@ USER root
 ARG ROBOTWIN_SHA=0aeea2d669c0f8516f4d5785f0aa33ba812c14b4
 RUN apt-get update \
    && apt-get install -y --no-install-recommends \
-         cuda-nvcc-12-4 cuda-cudart-dev-12-4 \
+         cuda-nvcc-12-6 cuda-cudart-dev-12-6 \
         libvulkan1 vulkan-tools \
    && mkdir -p /usr/share/vulkan/icd.d \
    && echo '{"file_format_version":"1.0.0","ICD":{"library_path":"libGLX_nvidia.so.0","api_version":"1.3.0"}}' \
@@ -43,7 +43,6 @@ from .tables import (
    CAN_CMD_SET_ZERO,
    DEFAULT_BAUDRATE,
    DEFAULT_TIMEOUT_MS,
-    HANDSHAKE_TIMEOUT_S,
    MODEL_RESOLUTION,
    MOTOR_LIMIT_PARAMS,
    NORMALIZED_DATA,
@@ -216,16 +215,14 @@ class RobstrideMotorsBus(MotorsBusBase):
            self._is_connected = False
            raise ConnectionError(f"Failed to connect to CAN bus: {e}") from e

-    def _query_status_via_clear_fault(
-        self, motor: NameOrID, timeout: float = RUNNING_TIMEOUT
-    ) -> tuple[bool, can.Message | None]:
+    def _query_status_via_clear_fault(self, motor: NameOrID) -> tuple[bool, can.Message | None]:
        motor_name = self._get_motor_name(motor)
        motor_id = self._get_motor_id(motor_name)
        recv_id = self._get_motor_recv_id(motor_name)
        data = [0xFF] * 7 + [CAN_CMD_CLEAR_FAULT]
        msg = can.Message(arbitration_id=motor_id, data=data, is_extended_id=False)
        self._bus().send(msg)
-        return self._recv_status_via_clear_fault(expected_recv_id=recv_id, timeout=timeout)
+        return self._recv_status_via_clear_fault(expected_recv_id=recv_id)

    def _recv_status_via_clear_fault(
        self, expected_recv_id: int | None = None, timeout: float = RUNNING_TIMEOUT
@@ -283,7 +280,7 @@ class RobstrideMotorsBus(MotorsBusBase):
        faulted_motors = []

        for motor_name in self.motors:
-            has_fault, msg = self._query_status_via_clear_fault(motor_name, timeout=HANDSHAKE_TIMEOUT_S)
+            has_fault, msg = self._query_status_via_clear_fault(motor_name)
            if msg is None:
                missing_motors.append(motor_name)
            elif has_fault:
@@ -508,92 +505,6 @@ class RobstrideMotorsBus(MotorsBusBase):

        return responses

-    def _recv_all_messages_until_quiet(
-        self,
-        *,
-        timeout: float = RUNNING_TIMEOUT,
-        max_messages: int = 4096,
-    ) -> list[can.Message]:
-        """
-        Receive frames until the bus goes quiet.
-
-        Args:
-            timeout: Poll timeout used for each recv() call. Collection stops
-                when one recv() times out (quiet gap).
-            max_messages: Safety cap to prevent unbounded loops.
-        """
-        out: list[can.Message] = []
-        max_messages = max(1, max_messages)
-        timeout = max(0.0, timeout)
-
-        try:
-            while len(out) < max_messages:
-                msg = self._bus().recv(timeout=timeout)
-                if msg is None:
-                    break
-                out.append(msg)
-        except (can.CanError, OSError) as e:
-            logger.debug(f"Error draining CAN RX queue on {self.port}: {e}")
-
-        return out
-
-    def _process_feedback_messages(self, messages: list[can.Message]) -> set[int]:
-        """
-        Decode all received feedback frames and update cached motor states.
-
-        Returns:
-            Set of payload recv_ids that were successfully mapped to motors.
-        """
-        processed_recv_ids: set[int] = set()
-        for msg in messages:
-            if len(msg.data) < 1:
-                logger.debug(
-                    "Dropping short CAN frame on %s (arb=0x%02X, data=%s)",
-                    self.port,
-                    int(msg.arbitration_id),
-                    bytes(msg.data).hex(),
-                )
-                continue
-
-            recv_id = int(msg.data[0])
-            motor_name = self._recv_id_to_motor.get(recv_id)
-            if motor_name is None:
-                logger.debug(
-                    "Unmapped CAN frame on %s (arb=0x%02X, recv_id=0x%02X, data=%s)",
-                    self.port,
-                    int(msg.arbitration_id),
-                    recv_id,
-                    bytes(msg.data).hex(),
-                )
-                continue
-
-            self._process_response(motor_name, msg)
-            processed_recv_ids.add(recv_id)
-
-        return processed_recv_ids
-
-    def flush_rx_queue(self, poll_timeout_s: float = 0.0005, max_messages: int = 4096) -> int:
-        """
-        Drain pending RX frames from the CAN interface.
-
-        This is used by higher-level controllers to drop stale feedback before issuing
-        a fresh read cycle, so subsequent state reads are based on most recent replies.
-        It should also be called once when a controller instance is created/connected,
-        to clear residual frames left on the interface from previous sessions.
-        """
-        drained = 0
-        poll_timeout_s = max(0.0, poll_timeout_s)
-        max_messages = max(1, max_messages)
-        try:
-            while drained < max_messages:
-                msg = self._bus().recv(timeout=poll_timeout_s)
-                if msg is None:
-                    break
-                drained += 1
-        except Exception as e:
-            logger.debug("Failed to flush CAN RX queue on %s: %s", self.port, e)
-        return drained
-
    def _speed_control(
        self,
        motor: NameOrID,
@@ -733,14 +644,11 @@ class RobstrideMotorsBus(MotorsBusBase):
            msg = can.Message(arbitration_id=motor_id, data=data, is_extended_id=False)
            self._bus().send(msg)
            recv_id_to_motor[self._get_motor_recv_id(motor)] = motor_name
-        # Read every feedback frame until RX goes quiet, then decode all of them.
-        # This avoids dropping useful frames when responses from different motors interleave.
-        messages = self._recv_all_messages_until_quiet()
-        processed_recv_ids = self._process_feedback_messages(messages)

+        responses = self._recv_all_responses(list(recv_id_to_motor.keys()), timeout=RUNNING_TIMEOUT)
        for recv_id, motor_name in recv_id_to_motor.items():
-            if recv_id not in processed_recv_ids:
-                logger.warning("Packet drop: %s (ID: 0x%02X). Using last known state.", motor_name, recv_id)
+            if msg := responses.get(recv_id):
+                self._process_response(motor_name, msg)

    def _float_to_uint(self, x: float, x_min: float, x_max: float, bits: int) -> int:
        """Convert float to unsigned integer for CAN transmission."""
@@ -803,13 +711,7 @@ class RobstrideMotorsBus(MotorsBusBase):
        try:
            self._decode_motor_state(msg.data)
        except Exception as e:
-            logger.warning(
-                "Failed to decode response from %s (arb=0x%02X, data=%s): %s",
-                motor,
-                int(msg.arbitration_id),
-                bytes(msg.data).hex(),
-                e,
-            )
+            logger.warning(f"Failed to decode response from {motor}: {e}")

    def _get_cached_value(self, motor: str, data_name: str) -> Value:
        """Retrieve a specific value from the state cache."""
@@ -944,14 +846,23 @@ class RobstrideMotorsBus(MotorsBusBase):
            data = [0xFF] * 7 + [CAN_CMD_CLEAR_FAULT]
            msg = can.Message(arbitration_id=motor_id, data=data, is_extended_id=False)
            self._bus().send(msg)
+            updated_motors.append(motor)

-        messages = self._recv_all_messages_until_quiet()
-        processed_recv_ids = self._process_feedback_messages(messages)
+        expected_recv_ids = [self._get_motor_recv_id(motor) for motor in updated_motors]
+        responses = self._recv_all_responses(expected_recv_ids, timeout=RUNNING_TIMEOUT)
+
+        for response in responses.values():
+            payload_motor_name = self._recv_id_to_motor.get(response.data[0])
+            if payload_motor_name is not None:
+                self._process_response(payload_motor_name, response)
+            else:
+                # Fallback: still attempt to decode based on payload byte0 mapping.
+                self._decode_motor_state(response.data)

        for motor in updated_motors:
            recv_id = self._get_motor_recv_id(motor)
-            if recv_id not in processed_recv_ids:
-                logger.warning("Packet drop: %s (ID: 0x%02X). Using last known state.", motor, recv_id)
+            if recv_id not in responses:
+                logger.warning(f"Packet drop: {motor} (ID: 0x{recv_id:02X}). Using last known state.")

    def read_calibration(self) -> dict[str, MotorCalibration]:
        """Read calibration data from motors."""
@@ -114,8 +114,7 @@ CAN_CMD_SAVE_PARAM = 0xAA
 CAN_PARAM_ID = 0x7FF


-RUNNING_TIMEOUT = 0.003
-HANDSHAKE_TIMEOUT_S = 0.05
+RUNNING_TIMEOUT = 0.001
 PARAM_TIMEOUT = 0.01

 STATE_CACHE_TTL_S = 0.02