robstride: remove redundant timeout and max_messages casts

fix(motors): flush stale robstride RX and harden feedback drain
change import order
2026-05-20 11:09:59 +00:00 · 2026-05-11 09:45:13 +02:00 · 2026-05-07 11:28:56 +02:00 · 2026-05-07 10:37:31 +02:00 · 2026-05-07 10:34:09 +02:00 · 2026-05-07 10:31:14 +02:00
13 changed files with 521 additions and 487 deletions
@@ -382,7 +382,6 @@ jobs:
                --policy.path=\"\$ROBOTWIN_POLICY\" \
                --env.type=robotwin \
                --env.task=\"\$ROBOTWIN_TASKS\" \
-                --env.max_parallel_tasks=5 \
                --eval.batch_size=1 \
                --eval.n_episodes=1 \
                --eval.use_async_envs=false \
@@ -483,7 +482,6 @@ jobs:
                --policy.path=lerobot/smolvla_robocasa \
                --env.type=robocasa \
                --env.task=CloseFridge,OpenCabinet,OpenDrawer,TurnOnMicrowave,TurnOffStove,CloseToasterOvenDoor,SlideDishwasherRack,TurnOnSinkFaucet,NavigateKitchen,TurnOnElectricKettle \
-                --env.max_parallel_tasks=5 \
                --eval.batch_size=1 \
                --eval.n_episodes=1 \
                --eval.use_async_envs=false \
@@ -695,7 +693,6 @@ jobs:
                --env.task=\"\$ROBOMME_TASKS\" \
                --env.dataset_split=test \
                --env.task_ids=[0] \
-                --env.max_parallel_tasks=5 \
                --eval.batch_size=1 \
                --eval.n_episodes=1 \
                --eval.use_async_envs=false \
@@ -803,7 +800,6 @@ jobs:
                --env.type=libero_plus \
                --env.task=\"\$LIBERO_PLUS_SUITE\" \
                --env.task_ids=\"\$LIBERO_PLUS_TASK_IDS\" \
-                --env.max_parallel_tasks=5 \
                --eval.batch_size=1 \
                --eval.n_episodes=1 \
                --eval.use_async_envs=false \
@@ -904,8 +900,6 @@ jobs:
                --policy.path=lerobot/smolvla_vlabench \
                --env.type=vlabench \
                --env.task=select_fruit,select_toy,select_book,select_painting,select_drink,select_ingredient,select_billiards,select_poker,add_condiment,insert_flower \
-                --env.episode_length=50 \
-                --env.max_parallel_tasks=5 \
                --eval.batch_size=1 \
                --eval.n_episodes=1 \
                --eval.use_async_envs=false \
@@ -35,7 +35,7 @@ USER root
 ARG ROBOTWIN_SHA=0aeea2d669c0f8516f4d5785f0aa33ba812c14b4
 RUN apt-get update \
    && apt-get install -y --no-install-recommends \
-         cuda-nvcc-12-6 cuda-cudart-dev-12-6 \
+         cuda-nvcc-12-4 cuda-cudart-dev-12-4 \
         libvulkan1 vulkan-tools \
    && mkdir -p /usr/share/vulkan/icd.d \
    && echo '{"file_format_version":"1.0.0","ICD":{"library_path":"libGLX_nvidia.so.0","api_version":"1.3.0"}}' \
@@ -256,9 +256,7 @@ class TrainPipelineConfig(HubMixin):
                ) from e

        cli_args = kwargs.pop("cli_args", [])
-        # Legacy RA-BC migration only applies to framework-saved checkpoints (always JSON).
-        # Hand-written YAML/TOML configs are expected to use the current sample_weighting schema.
-        if config_file is not None and config_file.endswith(".json"):
+        if config_file is not None:
            with open(config_file) as f:
                config = json.load(f)
            migrated_config = _migrate_legacy_rabc_fields(config)
@@ -43,6 +43,7 @@ from .tables import (
    CAN_CMD_SET_ZERO,
    DEFAULT_BAUDRATE,
    DEFAULT_TIMEOUT_MS,
+    HANDSHAKE_TIMEOUT_S,
    MODEL_RESOLUTION,
    MOTOR_LIMIT_PARAMS,
    NORMALIZED_DATA,
@@ -215,14 +216,16 @@ class RobstrideMotorsBus(MotorsBusBase):
            self._is_connected = False
            raise ConnectionError(f"Failed to connect to CAN bus: {e}") from e

-    def _query_status_via_clear_fault(self, motor: NameOrID) -> tuple[bool, can.Message | None]:
+    def _query_status_via_clear_fault(
+        self, motor: NameOrID, timeout: float = RUNNING_TIMEOUT
+    ) -> tuple[bool, can.Message | None]:
        motor_name = self._get_motor_name(motor)
        motor_id = self._get_motor_id(motor_name)
        recv_id = self._get_motor_recv_id(motor_name)
        data = [0xFF] * 7 + [CAN_CMD_CLEAR_FAULT]
        msg = can.Message(arbitration_id=motor_id, data=data, is_extended_id=False)
        self._bus().send(msg)
-        return self._recv_status_via_clear_fault(expected_recv_id=recv_id)
+        return self._recv_status_via_clear_fault(expected_recv_id=recv_id, timeout=timeout)

    def _recv_status_via_clear_fault(
        self, expected_recv_id: int | None = None, timeout: float = RUNNING_TIMEOUT
@@ -280,7 +283,7 @@ class RobstrideMotorsBus(MotorsBusBase):
        faulted_motors = []

        for motor_name in self.motors:
-            has_fault, msg = self._query_status_via_clear_fault(motor_name)
+            has_fault, msg = self._query_status_via_clear_fault(motor_name, timeout=HANDSHAKE_TIMEOUT_S)
            if msg is None:
                missing_motors.append(motor_name)
            elif has_fault:
@@ -505,6 +508,92 @@ class RobstrideMotorsBus(MotorsBusBase):

        return responses

+    def _recv_all_messages_until_quiet(
+        self,
+        *,
+        timeout: float = RUNNING_TIMEOUT,
+        max_messages: int = 4096,
+    ) -> list[can.Message]:
+        """
+        Receive frames until the bus goes quiet.
+
+        Args:
+            timeout: Poll timeout used for each recv() call. Collection stops
+                when one recv() times out (quiet gap).
+            max_messages: Safety cap to prevent unbounded loops.
+        """
+        out: list[can.Message] = []
+        max_messages = max(1, max_messages)
+        timeout = max(0.0, timeout)
+
+        try:
+            while len(out) < max_messages:
+                msg = self._bus().recv(timeout=timeout)
+                if msg is None:
+                    break
+                out.append(msg)
+        except (can.CanError, OSError) as e:
+            logger.debug(f"Error draining CAN RX queue on {self.port}: {e}")
+
+        return out
+
+    def _process_feedback_messages(self, messages: list[can.Message]) -> set[int]:
+        """
+        Decode all received feedback frames and update cached motor states.
+
+        Returns:
+            Set of payload recv_ids that were successfully mapped to motors.
+        """
+        processed_recv_ids: set[int] = set()
+        for msg in messages:
+            if len(msg.data) < 1:
+                logger.debug(
+                    "Dropping short CAN frame on %s (arb=0x%02X, data=%s)",
+                    self.port,
+                    int(msg.arbitration_id),
+                    bytes(msg.data).hex(),
+                )
+                continue
+
+            recv_id = int(msg.data[0])
+            motor_name = self._recv_id_to_motor.get(recv_id)
+            if motor_name is None:
+                logger.debug(
+                    "Unmapped CAN frame on %s (arb=0x%02X, recv_id=0x%02X, data=%s)",
+                    self.port,
+                    int(msg.arbitration_id),
+                    recv_id,
+                    bytes(msg.data).hex(),
+                )
+                continue
+
+            self._process_response(motor_name, msg)
+            processed_recv_ids.add(recv_id)
+
+        return processed_recv_ids
+
+    def flush_rx_queue(self, poll_timeout_s: float = 0.0005, max_messages: int = 4096) -> int:
+        """
+        Drain pending RX frames from the CAN interface.
+
+        This is used by higher-level controllers to drop stale feedback before issuing
+        a fresh read cycle, so subsequent state reads are based on most recent replies.
+        It should also be called once when a controller instance is created/connected,
+        to clear residual frames left on the interface from previous sessions.
+        """
+        drained = 0
+        poll_timeout_s = max(0.0, poll_timeout_s)
+        max_messages = max(1, max_messages)
+        try:
+            while drained < max_messages:
+                msg = self._bus().recv(timeout=poll_timeout_s)
+                if msg is None:
+                    break
+                drained += 1
+        except Exception as e:
+            logger.debug("Failed to flush CAN RX queue on %s: %s", self.port, e)
+        return drained
+
    def _speed_control(
        self,
        motor: NameOrID,
@@ -644,11 +733,14 @@ class RobstrideMotorsBus(MotorsBusBase):
            msg = can.Message(arbitration_id=motor_id, data=data, is_extended_id=False)
            self._bus().send(msg)
            recv_id_to_motor[self._get_motor_recv_id(motor)] = motor_name
+        # Read every feedback frame until RX goes quiet, then decode all of them.
+        # This avoids dropping useful frames when responses from different motors interleave.
+        messages = self._recv_all_messages_until_quiet()
+        processed_recv_ids = self._process_feedback_messages(messages)

-        responses = self._recv_all_responses(list(recv_id_to_motor.keys()), timeout=RUNNING_TIMEOUT)
        for recv_id, motor_name in recv_id_to_motor.items():
-            if msg := responses.get(recv_id):
-                self._process_response(motor_name, msg)
+            if recv_id not in processed_recv_ids:
+                logger.warning("Packet drop: %s (ID: 0x%02X). Using last known state.", motor_name, recv_id)

    def _float_to_uint(self, x: float, x_min: float, x_max: float, bits: int) -> int:
        """Convert float to unsigned integer for CAN transmission."""
@@ -711,7 +803,13 @@ class RobstrideMotorsBus(MotorsBusBase):
        try:
            self._decode_motor_state(msg.data)
        except Exception as e:
-            logger.warning(f"Failed to decode response from {motor}: {e}")
+            logger.warning(
+                "Failed to decode response from %s (arb=0x%02X, data=%s): %s",
+                motor,
+                int(msg.arbitration_id),
+                bytes(msg.data).hex(),
+                e,
+            )

    def _get_cached_value(self, motor: str, data_name: str) -> Value:
        """Retrieve a specific value from the state cache."""
@@ -846,23 +944,14 @@ class RobstrideMotorsBus(MotorsBusBase):
            data = [0xFF] * 7 + [CAN_CMD_CLEAR_FAULT]
            msg = can.Message(arbitration_id=motor_id, data=data, is_extended_id=False)
            self._bus().send(msg)
-            updated_motors.append(motor)

-        expected_recv_ids = [self._get_motor_recv_id(motor) for motor in updated_motors]
-        responses = self._recv_all_responses(expected_recv_ids, timeout=RUNNING_TIMEOUT)
-
-        for response in responses.values():
-            payload_motor_name = self._recv_id_to_motor.get(response.data[0])
-            if payload_motor_name is not None:
-                self._process_response(payload_motor_name, response)
-            else:
-                # Fallback: still attempt to decode based on payload byte0 mapping.
-                self._decode_motor_state(response.data)
+        messages = self._recv_all_messages_until_quiet()
+        processed_recv_ids = self._process_feedback_messages(messages)

        for motor in updated_motors:
            recv_id = self._get_motor_recv_id(motor)
-            if recv_id not in responses:
-                logger.warning(f"Packet drop: {motor} (ID: 0x{recv_id:02X}). Using last known state.")
+            if recv_id not in processed_recv_ids:
+                logger.warning("Packet drop: %s (ID: 0x%02X). Using last known state.", motor, recv_id)

    def read_calibration(self) -> dict[str, MotorCalibration]:
        """Read calibration data from motors."""
@@ -114,7 +114,8 @@ CAN_CMD_SAVE_PARAM = 0xAA
 CAN_PARAM_ID = 0x7FF


-RUNNING_TIMEOUT = 0.001
+RUNNING_TIMEOUT = 0.003
+HANDSHAKE_TIMEOUT_S = 0.05
 PARAM_TIMEOUT = 0.01

 STATE_CACHE_TTL_S = 0.02
@@ -100,8 +100,8 @@ class DiffusionConfig(PreTrainedConfig):

    # Inputs / output structure.
    n_obs_steps: int = 2
-    horizon: int = 64
-    n_action_steps: int = 32
+    horizon: int = 16
+    n_action_steps: int = 8

    normalization_mapping: dict[str, NormalizationMode] = field(
        default_factory=lambda: {
@@ -122,10 +122,10 @@ class DiffusionConfig(PreTrainedConfig):
    crop_ratio: float = 1.0
    crop_shape: tuple[int, int] | None = None
    crop_is_random: bool = True
-    pretrained_backbone_weights: str | None = "ResNet18_Weights.IMAGENET1K_V1"
-    use_group_norm: bool = False
+    pretrained_backbone_weights: str | None = None
+    use_group_norm: bool = True
    spatial_softmax_num_keypoints: int = 32
-    use_separate_rgb_encoder_per_camera: bool = True
+    use_separate_rgb_encoder_per_camera: bool = False
    # Unet.
    down_dims: tuple[int, ...] = (512, 1024, 2048)
    kernel_size: int = 5
@@ -97,8 +97,8 @@ class VQBeTConfig(PreTrainedConfig):
    vision_backbone: str = "resnet18"
    crop_shape: tuple[int, int] | None = (84, 84)
    crop_is_random: bool = True
-    pretrained_backbone_weights: str | None = "ResNet18_Weights.IMAGENET1K_V1"
-    use_group_norm: bool = False
+    pretrained_backbone_weights: str | None = None
+    use_group_norm: bool = True
    spatial_softmax_num_keypoints: int = 32
    # VQ-VAE
    n_vqvae_training_steps: int = 20000
@@ -46,7 +46,7 @@ class LeKiwiConfig(RobotConfig):
    cameras: dict[str, CameraConfig] = field(default_factory=lekiwi_cameras_config)

    # Set to `True` for backward compatibility with previous policies/dataset
-    use_degrees: bool = True
+    use_degrees: bool = False


@dataclass
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:51effd76b73e972f10d31f5084ab906386134b600c87b2668767d30232a902bd
+oid sha256:54aecbc1af72a4cd5e9261492f5e7601890517516257aacdf2a0ffb3ce281f1b
 size 992
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d4d7a16ca67f9adefac0e0620a7b2e9c822f2db42faaaced7a89fbad60e5ead4
-size 47680
+oid sha256:88a9c3775a2aa1e90a08850521970070a4fcf0f6b82aab43cd8ccc5cf77e0013
+size 47424
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:796c439ee8a64bf9901ff8325e7419bda8bd316360ee95e6304e8e1ae0f4c36c
+oid sha256:91a2635e05a75fe187a5081504c5f35ce3417378813fa2deaf9ca4e8200e1819
 size 68
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ad33a8b47c39c2e1374567ff9da43cdb95e2dbe904c1b02a35051346d3043095
-size 47680
+oid sha256:645bff922ac7bea63ad018ebf77c303c0e4cd2c1c0dc5ef3192865281bef3dc6
+size 47424
Author	SHA1	Message	Date
Virgile	6d69cfb952	robstride: remove redundant timeout and max_messages casts	2026-05-11 09:45:13 +02:00
Virgile	7953cb4b53	fix(motors): flush stale robstride RX and harden feedback drain	2026-05-07 11:28:56 +02:00
Virgile	cd86016393	change import order	2026-05-07 10:37:31 +02:00
Virgile	46482e23b7	enforce last state read when querry	2026-05-07 10:34:09 +02:00
Virgile	a27773fa3e	change timeout for handshake	2026-05-07 10:31:14 +02:00