From ed49c9935acc795173d2ccef9d56542e7f4d6906 Mon Sep 17 00:00:00 2001 From: Robin Glauser Date: Fri, 17 Oct 2025 15:15:03 +0200 Subject: [PATCH 01/62] Adding magnitude encoding bits for feetech motors according to https://github.com/Kotakku/FT_SCServo_Debug_Qt/blob/master/servo/sms_sts.h and https://gitee.com/ftservo/FTServo_Python/blob/main/scservo_sdk/sms_sts.py (#2223) --- src/lerobot/motors/feetech/tables.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/lerobot/motors/feetech/tables.py b/src/lerobot/motors/feetech/tables.py index 48814957f..91e844a72 100644 --- a/src/lerobot/motors/feetech/tables.py +++ b/src/lerobot/motors/feetech/tables.py @@ -206,8 +206,12 @@ MODEL_BAUDRATE_TABLE = { # Sign-Magnitude encoding bits STS_SMS_SERIES_ENCODINGS_TABLE = { "Homing_Offset": 11, + "Goal_Position": 15, "Goal_Velocity": 15, + "Goal_Speed": 15, + "Present_Position": 15, "Present_Velocity": 15, + "Present_Speed": 15, } MODEL_ENCODING_TABLE = { From a51682b266458eaeab136c2207cae2d7cfc8fbdc Mon Sep 17 00:00:00 2001 From: Antoine Date: Fri, 17 Oct 2025 15:18:21 +0200 Subject: [PATCH 02/62] Optimized episode cache verification (#2166) Signed-off-by: Antoine Co-authored-by: Michel Aractingi --- src/lerobot/datasets/lerobot_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lerobot/datasets/lerobot_dataset.py b/src/lerobot/datasets/lerobot_dataset.py index ae142c1e8..ec419d557 100644 --- a/src/lerobot/datasets/lerobot_dataset.py +++ b/src/lerobot/datasets/lerobot_dataset.py @@ -842,7 +842,7 @@ class LeRobotDataset(torch.utils.data.Dataset): # Get available episode indices from cached dataset available_episodes = { ep_idx.item() if isinstance(ep_idx, torch.Tensor) else ep_idx - for ep_idx in self.hf_dataset["episode_index"] + for ep_idx in self.hf_dataset.unique("episode_index") } # Determine requested episodes From 44bf2837012ea912f748b6b1342125dffd140c1d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 17 Oct 2025 15:33:37 +0200 Subject: [PATCH 03/62] chore(deps): bump pypa/gh-action-pypi-publish (#1870) Bumps the github_actions group with 1 update in the /.github/workflows directory: [pypa/gh-action-pypi-publish](https://github.com/pypa/gh-action-pypi-publish). Updates `pypa/gh-action-pypi-publish` from 1.12.4 to 1.13.0 - [Release notes](https://github.com/pypa/gh-action-pypi-publish/releases) - [Commits](https://github.com/pypa/gh-action-pypi-publish/compare/v1.12.4...v1.13.0) --- updated-dependencies: - dependency-name: pypa/gh-action-pypi-publish dependency-version: 1.13.0 dependency-type: direct:production dependency-group: github_actions ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/release.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 67aa5186b..e05b4816a 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -103,7 +103,7 @@ jobs: - name: Publish to TestPyPI for pre-releases # True for tags like 'v0.2.0-rc1' if: startsWith(github.ref, 'refs/tags/v') && contains(github.ref, '-') - uses: pypa/gh-action-pypi-publish@v1.12.4 # zizmor: ignore[unpinned-uses, use-trusted-publishing] + uses: pypa/gh-action-pypi-publish@v1.13.0 # zizmor: ignore[unpinned-uses, use-trusted-publishing] with: repository-url: https://test.pypi.org/legacy/ verbose: true @@ -111,7 +111,7 @@ jobs: - name: Publish to PyPI if: startsWith(github.ref, 'refs/tags/v') && !contains(github.ref, '-') - uses: pypa/gh-action-pypi-publish@v1.12.4 # zizmor: ignore[unpinned-uses, use-trusted-publishing] + uses: pypa/gh-action-pypi-publish@v1.13.0 # zizmor: ignore[unpinned-uses, use-trusted-publishing] with: verbose: true print-hash: true From 2c18210d027f30dca1cb2594745413978c25cf8b Mon Sep 17 00:00:00 2001 From: Steven Palma Date: Fri, 17 Oct 2025 15:36:19 +0200 Subject: [PATCH 04/62] chore(robots): deprecate strech, vipex and widowx robots (#2205) --- pyproject.toml | 5 - src/lerobot/robots/stretch3/README.md | 173 ------------- src/lerobot/robots/stretch3/__init__.py | 18 -- .../robots/stretch3/configuration_stretch3.py | 51 ---- src/lerobot/robots/stretch3/robot_stretch3.py | 180 -------------- src/lerobot/robots/utils.py | 8 - src/lerobot/robots/viperx/README.md | 196 --------------- src/lerobot/robots/viperx/__init__.py | 18 -- src/lerobot/robots/viperx/config_viperx.py | 45 ---- src/lerobot/robots/viperx/viperx.py | 233 ------------------ .../stretch3_gamepad/__init__.py | 18 -- .../configuration_stretch3.py | 25 -- .../stretch3_gamepad/stretch3_gamepad.py | 117 --------- src/lerobot/teleoperators/utils.py | 8 - src/lerobot/teleoperators/widowx/__init__.py | 18 -- .../teleoperators/widowx/config_widowx.py | 25 -- src/lerobot/teleoperators/widowx/widowx.py | 155 ------------ 17 files changed, 1293 deletions(-) delete mode 100644 src/lerobot/robots/stretch3/README.md delete mode 100644 src/lerobot/robots/stretch3/__init__.py delete mode 100644 src/lerobot/robots/stretch3/configuration_stretch3.py delete mode 100644 src/lerobot/robots/stretch3/robot_stretch3.py delete mode 100644 src/lerobot/robots/viperx/README.md delete mode 100644 src/lerobot/robots/viperx/__init__.py delete mode 100644 src/lerobot/robots/viperx/config_viperx.py delete mode 100644 src/lerobot/robots/viperx/viperx.py delete mode 100644 src/lerobot/teleoperators/stretch3_gamepad/__init__.py delete mode 100644 src/lerobot/teleoperators/stretch3_gamepad/configuration_stretch3.py delete mode 100644 src/lerobot/teleoperators/stretch3_gamepad/stretch3_gamepad.py delete mode 100644 src/lerobot/teleoperators/widowx/__init__.py delete mode 100644 src/lerobot/teleoperators/widowx/config_widowx.py delete mode 100644 src/lerobot/teleoperators/widowx/widowx.py diff --git a/pyproject.toml b/pyproject.toml index c8879ac36..876f3699b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -114,11 +114,6 @@ intelrealsense = [ "pyrealsense2-macosx>=2.54,<2.55.0 ; sys_platform == 'darwin'", ] phone = ["hebi-py>=2.8.0,<2.12.0", "teleop>=0.1.0,<0.2.0"] -# stretch = [ -# "hello-robot-stretch-body>=0.7.27 ; sys_platform == 'linux'", -# "pyrender @ git+https://github.com/mmatl/pyrender.git ; sys_platform == 'linux'", -# "pyrealsense2>=2.55.1.6486 ; sys_platform != 'darwin'" -# ] # TODO: Currently not supported # Policies pi = ["transformers @ git+https://github.com/huggingface/transformers.git@fix/lerobot_openpi"] diff --git a/src/lerobot/robots/stretch3/README.md b/src/lerobot/robots/stretch3/README.md deleted file mode 100644 index 027f12d65..000000000 --- a/src/lerobot/robots/stretch3/README.md +++ /dev/null @@ -1,173 +0,0 @@ -This tutorial explains how to use [Stretch 3](https://hello-robot.com/stretch-3-product) with LeRobot. - -## Setup - -Familiarize yourself with Stretch by following its [tutorials](https://docs.hello-robot.com/0.3/getting_started/hello_robot/) (recommended). - -To use LeRobot on Stretch, 3 options are available: - -- [tethered setup](https://docs.hello-robot.com/0.3/getting_started/connecting_to_stretch/#tethered-setup) -- [untethered setup](https://docs.hello-robot.com/0.3/getting_started/connecting_to_stretch/#untethered-setup) -- ssh directly into Stretch (you will first need to install and configure openssh-server on stretch using one of the two above setups) - -## Install LeRobot - -On Stretch's CLI, follow these steps: - -1. [Install Miniconda](https://docs.anaconda.com/miniconda/#quick-command-line-install): - -```bash -mkdir -p ~/miniconda3 -wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda3/miniconda.sh -bash ~/miniconda3/miniconda.sh -b -u -p ~/miniconda3 -rm ~/miniconda3/miniconda.sh -~/miniconda3/bin/conda init bash -``` - -2. Comment out these lines in `~/.profile` (this can mess up paths used by conda and ~/.local/bin should already be in your PATH) - -``` -# set PATH so it includes user's private bin if it exists -if [ -d "$HOME/.local/bin" ] ; then - PATH="$HOME/.local/bin:$PATH" -fi -``` - -3. Restart shell or `source ~/.bashrc` - -4. Create and activate a fresh conda environment for lerobot - -```bash -conda create -y -n lerobot python=3.10 && conda activate lerobot -``` - -5. Clone LeRobot: - -```bash -git clone https://github.com/huggingface/lerobot.git ~/lerobot -``` - -6. When using `miniconda`, install `ffmpeg` in your environment: - -```bash -conda install ffmpeg -c conda-forge -``` - -7. Install LeRobot with stretch dependencies: - -```bash -cd ~/lerobot && pip install -e ".[stretch]" -``` - -> **Note:** If you get this message, you can ignore it: `ERROR: pip's dependency resolver does not currently take into account all the packages that are installed.` - -8. Run a [system check](https://docs.hello-robot.com/0.3/getting_started/stretch_hardware_overview/#system-check) to make sure your robot is ready: - -```bash -stretch_system_check.py -``` - -> **Note:** You may need to free the "robot process" after booting Stretch by running `stretch_free_robot_process.py`. For more info this Stretch's [doc](https://docs.hello-robot.com/0.3/getting_started/stretch_hardware_overview/#turning-off-gamepad-teleoperation). - -You should get something like this: - -```bash -For use with S T R E T C H (R) from Hello Robot Inc. ---------------------------------------------------------------------- - -Model = Stretch 3 -Tool = DexWrist 3 w/ Gripper -Serial Number = stretch-se3-3054 - ----- Checking Hardware ---- -[Pass] Comms are ready -[Pass] Actuators are ready -[Warn] Sensors not ready (IMU AZ = -10.19 out of range -10.1 to -9.5) -[Pass] Battery voltage is 13.6 V - ----- Checking Software ---- -[Pass] Ubuntu 22.04 is ready -[Pass] All APT pkgs are setup correctly -[Pass] Firmware is up-to-date -[Pass] Python pkgs are up-to-date -[Pass] ROS2 Humble is ready -``` - -## Teleoperate, record a dataset and run a policy - -**Calibrate (Optional)** -Before operating Stretch, you need to [home](https://docs.hello-robot.com/0.3/getting_started/stretch_hardware_overview/#homing) it first. Be mindful about giving Stretch some space as this procedure will move the robot's arm and gripper. Now run this command: - -```bash -python lerobot/scripts/control_robot.py \ - --robot.type=stretch \ - --control.type=calibrate -``` - -This is equivalent to running `stretch_robot_home.py` - -> **Note:** If you run any of the LeRobot scripts below and Stretch is not properly homed, it will automatically home/calibrate first. - -**Teleoperate** -Before trying teleoperation, you need to activate the gamepad controller by pressing the middle button. For more info, see Stretch's [doc](https://docs.hello-robot.com/0.3/getting_started/hello_robot/#gamepad-teleoperation). - -Now try out teleoperation (see above documentation to learn about the gamepad controls): - -> **NOTE:** To visualize the data, enable `--control.display_data=true`. This streams the data using `rerun`. - -```bash -python lerobot/scripts/control_robot.py \ - --robot.type=stretch \ - --control.type=teleoperate -``` - -This is essentially the same as running `stretch_gamepad_teleop.py` - -**Record a dataset** -Once you're familiar with the gamepad controls and after a bit of practice, you can try to record your first dataset with Stretch. - -If you want to use the Hugging Face hub features for uploading your dataset and you haven't previously done it, make sure you've logged in using a write-access token, which can be generated from the [Hugging Face settings](https://huggingface.co/settings/tokens): - -```bash -huggingface-cli login --token ${HUGGINGFACE_TOKEN} --add-to-git-credential -``` - -Store your Hugging Face repository name in a variable to run these commands: - -```bash -HF_USER=$(huggingface-cli whoami | head -n 1) -echo $HF_USER -``` - -Record one episode: - -```bash -python lerobot/scripts/control_robot.py \ - --robot.type=stretch \ - --control.type=record \ - --control.fps=30 \ - --control.single_task="Grasp a lego block and put it in the bin." \ - --control.repo_id=${HF_USER}/stretch_test \ - --control.tags='["tutorial"]' \ - --control.warmup_time_s=5 \ - --control.episode_time_s=30 \ - --control.reset_time_s=30 \ - --control.num_episodes=2 \ - --control.push_to_hub=true -``` - -> **Note:** If you're using ssh to connect to Stretch and run this script, you won't be able to visualize its cameras feed (though they will still be recording). To see the cameras stream, use [tethered](https://docs.hello-robot.com/0.3/getting_started/connecting_to_stretch/#tethered-setup) or [untethered setup](https://docs.hello-robot.com/0.3/getting_started/connecting_to_stretch/#untethered-setup). - -**Replay an episode** -Now try to replay this episode (make sure the robot's initial position is the same): - -```bash -python lerobot/scripts/control_robot.py \ - --robot.type=stretch \ - --control.type=replay \ - --control.fps=30 \ - --control.repo_id=${HF_USER}/stretch_test \ - --control.episode=0 -``` - -If you need help, please reach out on Discord in the channel `#stretch3-mobile-arm`. diff --git a/src/lerobot/robots/stretch3/__init__.py b/src/lerobot/robots/stretch3/__init__.py deleted file mode 100644 index b3070bbd6..000000000 --- a/src/lerobot/robots/stretch3/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -#!/usr/bin/env python - -# Copyright 2025 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from .configuration_stretch3 import Stretch3RobotConfig -from .robot_stretch3 import Stretch3Robot diff --git a/src/lerobot/robots/stretch3/configuration_stretch3.py b/src/lerobot/robots/stretch3/configuration_stretch3.py deleted file mode 100644 index c1226bf90..000000000 --- a/src/lerobot/robots/stretch3/configuration_stretch3.py +++ /dev/null @@ -1,51 +0,0 @@ -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dataclasses import dataclass, field - -from lerobot.cameras import CameraConfig -from lerobot.cameras.opencv import OpenCVCameraConfig -from lerobot.cameras.realsense import RealSenseCameraConfig - -from ..config import RobotConfig - - -@RobotConfig.register_subclass("stretch3") -@dataclass -class Stretch3RobotConfig(RobotConfig): - # cameras - cameras: dict[str, CameraConfig] = field( - default_factory=lambda: { - "navigation": OpenCVCameraConfig( - index_or_path="/dev/hello-nav-head-camera", - fps=10, - width=1280, - height=720, - rotation=-90, - ), - "head": RealSenseCameraConfig( - name="Intel RealSense D435I", - fps=30, - width=640, - height=480, - rotation=90, - ), - "wrist": RealSenseCameraConfig( - name="Intel RealSense D405", - fps=30, - width=640, - height=480, - ), - } - ) diff --git a/src/lerobot/robots/stretch3/robot_stretch3.py b/src/lerobot/robots/stretch3/robot_stretch3.py deleted file mode 100644 index 73df360b2..000000000 --- a/src/lerobot/robots/stretch3/robot_stretch3.py +++ /dev/null @@ -1,180 +0,0 @@ -#!/usr/bin/env python - -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import time - -import numpy as np -from stretch_body.gamepad_teleop import GamePadTeleop -from stretch_body.robot import Robot as StretchAPI -from stretch_body.robot_params import RobotParams - -from lerobot.cameras.utils import make_cameras_from_configs -from lerobot.datasets.utils import get_nested_item -from lerobot.utils.constants import OBS_IMAGES, OBS_STATE - -from ..robot import Robot -from .configuration_stretch3 import Stretch3RobotConfig - -# {lerobot_keys: stretch.api.keys} -STRETCH_MOTORS = { - "head_pan.pos": "head.head_pan.pos", - "head_tilt.pos": "head.head_tilt.pos", - "lift.pos": "lift.pos", - "arm.pos": "arm.pos", - "wrist_pitch.pos": "end_of_arm.wrist_pitch.pos", - "wrist_roll.pos": "end_of_arm.wrist_roll.pos", - "wrist_yaw.pos": "end_of_arm.wrist_yaw.pos", - "gripper.pos": "end_of_arm.stretch_gripper.pos", - "base_x.vel": "base.x_vel", - "base_y.vel": "base.y_vel", - "base_theta.vel": "base.theta_vel", -} - - -class Stretch3Robot(Robot): - """[Stretch 3](https://hello-robot.com/stretch-3-product), by Hello Robot.""" - - config_class = Stretch3RobotConfig - name = "stretch3" - - def __init__(self, config: Stretch3RobotConfig): - raise NotImplementedError - super().__init__(config) - - self.config = config - self.robot_type = self.config.type - - self.api = StretchAPI() - self.cameras = make_cameras_from_configs(config.cameras) - - self.is_connected = False - self.logs = {} - - self.teleop = None # TODO remove - - # TODO(aliberts): test this - RobotParams.set_logging_level("WARNING") - RobotParams.set_logging_formatter("brief_console_formatter") - - self.state_keys = None - self.action_keys = None - - @property - def observation_features(self) -> dict: - return { - "dtype": "float32", - "shape": (len(STRETCH_MOTORS),), - "names": {"motors": list(STRETCH_MOTORS)}, - } - - @property - def action_features(self) -> dict: - return self.observation_features - - @property - def camera_features(self) -> dict[str, dict]: - cam_ft = {} - for cam_key, cam in self.cameras.items(): - cam_ft[cam_key] = { - "shape": (cam.height, cam.width, cam.channels), - "names": ["height", "width", "channels"], - "info": None, - } - return cam_ft - - def connect(self) -> None: - self.is_connected = self.api.startup() - if not self.is_connected: - print("Another process is already using Stretch. Try running 'stretch_free_robot_process.py'") - raise ConnectionError() - - for cam in self.cameras.values(): - cam.connect() - self.is_connected = self.is_connected and cam.is_connected - - if not self.is_connected: - print("Could not connect to the cameras, check that all cameras are plugged-in.") - raise ConnectionError() - - self.calibrate() - - def calibrate(self) -> None: - if not self.api.is_homed(): - self.api.home() - - def _get_state(self) -> dict: - status = self.api.get_status() - return {k: get_nested_item(status, v, sep=".") for k, v in STRETCH_MOTORS.items()} - - def get_observation(self) -> dict[str, np.ndarray]: - obs_dict = {} - - # Read Stretch state - before_read_t = time.perf_counter() - state = self._get_state() - self.logs["read_pos_dt_s"] = time.perf_counter() - before_read_t - - if self.state_keys is None: - self.state_keys = list(state) - - state = np.asarray(list(state.values())) - obs_dict[OBS_STATE] = state - - # Capture images from cameras - for cam_key, cam in self.cameras.items(): - before_camread_t = time.perf_counter() - obs_dict[f"{OBS_IMAGES}.{cam_key}"] = cam.async_read() - self.logs[f"read_camera_{cam_key}_dt_s"] = cam.logs["delta_timestamp_s"] - self.logs[f"async_read_camera_{cam_key}_dt_s"] = time.perf_counter() - before_camread_t - - return obs_dict - - def send_action(self, action: np.ndarray) -> np.ndarray: - if not self.is_connected: - raise ConnectionError() - - if self.teleop is None: - self.teleop = GamePadTeleop(robot_instance=False) - self.teleop.startup(robot=self) - - if self.action_keys is None: - dummy_action = self.teleop.gamepad_controller.get_state() - self.action_keys = list(dummy_action.keys()) - - action_dict = dict(zip(self.action_keys, action.tolist(), strict=True)) - - before_write_t = time.perf_counter() - self.teleop.do_motion(state=action_dict, robot=self) - self.push_command() - self.logs["write_pos_dt_s"] = time.perf_counter() - before_write_t - - # TODO(aliberts): return action_sent when motion is limited - return action - - def teleop_safety_stop(self) -> None: - if self.teleop is not None: - self.teleop._safety_stop(robot=self) - - def disconnect(self) -> None: - self.api.stop() - if self.teleop is not None: - self.teleop.gamepad_controller.stop() - self.teleop.stop() - - for cam in self.cameras.values(): - cam.disconnect() - - self.is_connected = False diff --git a/src/lerobot/robots/utils.py b/src/lerobot/robots/utils.py index aca5c8716..4e8001538 100644 --- a/src/lerobot/robots/utils.py +++ b/src/lerobot/robots/utils.py @@ -40,14 +40,6 @@ def make_robot_from_config(config: RobotConfig) -> Robot: from .lekiwi import LeKiwi return LeKiwi(config) - elif config.type == "stretch3": - from .stretch3 import Stretch3Robot - - return Stretch3Robot(config) - elif config.type == "viperx": - from .viperx import ViperX - - return ViperX(config) elif config.type == "hope_jr_hand": from .hope_jr import HopeJrHand diff --git a/src/lerobot/robots/viperx/README.md b/src/lerobot/robots/viperx/README.md deleted file mode 100644 index 2e8fc7289..000000000 --- a/src/lerobot/robots/viperx/README.md +++ /dev/null @@ -1,196 +0,0 @@ -This tutorial explains how to use [Aloha and Aloha 2 stationary](https://www.trossenrobotics.com/aloha-stationary) with LeRobot. - -## Setup - -Follow the [documentation from Trossen Robotics](https://docs.trossenrobotics.com/aloha_docs/2.0/getting_started/stationary/hardware_setup.html) for setting up the hardware and plugging the 4 arms and 4 cameras to your computer. - -## Install LeRobot - -On your computer: - -1. [Install Miniconda](https://docs.anaconda.com/miniconda/#quick-command-line-install): - -```bash -mkdir -p ~/miniconda3 -wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda3/miniconda.sh -bash ~/miniconda3/miniconda.sh -b -u -p ~/miniconda3 -rm ~/miniconda3/miniconda.sh -~/miniconda3/bin/conda init bash -``` - -2. Restart shell or `source ~/.bashrc` - -3. Create and activate a fresh conda environment for lerobot - -```bash -conda create -y -n lerobot python=3.10 && conda activate lerobot -``` - -4. Clone LeRobot: - -```bash -git clone https://github.com/huggingface/lerobot.git ~/lerobot -``` - -5. When using `miniconda`, install `ffmpeg` in your environment: - -```bash -conda install ffmpeg -c conda-forge -``` - -6. Install LeRobot with dependencies for the Aloha motors (dynamixel) and cameras (intelrealsense): - -```bash -cd ~/lerobot && pip install -e ".[dynamixel, intelrealsense]" -``` - -## Teleoperate - -\*\*/!\ FOR SAFETY, READ THIS /!\*\* -Teleoperation consists in manually operating the leader arms to move the follower arms. Importantly: - -1. Make sure your leader arms are in the same position as the follower arms, so that the follower arms don't move too fast to match the leader arms, -2. Our code assumes that your robot has been assembled following Trossen Robotics instructions. This allows us to skip calibration, as we use the pre-defined calibration files in `.cache/calibration/aloha_default`. If you replace a motor, make sure you follow the exact instructions from Trossen Robotics. - -By running the following code, you can start your first **SAFE** teleoperation: - -> **NOTE:** To visualize the data, enable `--control.display_data=true`. This streams the data using `rerun`. - -```bash -python lerobot/scripts/control_robot.py \ - --robot.type=aloha \ - --robot.max_relative_target=5 \ - --control.type=teleoperate -``` - -By adding `--robot.max_relative_target=5`, we override the default value for `max_relative_target` defined in [`ViperXConfig`](./config_viperx.py). It is expected to be `5` to limit the magnitude of the movement for more safety, but the teleoperation won't be smooth. When you feel confident, you can disable this limit by adding `--robot.max_relative_target=null` to the command line: - -```bash -python lerobot/scripts/control_robot.py \ - --robot.type=aloha \ - --robot.max_relative_target=null \ - --control.type=teleoperate -``` - -## Record a dataset - -Once you're familiar with teleoperation, you can record your first dataset with Aloha. - -If you want to use the Hugging Face hub features for uploading your dataset and you haven't previously done it, make sure you've logged in using a write-access token, which can be generated from the [Hugging Face settings](https://huggingface.co/settings/tokens): - -```bash -huggingface-cli login --token ${HUGGINGFACE_TOKEN} --add-to-git-credential -``` - -Store your Hugging Face repository name in a variable to run these commands: - -```bash -HF_USER=$(huggingface-cli whoami | head -n 1) -echo $HF_USER -``` - -Record 2 episodes and upload your dataset to the hub: - -```bash -python lerobot/scripts/control_robot.py \ - --robot.type=aloha \ - --robot.max_relative_target=null \ - --control.type=record \ - --control.fps=30 \ - --control.single_task="Grasp a lego block and put it in the bin." \ - --control.repo_id=${HF_USER}/aloha_test \ - --control.tags='["tutorial"]' \ - --control.warmup_time_s=5 \ - --control.episode_time_s=30 \ - --control.reset_time_s=30 \ - --control.num_episodes=2 \ - --control.push_to_hub=true -``` - -## Visualize a dataset - -If you uploaded your dataset to the hub with `--control.push_to_hub=true`, you can [visualize your dataset online](https://huggingface.co/spaces/lerobot/visualize_dataset) by copy pasting your repo id given by: - -```bash -echo ${HF_USER}/aloha_test -``` - -If you didn't upload with `--control.push_to_hub=false`, you can also visualize it locally with [Rerun](https://github.com/rerun-io/rerun): - -```bash -lerobot-dataset-viz \ - --repo-id ${HF_USER}/aloha_test --episode 0 -``` - -## Replay an episode - -\*\*/!\ FOR SAFETY, READ THIS /!\*\* -Replay consists in automatically replaying the sequence of actions (i.e. goal positions for your motors) recorded in a given dataset episode. Make sure the current initial position of your robot is similar to the one in your episode, so that your follower arms don't move too fast to go to the first goal positions. For safety, you might want to add `--robot.max_relative_target=5` to your command line as explained above. - -Now try to replay the first episode on your robot: - -```bash -python lerobot/scripts/control_robot.py \ - --robot.type=aloha \ - --robot.max_relative_target=null \ - --control.type=replay \ - --control.fps=30 \ - --control.repo_id=${HF_USER}/aloha_test \ - --control.episode=0 -``` - -## Train a policy - -To train a policy to control your robot, use the [`lerobot-train`](../src/lerobot/scripts/train.py) script. A few arguments are required. Here is an example command: - -```bash -lerobot-train \ - --dataset.repo_id=${HF_USER}/aloha_test \ - --policy.type=act \ - --output_dir=outputs/train/act_aloha_test \ - --job_name=act_aloha_test \ - --policy.device=cuda \ - --wandb.enable=true -``` - -Let's explain it: - -1. We provided the dataset as argument with `--dataset.repo_id=${HF_USER}/aloha_test`. -2. We provided the policy with `policy.type=act`. This loads configurations from [`configuration_act.py`](../src/lerobot/policies/act/configuration_act.py). Importantly, this policy will automatically adapt to the number of motor states, motor actions and cameras of your robot (e.g. `laptop` and `phone`) which have been saved in your dataset. -3. We provided `policy.device=cuda` since we are training on a Nvidia GPU, but you could use `policy.device=mps` to train on Apple silicon. -4. We provided `wandb.enable=true` to use [Weights and Biases](https://docs.wandb.ai/quickstart) for visualizing training plots. This is optional but if you use it, make sure you are logged in by running `wandb login`. - -For more information on the `train` script see the previous tutorial: [`examples/4_train_policy_with_script.md`](../examples/4_train_policy_with_script.md) - -Training should take several hours. You will find checkpoints in `outputs/train/act_aloha_test/checkpoints`. - -## Evaluate your policy - -You can use the `record` function from [`lerobot/scripts/control_robot.py`](../src/lerobot/scripts/control_robot.py) but with a policy checkpoint as input. For instance, run this command to record 10 evaluation episodes: - -```bash -python lerobot/scripts/control_robot.py \ - --robot.type=aloha \ - --control.type=record \ - --control.fps=30 \ - --control.single_task="Grasp a lego block and put it in the bin." \ - --control.repo_id=${HF_USER}/eval_act_aloha_test \ - --control.tags='["tutorial"]' \ - --control.warmup_time_s=5 \ - --control.episode_time_s=30 \ - --control.reset_time_s=30 \ - --control.num_episodes=10 \ - --control.push_to_hub=true \ - --control.policy.path=outputs/train/act_aloha_test/checkpoints/last/pretrained_model \ - --control.num_image_writer_processes=1 -``` - -As you can see, it's almost the same command as previously used to record your training dataset. Two things changed: - -1. There is an additional `--control.policy.path` argument which indicates the path to your policy checkpoint with (e.g. `outputs/train/eval_act_aloha_test/checkpoints/last/pretrained_model`). You can also use the model repository if you uploaded a model checkpoint to the hub (e.g. `${HF_USER}/act_aloha_test`). -2. The name of dataset begins by `eval` to reflect that you are running inference (e.g. `${HF_USER}/eval_act_aloha_test`). -3. We use `--control.num_image_writer_processes=1` instead of the default value (`0`). On our computer, using a dedicated process to write images from the 4 cameras on disk allows to reach constant 30 fps during inference. Feel free to explore different values for `--control.num_image_writer_processes`. - -## More - -If you have any question or need help, please reach out on Discord in the channel `#aloha-arm`. diff --git a/src/lerobot/robots/viperx/__init__.py b/src/lerobot/robots/viperx/__init__.py deleted file mode 100644 index bfba07fc7..000000000 --- a/src/lerobot/robots/viperx/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -#!/usr/bin/env python - -# Copyright 2025 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from .config_viperx import ViperXConfig -from .viperx import ViperX diff --git a/src/lerobot/robots/viperx/config_viperx.py b/src/lerobot/robots/viperx/config_viperx.py deleted file mode 100644 index ed3876a9c..000000000 --- a/src/lerobot/robots/viperx/config_viperx.py +++ /dev/null @@ -1,45 +0,0 @@ -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dataclasses import dataclass, field - -from lerobot.cameras import CameraConfig - -from ..config import RobotConfig - - -@RobotConfig.register_subclass("viperx") -@dataclass -class ViperXConfig(RobotConfig): - port: str # Port to connect to the arm - - disable_torque_on_disconnect: bool = True - - # /!\ FOR SAFETY, READ THIS /!\ - # `max_relative_target` limits the magnitude of the relative positional target vector for safety purposes. - # Set this to a positive scalar to have the same value for all motors, or a dictionary that maps motor - # names to the max_relative_target value for that motor. - # For Aloha, for every goal position request, motor rotations are capped at 5 degrees by default. - # When you feel more confident with teleoperation or running the policy, you can extend - # this safety limit and even removing it by setting it to `null`. - # Also, everything is expected to work safely out-of-the-box, but we highly advise to - # first try to teleoperate the grippers only (by commenting out the rest of the motors in this yaml), - # then to gradually add more motors (by uncommenting), until you can teleoperate both arms fully - max_relative_target: float | dict[str, float] = 5.0 - - # cameras - cameras: dict[str, CameraConfig] = field(default_factory=dict) - # Troubleshooting: If one of your IntelRealSense cameras freeze during - # data recording due to bandwidth limit, you might need to plug the camera - # on another USB hub or PCIe card. diff --git a/src/lerobot/robots/viperx/viperx.py b/src/lerobot/robots/viperx/viperx.py deleted file mode 100644 index 31e99ffdb..000000000 --- a/src/lerobot/robots/viperx/viperx.py +++ /dev/null @@ -1,233 +0,0 @@ -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import time -from functools import cached_property -from typing import Any - -from lerobot.cameras.utils import make_cameras_from_configs -from lerobot.motors import Motor, MotorCalibration, MotorNormMode -from lerobot.motors.dynamixel import ( - DynamixelMotorsBus, - OperatingMode, -) -from lerobot.utils.constants import OBS_STATE -from lerobot.utils.errors import DeviceAlreadyConnectedError, DeviceNotConnectedError - -from ..robot import Robot -from ..utils import ensure_safe_goal_position -from .config_viperx import ViperXConfig - -logger = logging.getLogger(__name__) - - -class ViperX(Robot): - """ - [ViperX](https://www.trossenrobotics.com/viperx-300) developed by Trossen Robotics - """ - - config_class = ViperXConfig - name = "viperx" - - def __init__( - self, - config: ViperXConfig, - ): - raise NotImplementedError - super().__init__(config) - self.config = config - self.bus = DynamixelMotorsBus( - port=self.config.port, - motors={ - "waist": Motor(1, "xm540-w270", MotorNormMode.RANGE_M100_100), - "shoulder": Motor(2, "xm540-w270", MotorNormMode.RANGE_M100_100), - "shoulder_shadow": Motor(3, "xm540-w270", MotorNormMode.RANGE_M100_100), - "elbow": Motor(4, "xm540-w270", MotorNormMode.RANGE_M100_100), - "elbow_shadow": Motor(5, "xm540-w270", MotorNormMode.RANGE_M100_100), - "forearm_roll": Motor(6, "xm540-w270", MotorNormMode.RANGE_M100_100), - "wrist_angle": Motor(7, "xm540-w270", MotorNormMode.RANGE_M100_100), - "wrist_rotate": Motor(8, "xm430-w350", MotorNormMode.RANGE_M100_100), - "gripper": Motor(9, "xm430-w350", MotorNormMode.RANGE_0_100), - }, - ) - self.cameras = make_cameras_from_configs(config.cameras) - - @property - def _motors_ft(self) -> dict[str, type]: - return {f"{motor}.pos": float for motor in self.bus.motors} - - @property - def _cameras_ft(self) -> dict[str, tuple]: - return { - cam: (self.config.cameras[cam].height, self.config.cameras[cam].width, 3) for cam in self.cameras - } - - @cached_property - def observation_features(self) -> dict[str, type | tuple]: - return {**self._motors_ft, **self._cameras_ft} - - @cached_property - def action_features(self) -> dict[str, type]: - return self._motors_ft - - @property - def is_connected(self) -> bool: - return self.bus.is_connected and all(cam.is_connected for cam in self.cameras.values()) - - def connect(self, calibrate: bool = True) -> None: - """ - We assume that at connection time, arm is in a rest position, - and torque can be safely disabled to run calibration. - """ - if self.is_connected: - raise DeviceAlreadyConnectedError(f"{self} already connected") - - self.bus.connect() - if not self.is_calibrated and calibrate: - self.calibrate() - - for cam in self.cameras.values(): - cam.connect() - - self.configure() - logger.info(f"{self} connected.") - - @property - def is_calibrated(self) -> bool: - return self.bus.is_calibrated - - def calibrate(self) -> None: - raise NotImplementedError # TODO(aliberts): adapt code below (copied from koch - logger.info(f"\nRunning calibration of {self}") - self.bus.disable_torque() - for motor in self.bus.motors: - self.bus.write("Operating_Mode", motor, OperatingMode.EXTENDED_POSITION.value) - - input("Move robot to the middle of its range of motion and press ENTER....") - homing_offsets = self.bus.set_half_turn_homings() - - full_turn_motors = ["shoulder_pan", "wrist_roll"] - unknown_range_motors = [motor for motor in self.bus.motors if motor not in full_turn_motors] - print( - f"Move all joints except {full_turn_motors} sequentially through their entire " - "ranges of motion.\nRecording positions. Press ENTER to stop..." - ) - range_mins, range_maxes = self.bus.record_ranges_of_motion(unknown_range_motors) - for motor in full_turn_motors: - range_mins[motor] = 0 - range_maxes[motor] = 4095 - - self.calibration = {} - for motor, m in self.bus.motors.items(): - self.calibration[motor] = MotorCalibration( - id=m.id, - drive_mode=0, - homing_offset=homing_offsets[motor], - range_min=range_mins[motor], - range_max=range_maxes[motor], - ) - - self.bus.write_calibration(self.calibration) - self._save_calibration() - logger.info(f"Calibration saved to {self.calibration_fpath}") - - def configure(self) -> None: - with self.bus.torque_disabled(): - self.bus.configure_motors() - - # Set secondary/shadow ID for shoulder and elbow. These joints have two motors. - # As a result, if only one of them is required to move to a certain position, - # the other will follow. This is to avoid breaking the motors. - self.bus.write("Secondary_ID", "shoulder_shadow", 2) - self.bus.write("Secondary_ID", "elbow_shadow", 4) - - # Set a velocity limit of 131 as advised by Trossen Robotics - # TODO(aliberts): remove as it's actually useless in position control - self.bus.write("Velocity_Limit", 131) - - # Use 'extended position mode' for all motors except gripper, because in joint mode the servos - # can't rotate more than 360 degrees (from 0 to 4095) And some mistake can happen while assembling - # the arm, you could end up with a servo with a position 0 or 4095 at a crucial point. - # See: https://emanual.robotis.com/docs/en/dxl/x/x_series/#operating-mode11 - for motor in self.bus.motors: - if motor != "gripper": - self.bus.write("Operating_Mode", motor, OperatingMode.EXTENDED_POSITION.value) - - # Use 'position control current based' for follower gripper to be limited by the limit of the - # current. It can grasp an object without forcing too much even tho, it's goal position is a - # complete grasp (both gripper fingers are ordered to join and reach a touch). - self.bus.write("Operating_Mode", "gripper", OperatingMode.CURRENT_POSITION.value) - - def get_observation(self) -> dict[str, Any]: - """The returned observations do not have a batch dimension.""" - if not self.is_connected: - raise DeviceNotConnectedError(f"{self} is not connected.") - - obs_dict = {} - - # Read arm position - start = time.perf_counter() - obs_dict[OBS_STATE] = self.bus.sync_read("Present_Position") - obs_dict = {f"{motor}.pos": val for motor, val in obs_dict.items()} - dt_ms = (time.perf_counter() - start) * 1e3 - logger.debug(f"{self} read state: {dt_ms:.1f}ms") - - # Capture images from cameras - for cam_key, cam in self.cameras.items(): - start = time.perf_counter() - obs_dict[cam_key] = cam.async_read() - dt_ms = (time.perf_counter() - start) * 1e3 - logger.debug(f"{self} read {cam_key}: {dt_ms:.1f}ms") - - return obs_dict - - def send_action(self, action: dict[str, float]) -> dict[str, float]: - """Command arm to move to a target joint configuration. - - The relative action magnitude may be clipped depending on the configuration parameter - `max_relative_target`. In this case, the action sent differs from original action. - Thus, this function always returns the action actually sent. - - Args: - action (dict[str, float]): The goal positions for the motors. - - Returns: - dict[str, float]: The action sent to the motors, potentially clipped. - """ - if not self.is_connected: - raise DeviceNotConnectedError(f"{self} is not connected.") - - goal_pos = {key.removesuffix(".pos"): val for key, val in action.items() if key.endswith(".pos")} - - # Cap goal position when too far away from present position. - # /!\ Slower fps expected due to reading from the follower. - if self.config.max_relative_target is not None: - present_pos = self.bus.sync_read("Present_Position") - goal_present_pos = {key: (g_pos, present_pos[key]) for key, g_pos in goal_pos.items()} - goal_pos = ensure_safe_goal_position(goal_present_pos, self.config.max_relative_target) - - # Send goal position to the arm - self.bus.sync_write("Goal_Position", goal_pos) - return {f"{motor}.pos": val for motor, val in goal_pos.items()} - - def disconnect(self): - if not self.is_connected: - raise DeviceNotConnectedError(f"{self} is not connected.") - - self.bus.disconnect(self.config.disable_torque_on_disconnect) - for cam in self.cameras.values(): - cam.disconnect() - - logger.info(f"{self} disconnected.") diff --git a/src/lerobot/teleoperators/stretch3_gamepad/__init__.py b/src/lerobot/teleoperators/stretch3_gamepad/__init__.py deleted file mode 100644 index fa5a19974..000000000 --- a/src/lerobot/teleoperators/stretch3_gamepad/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -#!/usr/bin/env python - -# Copyright 2025 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from .configuration_stretch3 import Stretch3GamePadConfig -from .stretch3_gamepad import Stretch3GamePad diff --git a/src/lerobot/teleoperators/stretch3_gamepad/configuration_stretch3.py b/src/lerobot/teleoperators/stretch3_gamepad/configuration_stretch3.py deleted file mode 100644 index 3af0b5be1..000000000 --- a/src/lerobot/teleoperators/stretch3_gamepad/configuration_stretch3.py +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env python - -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dataclasses import dataclass - -from ..config import TeleoperatorConfig - - -@TeleoperatorConfig.register_subclass("stretch3") -@dataclass -class Stretch3GamePadConfig(TeleoperatorConfig): - """Stretch3GamePadConfig""" diff --git a/src/lerobot/teleoperators/stretch3_gamepad/stretch3_gamepad.py b/src/lerobot/teleoperators/stretch3_gamepad/stretch3_gamepad.py deleted file mode 100644 index 09fdfadd7..000000000 --- a/src/lerobot/teleoperators/stretch3_gamepad/stretch3_gamepad.py +++ /dev/null @@ -1,117 +0,0 @@ -#!/usr/bin/env python - -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import time - -import numpy as np -from stretch_body.gamepad_teleop import GamePadTeleop -from stretch_body.robot_params import RobotParams - -from lerobot.utils.errors import DeviceAlreadyConnectedError - -from ..teleoperator import Teleoperator -from .configuration_stretch3 import Stretch3GamePadConfig - -# from stretch_body.gamepad_controller.GamePadController -GAMEPAD_BUTTONS = [ - "middle_led_ring_button_pressed", - "left_stick_x", - "left_stick_y", - "right_stick_x", - "right_stick_y", - "left_stick_button_pressed", - "right_stick_button_pressed", - "bottom_button_pressed", - "top_button_pressed", - "left_button_pressed", - "right_button_pressed", - "left_shoulder_button_pressed", - "right_shoulder_button_pressed", - "select_button_pressed", - "start_button_pressed", - "left_trigger_pulled", - "right_trigger_pulled", - "bottom_pad_pressed", - "top_pad_pressed", - "left_pad_pressed", - "right_pad_pressed", -] - - -class Stretch3GamePad(Teleoperator): - """[Stretch 3](https://hello-robot.com/stretch-3-product), by Hello Robot.""" - - config_class = Stretch3GamePadConfig - name = "stretch3" - - def __init__(self, config: Stretch3GamePadConfig): - raise NotImplementedError - super().__init__(config) - - self.config = config - self.robot_type = self.config.type - - self.api = GamePadTeleop(robot_instance=False) - - self.is_connected = False - self.logs = {} - - # TODO(aliberts): test this - RobotParams.set_logging_level("WARNING") - RobotParams.set_logging_formatter("brief_console_formatter") - - @property - def action_features(self) -> dict: - return { - "dtype": "float32", - "shape": (len(GAMEPAD_BUTTONS),), - "names": {"buttons": GAMEPAD_BUTTONS}, - } - - @property - def feedback_features(self) -> dict: - return {} - - def connect(self) -> None: - if self.is_connected: - raise DeviceAlreadyConnectedError( - "ManipulatorRobot is already connected. Do not run `robot.connect()` twice." - ) - - self.api.startup() - self.api._update_state() # Check controller can be read & written - self.api._update_modes() - self.is_connected = True - - def calibrate(self) -> None: - pass - - def get_action(self) -> np.ndarray: - # Read Stretch state - before_read_t = time.perf_counter() - action = self.api.gamepad_controller.get_state() - self.logs["read_pos_dt_s"] = time.perf_counter() - before_read_t - - action = np.asarray(list(action.values())) - - return action - - def send_feedback(self, feedback: np.ndarray) -> None: - pass - - def disconnect(self) -> None: - self.api.stop() - self.is_connected = False diff --git a/src/lerobot/teleoperators/utils.py b/src/lerobot/teleoperators/utils.py index ada7ee8a1..2103a1669 100644 --- a/src/lerobot/teleoperators/utils.py +++ b/src/lerobot/teleoperators/utils.py @@ -49,14 +49,6 @@ def make_teleoperator_from_config(config: TeleoperatorConfig) -> Teleoperator: from .so101_leader import SO101Leader return SO101Leader(config) - elif config.type == "stretch3": - from .stretch3_gamepad import Stretch3GamePad - - return Stretch3GamePad(config) - elif config.type == "widowx": - from .widowx import WidowX - - return WidowX(config) elif config.type == "mock_teleop": from tests.mocks.mock_teleop import MockTeleop diff --git a/src/lerobot/teleoperators/widowx/__init__.py b/src/lerobot/teleoperators/widowx/__init__.py deleted file mode 100644 index 42e312f49..000000000 --- a/src/lerobot/teleoperators/widowx/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -#!/usr/bin/env python - -# Copyright 2025 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from .config_widowx import WidowXConfig -from .widowx import WidowX diff --git a/src/lerobot/teleoperators/widowx/config_widowx.py b/src/lerobot/teleoperators/widowx/config_widowx.py deleted file mode 100644 index 42fae12db..000000000 --- a/src/lerobot/teleoperators/widowx/config_widowx.py +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env python - -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dataclasses import dataclass - -from ..config import TeleoperatorConfig - - -@TeleoperatorConfig.register_subclass("widowx") -@dataclass -class WidowXConfig(TeleoperatorConfig): - port: str # Port to connect to the arm diff --git a/src/lerobot/teleoperators/widowx/widowx.py b/src/lerobot/teleoperators/widowx/widowx.py deleted file mode 100644 index 1a00bd4d2..000000000 --- a/src/lerobot/teleoperators/widowx/widowx.py +++ /dev/null @@ -1,155 +0,0 @@ -#!/usr/bin/env python - -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import time - -from lerobot.motors import Motor, MotorCalibration, MotorNormMode -from lerobot.motors.dynamixel import ( - DriveMode, - DynamixelMotorsBus, - OperatingMode, -) -from lerobot.utils.errors import DeviceAlreadyConnectedError, DeviceNotConnectedError - -from ..teleoperator import Teleoperator -from .config_widowx import WidowXConfig - -logger = logging.getLogger(__name__) - - -class WidowX(Teleoperator): - """ - [WidowX](https://www.trossenrobotics.com/widowx-250) developed by Trossen Robotics - """ - - config_class = WidowXConfig - name = "widowx" - - def __init__(self, config: WidowXConfig): - raise NotImplementedError - super().__init__(config) - self.config = config - self.bus = DynamixelMotorsBus( - port=self.config.port, - motors={ - "waist": Motor(1, "xm430-w350", MotorNormMode.RANGE_M100_100), - "shoulder": Motor(2, "xm430-w350", MotorNormMode.RANGE_M100_100), - "shoulder_shadow": Motor(3, "xm430-w350", MotorNormMode.RANGE_M100_100), - "elbow": Motor(4, "xm430-w350", MotorNormMode.RANGE_M100_100), - "elbow_shadow": Motor(5, "xm430-w350", MotorNormMode.RANGE_M100_100), - "forearm_roll": Motor(6, "xm430-w350", MotorNormMode.RANGE_M100_100), - "wrist_angle": Motor(7, "xm430-w350", MotorNormMode.RANGE_M100_100), - "wrist_rotate": Motor(8, "xl430-w250", MotorNormMode.RANGE_M100_100), - "gripper": Motor(9, "xc430-w150", MotorNormMode.RANGE_0_100), - }, - ) - - @property - def action_features(self) -> dict[str, type]: - return {f"{motor}.pos": float for motor in self.bus.motors} - - @property - def feedback_features(self) -> dict[str, type]: - return {} - - @property - def is_connected(self) -> bool: - return self.bus.is_connected - - def connect(self, calibrate: bool = True): - if self.is_connected: - raise DeviceAlreadyConnectedError(f"{self} already connected") - - self.bus.connect() - if not self.is_calibrated and calibrate: - self.calibrate() - - self.configure() - logger.info(f"{self} connected.") - - @property - def is_calibrated(self) -> bool: - return self.bus.is_calibrated - - def calibrate(self) -> None: - raise NotImplementedError # TODO(aliberts): adapt code below (copied from koch) - logger.info(f"\nRunning calibration of {self}") - self.bus.disable_torque() - for motor in self.bus.motors: - self.bus.write("Operating_Mode", motor, OperatingMode.EXTENDED_POSITION.value) - - self.bus.write("Drive_Mode", "elbow_flex", DriveMode.INVERTED.value) - drive_modes = {motor: 1 if motor == "elbow_flex" else 0 for motor in self.bus.motors} - - input("Move robot to the middle of its range of motion and press ENTER....") - homing_offsets = self.bus.set_half_turn_homings() - - full_turn_motors = ["shoulder_pan", "wrist_roll"] - unknown_range_motors = [motor for motor in self.bus.motors if motor not in full_turn_motors] - print( - f"Move all joints except {full_turn_motors} sequentially through their " - "entire ranges of motion.\nRecording positions. Press ENTER to stop..." - ) - range_mins, range_maxes = self.bus.record_ranges_of_motion(unknown_range_motors) - for motor in full_turn_motors: - range_mins[motor] = 0 - range_maxes[motor] = 4095 - - self.calibration = {} - for motor, m in self.bus.motors.items(): - self.calibration[motor] = MotorCalibration( - id=m.id, - drive_mode=drive_modes[motor], - homing_offset=homing_offsets[motor], - range_min=range_mins[motor], - range_max=range_maxes[motor], - ) - - self.bus.write_calibration(self.calibration) - self._save_calibration() - logger.info(f"Calibration saved to {self.calibration_fpath}") - - def configure(self) -> None: - self.bus.disable_torque() - self.bus.configure_motors() - - # Set secondary/shadow ID for shoulder and elbow. These joints have two motors. - # As a result, if only one of them is required to move to a certain position, - # the other will follow. This is to avoid breaking the motors. - self.bus.write("Secondary_ID", "shoulder_shadow", 2) - self.bus.write("Secondary_ID", "elbow_shadow", 4) - - def get_action(self) -> dict[str, float]: - if not self.is_connected: - raise DeviceNotConnectedError(f"{self} is not connected.") - - start = time.perf_counter() - action = self.bus.sync_read("Present_Position") - action = {f"{motor}.pos": val for motor, val in action.items()} - dt_ms = (time.perf_counter() - start) * 1e3 - logger.debug(f"{self} read action: {dt_ms:.1f}ms") - return action - - def send_feedback(self, feedback: dict[str, float]) -> None: - raise NotImplementedError - - def disconnect(self) -> None: - if not self.is_connected: - raise DeviceNotConnectedError(f"{self} is not connected.") - - self.bus.disconnect() - logger.info(f"{self} disconnected.") From cf2897f54513a8d5ca56e38a4fa6ec53c28bb45e Mon Sep 17 00:00:00 2001 From: Jade Choghari Date: Fri, 17 Oct 2025 16:12:01 +0200 Subject: [PATCH 05/62] Docs(fix): corrects minor mix-ups encoder/decoder (#2231) --- src/lerobot/policies/act/modeling_act.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lerobot/policies/act/modeling_act.py b/src/lerobot/policies/act/modeling_act.py index 4d2890ba6..b7cbcd061 100644 --- a/src/lerobot/policies/act/modeling_act.py +++ b/src/lerobot/policies/act/modeling_act.py @@ -626,8 +626,8 @@ class ACTDecoderLayer(nn.Module): x: (Decoder Sequence, Batch, Channel) tensor of input tokens. encoder_out: (Encoder Sequence, B, C) output features from the last layer of the encoder we are cross-attending with. - decoder_pos_embed: (ES, 1, C) positional embedding for keys (from the encoder). - encoder_pos_embed: (DS, 1, C) Positional_embedding for the queries (from the decoder). + encoder_pos_embed: (ES, 1, C) positional embedding for keys (from the encoder). + decoder_pos_embed: (DS, 1, C) positional embedding for the queries (from the decoder). Returns: (DS, B, C) tensor of decoder output features. """ From 0050d7c61ca4f17fe2498f0d041cc695fe99aac3 Mon Sep 17 00:00:00 2001 From: Edgar Riba Date: Fri, 17 Oct 2025 16:32:24 +0200 Subject: [PATCH 06/62] docs: change video file path format in conversion script (#2113) * Change video file path format in conversion script Updated video file path in the dataset conversion script. Signed-off-by: Edgar Riba * Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Signed-off-by: Edgar Riba --------- Signed-off-by: Edgar Riba Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Michel Aractingi --- src/lerobot/datasets/v30/convert_dataset_v21_to_v30.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lerobot/datasets/v30/convert_dataset_v21_to_v30.py b/src/lerobot/datasets/v30/convert_dataset_v21_to_v30.py index b8ae29ad6..74be6bfa4 100644 --- a/src/lerobot/datasets/v30/convert_dataset_v21_to_v30.py +++ b/src/lerobot/datasets/v30/convert_dataset_v21_to_v30.py @@ -98,7 +98,7 @@ OLD videos/chunk-000/CAMERA/episode_000000.mp4 NEW -videos/chunk-000/file_000.mp4 +videos/CAMERA/chunk-000/file_000.mp4 ------------------------- OLD episodes.jsonl From 5d4af4b0b1fc1f77c0151747f1a9a7de20a2aff8 Mon Sep 17 00:00:00 2001 From: yfynb1111 <33393037+yfynb1111@users.noreply.github.com> Date: Fri, 17 Oct 2025 22:32:56 +0800 Subject: [PATCH 07/62] Fix: debug policy load pretrained model failure problem (#2073) Co-authored-by: Steven Palma --- src/lerobot/rl/eval_policy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lerobot/rl/eval_policy.py b/src/lerobot/rl/eval_policy.py index 7cec66800..16bb64a73 100644 --- a/src/lerobot/rl/eval_policy.py +++ b/src/lerobot/rl/eval_policy.py @@ -65,7 +65,7 @@ def main(cfg: TrainRLServerPipelineConfig): # env_cfg=cfg.env, ds_meta=dataset_meta, ) - policy.from_pretrained(env_cfg.pretrained_policy_name_or_path) + policy = policy.from_pretrained(env_cfg.pretrained_policy_name_or_path) policy.eval() eval_policy(env, policy=policy, n_episodes=10) From 45730cc71e604669a5a0607ddc69c0711b322500 Mon Sep 17 00:00:00 2001 From: Steven Palma Date: Fri, 17 Oct 2025 16:33:46 +0200 Subject: [PATCH 08/62] fix(docs): markdown formatting in integrate_hardware.mdx (#2232) * Fixing some markdown formatting in the Step 4 section * fix(docs): code block format --------- Co-authored-by: Doug Harris --- docs/source/integrate_hardware.mdx | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/docs/source/integrate_hardware.mdx b/docs/source/integrate_hardware.mdx index ed9dc8dd5..e1587be91 100644 --- a/docs/source/integrate_hardware.mdx +++ b/docs/source/integrate_hardware.mdx @@ -208,34 +208,36 @@ LeRobot supports saving and loading calibration data automatically. This is usef ```python -> @property -> def is_calibrated(self) -> bool: -> return True -> -> def calibrate(self) -> None: -> pass -> ``` +@property +def is_calibrated(self) -> bool: + return True + +def calibrate(self) -> None: + pass +``` + ### `is_calibrated` This should reflect whether your robot has the required calibration loaded. -``` -python + +```python @property def is_calibrated(self) -> bool: return self.bus.is_calibrated ``` + ### `calibrate()` The goal of the calibration is twofold: - - Know the physical range of motion of each motors in order to only send commands within this range. - - Normalize raw motors positions to sensible continuous values (e.g. percentages, degrees) instead of arbitrary discrete value dependant on the specific motor used that will not replicate elsewhere. + +- Know the physical range of motion of each motors in order to only send commands within this range. +- Normalize raw motors positions to sensible continuous values (e.g. percentages, degrees) instead of arbitrary discrete value dependant on the specific motor used that will not replicate elsewhere. It should implement the logic for calibration (if relevant) and update the `self.calibration` dictionary. If you are using Feetech or Dynamixel motors, our bus interfaces already include methods to help with this. - ```python def calibrate(self) -> None: From da9c2e66f403e1e7051f24eb2ec5f47c2e8ddaa9 Mon Sep 17 00:00:00 2001 From: azaracla <33293244+azaracla@users.noreply.github.com> Date: Fri, 17 Oct 2025 17:26:34 +0200 Subject: [PATCH 09/62] fix: fix deprecated hugginface-cli whoami (#1884) Signed-off-by: azaracla <33293244+azaracla@users.noreply.github.com> Co-authored-by: Steven Palma --- docs/source/il_robots.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/il_robots.mdx b/docs/source/il_robots.mdx index 0d8fd56e5..000df2a19 100644 --- a/docs/source/il_robots.mdx +++ b/docs/source/il_robots.mdx @@ -165,7 +165,7 @@ huggingface-cli login --token ${HUGGINGFACE_TOKEN} --add-to-git-credential Then store your Hugging Face repository name in a variable: ```bash -HF_USER=$(huggingface-cli whoami | head -n 1) +HF_USER=$(hf auth whoami | head -n 1) echo $HF_USER ``` From 79137f58d1fb52d62fbf2a97351f308e968e9cc9 Mon Sep 17 00:00:00 2001 From: Ilia Larchenko <41329713+IliaLarchenko@users.noreply.github.com> Date: Fri, 17 Oct 2025 23:14:53 +0700 Subject: [PATCH 10/62] Fixed a small wrist flex calibration issue for lekiwi (#1787) wrist_flex is not full_turn_motor (it has only a 180-degree range) and should be calibrated like in so_100, only wrist_roll is a full turn motor Signed-off-by: Ilia Larchenko <41329713+IliaLarchenko@users.noreply.github.com> Co-authored-by: Steven Palma --- src/lerobot/robots/lekiwi/lekiwi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lerobot/robots/lekiwi/lekiwi.py b/src/lerobot/robots/lekiwi/lekiwi.py index 357109cb0..86fe017d6 100644 --- a/src/lerobot/robots/lekiwi/lekiwi.py +++ b/src/lerobot/robots/lekiwi/lekiwi.py @@ -153,7 +153,7 @@ class LeKiwi(Robot): homing_offsets.update(dict.fromkeys(self.base_motors, 0)) full_turn_motor = [ - motor for motor in motors if any(keyword in motor for keyword in ["wheel", "wrist"]) + motor for motor in motors if any(keyword in motor for keyword in ["wheel", "wrist_roll"]) ] unknown_range_motors = [motor for motor in motors if motor not in full_turn_motor] From 92b6254473b51a50271d73db7046ad6528c6c99d Mon Sep 17 00:00:00 2001 From: Steven Palma Date: Fri, 17 Oct 2025 19:30:25 +0200 Subject: [PATCH 11/62] feat(utils): add support for Intel XPU backend (#2233) * feat: add support for Intel XPU backend in device selection * Update src/lerobot/utils/utils.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Signed-off-by: Lim Xiang Yang * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix: update is_amp_available to include xpu as a valid device * Update src/lerobot/utils/utils.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Signed-off-by: Lim Xiang Yang * Update src/lerobot/utils/utils.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Signed-off-by: Lim Xiang Yang * fix: remove unused return and add comments on fp64 fallback handling * fix(utils): return dtype in case xpu has fp64 --------- Signed-off-by: Lim Xiang Yang Co-authored-by: Lim, Xiang Yang Co-authored-by: Lim Xiang Yang Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Jade Choghari --- src/lerobot/utils/utils.py | 27 +++++++++++++++++++++++++-- tests/async_inference/test_helpers.py | 2 +- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/src/lerobot/utils/utils.py b/src/lerobot/utils/utils.py index 4447a1fcf..46c15d925 100644 --- a/src/lerobot/utils/utils.py +++ b/src/lerobot/utils/utils.py @@ -45,6 +45,9 @@ def auto_select_torch_device() -> torch.device: elif torch.backends.mps.is_available(): logging.info("Metal backend detected, using mps.") return torch.device("mps") + elif torch.xpu.is_available(): + logging.info("Intel XPU backend detected, using xpu.") + return torch.device("xpu") else: logging.warning("No accelerated backend detected. Using default cpu, this will be slow.") return torch.device("cpu") @@ -61,6 +64,9 @@ def get_safe_torch_device(try_device: str, log: bool = False) -> torch.device: case "mps": assert torch.backends.mps.is_available() device = torch.device("mps") + case "xpu": + assert torch.xpu.is_available() + device = torch.device("xpu") case "cpu": device = torch.device("cpu") if log: @@ -81,6 +87,21 @@ def get_safe_dtype(dtype: torch.dtype, device: str | torch.device): device = device.type if device == "mps" and dtype == torch.float64: return torch.float32 + if device == "xpu" and dtype == torch.float64: + if hasattr(torch.xpu, "get_device_capability"): + device_capability = torch.xpu.get_device_capability() + # NOTE: Some Intel XPU devices do not support double precision (FP64). + # The `has_fp64` flag is returned by `torch.xpu.get_device_capability()` + # when available; if False, we fall back to float32 for compatibility. + if not device_capability.get("has_fp64", False): + logging.warning(f"Device {device} does not support float64, using float32 instead.") + return torch.float32 + else: + logging.warning( + f"Device {device} capability check failed. Assuming no support for float64, using float32 instead." + ) + return torch.float32 + return dtype else: return dtype @@ -91,14 +112,16 @@ def is_torch_device_available(try_device: str) -> bool: return torch.cuda.is_available() elif try_device == "mps": return torch.backends.mps.is_available() + elif try_device == "xpu": + return torch.xpu.is_available() elif try_device == "cpu": return True else: - raise ValueError(f"Unknown device {try_device}. Supported devices are: cuda, mps or cpu.") + raise ValueError(f"Unknown device {try_device}. Supported devices are: cuda, mps, xpu or cpu.") def is_amp_available(device: str): - if device in ["cuda", "cpu"]: + if device in ["cuda", "xpu", "cpu"]: return True elif device == "mps": return False diff --git a/tests/async_inference/test_helpers.py b/tests/async_inference/test_helpers.py index 1e2d1e311..a9e53200d 100644 --- a/tests/async_inference/test_helpers.py +++ b/tests/async_inference/test_helpers.py @@ -389,7 +389,7 @@ def test_raw_observation_to_observation_device_handling(): # Check that all expected keys produce tensors (device placement handled by preprocessor later) for key, value in observation.items(): if isinstance(value, torch.Tensor): - assert value.device.type in ["cpu", "cuda", "mps"], f"Tensor {key} on unexpected device" + assert value.device.type in ["cpu", "cuda", "mps", "xpu"], f"Tensor {key} on unexpected device" def test_raw_observation_to_observation_deterministic(): From f7c4f995451e36c76d51dbf2c01dd7b6b857a520 Mon Sep 17 00:00:00 2001 From: Maximilian Li Date: Sat, 18 Oct 2025 00:50:34 +0200 Subject: [PATCH 12/62] fix(factory): ensure output and input features are set only if not already defined (#1771) Co-authored-by: Steven Palma --- src/lerobot/policies/factory.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/lerobot/policies/factory.py b/src/lerobot/policies/factory.py index cfb550ab2..9c67e317a 100644 --- a/src/lerobot/policies/factory.py +++ b/src/lerobot/policies/factory.py @@ -360,8 +360,10 @@ def make_policy( raise ValueError("env_cfg cannot be None when ds_meta is not provided") features = env_to_policy_features(env_cfg) - cfg.output_features = {key: ft for key, ft in features.items() if ft.type is FeatureType.ACTION} - cfg.input_features = {key: ft for key, ft in features.items() if key not in cfg.output_features} + if not cfg.output_features: + cfg.output_features = {key: ft for key, ft in features.items() if ft.type is FeatureType.ACTION} + if not cfg.input_features: + cfg.input_features = {key: ft for key, ft in features.items() if key not in cfg.output_features} kwargs["config"] = cfg if cfg.pretrained_path: From 1ee8d824f57922c35e8fc03dcfe7ef7d5a334585 Mon Sep 17 00:00:00 2001 From: Steven Palma Date: Sat, 18 Oct 2025 00:51:17 +0200 Subject: [PATCH 13/62] fix(docs): update eval example (#2236) Co-authored-by: Hemanth M --- src/lerobot/scripts/lerobot_eval.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lerobot/scripts/lerobot_eval.py b/src/lerobot/scripts/lerobot_eval.py index 0fdec9286..0dec18be6 100644 --- a/src/lerobot/scripts/lerobot_eval.py +++ b/src/lerobot/scripts/lerobot_eval.py @@ -26,7 +26,7 @@ lerobot-eval \ --env.type=pusht \ --eval.batch_size=10 \ --eval.n_episodes=10 \ - --use_amp=false \ + --policy.use_amp=false \ --policy.device=cuda ``` @@ -37,7 +37,7 @@ lerobot-eval \ --env.type=pusht \ --eval.batch_size=10 \ --eval.n_episodes=10 \ - --use_amp=false \ + --policy.use_amp=false \ --policy.device=cuda ``` From 7aedbbf81a9dca4ca06eb91ce99097b31448735f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 18 Oct 2025 01:20:45 +0200 Subject: [PATCH 14/62] [pre-commit.ci] pre-commit autoupdate (#1563) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [pre-commit.ci] pre-commit autoupdate updates: - [github.com/pre-commit/pre-commit-hooks: v5.0.0 → v6.0.0](https://github.com/pre-commit/pre-commit-hooks/compare/v5.0.0...v6.0.0) - [github.com/astral-sh/ruff-pre-commit: v0.12.4 → v0.13.0](https://github.com/astral-sh/ruff-pre-commit/compare/v0.12.4...v0.13.0) - [github.com/adhtruong/mirrors-typos: v1.34.0 → v1.36.2](https://github.com/adhtruong/mirrors-typos/compare/v1.34.0...v1.36.2) - [github.com/gitleaks/gitleaks: v8.27.2 → v8.28.0](https://github.com/gitleaks/gitleaks/compare/v8.27.2...v8.28.0) - [github.com/woodruffw/zizmor-pre-commit: v1.11.0 → v1.13.0](https://github.com/woodruffw/zizmor-pre-commit/compare/v1.11.0...v1.13.0) * chore: update pre-commit versions --------- Co-authored-by: Steven Palma --- .github/workflows/release.yml | 2 +- .pre-commit-config.yaml | 14 +++++++------- pyproject.toml | 1 + src/lerobot/motors/dynamixel/dynamixel.py | 2 +- tests/rl/test_learner_service.py | 4 ++-- 5 files changed, 12 insertions(+), 11 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index e05b4816a..7339d44ed 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -138,7 +138,7 @@ jobs: - name: Setup uv and Python uses: astral-sh/setup-uv@v6 # zizmor: ignore[unpinned-uses] with: - enable-cache: true + enable-cache: true # zizmor: ignore[cache-poisoning] version: ${{ env.UV_VERSION }} python-version: ${{ env.PYTHON_VERSION }} - name: Create uv virtual environment diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7f5beff80..896a0c10b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -26,7 +26,7 @@ repos: ##### General Code Quality & Formatting ##### - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v5.0.0 + rev: v6.0.0 hooks: - id: check-added-large-files args: ['--maxkb=1024'] @@ -39,20 +39,20 @@ repos: - id: trailing-whitespace - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.12.4 + rev: v0.14.1 hooks: - id: ruff-format - id: ruff args: [--fix, --exit-non-zero-on-fix] - repo: https://github.com/adhtruong/mirrors-typos - rev: v1.34.0 + rev: v1.38.1 hooks: - id: typos args: [--force-exclude] - repo: https://github.com/asottile/pyupgrade - rev: v3.20.0 + rev: v3.21.0 hooks: - id: pyupgrade args: [--py310-plus] @@ -68,12 +68,12 @@ repos: ##### Security ##### - repo: https://github.com/gitleaks/gitleaks - rev: v8.27.2 + rev: v8.28.0 hooks: - id: gitleaks - repo: https://github.com/woodruffw/zizmor-pre-commit - rev: v1.11.0 + rev: v1.15.2 hooks: - id: zizmor @@ -87,7 +87,7 @@ repos: # TODO(Steven): Uncomment when ready to use ##### Static Analysis & Typing ##### - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.16.0 + rev: v1.18.2 hooks: - id: mypy args: [--config-file=pyproject.toml] diff --git a/pyproject.toml b/pyproject.toml index 876f3699b..1d0ac6ea0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -246,6 +246,7 @@ default.extend-ignore-identifiers-re = [ "pn", "ser", "ein", + "inpt", ] # TODO: Uncomment when ready to use diff --git a/src/lerobot/motors/dynamixel/dynamixel.py b/src/lerobot/motors/dynamixel/dynamixel.py index e1d4e0963..01bfcf544 100644 --- a/src/lerobot/motors/dynamixel/dynamixel.py +++ b/src/lerobot/motors/dynamixel/dynamixel.py @@ -60,7 +60,7 @@ class OperatingMode(Enum): # This mode controls position. This mode is identical to the Multi-turn Position Control from existing # DYNAMIXEL. 512 turns are supported(-256[rev] ~ 256[rev]). This mode is ideal for multi-turn wrists or - # conveyer systems or a system that requires an additional reduction gear. Note that Max Position + # conveyor systems or a system that requires an additional reduction gear. Note that Max Position # Limit(48), Min Position Limit(52) are not used on Extended Position Control Mode. EXTENDED_POSITION = 4 diff --git a/tests/rl/test_learner_service.py b/tests/rl/test_learner_service.py index b0e61165a..e0f0292be 100644 --- a/tests/rl/test_learner_service.py +++ b/tests/rl/test_learner_service.py @@ -117,12 +117,12 @@ def test_send_interactions(): services_pb2.InteractionMessage(transfer_state=services_pb2.TransferState.TRANSFER_END, data=b"8"), ] - def mock_intercations_stream(): + def mock_interactions_stream(): yield from list_of_interaction_messages return services_pb2.Empty() - response = client.SendInteractions(mock_intercations_stream()) + response = client.SendInteractions(mock_interactions_stream()) assert response == services_pb2.Empty() close_learner_service_stub(channel, server) From d6ea3bbce0fc8c75138f02639ac4ceaf12a67829 Mon Sep 17 00:00:00 2001 From: Steven Palma Date: Sat, 18 Oct 2025 01:34:44 +0200 Subject: [PATCH 15/62] fix(docs): update example flags for lerobot-dataset-viz (#2238) Co-authored-by: Yingjie Wei Co-authored-by: DWarez --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 6409d931d..ff20e79d5 100644 --- a/README.md +++ b/README.md @@ -207,13 +207,13 @@ lerobot-dataset-viz \ --episode-index 0 ``` -or from a dataset in a local folder with the `root` option and the `--local-files-only` (in the following case the dataset will be searched for in `./my_local_data_dir/lerobot/pusht`) +or from a dataset in a local folder with the `root` option and the `--mode local` (in the following case the dataset will be searched for in `./my_local_data_dir/lerobot/pusht`) ```bash lerobot-dataset-viz \ --repo-id lerobot/pusht \ --root ./my_local_data_dir \ - --local-files-only 1 \ + --mode local \ --episode-index 0 ``` From da5d2f3e9187fa4690e6667fe8b294cae49016d6 Mon Sep 17 00:00:00 2001 From: Steven Palma Date: Sat, 18 Oct 2025 01:35:02 +0200 Subject: [PATCH 16/62] chore(dependencies): upgrade rerun (#2237) * chore(dependencies): upgrade rerun Co-authored-by: Ben Zhang * test(utils): fix rerun scalars --------- Co-authored-by: Ben Zhang --- pyproject.toml | 2 +- src/lerobot/scripts/lerobot_dataset_viz.py | 16 ++++++++-------- src/lerobot/utils/visualization_utils.py | 12 ++++++------ tests/utils/test_visualization_utils.py | 6 +++--- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 1d0ac6ea0..cc950958e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -82,7 +82,7 @@ dependencies = [ "draccus==0.10.0", # TODO: Remove == "gymnasium>=1.0.0", - "rerun-sdk>=0.21.0,<0.23.0", # TODO: Bumb dependency + "rerun-sdk>=0.24.0,<0.27.0", # Support dependencies "deepdiff>=7.0.1,<9.0.0", diff --git a/src/lerobot/scripts/lerobot_dataset_viz.py b/src/lerobot/scripts/lerobot_dataset_viz.py index 55708d9a9..12273cb1d 100644 --- a/src/lerobot/scripts/lerobot_dataset_viz.py +++ b/src/lerobot/scripts/lerobot_dataset_viz.py @@ -141,15 +141,15 @@ def visualize_dataset( gc.collect() if mode == "distant": - rr.serve(open_browser=False, web_port=web_port, ws_port=ws_port) + rr.serve_web_viewer(open_browser=False, web_port=web_port) logging.info("Logging to Rerun") for batch in tqdm.tqdm(dataloader, total=len(dataloader)): # iterate over the batch for i in range(len(batch["index"])): - rr.set_time_sequence("frame_index", batch["frame_index"][i].item()) - rr.set_time_seconds("timestamp", batch["timestamp"][i].item()) + rr.set_time("frame_index", sequence=batch["frame_index"][i].item()) + rr.set_time("timestamp", timestamp=batch["timestamp"][i].item()) # display each camera image for key in dataset.meta.camera_keys: @@ -159,21 +159,21 @@ def visualize_dataset( # display each dimension of action space (e.g. actuators command) if ACTION in batch: for dim_idx, val in enumerate(batch[ACTION][i]): - rr.log(f"{ACTION}/{dim_idx}", rr.Scalar(val.item())) + rr.log(f"{ACTION}/{dim_idx}", rr.Scalars(val.item())) # display each dimension of observed state space (e.g. agent position in joint space) if OBS_STATE in batch: for dim_idx, val in enumerate(batch[OBS_STATE][i]): - rr.log(f"state/{dim_idx}", rr.Scalar(val.item())) + rr.log(f"state/{dim_idx}", rr.Scalars(val.item())) if DONE in batch: - rr.log(DONE, rr.Scalar(batch[DONE][i].item())) + rr.log(DONE, rr.Scalars(batch[DONE][i].item())) if REWARD in batch: - rr.log(REWARD, rr.Scalar(batch[REWARD][i].item())) + rr.log(REWARD, rr.Scalars(batch[REWARD][i].item())) if "next.success" in batch: - rr.log("next.success", rr.Scalar(batch["next.success"][i].item())) + rr.log("next.success", rr.Scalars(batch["next.success"][i].item())) if mode == "local" and save: # save .rrd locally diff --git a/src/lerobot/utils/visualization_utils.py b/src/lerobot/utils/visualization_utils.py index 95fdb178a..991b10247 100644 --- a/src/lerobot/utils/visualization_utils.py +++ b/src/lerobot/utils/visualization_utils.py @@ -46,7 +46,7 @@ def log_rerun_data( This function iterates through the provided observation and action dictionaries and sends their contents to the Rerun viewer. It handles different data types appropriately: - - Scalar values (floats, ints) are logged as `rr.Scalar`. + - Scalars values (floats, ints) are logged as `rr.Scalars`. - 3D NumPy arrays that resemble images (e.g., with 1, 3, or 4 channels first) are transposed from CHW to HWC format and logged as `rr.Image`. - 1D NumPy arrays are logged as a series of individual scalars, with each element indexed. @@ -65,7 +65,7 @@ def log_rerun_data( key = k if str(k).startswith(OBS_PREFIX) else f"{OBS_STR}.{k}" if _is_scalar(v): - rr.log(key, rr.Scalar(float(v))) + rr.log(key, rr.Scalars(float(v))) elif isinstance(v, np.ndarray): arr = v # Convert CHW -> HWC when needed @@ -73,7 +73,7 @@ def log_rerun_data( arr = np.transpose(arr, (1, 2, 0)) if arr.ndim == 1: for i, vi in enumerate(arr): - rr.log(f"{key}_{i}", rr.Scalar(float(vi))) + rr.log(f"{key}_{i}", rr.Scalars(float(vi))) else: rr.log(key, rr.Image(arr), static=True) @@ -84,13 +84,13 @@ def log_rerun_data( key = k if str(k).startswith("action.") else f"action.{k}" if _is_scalar(v): - rr.log(key, rr.Scalar(float(v))) + rr.log(key, rr.Scalars(float(v))) elif isinstance(v, np.ndarray): if v.ndim == 1: for i, vi in enumerate(v): - rr.log(f"{key}_{i}", rr.Scalar(float(vi))) + rr.log(f"{key}_{i}", rr.Scalars(float(vi))) else: # Fall back to flattening higher-dimensional arrays flat = v.flatten() for i, vi in enumerate(flat): - rr.log(f"{key}_{i}", rr.Scalar(float(vi))) + rr.log(f"{key}_{i}", rr.Scalars(float(vi))) diff --git a/tests/utils/test_visualization_utils.py b/tests/utils/test_visualization_utils.py index 08a827570..f573de166 100644 --- a/tests/utils/test_visualization_utils.py +++ b/tests/utils/test_visualization_utils.py @@ -45,7 +45,7 @@ def mock_rerun(monkeypatch): calls.append((key, obj, kwargs)) dummy_rr = SimpleNamespace( - Scalar=DummyScalar, + Scalars=DummyScalar, Image=DummyImage, log=dummy_log, init=lambda *a, **k: None, @@ -109,9 +109,9 @@ def test_log_rerun_data_envtransition_scalars_and_image(mock_rerun): vu.log_rerun_data(observation=obs_data, action=action_data) # We expect: - # - observation.state.temperature -> Scalar + # - observation.state.temperature -> Scalars # - observation.camera -> Image (HWC) with static=True - # - action.throttle -> Scalar + # - action.throttle -> Scalars # - action.vector_0, action.vector_1 -> Scalars expected_keys = { f"{OBS_STATE}.temperature", From f0aeded1424c9962da851686cd31c77b17537a03 Mon Sep 17 00:00:00 2001 From: Lycoris <32864669+lycoris1129@users.noreply.github.com> Date: Sat, 18 Oct 2025 12:47:07 +0800 Subject: [PATCH 17/62] Fixes failed to delete images because the timing of gc is uncertain (#1710) * Prevents resource leak in video_utils when getting width and height Added the with statement when opening the image to ensure that the file handle is properly closed after its contents are read. Otherwise, shutil.rmtree(img_dir) will fail when called after the encode_video_frames function completes. Signed-off-by: Lycoris <32864669+lycoris1129@users.noreply.github.com> --------- Signed-off-by: Lycoris <32864669+lycoris1129@users.noreply.github.com> --- src/lerobot/datasets/video_utils.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/lerobot/datasets/video_utils.py b/src/lerobot/datasets/video_utils.py index 740cdb602..0de791919 100644 --- a/src/lerobot/datasets/video_utils.py +++ b/src/lerobot/datasets/video_utils.py @@ -342,8 +342,8 @@ def encode_video_frames( # Define video output frame size (assuming all input frames are the same size) if len(input_list) == 0: raise FileNotFoundError(f"No images found in {imgs_dir}.") - dummy_image = Image.open(input_list[0]) - width, height = dummy_image.size + with Image.open(input_list[0]) as dummy_image: + width, height = dummy_image.size # Define video codec options video_options = {} @@ -373,11 +373,12 @@ def encode_video_frames( # Loop through input frames and encode them for input_data in input_list: - input_image = Image.open(input_data).convert("RGB") - input_frame = av.VideoFrame.from_image(input_image) - packet = output_stream.encode(input_frame) - if packet: - output.mux(packet) + with Image.open(input_data) as input_image: + input_image = input_image.convert("RGB") + input_frame = av.VideoFrame.from_image(input_image) + packet = output_stream.encode(input_frame) + if packet: + output.mux(packet) # Flush the encoder packet = output_stream.encode() From 1ff8986c77db7570059e92a27bc7a72b2f080b03 Mon Sep 17 00:00:00 2001 From: Francesco Capuano <74058581+fracapuano@users.noreply.github.com> Date: Sat, 18 Oct 2025 12:06:43 +0200 Subject: [PATCH 18/62] fix: add MockMotorBus to MockRobot (#2081) --- tests/mocks/mock_robot.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/tests/mocks/mock_robot.py b/tests/mocks/mock_robot.py index 027ee45ed..b0513fd38 100644 --- a/tests/mocks/mock_robot.py +++ b/tests/mocks/mock_robot.py @@ -20,8 +20,10 @@ from functools import cached_property from typing import Any from lerobot.cameras import CameraConfig, make_cameras_from_configs +from lerobot.motors.motors_bus import Motor, MotorNormMode from lerobot.robots import Robot, RobotConfig from lerobot.utils.errors import DeviceAlreadyConnectedError, DeviceNotConnectedError +from tests.mocks.mock_motors_bus import MockMotorsBus @RobotConfig.register_subclass("mock_robot") @@ -58,9 +60,22 @@ class MockRobot(Robot): self.config = config self._is_connected = False self._is_calibrated = config.calibrated - self.motors = [f"motor_{i + 1}" for i in range(config.n_motors)] self.cameras = make_cameras_from_configs(config.cameras) + mock_motors = {} + for i in range(config.n_motors): + motor_name = f"motor_{i + 1}" + mock_motors[motor_name] = Motor( + id=i + 1, + model="model_1", # Use model_1 which exists in MockMotorsBus tables + norm_mode=MotorNormMode.RANGE_M100_100, + ) + + self.bus = MockMotorsBus("/dev/dummy-port", mock_motors) + + # NOTE(fracapuano): The .motors attribute was used from the previous interface + self.motors = [f"motor_{i + 1}" for i in range(config.n_motors)] + @property def _motors_ft(self) -> dict[str, type]: return {f"{motor}.pos": float for motor in self.motors} From 4d8f242af9298bbe8c007635c24bfd347c9f12ee Mon Sep 17 00:00:00 2001 From: Caroline Pascal Date: Sun, 19 Oct 2025 14:43:07 +0200 Subject: [PATCH 19/62] chore(pyproject): cleaning no longer existing files/folders in pyproject exclude_dirs (#2240) --- pyproject.toml | 3 --- 1 file changed, 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index cc950958e..1a7da48ea 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -229,9 +229,6 @@ exclude_dirs = [ "tests", "benchmarks", "src/lerobot/datasets/push_dataset_to_hub", - "src/lerobot/datasets/v2/convert_dataset_v1_to_v2", - "src/lerobot/policies/pi0/conversion_scripts", - "src/lerobot/scripts/push_dataset_to_hub.py", ] skips = ["B101", "B311", "B404", "B603", "B615"] From 98662e5f248afd4c141ce9c42113b1c1621f7eca Mon Sep 17 00:00:00 2001 From: Steven Palma Date: Sun, 19 Oct 2025 19:19:21 +0200 Subject: [PATCH 20/62] chore(install): use miniforge instead of miniconda (#2249) Co-authored-by: Silvio Traversaro --- CONTRIBUTING.md | 2 +- README.md | 4 ++-- docs/source/installation.mdx | 11 +++++++++-- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a07596728..dcb5c03d8 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -137,7 +137,7 @@ Follow these steps to start contributing: 4. for development, we advise to use a tool like `poetry` or `uv` instead of just `pip` to easily track our dependencies. Follow the instructions to [install poetry](https://python-poetry.org/docs/#installation) (use a version >=2.1.0) or to [install uv](https://docs.astral.sh/uv/getting-started/installation/#installation-methods) if you don't have one of them already. - Set up a development environment with conda or miniconda: + Set up a development environment with conda: ```bash conda create -y -n lerobot-dev python=3.10 && conda activate lerobot-dev diff --git a/README.md b/README.md index ff20e79d5..56d82c0c7 100644 --- a/README.md +++ b/README.md @@ -104,14 +104,14 @@ LeRobot works with Python 3.10+ and PyTorch 2.2+. ### Environment Setup -Create a virtual environment with Python 3.10 and activate it, e.g. with [`miniconda`](https://docs.anaconda.com/free/miniconda/index.html): +Create a virtual environment with Python 3.10 and activate it, e.g. with [`miniforge`](https://conda-forge.org/download/): ```bash conda create -y -n lerobot python=3.10 conda activate lerobot ``` -When using `miniconda`, install `ffmpeg` in your environment: +When using `conda`, install `ffmpeg` in your environment: ```bash conda install ffmpeg -c conda-forge diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx index f5fd09acd..f5fc41b43 100644 --- a/docs/source/installation.mdx +++ b/docs/source/installation.mdx @@ -1,8 +1,15 @@ # Installation +## Install [`miniforge`](https://conda-forge.org/download/) + +```bash +wget "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh" +bash Miniforge3-$(uname)-$(uname -m).sh +``` + ## Environment Setup -Create a virtual environment with Python 3.10, using [`Miniconda`](https://docs.anaconda.com/miniconda/install/#quick-command-line-install) +Create a virtual environment with Python 3.10, using conda: ```bash conda create -y -n lerobot python=3.10 @@ -14,7 +21,7 @@ Then activate your conda environment, you have to do this each time you open a s conda activate lerobot ``` -When using `miniconda`, install `ffmpeg` in your environment: +When using `conda`, install `ffmpeg` in your environment: ```bash conda install ffmpeg -c conda-forge From a97d078d9528755b9c0cfaf1c13703d6edaa0952 Mon Sep 17 00:00:00 2001 From: Xingdong Zuo Date: Mon, 20 Oct 2025 03:19:57 +0900 Subject: [PATCH 21/62] Feat: Support CLI for Launching LeKiwiHost (#1614) * Support CLI for LeKiwiHost Signed-off-by: Xingdong Zuo * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Xingdong Zuo Co-authored-by: Steven Palma --- src/lerobot/robots/lekiwi/lekiwi_host.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/src/lerobot/robots/lekiwi/lekiwi_host.py b/src/lerobot/robots/lekiwi/lekiwi_host.py index 1155cf71c..ae7ac58c4 100644 --- a/src/lerobot/robots/lekiwi/lekiwi_host.py +++ b/src/lerobot/robots/lekiwi/lekiwi_host.py @@ -18,14 +18,24 @@ import base64 import json import logging import time +from dataclasses import dataclass, field import cv2 +import draccus import zmq from .config_lekiwi import LeKiwiConfig, LeKiwiHostConfig from .lekiwi import LeKiwi +@dataclass +class LeKiwiServerConfig: + """Configuration for the LeKiwi host script.""" + + robot: LeKiwiConfig = field(default_factory=LeKiwiConfig) + host: LeKiwiHostConfig = field(default_factory=LeKiwiHostConfig) + + class LeKiwiHost: def __init__(self, config: LeKiwiHostConfig): self.zmq_context = zmq.Context() @@ -47,17 +57,16 @@ class LeKiwiHost: self.zmq_context.term() -def main(): +@draccus.wrap() +def main(cfg: LeKiwiServerConfig): logging.info("Configuring LeKiwi") - robot_config = LeKiwiConfig() - robot = LeKiwi(robot_config) + robot = LeKiwi(cfg.robot) logging.info("Connecting LeKiwi") robot.connect() logging.info("Starting HostAgent") - host_config = LeKiwiHostConfig() - host = LeKiwiHost(host_config) + host = LeKiwiHost(cfg.host) last_cmd_time = time.time() watchdog_active = False From a95b15ccc03a095fad729f199e8260f1705f3d1c Mon Sep 17 00:00:00 2001 From: Jade Choghari Date: Sun, 19 Oct 2025 20:50:00 +0200 Subject: [PATCH 22/62] refactor(env): introduce explicit gym ID handling in EnvConfig/factory (#2234) * refactor(env): introduce explicit gym ID handling in EnvConfig/factory This commit introduces properties for the gym package/ID associated with and environment config. They default to the current defaults (`gym_{package_name}/{task_id}`) to avoid breaking changes, but allow for easier use of external gym environments. Subclasses of `EnvConfig` can override the default properties to allow the factory to import (i.e. register) the gym env from a specific module, and also instantiate the env from any ID string. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * more changes * quality * fix test --------- Co-authored-by: Ben Sprenger Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Adil Zouitine --- src/lerobot/envs/configs.py | 10 +++++++++ src/lerobot/envs/factory.py | 24 +++++++++++++------- tests/envs/test_envs.py | 44 +++++++++++++++++++++++++++++++++++++ 3 files changed, 70 insertions(+), 8 deletions(-) diff --git a/src/lerobot/envs/configs.py b/src/lerobot/envs/configs.py index 3aa155093..dc526114d 100644 --- a/src/lerobot/envs/configs.py +++ b/src/lerobot/envs/configs.py @@ -37,6 +37,16 @@ class EnvConfig(draccus.ChoiceRegistry, abc.ABC): def type(self) -> str: return self.get_choice_name(self.__class__) + @property + def package_name(self) -> str: + """Package name to import if environment not found in gym registry""" + return f"gym_{self.type}" + + @property + def gym_id(self) -> str: + """ID string used in gym.make() to instantiate the environment""" + return f"{self.package_name}/{self.task}" + @property @abc.abstractmethod def gym_kwargs(self) -> dict: diff --git a/src/lerobot/envs/factory.py b/src/lerobot/envs/factory.py index 059e0e11a..52c7cbb96 100644 --- a/src/lerobot/envs/factory.py +++ b/src/lerobot/envs/factory.py @@ -16,6 +16,7 @@ import importlib import gymnasium as gym +from gymnasium.envs.registration import registry as gym_registry from lerobot.envs.configs import AlohaEnv, EnvConfig, LiberoEnv, PushtEnv @@ -84,17 +85,24 @@ def make_env( gym_kwargs=cfg.gym_kwargs, env_cls=env_cls, ) - package_name = f"gym_{cfg.type}" - try: - importlib.import_module(package_name) - except ModuleNotFoundError as e: - print(f"{package_name} is not installed. Please install it with `pip install 'lerobot[{cfg.type}]'`") - raise e - gym_handle = f"{package_name}/{cfg.task}" + if cfg.gym_id not in gym_registry: + print(f"gym id '{cfg.gym_id}' not found, attempting to import '{cfg.package_name}'...") + try: + importlib.import_module(cfg.package_name) + except ModuleNotFoundError as e: + raise ModuleNotFoundError( + f"Package '{cfg.package_name}' required for env '{cfg.type}' not found. " + f"Please install it or check PYTHONPATH." + ) from e + + if cfg.gym_id not in gym_registry: + raise gym.error.NameNotFound( + f"Environment '{cfg.gym_id}' not registered even after importing '{cfg.package_name}'." + ) def _make_one(): - return gym.make(gym_handle, disable_env_checker=cfg.disable_env_checker, **(cfg.gym_kwargs or {})) + return gym.make(cfg.gym_id, disable_env_checker=cfg.disable_env_checker, **(cfg.gym_kwargs or {})) vec = env_cls([_make_one for _ in range(n_envs)], autoreset_mode=gym.vector.AutoresetMode.SAME_STEP) diff --git a/tests/envs/test_envs.py b/tests/envs/test_envs.py index 51ea564e5..4c129dbbf 100644 --- a/tests/envs/test_envs.py +++ b/tests/envs/test_envs.py @@ -14,13 +14,17 @@ # See the License for the specific language governing permissions and # limitations under the License. import importlib +from dataclasses import dataclass, field import gymnasium as gym import pytest import torch +from gymnasium.envs.registration import register, registry as gym_registry from gymnasium.utils.env_checker import check_env import lerobot +from lerobot.configs.types import PolicyFeature +from lerobot.envs.configs import EnvConfig from lerobot.envs.factory import make_env, make_env_config from lerobot.envs.utils import preprocess_observation from tests.utils import require_env @@ -64,3 +68,43 @@ def test_factory(env_name): assert img.min() >= 0.0 env.close() + + +def test_factory_custom_gym_id(): + gym_id = "dummy_gym_pkg/DummyTask-v0" + if gym_id in gym_registry: + pytest.skip(f"Environment ID {gym_id} is already registered") + + @EnvConfig.register_subclass("dummy") + @dataclass + class DummyEnv(EnvConfig): + task: str = "DummyTask-v0" + fps: int = 10 + features: dict[str, PolicyFeature] = field(default_factory=dict) + + @property + def package_name(self) -> str: + return "dummy_gym_pkg" + + @property + def gym_id(self) -> str: + return gym_id + + @property + def gym_kwargs(self) -> dict: + return {} + + try: + register(id=gym_id, entry_point="gymnasium.envs.classic_control:CartPoleEnv") + + cfg = DummyEnv() + envs_dict = make_env(cfg, n_envs=1) + dummy_envs = envs_dict["dummy"] + assert len(dummy_envs) == 1 + env = next(iter(dummy_envs.values())) + assert env is not None and isinstance(env, gym.vector.VectorEnv) + env.close() + + finally: + if gym_id in gym_registry: + del gym_registry[gym_id] From 88100943ef130629dffa526f8a850f580e3dafbc Mon Sep 17 00:00:00 2001 From: Bryson Jones <63133702+brysonjones@users.noreply.github.com> Date: Sun, 19 Oct 2025 12:39:30 -0700 Subject: [PATCH 23/62] add affine transforms and test (#2145) Co-authored-by: Steven Palma --- src/lerobot/datasets/transforms.py | 7 ++++ tests/datasets/test_image_transforms.py | 53 ++++++++++++++++++++++++- 2 files changed, 58 insertions(+), 2 deletions(-) diff --git a/src/lerobot/datasets/transforms.py b/src/lerobot/datasets/transforms.py index f7072c72f..beacc48d9 100644 --- a/src/lerobot/datasets/transforms.py +++ b/src/lerobot/datasets/transforms.py @@ -206,6 +206,11 @@ class ImageTransformsConfig: type="SharpnessJitter", kwargs={"sharpness": (0.5, 1.5)}, ), + "affine": ImageTransformConfig( + weight=1.0, + type="RandomAffine", + kwargs={"degrees": (-5.0, 5.0), "translate": (0.05, 0.05)}, + ), } ) @@ -217,6 +222,8 @@ def make_transform_from_config(cfg: ImageTransformConfig): return v2.ColorJitter(**cfg.kwargs) elif cfg.type == "SharpnessJitter": return SharpnessJitter(**cfg.kwargs) + elif cfg.type == "RandomAffine": + return v2.RandomAffine(**cfg.kwargs) else: raise ValueError(f"Transform '{cfg.type}' is not valid.") diff --git a/tests/datasets/test_image_transforms.py b/tests/datasets/test_image_transforms.py index 98f957076..8a66ceb24 100644 --- a/tests/datasets/test_image_transforms.py +++ b/tests/datasets/test_image_transforms.py @@ -134,6 +134,25 @@ def test_get_image_transforms_sharpness(img_tensor_factory, min_max): torch.testing.assert_close(tf_actual(img_tensor), tf_expected(img_tensor)) +@pytest.mark.parametrize("degrees, translate", [((-5.0, 5.0), (0.05, 0.05)), ((10.0, 10.0), (0.1, 0.1))]) +def test_get_image_transforms_affine(img_tensor_factory, degrees, translate): + img_tensor = img_tensor_factory() + tf_cfg = ImageTransformsConfig( + enable=True, + tfs={ + "affine": ImageTransformConfig( + type="RandomAffine", kwargs={"degrees": degrees, "translate": translate} + ) + }, + ) + tf = ImageTransforms(tf_cfg) + output = tf(img_tensor) + # Verify output shape is preserved + assert output.shape == img_tensor.shape + # Verify transform is type RandomAffine + assert isinstance(tf.transforms["affine"], v2.RandomAffine) + + def test_get_image_transforms_max_num_transforms(img_tensor_factory): img_tensor = img_tensor_factory() tf_cfg = ImageTransformsConfig( @@ -262,7 +281,37 @@ def test_backward_compatibility_default_config(img_tensor, default_transforms): # NOTE: PyTorch versions have different randomness, it might break this test. # See this PR: https://github.com/huggingface/lerobot/pull/1127. - cfg = ImageTransformsConfig(enable=True) + # Use config without affine to match original test artifacts + cfg = ImageTransformsConfig( + enable=True, + tfs={ + "brightness": ImageTransformConfig( + weight=1.0, + type="ColorJitter", + kwargs={"brightness": (0.8, 1.2)}, + ), + "contrast": ImageTransformConfig( + weight=1.0, + type="ColorJitter", + kwargs={"contrast": (0.8, 1.2)}, + ), + "saturation": ImageTransformConfig( + weight=1.0, + type="ColorJitter", + kwargs={"saturation": (0.5, 1.5)}, + ), + "hue": ImageTransformConfig( + weight=1.0, + type="ColorJitter", + kwargs={"hue": (-0.05, 0.05)}, + ), + "sharpness": ImageTransformConfig( + weight=1.0, + type="SharpnessJitter", + kwargs={"sharpness": (0.5, 1.5)}, + ), + }, + ) default_tf = ImageTransforms(cfg) with seeded_context(1337): @@ -368,7 +417,7 @@ def test_save_each_transform(img_tensor_factory, tmp_path): save_each_transform(tf_cfg, img_tensor, tmp_path, n_examples) # Check if the transformed images exist for each transform type - transforms = ["brightness", "contrast", "saturation", "hue", "sharpness"] + transforms = ["brightness", "contrast", "saturation", "hue", "sharpness", "affine"] for transform in transforms: transform_dir = tmp_path / transform assert transform_dir.exists(), f"{transform} directory was not created." From a5ca206c4977fbbcf61d7b362e02a96f89749206 Mon Sep 17 00:00:00 2001 From: Huy Date: Sun, 19 Oct 2025 23:35:21 +0200 Subject: [PATCH 24/62] chore(mypy-compliant): Ensure the model module passes MyPy type checks (#1782) * feat(mypy-compliant): Ensure the model module passes MyPy type checks * fix * uncomment pyproject.toml for model module * fix * fix --------- Co-authored-by: Adil Zouitine Co-authored-by: Steven Palma --- pyproject.toml | 7 +++---- src/lerobot/model/kinematics.py | 20 ++++++++++++-------- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 1a7da48ea..250395035 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -282,7 +282,6 @@ ignore_errors = true [[tool.mypy.overrides]] module = "lerobot.envs.*" -# Enable type checking only for the envs module ignore_errors = false @@ -298,9 +297,9 @@ ignore_errors = false # module = "lerobot.optim.*" # ignore_errors = false -# [[tool.mypy.overrides]] -# module = "lerobot.model.*" -# ignore_errors = false +[[tool.mypy.overrides]] +module = "lerobot.model.*" +ignore_errors = false # [[tool.mypy.overrides]] # module = "lerobot.processor.*" diff --git a/src/lerobot/model/kinematics.py b/src/lerobot/model/kinematics.py index f059b9790..95d3b235c 100644 --- a/src/lerobot/model/kinematics.py +++ b/src/lerobot/model/kinematics.py @@ -22,18 +22,18 @@ class RobotKinematics: self, urdf_path: str, target_frame_name: str = "gripper_frame_link", - joint_names: list[str] = None, + joint_names: list[str] | None = None, ): """ Initialize placo-based kinematics solver. Args: - urdf_path: Path to the robot URDF file - target_frame_name: Name of the end-effector frame in the URDF - joint_names: List of joint names to use for the kinematics solver + urdf_path (str): Path to the robot URDF file + target_frame_name (str): Name of the end-effector frame in the URDF + joint_names (list[str] | None): List of joint names to use for the kinematics solver """ try: - import placo + import placo # type: ignore[import-not-found] # C++ library with Python bindings, no type stubs available. TODO: Create stub file or request upstream typing support. except ImportError as e: raise ImportError( "placo is required for RobotKinematics. " @@ -52,7 +52,7 @@ class RobotKinematics: # Initialize frame task for IK self.tip_frame = self.solver.add_frame_task(self.target_frame_name, np.eye(4)) - def forward_kinematics(self, joint_pos_deg): + def forward_kinematics(self, joint_pos_deg: np.ndarray) -> np.ndarray: """ Compute forward kinematics for given joint configuration given the target frame name in the constructor. @@ -77,8 +77,12 @@ class RobotKinematics: return self.robot.get_T_world_frame(self.target_frame_name) def inverse_kinematics( - self, current_joint_pos, desired_ee_pose, position_weight=1.0, orientation_weight=0.01 - ): + self, + current_joint_pos: np.ndarray, + desired_ee_pose: np.ndarray, + position_weight: float = 1.0, + orientation_weight: float = 0.01, + ) -> np.ndarray: """ Compute inverse kinematics using placo solver. From c54cd529a2cbf81a28f8d30c6540401f3da2a0a8 Mon Sep 17 00:00:00 2001 From: Jaisree25 <63940089+Jaisree25@users.noreply.github.com> Date: Mon, 20 Oct 2025 03:57:10 -0700 Subject: [PATCH 25/62] Fix: camera code changes only (#1788) --- src/lerobot/cameras/camera.py | 6 +-- src/lerobot/cameras/configs.py | 6 +-- src/lerobot/cameras/opencv/__init__.py | 2 + src/lerobot/cameras/opencv/camera_opencv.py | 42 ++++++++++++++----- .../cameras/opencv/configuration_opencv.py | 4 +- .../configuration_reachy2_camera.py | 4 +- .../cameras/reachy2_camera/reachy2_camera.py | 37 ++++++++++------ .../cameras/realsense/camera_realsense.py | 41 +++++++++++------- .../realsense/configuration_realsense.py | 2 +- src/lerobot/cameras/utils.py | 12 +++--- 10 files changed, 104 insertions(+), 52 deletions(-) diff --git a/src/lerobot/cameras/camera.py b/src/lerobot/cameras/camera.py index e435c7309..bfdb571a7 100644 --- a/src/lerobot/cameras/camera.py +++ b/src/lerobot/cameras/camera.py @@ -17,7 +17,7 @@ import abc from typing import Any -import numpy as np +from numpy.typing import NDArray # type: ignore # TODO: add type stubs for numpy.typing from .configs import CameraConfig, ColorMode @@ -89,7 +89,7 @@ class Camera(abc.ABC): pass @abc.abstractmethod - def read(self, color_mode: ColorMode | None = None) -> np.ndarray: + def read(self, color_mode: ColorMode | None = None) -> NDArray[Any]: """Capture and return a single frame from the camera. Args: @@ -102,7 +102,7 @@ class Camera(abc.ABC): pass @abc.abstractmethod - def async_read(self, timeout_ms: float = ...) -> np.ndarray: + def async_read(self, timeout_ms: float = ...) -> NDArray[Any]: """Asynchronously capture and return a single frame from the camera. Args: diff --git a/src/lerobot/cameras/configs.py b/src/lerobot/cameras/configs.py index 0488a97ff..056eec314 100644 --- a/src/lerobot/cameras/configs.py +++ b/src/lerobot/cameras/configs.py @@ -18,7 +18,7 @@ import abc from dataclasses import dataclass from enum import Enum -import draccus +import draccus # type: ignore # TODO: add type stubs for draccus class ColorMode(str, Enum): @@ -34,11 +34,11 @@ class Cv2Rotation(int, Enum): @dataclass(kw_only=True) -class CameraConfig(draccus.ChoiceRegistry, abc.ABC): +class CameraConfig(draccus.ChoiceRegistry, abc.ABC): # type: ignore # TODO: add type stubs for draccus fps: int | None = None width: int | None = None height: int | None = None @property def type(self) -> str: - return self.get_choice_name(self.__class__) + return str(self.get_choice_name(self.__class__)) diff --git a/src/lerobot/cameras/opencv/__init__.py b/src/lerobot/cameras/opencv/__init__.py index 11d3139fe..377dafc05 100644 --- a/src/lerobot/cameras/opencv/__init__.py +++ b/src/lerobot/cameras/opencv/__init__.py @@ -14,3 +14,5 @@ from .camera_opencv import OpenCVCamera from .configuration_opencv import OpenCVCameraConfig + +__all__ = ["OpenCVCamera", "OpenCVCameraConfig"] diff --git a/src/lerobot/cameras/opencv/camera_opencv.py b/src/lerobot/cameras/opencv/camera_opencv.py index 50e55f0c2..9e278dfdb 100644 --- a/src/lerobot/cameras/opencv/camera_opencv.py +++ b/src/lerobot/cameras/opencv/camera_opencv.py @@ -25,11 +25,12 @@ from pathlib import Path from threading import Event, Lock, Thread from typing import Any +from numpy.typing import NDArray # type: ignore # TODO: add type stubs for numpy.typing + # Fix MSMF hardware transform compatibility for Windows before importing cv2 if platform.system() == "Windows" and "OPENCV_VIDEOIO_MSMF_ENABLE_HW_TRANSFORMS" not in os.environ: os.environ["OPENCV_VIDEOIO_MSMF_ENABLE_HW_TRANSFORMS"] = "0" -import cv2 -import numpy as np +import cv2 # type: ignore # TODO: add type stubs for OpenCV from lerobot.utils.errors import DeviceAlreadyConnectedError, DeviceNotConnectedError @@ -121,7 +122,7 @@ class OpenCVCamera(Camera): self.thread: Thread | None = None self.stop_event: Event | None = None self.frame_lock: Lock = Lock() - self.latest_frame: np.ndarray | None = None + self.latest_frame: NDArray[Any] | None = None self.new_frame_event: Event = Event() self.rotation: int | None = get_cv2_rotation(config.rotation) @@ -140,7 +141,7 @@ class OpenCVCamera(Camera): """Checks if the camera is currently connected and opened.""" return isinstance(self.videocapture, cv2.VideoCapture) and self.videocapture.isOpened() - def connect(self, warmup: bool = True): + def connect(self, warmup: bool = True) -> None: """ Connects to the OpenCV camera specified in the configuration. @@ -199,6 +200,9 @@ class OpenCVCamera(Camera): if not self.is_connected: raise DeviceNotConnectedError(f"Cannot configure settings for {self} as it is not connected.") + if self.videocapture is None: + raise DeviceNotConnectedError(f"{self} videocapture is not initialized") + if self.fps is None: self.fps = self.videocapture.get(cv2.CAP_PROP_FPS) else: @@ -219,6 +223,12 @@ class OpenCVCamera(Camera): def _validate_fps(self) -> None: """Validates and sets the camera's frames per second (FPS).""" + if self.videocapture is None: + raise DeviceNotConnectedError(f"{self} videocapture is not initialized") + + if self.fps is None: + raise ValueError(f"{self} FPS is not set") + success = self.videocapture.set(cv2.CAP_PROP_FPS, float(self.fps)) actual_fps = self.videocapture.get(cv2.CAP_PROP_FPS) # Use math.isclose for robust float comparison @@ -228,6 +238,12 @@ class OpenCVCamera(Camera): def _validate_width_and_height(self) -> None: """Validates and sets the camera's frame capture width and height.""" + if self.videocapture is None: + raise DeviceNotConnectedError(f"{self} videocapture is not initialized") + + if self.capture_width is None or self.capture_height is None: + raise ValueError(f"{self} capture_width or capture_height is not set") + width_success = self.videocapture.set(cv2.CAP_PROP_FRAME_WIDTH, float(self.capture_width)) height_success = self.videocapture.set(cv2.CAP_PROP_FRAME_HEIGHT, float(self.capture_height)) @@ -262,7 +278,7 @@ class OpenCVCamera(Camera): possible_paths = sorted(Path("/dev").glob("video*"), key=lambda p: p.name) targets_to_scan = [str(p) for p in possible_paths] else: - targets_to_scan = list(range(MAX_OPENCV_INDEX)) + targets_to_scan = [str(i) for i in range(MAX_OPENCV_INDEX)] for target in targets_to_scan: camera = cv2.VideoCapture(target) @@ -289,7 +305,7 @@ class OpenCVCamera(Camera): return found_cameras_info - def read(self, color_mode: ColorMode | None = None) -> np.ndarray: + def read(self, color_mode: ColorMode | None = None) -> NDArray[Any]: """ Reads a single frame synchronously from the camera. @@ -317,6 +333,9 @@ class OpenCVCamera(Camera): start_time = time.perf_counter() + if self.videocapture is None: + raise DeviceNotConnectedError(f"{self} videocapture is not initialized") + ret, frame = self.videocapture.read() if not ret or frame is None: @@ -329,7 +348,7 @@ class OpenCVCamera(Camera): return processed_frame - def _postprocess_image(self, image: np.ndarray, color_mode: ColorMode | None = None) -> np.ndarray: + def _postprocess_image(self, image: NDArray[Any], color_mode: ColorMode | None = None) -> NDArray[Any]: """ Applies color conversion, dimension validation, and rotation to a raw frame. @@ -372,7 +391,7 @@ class OpenCVCamera(Camera): return processed_image - def _read_loop(self): + def _read_loop(self) -> None: """ Internal loop run by the background thread for asynchronous reading. @@ -383,6 +402,9 @@ class OpenCVCamera(Camera): Stops on DeviceNotConnectedError, logs other errors and continues. """ + if self.stop_event is None: + raise RuntimeError(f"{self}: stop_event is not initialized before starting read loop.") + while not self.stop_event.is_set(): try: color_image = self.read() @@ -419,7 +441,7 @@ class OpenCVCamera(Camera): self.thread = None self.stop_event = None - def async_read(self, timeout_ms: float = 200) -> np.ndarray: + def async_read(self, timeout_ms: float = 200) -> NDArray[Any]: """ Reads the latest available frame asynchronously. @@ -462,7 +484,7 @@ class OpenCVCamera(Camera): return frame - def disconnect(self): + def disconnect(self) -> None: """ Disconnects from the camera and cleans up resources. diff --git a/src/lerobot/cameras/opencv/configuration_opencv.py b/src/lerobot/cameras/opencv/configuration_opencv.py index 3ac92de36..b66fb31bc 100644 --- a/src/lerobot/cameras/opencv/configuration_opencv.py +++ b/src/lerobot/cameras/opencv/configuration_opencv.py @@ -17,6 +17,8 @@ from pathlib import Path from ..configs import CameraConfig, ColorMode, Cv2Rotation +__all__ = ["OpenCVCameraConfig", "ColorMode", "Cv2Rotation"] + @CameraConfig.register_subclass("opencv") @dataclass @@ -56,7 +58,7 @@ class OpenCVCameraConfig(CameraConfig): rotation: Cv2Rotation = Cv2Rotation.NO_ROTATION warmup_s: int = 1 - def __post_init__(self): + def __post_init__(self) -> None: if self.color_mode not in (ColorMode.RGB, ColorMode.BGR): raise ValueError( f"`color_mode` is expected to be {ColorMode.RGB.value} or {ColorMode.BGR.value}, but {self.color_mode} is provided." diff --git a/src/lerobot/cameras/reachy2_camera/configuration_reachy2_camera.py b/src/lerobot/cameras/reachy2_camera/configuration_reachy2_camera.py index 5b2303ff2..f26cf2ad1 100644 --- a/src/lerobot/cameras/reachy2_camera/configuration_reachy2_camera.py +++ b/src/lerobot/cameras/reachy2_camera/configuration_reachy2_camera.py @@ -16,6 +16,8 @@ from dataclasses import dataclass from ..configs import CameraConfig, ColorMode +__all__ = ["CameraConfig", "ColorMode", "Reachy2CameraConfig"] + @CameraConfig.register_subclass("reachy2_camera") @dataclass @@ -62,7 +64,7 @@ class Reachy2CameraConfig(CameraConfig): port: int = 50065 # use_depth: bool = False - def __post_init__(self): + def __post_init__(self) -> None: if self.name not in ["teleop", "depth"]: raise ValueError(f"`name` is expected to be 'teleop' or 'depth', but {self.name} is provided.") if (self.name == "teleop" and self.image_type not in ["left", "right"]) or ( diff --git a/src/lerobot/cameras/reachy2_camera/reachy2_camera.py b/src/lerobot/cameras/reachy2_camera/reachy2_camera.py index c96789f96..30e096767 100644 --- a/src/lerobot/cameras/reachy2_camera/reachy2_camera.py +++ b/src/lerobot/cameras/reachy2_camera/reachy2_camera.py @@ -23,13 +23,17 @@ import time from threading import Event, Lock, Thread from typing import Any +from numpy.typing import NDArray # type: ignore # TODO: add type stubs for numpy.typing + # Fix MSMF hardware transform compatibility for Windows before importing cv2 if platform.system() == "Windows" and "OPENCV_VIDEOIO_MSMF_ENABLE_HW_TRANSFORMS" not in os.environ: os.environ["OPENCV_VIDEOIO_MSMF_ENABLE_HW_TRANSFORMS"] = "0" -import cv2 -import numpy as np -from reachy2_sdk.media.camera import CameraView -from reachy2_sdk.media.camera_manager import CameraManager +import cv2 # type: ignore # TODO: add type stubs for OpenCV +import numpy as np # type: ignore # TODO: add type stubs for numpy +from reachy2_sdk.media.camera import CameraView # type: ignore # TODO: add type stubs for reachy2_sdk +from reachy2_sdk.media.camera_manager import ( # type: ignore # TODO: add type stubs for reachy2_sdk + CameraManager, +) from lerobot.utils.errors import DeviceNotConnectedError @@ -73,7 +77,7 @@ class Reachy2Camera(Camera): self.thread: Thread | None = None self.stop_event: Event | None = None self.frame_lock: Lock = Lock() - self.latest_frame: np.ndarray | None = None + self.latest_frame: NDArray[Any] | None = None self.new_frame_event: Event = Event() def __str__(self) -> str: @@ -83,13 +87,17 @@ class Reachy2Camera(Camera): def is_connected(self) -> bool: """Checks if the camera is currently connected and opened.""" if self.config.name == "teleop": - return self.cam_manager._grpc_connected and self.cam_manager.teleop if self.cam_manager else False + return bool( + self.cam_manager._grpc_connected and self.cam_manager.teleop if self.cam_manager else False + ) elif self.config.name == "depth": - return self.cam_manager._grpc_connected and self.cam_manager.depth if self.cam_manager else False + return bool( + self.cam_manager._grpc_connected and self.cam_manager.depth if self.cam_manager else False + ) else: raise ValueError(f"Invalid camera name '{self.config.name}'. Expected 'teleop' or 'depth'.") - def connect(self, warmup: bool = True): + def connect(self, warmup: bool = True) -> None: """ Connects to the Reachy2 CameraManager as specified in the configuration. """ @@ -131,7 +139,7 @@ class Reachy2Camera(Camera): camera_manager.disconnect() return initialized_cameras - def read(self, color_mode: ColorMode | None = None) -> np.ndarray: + def read(self, color_mode: ColorMode | None = None) -> NDArray[Any]: """ Reads a single frame synchronously from the camera. @@ -152,7 +160,7 @@ class Reachy2Camera(Camera): start_time = time.perf_counter() - frame = None + frame: NDArray[Any] = np.empty((0, 0, 3), dtype=np.uint8) if self.cam_manager is None: raise DeviceNotConnectedError(f"{self} is not connected.") @@ -179,7 +187,7 @@ class Reachy2Camera(Camera): return frame - def _read_loop(self): + def _read_loop(self) -> None: """ Internal loop run by the background thread for asynchronous reading. @@ -190,6 +198,9 @@ class Reachy2Camera(Camera): Stops on DeviceNotConnectedError, logs other errors and continues. """ + if self.stop_event is None: + raise RuntimeError(f"{self}: stop_event is not initialized before starting read loop.") + while not self.stop_event.is_set(): try: color_image = self.read() @@ -226,7 +237,7 @@ class Reachy2Camera(Camera): self.thread = None self.stop_event = None - def async_read(self, timeout_ms: float = 200) -> np.ndarray: + def async_read(self, timeout_ms: float = 200) -> NDArray[Any]: """ Reads the latest available frame asynchronously. @@ -269,7 +280,7 @@ class Reachy2Camera(Camera): return frame - def disconnect(self): + def disconnect(self) -> None: """ Stops the background read thread (if running). diff --git a/src/lerobot/cameras/realsense/camera_realsense.py b/src/lerobot/cameras/realsense/camera_realsense.py index cc816e552..f2906fdd8 100644 --- a/src/lerobot/cameras/realsense/camera_realsense.py +++ b/src/lerobot/cameras/realsense/camera_realsense.py @@ -21,11 +21,12 @@ import time from threading import Event, Lock, Thread from typing import Any -import cv2 -import numpy as np +import cv2 # type: ignore # TODO: add type stubs for OpenCV +import numpy as np # type: ignore # TODO: add type stubs for numpy +from numpy.typing import NDArray # type: ignore # TODO: add type stubs for numpy.typing try: - import pyrealsense2 as rs + import pyrealsense2 as rs # type: ignore # TODO: add type stubs for pyrealsense2 except Exception as e: logging.info(f"Could not import realsense: {e}") @@ -132,7 +133,7 @@ class RealSenseCamera(Camera): self.thread: Thread | None = None self.stop_event: Event | None = None self.frame_lock: Lock = Lock() - self.latest_frame: np.ndarray | None = None + self.latest_frame: NDArray[Any] | None = None self.new_frame_event: Event = Event() self.rotation: int | None = get_cv2_rotation(config.rotation) @@ -150,7 +151,7 @@ class RealSenseCamera(Camera): """Checks if the camera pipeline is started and streams are active.""" return self.rs_pipeline is not None and self.rs_profile is not None - def connect(self, warmup: bool = True): + def connect(self, warmup: bool = True) -> None: """ Connects to the RealSense camera specified in the configuration. @@ -264,7 +265,7 @@ class RealSenseCamera(Camera): serial_number = str(found_devices[0]["serial_number"]) return serial_number - def _configure_rs_pipeline_config(self, rs_config): + def _configure_rs_pipeline_config(self, rs_config: Any) -> None: """Creates and configures the RealSense pipeline configuration object.""" rs.config.enable_device(rs_config, self.serial_number) @@ -293,6 +294,9 @@ class RealSenseCamera(Camera): if not self.is_connected: raise DeviceNotConnectedError(f"Cannot validate settings for {self} as it is not connected.") + if self.rs_profile is None: + raise RuntimeError(f"{self}: rs_profile must be initialized before use.") + stream = self.rs_profile.get_stream(rs.stream.color).as_video_stream_profile() if self.fps is None: @@ -308,7 +312,7 @@ class RealSenseCamera(Camera): self.width, self.height = actual_width, actual_height self.capture_width, self.capture_height = actual_width, actual_height - def read_depth(self, timeout_ms: int = 200) -> np.ndarray: + def read_depth(self, timeout_ms: int = 200) -> NDArray[Any]: """ Reads a single frame (depth) synchronously from the camera. @@ -336,6 +340,9 @@ class RealSenseCamera(Camera): start_time = time.perf_counter() + if self.rs_pipeline is None: + raise RuntimeError(f"{self}: rs_pipeline must be initialized before use.") + ret, frame = self.rs_pipeline.try_wait_for_frames(timeout_ms=timeout_ms) if not ret or frame is None: @@ -351,7 +358,7 @@ class RealSenseCamera(Camera): return depth_map_processed - def read(self, color_mode: ColorMode | None = None, timeout_ms: int = 200) -> np.ndarray: + def read(self, color_mode: ColorMode | None = None, timeout_ms: int = 200) -> NDArray[Any]: """ Reads a single frame (color) synchronously from the camera. @@ -376,6 +383,9 @@ class RealSenseCamera(Camera): start_time = time.perf_counter() + if self.rs_pipeline is None: + raise RuntimeError(f"{self}: rs_pipeline must be initialized before use.") + ret, frame = self.rs_pipeline.try_wait_for_frames(timeout_ms=timeout_ms) if not ret or frame is None: @@ -392,8 +402,8 @@ class RealSenseCamera(Camera): return color_image_processed def _postprocess_image( - self, image: np.ndarray, color_mode: ColorMode | None = None, depth_frame: bool = False - ) -> np.ndarray: + self, image: NDArray[Any], color_mode: ColorMode | None = None, depth_frame: bool = False + ) -> NDArray[Any]: """ Applies color conversion, dimension validation, and rotation to a raw color frame. @@ -438,7 +448,7 @@ class RealSenseCamera(Camera): return processed_image - def _read_loop(self): + def _read_loop(self) -> None: """ Internal loop run by the background thread for asynchronous reading. @@ -449,6 +459,9 @@ class RealSenseCamera(Camera): Stops on DeviceNotConnectedError, logs other errors and continues. """ + if self.stop_event is None: + raise RuntimeError(f"{self}: stop_event is not initialized before starting read loop.") + while not self.stop_event.is_set(): try: color_image = self.read(timeout_ms=500) @@ -474,7 +487,7 @@ class RealSenseCamera(Camera): self.thread.daemon = True self.thread.start() - def _stop_read_thread(self): + def _stop_read_thread(self) -> None: """Signals the background read thread to stop and waits for it to join.""" if self.stop_event is not None: self.stop_event.set() @@ -486,7 +499,7 @@ class RealSenseCamera(Camera): self.stop_event = None # NOTE(Steven): Missing implementation for depth for now - def async_read(self, timeout_ms: float = 200) -> np.ndarray: + def async_read(self, timeout_ms: float = 200) -> NDArray[Any]: """ Reads the latest available frame data (color) asynchronously. @@ -529,7 +542,7 @@ class RealSenseCamera(Camera): return frame - def disconnect(self): + def disconnect(self) -> None: """ Disconnects from the camera, stops the pipeline, and cleans up resources. diff --git a/src/lerobot/cameras/realsense/configuration_realsense.py b/src/lerobot/cameras/realsense/configuration_realsense.py index 36a86876d..a094128bc 100644 --- a/src/lerobot/cameras/realsense/configuration_realsense.py +++ b/src/lerobot/cameras/realsense/configuration_realsense.py @@ -59,7 +59,7 @@ class RealSenseCameraConfig(CameraConfig): rotation: Cv2Rotation = Cv2Rotation.NO_ROTATION warmup_s: int = 1 - def __post_init__(self): + def __post_init__(self) -> None: if self.color_mode not in (ColorMode.RGB, ColorMode.BGR): raise ValueError( f"`color_mode` is expected to be {ColorMode.RGB.value} or {ColorMode.BGR.value}, but {self.color_mode} is provided." diff --git a/src/lerobot/cameras/utils.py b/src/lerobot/cameras/utils.py index aa6ff98b4..1b2d386d6 100644 --- a/src/lerobot/cameras/utils.py +++ b/src/lerobot/cameras/utils.py @@ -53,14 +53,14 @@ def make_cameras_from_configs(camera_configs: dict[str, CameraConfig]) -> dict[s def get_cv2_rotation(rotation: Cv2Rotation) -> int | None: - import cv2 + import cv2 # type: ignore # TODO: add type stubs for OpenCV if rotation == Cv2Rotation.ROTATE_90: - return cv2.ROTATE_90_CLOCKWISE + return int(cv2.ROTATE_90_CLOCKWISE) elif rotation == Cv2Rotation.ROTATE_180: - return cv2.ROTATE_180 + return int(cv2.ROTATE_180) elif rotation == Cv2Rotation.ROTATE_270: - return cv2.ROTATE_90_COUNTERCLOCKWISE + return int(cv2.ROTATE_90_COUNTERCLOCKWISE) else: return None @@ -69,8 +69,8 @@ def get_cv2_backend() -> int: import cv2 if platform.system() == "Windows": - return cv2.CAP_MSMF # Use MSMF for Windows instead of AVFOUNDATION + return int(cv2.CAP_MSMF) # Use MSMF for Windows instead of AVFOUNDATION # elif platform.system() == "Darwin": # macOS # return cv2.CAP_AVFOUNDATION else: # Linux and others - return cv2.CAP_ANY + return int(cv2.CAP_ANY) From eff8a6fd12715edcb0a0facc10f17d53a258fa2c Mon Sep 17 00:00:00 2001 From: tetsugo02 <131431116+tetsugo02@users.noreply.github.com> Date: Mon, 20 Oct 2025 19:57:32 +0900 Subject: [PATCH 26/62] Fix typehint and address the mypy errors of src/lerobot/configs (#1746) * fix: update policy handling and type annotations added typehint and addressed the error of mypy * fix: rename should_push_to_hub to push_to_hub I find that there are other dependencies of push_to_hub so I fix the property name back to original one. * fix: typo * fix: changed the position of try-except block As the copilot said, use raise before `hf_hub_download` would stop program even it is able to download * fix: update pre-commit configuration and mypy settings add args: --follow-imports=silent to pass error which have no relationship with src/lerobot/configs * fix: remove the specific path in .pre-commit-config.yaml * feat: enhance typehint to adapt mypy strict mode. * fix: remove duplicate FileNotFoundError check in PreTrainedConfig * fix: make "pre-commit run --all-files" pass * fix: replace logging with logger for better logging practices * fix: fixed extra changes of lint and format changes * fix: fixed extra changes out of "configs" module * Update src/lerobot/configs/policies.py Co-authored-by: Steven Palma Signed-off-by: tetsugo02 <131431116+tetsugo02@users.noreply.github.com> * fix: add logging for scratch job --------- Signed-off-by: Adil Zouitine Signed-off-by: tetsugo02 <131431116+tetsugo02@users.noreply.github.com> Co-authored-by: Adil Zouitine Co-authored-by: Steven Palma --- pyproject.toml | 16 +++++++++++--- src/lerobot/configs/default.py | 2 +- src/lerobot/configs/eval.py | 18 ++++++++++----- src/lerobot/configs/parser.py | 26 ++++++++++++++-------- src/lerobot/configs/policies.py | 39 +++++++++++++++++++-------------- src/lerobot/configs/train.py | 37 +++++++++++++++++++------------ src/lerobot/configs/types.py | 2 +- 7 files changed, 90 insertions(+), 50 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 250395035..36746e79c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -289,9 +289,19 @@ ignore_errors = false # module = "lerobot.utils.*" # ignore_errors = false -# [[tool.mypy.overrides]] -# module = "lerobot.configs.*" -# ignore_errors = false +[[tool.mypy.overrides]] +module = "lerobot.configs.*" +ignore_errors = false + +# extra strictness for configs +disallow_untyped_defs = true +disallow_incomplete_defs = true +check_untyped_defs = true + +# ignores +disable_error_code = ["attr-defined"] #TODO: draccus issue + +# include = "src/lerobot/configs/**/*.py" # [[tool.mypy.overrides]] # module = "lerobot.optim.*" diff --git a/src/lerobot/configs/default.py b/src/lerobot/configs/default.py index afd644e1c..630d63f1b 100644 --- a/src/lerobot/configs/default.py +++ b/src/lerobot/configs/default.py @@ -57,7 +57,7 @@ class EvalConfig: # `use_async_envs` specifies whether to use asynchronous environments (multiprocessing). use_async_envs: bool = False - def __post_init__(self): + def __post_init__(self) -> None: if self.batch_size > self.n_episodes: raise ValueError( "The eval batch size is greater than the number of eval episodes " diff --git a/src/lerobot/configs/eval.py b/src/lerobot/configs/eval.py index cfe48cf87..e9e05a7e8 100644 --- a/src/lerobot/configs/eval.py +++ b/src/lerobot/configs/eval.py @@ -13,8 +13,8 @@ # limitations under the License. import datetime as dt -import logging from dataclasses import dataclass, field +from logging import getLogger from pathlib import Path from lerobot import envs, policies # noqa: F401 @@ -22,6 +22,8 @@ from lerobot.configs import parser from lerobot.configs.default import EvalConfig from lerobot.configs.policies import PreTrainedConfig +logger = getLogger(__name__) + @dataclass class EvalPipelineConfig: @@ -35,24 +37,28 @@ class EvalPipelineConfig: job_name: str | None = None seed: int | None = 1000 - def __post_init__(self): + def __post_init__(self) -> None: # HACK: We parse again the cli args here to get the pretrained path if there was one. policy_path = parser.get_path_arg("policy") if policy_path: cli_overrides = parser.get_cli_overrides("policy") self.policy = PreTrainedConfig.from_pretrained(policy_path, cli_overrides=cli_overrides) - self.policy.pretrained_path = policy_path + self.policy.pretrained_path = Path(policy_path) else: - logging.warning( + logger.warning( "No pretrained path was provided, evaluated policy will be built from scratch (random weights)." ) if not self.job_name: if self.env is None: - self.job_name = f"{self.policy.type}" + self.job_name = f"{self.policy.type if self.policy is not None else 'scratch'}" else: - self.job_name = f"{self.env.type}_{self.policy.type}" + self.job_name = ( + f"{self.env.type}_{self.policy.type if self.policy is not None else 'scratch'}" + ) + + logger.warning(f"No job name provided, using '{self.job_name}' as job name.") if not self.output_dir: now = dt.datetime.now() diff --git a/src/lerobot/configs/parser.py b/src/lerobot/configs/parser.py index 2296eaa20..57ebaf8fa 100644 --- a/src/lerobot/configs/parser.py +++ b/src/lerobot/configs/parser.py @@ -16,14 +16,19 @@ import inspect import pkgutil import sys from argparse import ArgumentError -from collections.abc import Sequence +from collections.abc import Callable, Iterable, Sequence from functools import wraps from pathlib import Path +from pkgutil import ModuleInfo +from types import ModuleType +from typing import Any, TypeVar, cast import draccus from lerobot.utils.utils import has_method +F = TypeVar("F", bound=Callable[..., object]) + PATH_KEY = "path" PLUGIN_DISCOVERY_SUFFIX = "discover_packages_path" @@ -60,7 +65,7 @@ def parse_arg(arg_name: str, args: Sequence[str] | None = None) -> str | None: return None -def parse_plugin_args(plugin_arg_suffix: str, args: Sequence[str]) -> dict: +def parse_plugin_args(plugin_arg_suffix: str, args: Sequence[str]) -> dict[str, str]: """Parse plugin-related arguments from command-line arguments. This function extracts arguments from command-line arguments that match a specified suffix pattern. @@ -127,7 +132,7 @@ def load_plugin(plugin_path: str) -> None: f"Failed to load plugin '{plugin_path}'. Verify the path and installation: {str(e)}" ) from e - def iter_namespace(ns_pkg): + def iter_namespace(ns_pkg: ModuleType) -> Iterable[ModuleInfo]: return pkgutil.iter_modules(ns_pkg.__path__, ns_pkg.__name__ + ".") try: @@ -148,6 +153,8 @@ def get_type_arg(field_name: str, args: Sequence[str] | None = None) -> str | No def filter_arg(field_to_filter: str, args: Sequence[str] | None = None) -> list[str]: + if args is None: + return [] return [arg for arg in args if not arg.startswith(f"--{field_to_filter}=")] @@ -171,7 +178,8 @@ def filter_path_args(fields_to_filter: str | list[str], args: Sequence[str] | No if isinstance(fields_to_filter, str): fields_to_filter = [fields_to_filter] - filtered_args = args + filtered_args = [] if args is None else list(args) + for field in fields_to_filter: if get_path_arg(field, args): if get_type_arg(field, args): @@ -184,7 +192,7 @@ def filter_path_args(fields_to_filter: str | list[str], args: Sequence[str] | No return filtered_args -def wrap(config_path: Path | None = None): +def wrap(config_path: Path | None = None) -> Callable[[F], F]: """ HACK: Similar to draccus.wrap but does three additional things: - Will remove '.path' arguments from CLI in order to process them later on. @@ -195,9 +203,9 @@ def wrap(config_path: Path | None = None): from the CLI '.type' arguments """ - def wrapper_outer(fn): + def wrapper_outer(fn: F) -> F: @wraps(fn) - def wrapper_inner(*args, **kwargs): + def wrapper_inner(*args: Any, **kwargs: Any) -> Any: argspec = inspect.getfullargspec(fn) argtype = argspec.annotations[argspec.args[0]] if len(args) > 0 and type(args[0]) is argtype: @@ -225,6 +233,6 @@ def wrap(config_path: Path | None = None): response = fn(cfg, *args, **kwargs) return response - return wrapper_inner + return cast(F, wrapper_inner) - return wrapper_outer + return cast(Callable[[F], F], wrapper_outer) diff --git a/src/lerobot/configs/policies.py b/src/lerobot/configs/policies.py index 98dd4df3f..b1cc19a4e 100644 --- a/src/lerobot/configs/policies.py +++ b/src/lerobot/configs/policies.py @@ -14,12 +14,12 @@ import abc import builtins import json -import logging import os import tempfile from dataclasses import dataclass, field +from logging import getLogger from pathlib import Path -from typing import TypeVar +from typing import Any, TypeVar import draccus from huggingface_hub import hf_hub_download @@ -34,10 +34,11 @@ from lerobot.utils.hub import HubMixin from lerobot.utils.utils import auto_select_torch_device, is_amp_available, is_torch_device_available T = TypeVar("T", bound="PreTrainedConfig") +logger = getLogger(__name__) @dataclass -class PreTrainedConfig(draccus.ChoiceRegistry, HubMixin, abc.ABC): +class PreTrainedConfig(draccus.ChoiceRegistry, HubMixin, abc.ABC): # type: ignore[misc,name-defined] #TODO: draccus issue """ Base configuration class for policy models. @@ -62,7 +63,7 @@ class PreTrainedConfig(draccus.ChoiceRegistry, HubMixin, abc.ABC): # automatic gradient scaling is used. use_amp: bool = False - push_to_hub: bool = True + push_to_hub: bool = True # type: ignore[assignment] # TODO: use a different name to avoid override repo_id: str | None = None # Upload on private repository on the Hugging Face hub. @@ -73,38 +74,41 @@ class PreTrainedConfig(draccus.ChoiceRegistry, HubMixin, abc.ABC): license: str | None = None # Either the repo ID of a model hosted on the Hub or a path to a directory containing weights # saved using `Policy.save_pretrained`. If not provided, the policy is initialized from scratch. - pretrained_path: str | None = None + pretrained_path: Path | None = None - def __post_init__(self): + def __post_init__(self) -> None: if not self.device or not is_torch_device_available(self.device): auto_device = auto_select_torch_device() - logging.warning(f"Device '{self.device}' is not available. Switching to '{auto_device}'.") + logger.warning(f"Device '{self.device}' is not available. Switching to '{auto_device}'.") self.device = auto_device.type # Automatically deactivate AMP if necessary if self.use_amp and not is_amp_available(self.device): - logging.warning( + logger.warning( f"Automatic Mixed Precision (amp) is not available on device '{self.device}'. Deactivating AMP." ) self.use_amp = False @property def type(self) -> str: - return self.get_choice_name(self.__class__) + choice_name = self.get_choice_name(self.__class__) + if not isinstance(choice_name, str): + raise TypeError(f"Expected string from get_choice_name, got {type(choice_name)}") + return choice_name @property @abc.abstractmethod - def observation_delta_indices(self) -> list | None: + def observation_delta_indices(self) -> list | None: # type: ignore[type-arg] #TODO: No implementation raise NotImplementedError @property @abc.abstractmethod - def action_delta_indices(self) -> list | None: + def action_delta_indices(self) -> list | None: # type: ignore[type-arg] #TODO: No implementation raise NotImplementedError @property @abc.abstractmethod - def reward_delta_indices(self) -> list | None: + def reward_delta_indices(self) -> list | None: # type: ignore[type-arg] #TODO: No implementation raise NotImplementedError @abc.abstractmethod @@ -154,13 +158,13 @@ class PreTrainedConfig(draccus.ChoiceRegistry, HubMixin, abc.ABC): pretrained_name_or_path: str | Path, *, force_download: bool = False, - resume_download: bool = None, - proxies: dict | None = None, + resume_download: bool | None = None, + proxies: dict[Any, Any] | None = None, token: str | bool | None = None, cache_dir: str | Path | None = None, local_files_only: bool = False, revision: str | None = None, - **policy_kwargs, + **policy_kwargs: Any, ) -> T: model_id = str(pretrained_name_or_path) config_file: str | None = None @@ -168,7 +172,7 @@ class PreTrainedConfig(draccus.ChoiceRegistry, HubMixin, abc.ABC): if CONFIG_NAME in os.listdir(model_id): config_file = os.path.join(model_id, CONFIG_NAME) else: - print(f"{CONFIG_NAME} not found in {Path(model_id).resolve()}") + logger.error(f"{CONFIG_NAME} not found in {Path(model_id).resolve()}") else: try: config_file = hf_hub_download( @@ -194,6 +198,9 @@ class PreTrainedConfig(draccus.ChoiceRegistry, HubMixin, abc.ABC): with draccus.config_type("json"): orig_config = draccus.parse(cls, config_file, args=[]) + if config_file is None: + raise FileNotFoundError(f"{CONFIG_NAME} not found in {model_id}") + with open(config_file) as f: config = json.load(f) diff --git a/src/lerobot/configs/train.py b/src/lerobot/configs/train.py index 60a4d81d5..2f3a65dbc 100644 --- a/src/lerobot/configs/train.py +++ b/src/lerobot/configs/train.py @@ -16,6 +16,7 @@ import datetime as dt import os from dataclasses import dataclass, field from pathlib import Path +from typing import Any import draccus from huggingface_hub import hf_hub_download @@ -63,18 +64,16 @@ class TrainPipelineConfig(HubMixin): scheduler: LRSchedulerConfig | None = None eval: EvalConfig = field(default_factory=EvalConfig) wandb: WandBConfig = field(default_factory=WandBConfig) + checkpoint_path: Path | None = field(init=False, default=None) - def __post_init__(self): - self.checkpoint_path = None - - def validate(self): + def validate(self) -> None: # HACK: We parse again the cli args here to get the pretrained paths if there was some. policy_path = parser.get_path_arg("policy") if policy_path: # Only load the policy config cli_overrides = parser.get_cli_overrides("policy") self.policy = PreTrainedConfig.from_pretrained(policy_path, cli_overrides=cli_overrides) - self.policy.pretrained_path = policy_path + self.policy.pretrained_path = Path(policy_path) elif self.resume: # The entire train config is already loaded, we just need to get the checkpoint dir config_path = parser.parse_arg("config_path") @@ -82,14 +81,22 @@ class TrainPipelineConfig(HubMixin): raise ValueError( f"A config_path is expected when resuming a run. Please specify path to {TRAIN_CONFIG_NAME}" ) + if not Path(config_path).resolve().exists(): raise NotADirectoryError( f"{config_path=} is expected to be a local path. " "Resuming from the hub is not supported for now." ) - policy_path = Path(config_path).parent - self.policy.pretrained_path = policy_path - self.checkpoint_path = policy_path.parent + + policy_dir = Path(config_path).parent + if self.policy is not None: + self.policy.pretrained_path = policy_dir + self.checkpoint_path = policy_dir.parent + + if self.policy is None: + raise ValueError( + "Policy is not configured. Please specify a pretrained policy with `--policy.path`." + ) if not self.job_name: if self.env is None: @@ -126,8 +133,8 @@ class TrainPipelineConfig(HubMixin): """This enables the parser to load config from the policy using `--policy.path=local/dir`""" return ["policy"] - def to_dict(self) -> dict: - return draccus.encode(self) + def to_dict(self) -> dict[str, Any]: + return draccus.encode(self) # type: ignore[no-any-return] # because of the third-party library draccus uses Any as the return type def _save_pretrained(self, save_directory: Path) -> None: with open(save_directory / TRAIN_CONFIG_NAME, "w") as f, draccus.config_type("json"): @@ -139,13 +146,13 @@ class TrainPipelineConfig(HubMixin): pretrained_name_or_path: str | Path, *, force_download: bool = False, - resume_download: bool = None, - proxies: dict | None = None, + resume_download: bool | None = None, + proxies: dict[Any, Any] | None = None, token: str | bool | None = None, cache_dir: str | Path | None = None, local_files_only: bool = False, revision: str | None = None, - **kwargs, + **kwargs: Any, ) -> "TrainPipelineConfig": model_id = str(pretrained_name_or_path) config_file: str | None = None @@ -181,4 +188,6 @@ class TrainPipelineConfig(HubMixin): @dataclass(kw_only=True) class TrainRLServerPipelineConfig(TrainPipelineConfig): - dataset: DatasetConfig | None = None # NOTE: In RL, we don't need an offline dataset + # NOTE: In RL, we don't need an offline dataset + # TODO: Make `TrainPipelineConfig.dataset` optional + dataset: DatasetConfig | None = None # type: ignore[assignment] # because the parent class has made it's type non-optional diff --git a/src/lerobot/configs/types.py b/src/lerobot/configs/types.py index cb578060e..11a1f8d74 100644 --- a/src/lerobot/configs/types.py +++ b/src/lerobot/configs/types.py @@ -42,4 +42,4 @@ class NormalizationMode(str, Enum): @dataclass class PolicyFeature: type: FeatureType - shape: tuple + shape: tuple[int, ...] From aa1d906802fbe2a55111dc15e31b748c86773cae Mon Sep 17 00:00:00 2001 From: hls <56255627+forgetwhatuwant@users.noreply.github.com> Date: Mon, 20 Oct 2025 20:19:21 +0800 Subject: [PATCH 27/62] Enhance OpenCVCamera with FOURCC for MJPEG support and validation (#1558) * Enhance OpenCVCamera with FOURCC support and validation - Added FOURCC configuration option to OpenCVCamera and OpenCVCameraConfig for specifying video format. - Implemented _validate_fourcc method to validate and set the camera's FOURCC code. - Updated _configure_capture_settings to apply FOURCC settings before FPS and resolution. - Enhanced camera detection to include default FOURCC code in camera info. - Updated documentation to reflect new FOURCC parameter and its implications on performance. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add tests for FOURCC configuration in OpenCVCamera - Implemented tests to validate FOURCC configuration and its application in OpenCVCamera. - Added checks for valid FOURCC codes and ensured that invalid codes raise appropriate errors. - Included a test for camera connection functionality using specified FOURCC settings. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix circular import in __init__.py - change to relative import * Update src/lerobot/cameras/opencv/configuration_opencv.py Co-authored-by: Steven Palma Signed-off-by: hls <56255627+forgetwhatuwant@users.noreply.github.com> * Update src/lerobot/cameras/opencv/configuration_opencv.py Co-authored-by: Steven Palma Signed-off-by: hls <56255627+forgetwhatuwant@users.noreply.github.com> * fix(camera_opencv): ensure MSMF hardware transform compatibility on Windows before importing OpenCV * This change reverts the import from a relative import (.) back to the absolute import (lerobot.) as it was previously * opencv/config: satisfy Ruff SIM102 by merging nested if for fourcc validation * style(opencv/config): apply ruff-format changes --------- Signed-off-by: hls <56255627+forgetwhatuwant@users.noreply.github.com> Signed-off-by: Steven Palma Co-authored-by: forgetwhatuwant Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Steven Palma --- src/lerobot/cameras/opencv/camera_opencv.py | 31 +++++++++++++- .../cameras/opencv/configuration_opencv.py | 14 ++++++- tests/cameras/test_opencv.py | 40 +++++++++++++++++++ 3 files changed, 82 insertions(+), 3 deletions(-) diff --git a/src/lerobot/cameras/opencv/camera_opencv.py b/src/lerobot/cameras/opencv/camera_opencv.py index 9e278dfdb..708340ab8 100644 --- a/src/lerobot/cameras/opencv/camera_opencv.py +++ b/src/lerobot/cameras/opencv/camera_opencv.py @@ -181,12 +181,14 @@ class OpenCVCamera(Camera): def _configure_capture_settings(self) -> None: """ - Applies the specified FPS, width, and height settings to the connected camera. + Applies the specified FOURCC, FPS, width, and height settings to the connected camera. This method attempts to set the camera properties via OpenCV. It checks if the camera successfully applied the settings and raises an error if not. + FOURCC is set first (if specified) as it can affect the available FPS and resolution options. Args: + fourcc: The desired FOURCC code (e.g., "MJPG", "YUYV"). If None, auto-detect. fps: The desired frames per second. If None, the setting is skipped. width: The desired capture width. If None, the setting is skipped. height: The desired capture height. If None, the setting is skipped. @@ -200,6 +202,9 @@ class OpenCVCamera(Camera): if not self.is_connected: raise DeviceNotConnectedError(f"Cannot configure settings for {self} as it is not connected.") + # Set FOURCC first (if specified) as it can affect available FPS/resolution options + if self.config.fourcc is not None: + self._validate_fourcc() if self.videocapture is None: raise DeviceNotConnectedError(f"{self} videocapture is not initialized") @@ -235,6 +240,23 @@ class OpenCVCamera(Camera): if not success or not math.isclose(self.fps, actual_fps, rel_tol=1e-3): raise RuntimeError(f"{self} failed to set fps={self.fps} ({actual_fps=}).") + def _validate_fourcc(self) -> None: + """Validates and sets the camera's FOURCC code.""" + + fourcc_code = cv2.VideoWriter_fourcc(*self.config.fourcc) + success = self.videocapture.set(cv2.CAP_PROP_FOURCC, fourcc_code) + actual_fourcc_code = self.videocapture.get(cv2.CAP_PROP_FOURCC) + + # Convert actual FOURCC code back to string for comparison + actual_fourcc_code_int = int(actual_fourcc_code) + actual_fourcc = "".join([chr((actual_fourcc_code_int >> 8 * i) & 0xFF) for i in range(4)]) + + if not success or actual_fourcc != self.config.fourcc: + logger.warning( + f"{self} failed to set fourcc={self.config.fourcc} (actual={actual_fourcc}, success={success}). " + f"Continuing with default format." + ) + def _validate_width_and_height(self) -> None: """Validates and sets the camera's frame capture width and height.""" @@ -287,6 +309,12 @@ class OpenCVCamera(Camera): default_height = int(camera.get(cv2.CAP_PROP_FRAME_HEIGHT)) default_fps = camera.get(cv2.CAP_PROP_FPS) default_format = camera.get(cv2.CAP_PROP_FORMAT) + + # Get FOURCC code and convert to string + default_fourcc_code = camera.get(cv2.CAP_PROP_FOURCC) + default_fourcc_code_int = int(default_fourcc_code) + default_fourcc = "".join([chr((default_fourcc_code_int >> 8 * i) & 0xFF) for i in range(4)]) + camera_info = { "name": f"OpenCV Camera @ {target}", "type": "OpenCV", @@ -294,6 +322,7 @@ class OpenCVCamera(Camera): "backend_api": camera.getBackendName(), "default_stream_profile": { "format": default_format, + "fourcc": default_fourcc, "width": default_width, "height": default_height, "fps": default_fps, diff --git a/src/lerobot/cameras/opencv/configuration_opencv.py b/src/lerobot/cameras/opencv/configuration_opencv.py index b66fb31bc..37a42861c 100644 --- a/src/lerobot/cameras/opencv/configuration_opencv.py +++ b/src/lerobot/cameras/opencv/configuration_opencv.py @@ -35,8 +35,9 @@ class OpenCVCameraConfig(CameraConfig): OpenCVCameraConfig(0, 30, 1280, 720) # 1280x720 @ 30FPS OpenCVCameraConfig(/dev/video4, 60, 640, 480) # 640x480 @ 60FPS - # Advanced configurations - OpenCVCameraConfig(128422271347, 30, 640, 480, rotation=Cv2Rotation.ROTATE_90) # With 90° rotation + # Advanced configurations with FOURCC format + OpenCVCameraConfig(128422271347, 30, 640, 480, rotation=Cv2Rotation.ROTATE_90, fourcc="MJPG") # With 90° rotation and MJPG format + OpenCVCameraConfig(0, 30, 1280, 720, fourcc="YUYV") # With YUYV format ``` Attributes: @@ -48,15 +49,19 @@ class OpenCVCameraConfig(CameraConfig): color_mode: Color mode for image output (RGB or BGR). Defaults to RGB. rotation: Image rotation setting (0°, 90°, 180°, or 270°). Defaults to no rotation. warmup_s: Time reading frames before returning from connect (in seconds) + fourcc: FOURCC code for video format (e.g., "MJPG", "YUYV", "I420"). Defaults to None (auto-detect). Note: - Only 3-channel color output (RGB/BGR) is currently supported. + - FOURCC codes must be 4-character strings (e.g., "MJPG", "YUYV"). Some common FOUCC codes: https://learn.microsoft.com/en-us/windows/win32/medfound/video-fourccs#fourcc-constants + - Setting FOURCC can help achieve higher frame rates on some cameras. """ index_or_path: int | Path color_mode: ColorMode = ColorMode.RGB rotation: Cv2Rotation = Cv2Rotation.NO_ROTATION warmup_s: int = 1 + fourcc: str | None = None def __post_init__(self) -> None: if self.color_mode not in (ColorMode.RGB, ColorMode.BGR): @@ -73,3 +78,8 @@ class OpenCVCameraConfig(CameraConfig): raise ValueError( f"`rotation` is expected to be in {(Cv2Rotation.NO_ROTATION, Cv2Rotation.ROTATE_90, Cv2Rotation.ROTATE_180, Cv2Rotation.ROTATE_270)}, but {self.rotation} is provided." ) + + if self.fourcc is not None and (not isinstance(self.fourcc, str) or len(self.fourcc) != 4): + raise ValueError( + f"`fourcc` must be a 4-character string (e.g., 'MJPG', 'YUYV'), but '{self.fourcc}' is provided." + ) diff --git a/tests/cameras/test_opencv.py b/tests/cameras/test_opencv.py index a3d98a679..3cf3793b6 100644 --- a/tests/cameras/test_opencv.py +++ b/tests/cameras/test_opencv.py @@ -155,6 +155,46 @@ def test_async_read_before_connect(): _ = camera.async_read() +def test_fourcc_configuration(): + """Test FourCC configuration validation and application.""" + + # Test MJPG specifically (main use case) + config = OpenCVCameraConfig(index_or_path=DEFAULT_PNG_FILE_PATH, fourcc="MJPG") + camera = OpenCVCamera(config) + assert camera.config.fourcc == "MJPG" + + # Test a few other common formats + valid_fourcc_codes = ["YUYV", "YUY2", "RGB3"] + + for fourcc in valid_fourcc_codes: + config = OpenCVCameraConfig(index_or_path=DEFAULT_PNG_FILE_PATH, fourcc=fourcc) + camera = OpenCVCamera(config) + assert camera.config.fourcc == fourcc + + # Test invalid FOURCC codes + invalid_fourcc_codes = ["ABC", "ABCDE", ""] + + for fourcc in invalid_fourcc_codes: + with pytest.raises(ValueError): + OpenCVCameraConfig(index_or_path=DEFAULT_PNG_FILE_PATH, fourcc=fourcc) + + +def test_fourcc_with_camera(): + """Test FourCC functionality with actual camera connection.""" + config = OpenCVCameraConfig(index_or_path=DEFAULT_PNG_FILE_PATH, fourcc="MJPG") + camera = OpenCVCamera(config) + + # Connect should work with MJPG specified + camera.connect(warmup=False) + assert camera.is_connected + + # Read should work normally + img = camera.read() + assert isinstance(img, np.ndarray) + + camera.disconnect() + + @pytest.mark.parametrize("index_or_path", TEST_IMAGE_PATHS, ids=TEST_IMAGE_SIZES) @pytest.mark.parametrize( "rotation", From 9db62138952b6a10e586c56dfffabd1ed407de1b Mon Sep 17 00:00:00 2001 From: Steven Palma Date: Mon, 20 Oct 2025 16:25:03 +0200 Subject: [PATCH 28/62] chore(style): update mypy config (#2257) * chore(style): update mypy config * fix(cameras): mypy check --- pyproject.toml | 11 +++-------- src/lerobot/cameras/opencv/camera_opencv.py | 4 ++++ 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 36746e79c..544584867 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -298,11 +298,6 @@ disallow_untyped_defs = true disallow_incomplete_defs = true check_untyped_defs = true -# ignores -disable_error_code = ["attr-defined"] #TODO: draccus issue - -# include = "src/lerobot/configs/**/*.py" - # [[tool.mypy.overrides]] # module = "lerobot.optim.*" # ignore_errors = false @@ -319,9 +314,9 @@ ignore_errors = false # module = "lerobot.datasets.*" # ignore_errors = false -# [[tool.mypy.overrides]] -# module = "lerobot.cameras.*" -# ignore_errors = false +[[tool.mypy.overrides]] +module = "lerobot.cameras.*" +ignore_errors = false # [[tool.mypy.overrides]] # module = "lerobot.motors.*" diff --git a/src/lerobot/cameras/opencv/camera_opencv.py b/src/lerobot/cameras/opencv/camera_opencv.py index 708340ab8..89d5995f4 100644 --- a/src/lerobot/cameras/opencv/camera_opencv.py +++ b/src/lerobot/cameras/opencv/camera_opencv.py @@ -244,6 +244,10 @@ class OpenCVCamera(Camera): """Validates and sets the camera's FOURCC code.""" fourcc_code = cv2.VideoWriter_fourcc(*self.config.fourcc) + + if self.videocapture is None: + raise DeviceNotConnectedError(f"{self} videocapture is not initialized") + success = self.videocapture.set(cv2.CAP_PROP_FOURCC, fourcc_code) actual_fourcc_code = self.videocapture.get(cv2.CAP_PROP_FOURCC) From 502fdc06302d27c0f108f3280ae4e90769304a7a Mon Sep 17 00:00:00 2001 From: Antoine Date: Mon, 20 Oct 2025 18:45:09 +0200 Subject: [PATCH 29/62] fix dataset revision (#2260) Co-authored-by: Steven Palma --- src/lerobot/datasets/lerobot_dataset.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/lerobot/datasets/lerobot_dataset.py b/src/lerobot/datasets/lerobot_dataset.py index ec419d557..9bbe07a58 100644 --- a/src/lerobot/datasets/lerobot_dataset.py +++ b/src/lerobot/datasets/lerobot_dataset.py @@ -708,7 +708,8 @@ class LeRobotDataset(torch.utils.data.Dataset): if not self._check_cached_episodes_sufficient(): raise FileNotFoundError("Cached dataset doesn't contain all requested episodes") except (AssertionError, FileNotFoundError, NotADirectoryError): - self.revision = get_safe_version(self.repo_id, self.revision) + if is_valid_version(self.revision): + self.revision = get_safe_version(self.repo_id, self.revision) self.download(download_videos) self.hf_dataset = self.load_hf_dataset() From 5f6f476f32c3e4ccfc5b2bda0e7f328c796458aa Mon Sep 17 00:00:00 2001 From: Jade Choghari Date: Mon, 20 Oct 2025 23:29:05 +0200 Subject: [PATCH 30/62] fix: support cuda:0, cuda:1 in string selection (#2256) * fix * update func 2 * update nightly * fix quality * ignore test_dynamixel --- .github/workflows/nightly.yml | 2 +- src/lerobot/configs/policies.py | 2 +- src/lerobot/utils/utils.py | 38 ++++++++++++++++----------------- 3 files changed, 20 insertions(+), 22 deletions(-) diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index f9fa02597..4904ed155 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -189,5 +189,5 @@ jobs: python -c "import torch; print(f'PyTorch CUDA available: {torch.cuda.is_available()}'); print(f'Number of GPUs: {torch.cuda.device_count()}')" - name: Run multi-GPU training tests - run: pytest tests/training/test_multi_gpu.py -vv --maxfail=3 + run: pytest tests -vv --maxfail=10 --ignore=tests/motors/test_dynamixel.py timeout-minutes: 10 diff --git a/src/lerobot/configs/policies.py b/src/lerobot/configs/policies.py index b1cc19a4e..0ecfa169b 100644 --- a/src/lerobot/configs/policies.py +++ b/src/lerobot/configs/policies.py @@ -58,7 +58,7 @@ class PreTrainedConfig(draccus.ChoiceRegistry, HubMixin, abc.ABC): # type: igno input_features: dict[str, PolicyFeature] = field(default_factory=dict) output_features: dict[str, PolicyFeature] = field(default_factory=dict) - device: str | None = None # cuda | cpu | mp + device: str | None = None # e.g. "cuda", "cuda:0", "cpu", or "mps" # `use_amp` determines whether to use Automatic Mixed Precision (AMP) for training and evaluation. With AMP, # automatic gradient scaling is used. use_amp: bool = False diff --git a/src/lerobot/utils/utils.py b/src/lerobot/utils/utils.py index 46c15d925..c7ad2bbdb 100644 --- a/src/lerobot/utils/utils.py +++ b/src/lerobot/utils/utils.py @@ -57,25 +57,23 @@ def auto_select_torch_device() -> torch.device: def get_safe_torch_device(try_device: str, log: bool = False) -> torch.device: """Given a string, return a torch.device with checks on whether the device is available.""" try_device = str(try_device) - match try_device: - case "cuda": - assert torch.cuda.is_available() - device = torch.device("cuda") - case "mps": - assert torch.backends.mps.is_available() - device = torch.device("mps") - case "xpu": - assert torch.xpu.is_available() - device = torch.device("xpu") - case "cpu": - device = torch.device("cpu") - if log: - logging.warning("Using CPU, this will be slow.") - case _: - device = torch.device(try_device) - if log: - logging.warning(f"Using custom {try_device} device.") - + if try_device.startswith("cuda"): + assert torch.cuda.is_available() + device = torch.device(try_device) + elif try_device == "mps": + assert torch.backends.mps.is_available() + device = torch.device("mps") + elif try_device == "xpu": + assert torch.xpu.is_available() + device = torch.device("xpu") + elif try_device == "cpu": + device = torch.device("cpu") + if log: + logging.warning("Using CPU, this will be slow.") + else: + device = torch.device(try_device) + if log: + logging.warning(f"Using custom {try_device} device.") return device @@ -108,7 +106,7 @@ def get_safe_dtype(dtype: torch.dtype, device: str | torch.device): def is_torch_device_available(try_device: str) -> bool: try_device = str(try_device) # Ensure try_device is a string - if try_device == "cuda": + if try_device.startswith("cuda"): return torch.cuda.is_available() elif try_device == "mps": return torch.backends.mps.is_available() From b954337ac7c8db5ea592c0d59dfb435845d9d380 Mon Sep 17 00:00:00 2001 From: Steven Palma Date: Mon, 20 Oct 2025 23:34:24 +0200 Subject: [PATCH 31/62] fix(scripts): add missing observation overwrite in eval and async (#2265) --- src/lerobot/async_inference/helpers.py | 3 ++- src/lerobot/async_inference/policy_server.py | 5 ++++- src/lerobot/configs/eval.py | 2 ++ src/lerobot/scripts/lerobot_eval.py | 5 ++++- 4 files changed, 12 insertions(+), 3 deletions(-) diff --git a/src/lerobot/async_inference/helpers.py b/src/lerobot/async_inference/helpers.py index f73cbc1da..2158f51ac 100644 --- a/src/lerobot/async_inference/helpers.py +++ b/src/lerobot/async_inference/helpers.py @@ -16,7 +16,7 @@ import logging import logging.handlers import os import time -from dataclasses import dataclass +from dataclasses import dataclass, field from pathlib import Path import torch @@ -268,6 +268,7 @@ class RemotePolicyConfig: lerobot_features: dict[str, PolicyFeature] actions_per_chunk: int device: str = "cpu" + rename_map: dict[str, str] = field(default_factory=dict) def _compare_observation_states(obs1_state: torch.Tensor, obs2_state: torch.Tensor, atol: float) -> bool: diff --git a/src/lerobot/async_inference/policy_server.py b/src/lerobot/async_inference/policy_server.py index f7e00dea4..ab2e6bcd8 100644 --- a/src/lerobot/async_inference/policy_server.py +++ b/src/lerobot/async_inference/policy_server.py @@ -159,7 +159,10 @@ class PolicyServer(services_pb2_grpc.AsyncInferenceServicer): self.preprocessor, self.postprocessor = make_pre_post_processors( self.policy.config, pretrained_path=policy_specs.pretrained_name_or_path, - preprocessor_overrides={"device_processor": device_override}, + preprocessor_overrides={ + "device_processor": device_override, + "rename_observations_processor": {"rename_map": policy_specs.rename_map}, + }, postprocessor_overrides={"device_processor": device_override}, ) diff --git a/src/lerobot/configs/eval.py b/src/lerobot/configs/eval.py index e9e05a7e8..2f085da56 100644 --- a/src/lerobot/configs/eval.py +++ b/src/lerobot/configs/eval.py @@ -36,6 +36,8 @@ class EvalPipelineConfig: output_dir: Path | None = None job_name: str | None = None seed: int | None = 1000 + # Rename map for the observation to override the image and state keys + rename_map: dict[str, str] = field(default_factory=dict) def __post_init__(self) -> None: # HACK: We parse again the cli args here to get the pretrained path if there was one. diff --git a/src/lerobot/scripts/lerobot_eval.py b/src/lerobot/scripts/lerobot_eval.py index 0dec18be6..8796e897e 100644 --- a/src/lerobot/scripts/lerobot_eval.py +++ b/src/lerobot/scripts/lerobot_eval.py @@ -508,7 +508,10 @@ def eval_main(cfg: EvalPipelineConfig): policy_cfg=cfg.policy, pretrained_path=cfg.policy.pretrained_path, # The inference device is automatically set to match the detected hardware, overriding any previous device settings from training to ensure compatibility. - preprocessor_overrides={"device_processor": {"device": str(policy.config.device)}}, + preprocessor_overrides={ + "device_processor": {"device": str(policy.config.device)}, + "rename_observations_processor": {"rename_map": cfg.rename_map}, + }, ) with torch.no_grad(), torch.autocast(device_type=device.type) if cfg.policy.use_amp else nullcontext(): info = eval_policy_all( From 92b479f9ac550f5785a53237efe8d6c2c03ebc81 Mon Sep 17 00:00:00 2001 From: Xiaoxuan Liu Date: Tue, 21 Oct 2025 17:31:03 +0800 Subject: [PATCH 32/62] Fix camera FPS set issue (#2275) Set camera width/height 1st before FPS setting, to avoid FPS set failure alike: ERROR:__main__:Failed to connect or configure OpenCV camera /dev/video2: OpenCVCamera(/dev/video2) failed to set fps=30 (actual_fps=25.0). --- src/lerobot/cameras/opencv/camera_opencv.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/lerobot/cameras/opencv/camera_opencv.py b/src/lerobot/cameras/opencv/camera_opencv.py index 89d5995f4..28c978d1c 100644 --- a/src/lerobot/cameras/opencv/camera_opencv.py +++ b/src/lerobot/cameras/opencv/camera_opencv.py @@ -208,11 +208,6 @@ class OpenCVCamera(Camera): if self.videocapture is None: raise DeviceNotConnectedError(f"{self} videocapture is not initialized") - if self.fps is None: - self.fps = self.videocapture.get(cv2.CAP_PROP_FPS) - else: - self._validate_fps() - default_width = int(round(self.videocapture.get(cv2.CAP_PROP_FRAME_WIDTH))) default_height = int(round(self.videocapture.get(cv2.CAP_PROP_FRAME_HEIGHT))) @@ -225,6 +220,11 @@ class OpenCVCamera(Camera): else: self._validate_width_and_height() + if self.fps is None: + self.fps = self.videocapture.get(cv2.CAP_PROP_FPS) + else: + self._validate_fps() + def _validate_fps(self) -> None: """Validates and sets the camera's frames per second (FPS).""" From 503fc4e9f41de5285e230a22c15396d742dfb82e Mon Sep 17 00:00:00 2001 From: Steven Palma Date: Tue, 21 Oct 2025 12:14:26 +0200 Subject: [PATCH 33/62] fix(ci): exclude motor tests in multi-gpu setup (#2276) --- .github/workflows/nightly.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 4904ed155..be8b5c094 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -189,5 +189,6 @@ jobs: python -c "import torch; print(f'PyTorch CUDA available: {torch.cuda.is_available()}'); print(f'Number of GPUs: {torch.cuda.device_count()}')" - name: Run multi-GPU training tests - run: pytest tests -vv --maxfail=10 --ignore=tests/motors/test_dynamixel.py + # TODO(Steven): Investigate why motors tests are failing in multi-GPU setup + run: pytest tests -vv --maxfail=10 --ignore=tests/motors/ timeout-minutes: 10 From abe9e7982502ef19fdc566070dd59431a3b08ac7 Mon Sep 17 00:00:00 2001 From: Steven Palma Date: Tue, 21 Oct 2025 12:56:32 +0200 Subject: [PATCH 34/62] chore(dependencies): bump & ceil gymnasium version + pin metaworld version + bump gym-hil (#2267) * chore(dependencies): bump & ceil gymnasium version + pin metaworld version Co-authored-by: Jade Choghari * chore(dependencies): bump gym-hil to be compatible --------- Co-authored-by: Jade Choghari --- pyproject.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 544584867..1c71acec0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -81,7 +81,7 @@ dependencies = [ "torchvision>=0.21.0,<0.23.0", # TODO: Bumb dependency "draccus==0.10.0", # TODO: Remove == - "gymnasium>=1.0.0", + "gymnasium>=1.1.1,<2.0.0", "rerun-sdk>=0.24.0,<0.27.0", # Support dependencies @@ -118,7 +118,7 @@ phone = ["hebi-py>=2.8.0,<2.12.0", "teleop>=0.1.0,<0.2.0"] # Policies pi = ["transformers @ git+https://github.com/huggingface/transformers.git@fix/lerobot_openpi"] smolvla = ["lerobot[transformers-dep]", "num2words>=0.5.14,<0.6.0", "accelerate>=1.7.0,<2.0.0", "safetensors>=0.4.3,<1.0.0"] -hilserl = ["lerobot[transformers-dep]", "gym-hil>=0.1.11,<0.2.0", "lerobot[grpcio-dep]", "lerobot[placo-dep]"] +hilserl = ["lerobot[transformers-dep]", "gym-hil>=0.1.13,<0.2.0", "lerobot[grpcio-dep]", "lerobot[placo-dep]"] # Features async = ["lerobot[grpcio-dep]", "matplotlib>=3.10.3,<4.0.0"] @@ -132,7 +132,7 @@ video_benchmark = ["scikit-image>=0.23.2,<0.26.0", "pandas>=2.2.2,<2.4.0"] aloha = ["gym-aloha>=0.1.2,<0.2.0"] pusht = ["gym-pusht>=0.1.5,<0.2.0", "pymunk>=6.6.0,<7.0.0"] # TODO: Fix pymunk version in gym-pusht instead libero = ["lerobot[transformers-dep]", "libero @ git+https://github.com/huggingface/lerobot-libero.git@main#egg=libero"] -metaworld = ["metaworld>=3.0.0"] +metaworld = ["metaworld==3.0.0"] # All all = [ From 63cd2111ade80173e40094ccff3d868937831359 Mon Sep 17 00:00:00 2001 From: Hakjin Lee Date: Tue, 21 Oct 2025 21:26:31 +0900 Subject: [PATCH 35/62] [Fix] Device Error on SmolVLA Multi-GPU Training (#2270) Co-authored-by: Steven Palma --- src/lerobot/policies/smolvla/modeling_smolvla.py | 1 + src/lerobot/policies/smolvla/smolvlm_with_expert.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/lerobot/policies/smolvla/modeling_smolvla.py b/src/lerobot/policies/smolvla/modeling_smolvla.py index 23fc3ca4f..6e54d3ea5 100644 --- a/src/lerobot/policies/smolvla/modeling_smolvla.py +++ b/src/lerobot/policies/smolvla/modeling_smolvla.py @@ -485,6 +485,7 @@ class VLAFlowMatching(nn.Module): num_vlm_layers=self.config.num_vlm_layers, self_attn_every_n_layers=self.config.self_attn_every_n_layers, expert_width_multiplier=self.config.expert_width_multiplier, + device=self.config.device, ) self.state_proj = nn.Linear( self.config.max_state_dim, self.vlm_with_expert.config.text_config.hidden_size diff --git a/src/lerobot/policies/smolvla/smolvlm_with_expert.py b/src/lerobot/policies/smolvla/smolvlm_with_expert.py index f3d1a693a..555c40773 100644 --- a/src/lerobot/policies/smolvla/smolvlm_with_expert.py +++ b/src/lerobot/policies/smolvla/smolvlm_with_expert.py @@ -70,13 +70,14 @@ class SmolVLMWithExpertModel(nn.Module): num_vlm_layers: int = -1, self_attn_every_n_layers: int = -1, expert_width_multiplier: float = 0.5, + device: str = "auto", ): super().__init__() if load_vlm_weights: print(f"Loading {model_id} weights ...") self.vlm = AutoModelForImageTextToText.from_pretrained( model_id, - device_map="auto", + device_map=device, torch_dtype="bfloat16", low_cpu_mem_usage=True, ) From a024d337509e2bfd5e408982cec72d5a885aca82 Mon Sep 17 00:00:00 2001 From: Jade Choghari Date: Tue, 21 Oct 2025 16:00:46 +0200 Subject: [PATCH 36/62] fix(bug): Fix policy renaming ValueError during training (#2278) * fixes * style * Update src/lerobot/policies/factory.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Signed-off-by: Jade Choghari * style * add review fixes --------- Signed-off-by: Jade Choghari Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/lerobot/configs/train.py | 2 ++ src/lerobot/policies/factory.py | 20 ++++++++++++++++++++ src/lerobot/scripts/lerobot_eval.py | 1 + src/lerobot/scripts/lerobot_train.py | 4 ++++ 4 files changed, 27 insertions(+) diff --git a/src/lerobot/configs/train.py b/src/lerobot/configs/train.py index 2f3a65dbc..d17915c36 100644 --- a/src/lerobot/configs/train.py +++ b/src/lerobot/configs/train.py @@ -65,6 +65,8 @@ class TrainPipelineConfig(HubMixin): eval: EvalConfig = field(default_factory=EvalConfig) wandb: WandBConfig = field(default_factory=WandBConfig) checkpoint_path: Path | None = field(init=False, default=None) + # Rename map for the observation to override the image and state keys + rename_map: dict[str, str] = field(default_factory=dict) def validate(self) -> None: # HACK: We parse again the cli args here to get the pretrained paths if there was some. diff --git a/src/lerobot/policies/factory.py b/src/lerobot/policies/factory.py index 9c67e317a..6e524f2ab 100644 --- a/src/lerobot/policies/factory.py +++ b/src/lerobot/policies/factory.py @@ -303,6 +303,7 @@ def make_policy( cfg: PreTrainedConfig, ds_meta: LeRobotDatasetMetadata | None = None, env_cfg: EnvConfig | None = None, + rename_map: dict[str, str] | None = None, ) -> PreTrainedPolicy: """ Instantiate a policy model. @@ -319,6 +320,8 @@ def make_policy( statistics for normalization layers. env_cfg: Environment configuration used to infer feature shapes and types. One of `ds_meta` or `env_cfg` must be provided. + rename_map: Optional mapping of dataset or environment feature keys to match + expected policy feature names (e.g., `"left"` → `"camera1"`). Returns: An instantiated and device-placed policy model. @@ -380,4 +383,21 @@ def make_policy( # policy = torch.compile(policy, mode="reduce-overhead") + if not rename_map: + expected_features = set(cfg.input_features.keys()) | set(cfg.output_features.keys()) + provided_features = set(features.keys()) + if expected_features and provided_features != expected_features: + missing = expected_features - provided_features + extra = provided_features - expected_features + # TODO (jadechoghari): provide a dynamic rename map suggestion to the user. + raise ValueError( + f"Feature mismatch between dataset/environment and policy config.\n" + f"- Missing features: {sorted(missing) if missing else 'None'}\n" + f"- Extra features: {sorted(extra) if extra else 'None'}\n\n" + f"Please ensure your dataset and policy use consistent feature names.\n" + f"If your dataset uses different observation keys (e.g., cameras named differently), " + f"use the `--rename_map` argument, for example:\n" + f' --rename_map=\'{{"observation.images.left": "observation.images.camera1", ' + f'"observation.images.top": "observation.images.camera2"}}\'' + ) return policy diff --git a/src/lerobot/scripts/lerobot_eval.py b/src/lerobot/scripts/lerobot_eval.py index 8796e897e..754fd15fe 100644 --- a/src/lerobot/scripts/lerobot_eval.py +++ b/src/lerobot/scripts/lerobot_eval.py @@ -501,6 +501,7 @@ def eval_main(cfg: EvalPipelineConfig): policy = make_policy( cfg=cfg.policy, env_cfg=cfg.env, + rename_map=cfg.rename_map, ) policy.eval() diff --git a/src/lerobot/scripts/lerobot_train.py b/src/lerobot/scripts/lerobot_train.py index 84eb81ad4..0cc6e037f 100644 --- a/src/lerobot/scripts/lerobot_train.py +++ b/src/lerobot/scripts/lerobot_train.py @@ -203,6 +203,7 @@ def train(cfg: TrainPipelineConfig, accelerator: Accelerator | None = None): policy = make_policy( cfg=cfg.policy, ds_meta=dataset.meta, + rename_map=cfg.rename_map, ) # Wait for all processes to finish policy creation before continuing @@ -224,6 +225,9 @@ def train(cfg: TrainPipelineConfig, accelerator: Accelerator | None = None): "norm_map": policy.config.normalization_mapping, }, } + processor_kwargs["preprocessor_overrides"]["rename_observations_processor"] = { + "rename_map": cfg.rename_map + } postprocessor_kwargs["postprocessor_overrides"] = { "unnormalizer_processor": { "stats": dataset.meta.stats, From 12f2f3576096cf5b3b0b8fe5d76906d7a1d7c926 Mon Sep 17 00:00:00 2001 From: Michel Aractingi Date: Tue, 21 Oct 2025 16:17:12 +0200 Subject: [PATCH 37/62] - Introduce _current_file_start_frame for better tracking of the number of frames in each parquet file (#2280) - Added testing for that section in `test_datasets.py` --- src/lerobot/datasets/lerobot_dataset.py | 7 +- tests/datasets/test_datasets.py | 93 +++++++++++++++++++++++++ 2 files changed, 99 insertions(+), 1 deletion(-) diff --git a/src/lerobot/datasets/lerobot_dataset.py b/src/lerobot/datasets/lerobot_dataset.py index 9bbe07a58..6258312a6 100644 --- a/src/lerobot/datasets/lerobot_dataset.py +++ b/src/lerobot/datasets/lerobot_dataset.py @@ -686,6 +686,7 @@ class LeRobotDataset(torch.utils.data.Dataset): self.episode_buffer = None self.writer = None self.latest_episode = None + self._current_file_start_frame = None # Track the starting frame index of the current parquet file self.root.mkdir(exist_ok=True, parents=True) @@ -1232,6 +1233,7 @@ class LeRobotDataset(torch.utils.data.Dataset): # Initialize indices and frame count for a new dataset made of the first episode data chunk_idx, file_idx = 0, 0 global_frame_index = 0 + self._current_file_start_frame = 0 # However, if the episodes already exists # It means we are resuming recording, so we need to load the latest episode # Update the indices to avoid overwriting the latest episode @@ -1243,6 +1245,7 @@ class LeRobotDataset(torch.utils.data.Dataset): # When resuming, move to the next file chunk_idx, file_idx = update_chunk_file_indices(chunk_idx, file_idx, self.meta.chunks_size) + self._current_file_start_frame = global_frame_index else: # Retrieve information from the latest parquet file latest_ep = self.latest_episode @@ -1253,7 +1256,7 @@ class LeRobotDataset(torch.utils.data.Dataset): latest_path = self.root / self.meta.data_path.format(chunk_index=chunk_idx, file_index=file_idx) latest_size_in_mb = get_file_size_in_mb(latest_path) - frames_in_current_file = global_frame_index - latest_ep["dataset_from_index"] + frames_in_current_file = global_frame_index - self._current_file_start_frame av_size_per_frame = ( latest_size_in_mb / frames_in_current_file if frames_in_current_file > 0 else 0 ) @@ -1267,6 +1270,7 @@ class LeRobotDataset(torch.utils.data.Dataset): chunk_idx, file_idx = update_chunk_file_indices(chunk_idx, file_idx, self.meta.chunks_size) self._close_writer() self._writer_closed_for_reading = False + self._current_file_start_frame = global_frame_index ep_dict["data/chunk_index"] = chunk_idx ep_dict["data/file_index"] = file_idx @@ -1473,6 +1477,7 @@ class LeRobotDataset(torch.utils.data.Dataset): obj.video_backend = video_backend if video_backend is not None else get_safe_default_codec() obj.writer = None obj.latest_episode = None + obj._current_file_start_frame = None # Initialize tracking for incremental recording obj._lazy_loading = False obj._recorded_frames = 0 diff --git a/tests/datasets/test_datasets.py b/tests/datasets/test_datasets.py index e174c5789..38fdc358d 100644 --- a/tests/datasets/test_datasets.py +++ b/tests/datasets/test_datasets.py @@ -1199,3 +1199,96 @@ def test_dataset_resume_recording(tmp_path, empty_lerobot_dataset_factory): expected_to = (ep_idx + 1) * frames_per_episode assert ep_metadata["dataset_from_index"] == expected_from assert ep_metadata["dataset_to_index"] == expected_to + + +def test_frames_in_current_file_calculation(tmp_path, empty_lerobot_dataset_factory): + """Regression test for bug where frames_in_current_file only counted frames from last episode instead of all frames in current file.""" + features = { + "observation.state": {"dtype": "float32", "shape": (2,), "names": ["x", "y"]}, + "action": {"dtype": "float32", "shape": (2,), "names": ["vx", "vy"]}, + } + + dataset = empty_lerobot_dataset_factory(root=tmp_path / "test", features=features, use_videos=False) + dataset.meta.update_chunk_settings(data_files_size_in_mb=100) + + assert dataset._current_file_start_frame is None + + frames_per_episode = 10 + for _ in range(frames_per_episode): + dataset.add_frame( + { + "observation.state": torch.randn(2), + "action": torch.randn(2), + "task": "task_0", + } + ) + dataset.save_episode() + + assert dataset._current_file_start_frame == 0 + assert dataset.meta.total_episodes == 1 + assert dataset.meta.total_frames == frames_per_episode + + for _ in range(frames_per_episode): + dataset.add_frame( + { + "observation.state": torch.randn(2), + "action": torch.randn(2), + "task": "task_1", + } + ) + dataset.save_episode() + + assert dataset._current_file_start_frame == 0 + assert dataset.meta.total_episodes == 2 + assert dataset.meta.total_frames == 2 * frames_per_episode + + ep1_chunk = dataset.latest_episode["data/chunk_index"] + ep1_file = dataset.latest_episode["data/file_index"] + assert ep1_chunk == 0 + assert ep1_file == 0 + + for _ in range(frames_per_episode): + dataset.add_frame( + { + "observation.state": torch.randn(2), + "action": torch.randn(2), + "task": "task_2", + } + ) + dataset.save_episode() + + assert dataset._current_file_start_frame == 0 + assert dataset.meta.total_episodes == 3 + assert dataset.meta.total_frames == 3 * frames_per_episode + + ep2_chunk = dataset.latest_episode["data/chunk_index"] + ep2_file = dataset.latest_episode["data/file_index"] + assert ep2_chunk == 0 + assert ep2_file == 0 + + dataset.finalize() + + from lerobot.datasets.utils import load_episodes + + dataset.meta.episodes = load_episodes(dataset.root) + assert dataset.meta.episodes is not None + + for ep_idx in range(3): + ep_metadata = dataset.meta.episodes[ep_idx] + assert ep_metadata["data/chunk_index"] == 0 + assert ep_metadata["data/file_index"] == 0 + + expected_from = ep_idx * frames_per_episode + expected_to = (ep_idx + 1) * frames_per_episode + assert ep_metadata["dataset_from_index"] == expected_from + assert ep_metadata["dataset_to_index"] == expected_to + + loaded_dataset = LeRobotDataset(dataset.repo_id, root=dataset.root) + assert len(loaded_dataset) == 3 * frames_per_episode + assert loaded_dataset.meta.total_episodes == 3 + assert loaded_dataset.meta.total_frames == 3 * frames_per_episode + + for idx in range(len(loaded_dataset)): + frame = loaded_dataset[idx] + expected_ep = idx // frames_per_episode + assert frame["episode_index"].item() == expected_ep From 306429a85b9cc79d8a86278d8b5bc024182c3cde Mon Sep 17 00:00:00 2001 From: Steven Palma Date: Wed, 22 Oct 2025 17:27:31 +0200 Subject: [PATCH 38/62] fix(cameras): opencv camera index casting (#2286) --- src/lerobot/cameras/opencv/camera_opencv.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/lerobot/cameras/opencv/camera_opencv.py b/src/lerobot/cameras/opencv/camera_opencv.py index 28c978d1c..b1043ba64 100644 --- a/src/lerobot/cameras/opencv/camera_opencv.py +++ b/src/lerobot/cameras/opencv/camera_opencv.py @@ -300,11 +300,12 @@ class OpenCVCamera(Camera): """ found_cameras_info = [] + targets_to_scan: list[str | int] if platform.system() == "Linux": possible_paths = sorted(Path("/dev").glob("video*"), key=lambda p: p.name) targets_to_scan = [str(p) for p in possible_paths] else: - targets_to_scan = [str(i) for i in range(MAX_OPENCV_INDEX)] + targets_to_scan = [int(i) for i in range(MAX_OPENCV_INDEX)] for target in targets_to_scan: camera = cv2.VideoCapture(target) From be46bdea8fc9d6ef720bed7e00c191feae9ca34b Mon Sep 17 00:00:00 2001 From: Steven Palma Date: Thu, 23 Oct 2025 13:50:30 +0200 Subject: [PATCH 39/62] feat(policies): add Nvidia Gr00t N1.5 model (#2292) * feat(policies): add Nvidia Gr00t N1.5 model Co-authored-by: lbenhorin Co-authored-by: Aravindh Co-authored-by: nv-sachdevkartik Co-authored-by: youliangt Co-authored-by: Michel Aractingi Co-authored-by: Pepijn <138571049+pkooij@users.noreply.github.com> Co-authored-by: Jade Choghari * fix(docs): add groot to index Co-authored-by: sachdevkartik --------- Co-authored-by: lbenhorin Co-authored-by: Aravindh Co-authored-by: nv-sachdevkartik Co-authored-by: youliangt Co-authored-by: Michel Aractingi Co-authored-by: Pepijn <138571049+pkooij@users.noreply.github.com> Co-authored-by: Jade Choghari Co-authored-by: sachdevkartik --- .github/workflows/full_tests.yml | 2 +- docs/source/_toctree.yml | 2 + docs/source/groot.mdx | 122 ++++ docs/source/policy_groot_README.md | 27 + pyproject.toml | 13 + src/lerobot/policies/__init__.py | 2 + src/lerobot/policies/factory.py | 36 + src/lerobot/policies/groot/README.md | 1 + src/lerobot/policies/groot/__init__.py | 21 + .../policies/groot/action_head/__init__.py | 14 + .../groot/action_head/action_encoder.py | 54 ++ .../groot/action_head/cross_attention_dit.py | 370 ++++++++++ .../action_head/flow_matching_action_head.py | 406 +++++++++++ .../policies/groot/configuration_groot.py | 201 ++++++ .../configuration_eagle2_5_vl.py | 135 ++++ .../image_processing_eagle2_5_vl_fast.py | 504 +++++++++++++ .../eagle2_hg_model/modeling_eagle2_5_vl.py | 395 +++++++++++ .../eagle2_hg_model/processing_eagle2_5_vl.py | 518 ++++++++++++++ src/lerobot/policies/groot/groot_n1.py | 376 ++++++++++ src/lerobot/policies/groot/modeling_groot.py | 198 ++++++ src/lerobot/policies/groot/processor_groot.py | 664 ++++++++++++++++++ src/lerobot/policies/groot/utils.py | 47 ++ src/lerobot/scripts/lerobot_eval.py | 13 +- src/lerobot/utils/import_utils.py | 1 + tests/policies/groot/test_groot_lerobot.py | 207 ++++++ .../policies/groot/test_groot_vs_original.py | 443 ++++++++++++ 26 files changed, 4766 insertions(+), 6 deletions(-) create mode 100644 docs/source/groot.mdx create mode 100644 docs/source/policy_groot_README.md create mode 120000 src/lerobot/policies/groot/README.md create mode 100644 src/lerobot/policies/groot/__init__.py create mode 100644 src/lerobot/policies/groot/action_head/__init__.py create mode 100644 src/lerobot/policies/groot/action_head/action_encoder.py create mode 100755 src/lerobot/policies/groot/action_head/cross_attention_dit.py create mode 100644 src/lerobot/policies/groot/action_head/flow_matching_action_head.py create mode 100644 src/lerobot/policies/groot/configuration_groot.py create mode 100755 src/lerobot/policies/groot/eagle2_hg_model/configuration_eagle2_5_vl.py create mode 100644 src/lerobot/policies/groot/eagle2_hg_model/image_processing_eagle2_5_vl_fast.py create mode 100755 src/lerobot/policies/groot/eagle2_hg_model/modeling_eagle2_5_vl.py create mode 100755 src/lerobot/policies/groot/eagle2_hg_model/processing_eagle2_5_vl.py create mode 100644 src/lerobot/policies/groot/groot_n1.py create mode 100644 src/lerobot/policies/groot/modeling_groot.py create mode 100644 src/lerobot/policies/groot/processor_groot.py create mode 100644 src/lerobot/policies/groot/utils.py create mode 100644 tests/policies/groot/test_groot_lerobot.py create mode 100644 tests/policies/groot/test_groot_vs_original.py diff --git a/.github/workflows/full_tests.yml b/.github/workflows/full_tests.yml index d16fe5e72..0155eec13 100644 --- a/.github/workflows/full_tests.yml +++ b/.github/workflows/full_tests.yml @@ -78,7 +78,7 @@ jobs: python-version: ${{ env.PYTHON_VERSION }} - name: Install lerobot with all extras - run: uv sync --all-extras + run: uv sync --all-extras --no-extra groot # TODO(Steven): Make flash-attn optional - name: Run pytest (all extras) run: uv run pytest tests -vv --maxfail=10 diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index 5e100013a..16fdb5e78 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -37,6 +37,8 @@ title: π₀ (Pi0) - local: pi05 title: π₀.₅ (Pi05) + - local: groot + title: Nvidia Gr00t N1.5 title: "Policies" - sections: - local: il_sim diff --git a/docs/source/groot.mdx b/docs/source/groot.mdx new file mode 100644 index 000000000..07348bea4 --- /dev/null +++ b/docs/source/groot.mdx @@ -0,0 +1,122 @@ +# Gr00t N1.5 Policy + +Gr00t N1.5 is an open foundation model from NVIDIA designed for generalized humanoid robot reasoning and skills. It is a cross-embodiment model that accepts multimodal input, including language and images, to perform manipulation tasks in diverse environments. + +This document outlines the specifics of its integration and usage within the LeRobot framework. + +## Model Overview + +NVIDIA Isaac GR00T N1.5 is an upgraded version of the GR00T N1 foundation model. It is built to improve generalization and language-following abilities for humanoid robots. + +Developers and researchers can post-train GR00T N1.5 with their own real or synthetic data to adapt it for specific humanoid robots or tasks. + +GR00T N1.5 (specifically the GR00T-N1.5-3B model) is built using pre-trained vision and language encoders. It utilizes a flow matching action transformer to model a chunk of actions, conditioned on vision, language, and proprioception. + +Its strong performance comes from being trained on an expansive and diverse humanoid dataset, which includes: + +- Real captured data from robots. +- Synthetic data generated using NVIDIA Isaac GR00T Blueprint. +- Internet-scale video data. + +This approach allows the model to be highly adaptable through post-training for specific embodiments, tasks, and environments. + +## Installation Requirements + +As of today, Gr00t N1.5 requires flash attention for it's internal working. + +We are working on making this optional, but in the meantime that means that we require an extra installation step and it can only be used in CUDA enabled devices. + +1. Following the Environment Setup of our [Installation Guide](./installation). **Attention** don't install `lerobot` in this step. +2. Install [Flash Attention](https://github.com/Dao-AILab/flash-attention) by running: + +```bash +# Check https://pytorch.org/get-started/locally/ for your system +pip install "torch>=2.2.1,<2.8.0" "torchvision>=0.21.0,<0.23.0" # --index-url https://download.pytorch.org/whl/cu1XX +pip install ninja "packaging>=24.2,<26.0" # flash attention dependencies +pip install "flash-attn>=2.5.9,<3.0.0" --no-build-isolation +python -c "import flash_attn; print(f'Flash Attention {flash_attn.__version__} imported successfully')" +``` + +3. Install LeRobot by running: + +```bash +pip install lerobot[groot] # consider also installing libero,dev and test tags +``` + +## Usage + +To use Gr00t in your LeRobot configuration, specify the policy type as: + +```python +policy.type=groot +``` + +## Training + +### Training Command Example + +Here's a complete training command for finetuning the base Gr00t model on your own dataset: + +```bash +# Using a multi-GPU setup +accelerate launch \ + --multi_gpu \ + --num_processes=$NUM_GPUS \ + $(which lerobot-train) \ + --output_dir=$OUTPUT_DIR \ + --save_checkpoint=true \ + --batch_size=$BATCH_SIZE \ + --steps=$NUM_STEPS \ + --save_freq=$SAVE_FREQ \ + --log_freq=$LOG_FREQ \ + --policy.push_to_hub=true \ + --policy.type=groot \ + --policy.repo_id=$REPO_ID \ + --policy.tune_diffusion_model=false \ + --dataset.repo_id=$DATASET_ID \ + --wandb.enable=true \ + --wandb.disable_artifact=true \ + --job_name=$JOB_NAME +``` + +## Performance Results + +### Libero Benchmark Results + +Gr00t has demonstrated strong performance on the Libero benchmark suite. To compare and test its LeRobot implementation, we finetuned the Gr00t N1.5 model for 20k steps on the Libero dataset and compared the results to the Gr00t reference results. + +| Benchmark | LeRobot Implementation | Gr00t Reference | +| ------------------ | ---------------------- | --------------- | +| **Libero Spatial** | 82% | 92.0% | +| **Libero Object** | 99% | 92.0% | +| **Libero Long** | 72% | 76.0% | +| **Average** | 84% | 87.0% | + +These results demonstrate Gr00t's strong generalization capabilities across diverse robotic manipulation tasks. To reproduce these results, you can follow the instructions in the [Libero](https://huggingface.co/docs/lerobot/libero) section. + +### Evaluate in your hardware setup + +Once you have trained your model using your parameters you can run inference in your downstream task. Follow our by following for the downstream hardware task, you can follow our instructions in [Imitation Learning for Robots](./il_robots). For example: + +```bash +lerobot-record \ + --robot.type=bi_so100_follower \ + --robot.left_arm_port=/dev/ttyACM1 \ + --robot.right_arm_port=/dev/ttyACM0 \ + --robot.id=bimanual_follower \ + --robot.cameras='{ right: {"type": "opencv", "index_or_path": 0, "width": 640, "height": 480, "fps": 30}, + left: {"type": "opencv", "index_or_path": 2, "width": 640, "height": 480, "fps": 30}, + top: {"type": "opencv", "index_or_path": 4, "width": 640, "height": 480, "fps": 30}, + }' \ + --display_data=true \ + --dataset.repo_id=/eval_groot-bimanual \ + --dataset.num_episodes=10 \ + --dataset.single_task="Grab and handover the red cube to the other arm" + --policy.path=/groot-bimanual # your trained model + --dataset.episode_time_s=30 + --dataset.reset_time_s=10 +``` + +## License + +This model follows the **Apache 2.0 License**, consistent with the original [Gr00t repository](https://github.com/NVIDIA/Isaac-GR00T). diff --git a/docs/source/policy_groot_README.md b/docs/source/policy_groot_README.md new file mode 100644 index 000000000..efcd76ebe --- /dev/null +++ b/docs/source/policy_groot_README.md @@ -0,0 +1,27 @@ +## Research Paper + +Paper: https://research.nvidia.com/labs/gear/gr00t-n1_5/ + +## Repository + +Code: https://github.com/NVIDIA/Isaac-GR00T + +## Citation + +```bibtex +@inproceedings{gr00tn1_2025, + archivePrefix = {arxiv}, + eprint = {2503.14734}, + title = {{GR00T} {N1}: An Open Foundation Model for Generalist Humanoid Robots}, + author = {NVIDIA and Johan Bjorck andFernando Castañeda, Nikita Cherniadev and Xingye Da and Runyu Ding and Linxi "Jim" Fan and Yu Fang and Dieter Fox and Fengyuan Hu and Spencer Huang and Joel Jang and Zhenyu Jiang and Jan Kautz and Kaushil Kundalia and Lawrence Lao and Zhiqi Li and Zongyu Lin and Kevin Lin and Guilin Liu and Edith Llontop and Loic Magne and Ajay Mandlekar and Avnish Narayan and Soroush Nasiriany and Scott Reed and You Liang Tan and Guanzhi Wang and Zu Wang and Jing Wang and Qi Wang and Jiannan Xiang and Yuqi Xie and Yinzhen Xu and Zhenjia Xu and Seonghyeon Ye and Zhiding Yu and Ao Zhang and Hao Zhang and Yizhou Zhao and Ruijie Zheng and Yuke Zhu}, + month = {March}, + year = {2025}, + booktitle = {ArXiv Preprint}, +} +``` + +## Additional Resources + +Blog: https://developer.nvidia.com/isaac/gr00t + +Hugging Face Model: https://huggingface.co/nvidia/GR00T-N1.5-3B diff --git a/pyproject.toml b/pyproject.toml index 1c71acec0..b76593b38 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -118,6 +118,17 @@ phone = ["hebi-py>=2.8.0,<2.12.0", "teleop>=0.1.0,<0.2.0"] # Policies pi = ["transformers @ git+https://github.com/huggingface/transformers.git@fix/lerobot_openpi"] smolvla = ["lerobot[transformers-dep]", "num2words>=0.5.14,<0.6.0", "accelerate>=1.7.0,<2.0.0", "safetensors>=0.4.3,<1.0.0"] +groot = [ + "lerobot[transformers-dep]", + "peft>=0.13.0,<1.0.0", + "dm-tree>=0.1.8,<1.0.0", + "timm>=1.0.0,<1.1.0", + "safetensors>=0.4.3,<1.0.0", + "Pillow>=10.0.0,<13.0.0", + "decord>=0.6.0,<1.0.0; (platform_machine == 'AMD64' or platform_machine == 'x86_64')", + "ninja>=1.11.1,<2.0.0", + "flash-attn>=2.5.9,<3.0.0 ; sys_platform != 'darwin'" +] hilserl = ["lerobot[transformers-dep]", "gym-hil>=0.1.13,<0.2.0", "lerobot[grpcio-dep]", "lerobot[placo-dep]"] # Features @@ -145,6 +156,7 @@ all = [ "lerobot[intelrealsense]", "lerobot[pi]", "lerobot[smolvla]", + # "lerobot[groot]", TODO(Steven): Gr00t requires specific installation instructions for flash-attn "lerobot[hilserl]", "lerobot[async]", "lerobot[dev]", @@ -243,6 +255,7 @@ default.extend-ignore-identifiers-re = [ "pn", "ser", "ein", + "thw", "inpt", ] diff --git a/src/lerobot/policies/__init__.py b/src/lerobot/policies/__init__.py index 49f1e0f95..4cdc89ea9 100644 --- a/src/lerobot/policies/__init__.py +++ b/src/lerobot/policies/__init__.py @@ -14,6 +14,7 @@ from .act.configuration_act import ACTConfig as ACTConfig from .diffusion.configuration_diffusion import DiffusionConfig as DiffusionConfig +from .groot.configuration_groot import GrootConfig as GrootConfig from .pi0.configuration_pi0 import PI0Config as PI0Config from .pi05.configuration_pi05 import PI05Config as PI05Config from .smolvla.configuration_smolvla import SmolVLAConfig as SmolVLAConfig @@ -29,4 +30,5 @@ __all__ = [ "SmolVLAConfig", "TDMPCConfig", "VQBeTConfig", + "GrootConfig", ] diff --git a/src/lerobot/policies/factory.py b/src/lerobot/policies/factory.py index 6e524f2ab..bdad5cbb3 100644 --- a/src/lerobot/policies/factory.py +++ b/src/lerobot/policies/factory.py @@ -30,6 +30,7 @@ from lerobot.envs.configs import EnvConfig from lerobot.envs.utils import env_to_policy_features from lerobot.policies.act.configuration_act import ACTConfig from lerobot.policies.diffusion.configuration_diffusion import DiffusionConfig +from lerobot.policies.groot.configuration_groot import GrootConfig from lerobot.policies.pi0.configuration_pi0 import PI0Config from lerobot.policies.pi05.configuration_pi05 import PI05Config from lerobot.policies.pretrained import PreTrainedPolicy @@ -101,6 +102,10 @@ def get_policy_class(name: str) -> type[PreTrainedPolicy]: from lerobot.policies.smolvla.modeling_smolvla import SmolVLAPolicy return SmolVLAPolicy + elif name == "groot": + from lerobot.policies.groot.modeling_groot import GrootPolicy + + return GrootPolicy else: raise NotImplementedError(f"Policy with name {name} is not implemented.") @@ -142,6 +147,8 @@ def make_policy_config(policy_type: str, **kwargs) -> PreTrainedConfig: return SmolVLAConfig(**kwargs) elif policy_type == "reward_classifier": return RewardClassifierConfig(**kwargs) + elif policy_type == "groot": + return GrootConfig(**kwargs) else: raise ValueError(f"Policy type '{policy_type}' is not available.") @@ -199,6 +206,27 @@ def make_pre_post_processors( policy configuration type. """ if pretrained_path: + # TODO(Steven): Temporary patch, implement correctly the processors for Gr00t + if isinstance(policy_cfg, GrootConfig): + # GROOT handles normalization in groot_pack_inputs_v3 step + # Need to override both stats AND normalize_min_max since saved config might be empty + preprocessor_overrides = {} + postprocessor_overrides = {} + preprocessor_overrides["groot_pack_inputs_v3"] = { + "stats": kwargs.get("dataset_stats"), + "normalize_min_max": True, + } + + # Also ensure postprocessing slices to env action dim and unnormalizes with dataset stats + env_action_dim = policy_cfg.output_features["action"].shape[0] + postprocessor_overrides["groot_action_unpack_unnormalize_v1"] = { + "stats": kwargs.get("dataset_stats"), + "normalize_min_max": True, + "env_action_dim": env_action_dim, + } + kwargs["preprocessor_overrides"] = preprocessor_overrides + kwargs["postprocessor_overrides"] = postprocessor_overrides + return ( PolicyProcessorPipeline.from_pretrained( pretrained_model_name_or_path=pretrained_path, @@ -293,6 +321,14 @@ def make_pre_post_processors( dataset_stats=kwargs.get("dataset_stats"), ) + elif isinstance(policy_cfg, GrootConfig): + from lerobot.policies.groot.processor_groot import make_groot_pre_post_processors + + processors = make_groot_pre_post_processors( + config=policy_cfg, + dataset_stats=kwargs.get("dataset_stats"), + ) + else: raise NotImplementedError(f"Processor for policy type '{policy_cfg.type}' is not implemented.") diff --git a/src/lerobot/policies/groot/README.md b/src/lerobot/policies/groot/README.md new file mode 120000 index 000000000..ff4937f5c --- /dev/null +++ b/src/lerobot/policies/groot/README.md @@ -0,0 +1 @@ +../../../../docs/source/policy_groot_README.md \ No newline at end of file diff --git a/src/lerobot/policies/groot/__init__.py b/src/lerobot/policies/groot/__init__.py new file mode 100644 index 000000000..c8933ff56 --- /dev/null +++ b/src/lerobot/policies/groot/__init__.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python + +# Copyright 2025 Nvidia and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .configuration_groot import GrootConfig +from .modeling_groot import GrootPolicy +from .processor_groot import make_groot_pre_post_processors + +__all__ = ["GrootConfig", "GrootPolicy", "make_groot_pre_post_processors"] diff --git a/src/lerobot/policies/groot/action_head/__init__.py b/src/lerobot/policies/groot/action_head/__init__.py new file mode 100644 index 000000000..3159bfe65 --- /dev/null +++ b/src/lerobot/policies/groot/action_head/__init__.py @@ -0,0 +1,14 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/src/lerobot/policies/groot/action_head/action_encoder.py b/src/lerobot/policies/groot/action_head/action_encoder.py new file mode 100644 index 000000000..c6fa0a779 --- /dev/null +++ b/src/lerobot/policies/groot/action_head/action_encoder.py @@ -0,0 +1,54 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import torch.nn as nn + + +def swish(x): + return x * torch.sigmoid(x) + + +class SinusoidalPositionalEncoding(nn.Module): + """ + Produces a sinusoidal encoding of shape (B, T, w) + given timesteps of shape (B, T). + """ + + def __init__(self, embedding_dim): + super().__init__() + self.embedding_dim = embedding_dim + + def forward(self, timesteps): + # timesteps: shape (B, T) + # We'll compute sin/cos frequencies across dim T + timesteps = timesteps.float() # ensure float + + b, t = timesteps.shape + device = timesteps.device + + half_dim = self.embedding_dim // 2 + # typical log space frequencies for sinusoidal encoding + exponent = -torch.arange(half_dim, dtype=torch.float, device=device) * ( + torch.log(torch.tensor(10000.0)) / half_dim + ) + # Expand timesteps to (B, T, 1) then multiply + freqs = timesteps.unsqueeze(-1) * exponent.exp() # (B, T, half_dim) + + sin = torch.sin(freqs) + cos = torch.cos(freqs) + enc = torch.cat([sin, cos], dim=-1) # (B, T, w) + + return enc diff --git a/src/lerobot/policies/groot/action_head/cross_attention_dit.py b/src/lerobot/policies/groot/action_head/cross_attention_dit.py new file mode 100755 index 000000000..40f7ba603 --- /dev/null +++ b/src/lerobot/policies/groot/action_head/cross_attention_dit.py @@ -0,0 +1,370 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import torch +import torch.nn.functional as F # noqa: N812 +from diffusers import ConfigMixin, ModelMixin +from diffusers.configuration_utils import register_to_config +from diffusers.models.attention import Attention, FeedForward +from diffusers.models.embeddings import ( + SinusoidalPositionalEmbedding, + TimestepEmbedding, + Timesteps, +) +from torch import nn + + +class TimestepEncoder(nn.Module): + def __init__(self, embedding_dim, compute_dtype=torch.float32): + super().__init__() + self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=1) + self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim) + + def forward(self, timesteps): + dtype = next(self.parameters()).dtype + timesteps_proj = self.time_proj(timesteps).to(dtype) + timesteps_emb = self.timestep_embedder(timesteps_proj) # (N, D) + return timesteps_emb + + +class AdaLayerNorm(nn.Module): + def __init__( + self, + embedding_dim: int, + norm_elementwise_affine: bool = False, + norm_eps: float = 1e-5, + chunk_dim: int = 0, + ): + super().__init__() + self.chunk_dim = chunk_dim + output_dim = embedding_dim * 2 + self.silu = nn.SiLU() + self.linear = nn.Linear(embedding_dim, output_dim) + self.norm = nn.LayerNorm(output_dim // 2, norm_eps, norm_elementwise_affine) + + def forward( + self, + x: torch.Tensor, + temb: torch.Tensor | None = None, + ) -> torch.Tensor: + temb = self.linear(self.silu(temb)) + scale, shift = temb.chunk(2, dim=1) + x = self.norm(x) * (1 + scale[:, None]) + shift[:, None] + return x + + +class BasicTransformerBlock(nn.Module): + def __init__( + self, + dim: int, + num_attention_heads: int, + attention_head_dim: int, + dropout=0.0, + cross_attention_dim: int | None = None, + activation_fn: str = "geglu", + attention_bias: bool = False, + upcast_attention: bool = False, + norm_elementwise_affine: bool = True, + norm_type: str = "layer_norm", # 'layer_norm', 'ada_norm', 'ada_norm_zero', 'ada_norm_single', 'ada_norm_continuous', 'layer_norm_i2vgen' + norm_eps: float = 1e-5, + final_dropout: bool = False, + attention_type: str = "default", + positional_embeddings: str | None = None, + num_positional_embeddings: int | None = None, + ff_inner_dim: int | None = None, + ff_bias: bool = True, + attention_out_bias: bool = True, + ): + super().__init__() + self.dim = dim + self.num_attention_heads = num_attention_heads + self.attention_head_dim = attention_head_dim + self.dropout = dropout + self.cross_attention_dim = cross_attention_dim + self.activation_fn = activation_fn + self.attention_bias = attention_bias + self.norm_elementwise_affine = norm_elementwise_affine + self.positional_embeddings = positional_embeddings + self.num_positional_embeddings = num_positional_embeddings + self.norm_type = norm_type + + if positional_embeddings and (num_positional_embeddings is None): + raise ValueError( + "If `positional_embeddings` type is defined, `num_positional_embeddings` must also be defined." + ) + + if positional_embeddings == "sinusoidal": + self.pos_embed = SinusoidalPositionalEmbedding(dim, max_seq_length=num_positional_embeddings) + else: + self.pos_embed = None + + # Define 3 blocks. Each block has its own normalization layer. + # 1. Self-Attn + if norm_type == "ada_norm": + self.norm1 = AdaLayerNorm(dim) + else: + self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps) + + self.attn1 = Attention( + query_dim=dim, + heads=num_attention_heads, + dim_head=attention_head_dim, + dropout=dropout, + bias=attention_bias, + cross_attention_dim=cross_attention_dim, + upcast_attention=upcast_attention, + out_bias=attention_out_bias, + ) + + # 3. Feed-forward + self.norm3 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine) + self.ff = FeedForward( + dim, + dropout=dropout, + activation_fn=activation_fn, + final_dropout=final_dropout, + inner_dim=ff_inner_dim, + bias=ff_bias, + ) + if final_dropout: + self.final_dropout = nn.Dropout(dropout) + else: + self.final_dropout = None + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor | None = None, + encoder_hidden_states: torch.Tensor | None = None, + encoder_attention_mask: torch.Tensor | None = None, + temb: torch.LongTensor | None = None, + ) -> torch.Tensor: + # 0. Self-Attention + if self.norm_type == "ada_norm": + norm_hidden_states = self.norm1(hidden_states, temb) + else: + norm_hidden_states = self.norm1(hidden_states) + + if self.pos_embed is not None: + norm_hidden_states = self.pos_embed(norm_hidden_states) + + attn_output = self.attn1( + norm_hidden_states, + encoder_hidden_states=encoder_hidden_states, + attention_mask=attention_mask, + # encoder_attention_mask=encoder_attention_mask, + ) + if self.final_dropout: + attn_output = self.final_dropout(attn_output) + + hidden_states = attn_output + hidden_states + if hidden_states.ndim == 4: + hidden_states = hidden_states.squeeze(1) + + # 4. Feed-forward + norm_hidden_states = self.norm3(hidden_states) + ff_output = self.ff(norm_hidden_states) + + hidden_states = ff_output + hidden_states + if hidden_states.ndim == 4: + hidden_states = hidden_states.squeeze(1) + return hidden_states + + +class DiT(ModelMixin, ConfigMixin): + _supports_gradient_checkpointing = True + + @register_to_config + def __init__( + self, + num_attention_heads: int = 8, + attention_head_dim: int = 64, + output_dim: int = 26, + num_layers: int = 12, + dropout: float = 0.1, + attention_bias: bool = True, + activation_fn: str = "gelu-approximate", + num_embeds_ada_norm: int | None = 1000, + upcast_attention: bool = False, + norm_type: str = "ada_norm", + norm_elementwise_affine: bool = False, + norm_eps: float = 1e-5, + max_num_positional_embeddings: int = 512, + compute_dtype=torch.float32, + final_dropout: bool = True, + positional_embeddings: str | None = "sinusoidal", + interleave_self_attention=False, + cross_attention_dim: int | None = None, + ): + super().__init__() + + self.attention_head_dim = attention_head_dim + self.inner_dim = self.config.num_attention_heads * self.config.attention_head_dim + self.gradient_checkpointing = False + + # Timestep encoder + self.timestep_encoder = TimestepEncoder( + embedding_dim=self.inner_dim, compute_dtype=self.config.compute_dtype + ) + + all_blocks = [] + for idx in range(self.config.num_layers): + use_self_attn = idx % 2 == 1 and interleave_self_attention + curr_cross_attention_dim = cross_attention_dim if not use_self_attn else None + + all_blocks += [ + BasicTransformerBlock( + self.inner_dim, + self.config.num_attention_heads, + self.config.attention_head_dim, + dropout=self.config.dropout, + activation_fn=self.config.activation_fn, + attention_bias=self.config.attention_bias, + upcast_attention=self.config.upcast_attention, + norm_type=norm_type, + norm_elementwise_affine=self.config.norm_elementwise_affine, + norm_eps=self.config.norm_eps, + positional_embeddings=positional_embeddings, + num_positional_embeddings=self.config.max_num_positional_embeddings, + final_dropout=final_dropout, + cross_attention_dim=curr_cross_attention_dim, + ) + ] + self.transformer_blocks = nn.ModuleList(all_blocks) + + # Output blocks + self.norm_out = nn.LayerNorm(self.inner_dim, elementwise_affine=False, eps=1e-6) + self.proj_out_1 = nn.Linear(self.inner_dim, 2 * self.inner_dim) + self.proj_out_2 = nn.Linear(self.inner_dim, self.config.output_dim) + print( + "Total number of DiT parameters: ", + sum(p.numel() for p in self.parameters() if p.requires_grad), + ) + + def forward( + self, + hidden_states: torch.Tensor, # Shape: (B, T, D) + encoder_hidden_states: torch.Tensor, # Shape: (B, S, D) + timestep: torch.LongTensor | None = None, + encoder_attention_mask: torch.Tensor | None = None, + return_all_hidden_states: bool = False, + ): + # Encode timesteps + temb = self.timestep_encoder(timestep) + + # Process through transformer blocks - single pass through the blocks + hidden_states = hidden_states.contiguous() + encoder_hidden_states = encoder_hidden_states.contiguous() + + all_hidden_states = [hidden_states] + + # Process through transformer blocks + for idx, block in enumerate(self.transformer_blocks): + if idx % 2 == 1 and self.config.interleave_self_attention: + hidden_states = block( + hidden_states, + attention_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + temb=temb, + ) + else: + hidden_states = block( + hidden_states, + attention_mask=None, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=None, + temb=temb, + ) + all_hidden_states.append(hidden_states) + + # Output processing + conditioning = temb + shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, dim=1) + hidden_states = self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None] + if return_all_hidden_states: + return self.proj_out_2(hidden_states), all_hidden_states + else: + return self.proj_out_2(hidden_states) + + +class SelfAttentionTransformer(ModelMixin, ConfigMixin): + _supports_gradient_checkpointing = True + + @register_to_config + def __init__( + self, + num_attention_heads: int = 8, + attention_head_dim: int = 64, + output_dim: int = 26, + num_layers: int = 12, + dropout: float = 0.1, + attention_bias: bool = True, + activation_fn: str = "gelu-approximate", + num_embeds_ada_norm: int | None = 1000, + upcast_attention: bool = False, + max_num_positional_embeddings: int = 512, + compute_dtype=torch.float32, + final_dropout: bool = True, + positional_embeddings: str | None = "sinusoidal", + interleave_self_attention=False, + ): + super().__init__() + + self.attention_head_dim = attention_head_dim + self.inner_dim = self.config.num_attention_heads * self.config.attention_head_dim + self.gradient_checkpointing = False + + self.transformer_blocks = nn.ModuleList( + [ + BasicTransformerBlock( + self.inner_dim, + self.config.num_attention_heads, + self.config.attention_head_dim, + dropout=self.config.dropout, + activation_fn=self.config.activation_fn, + attention_bias=self.config.attention_bias, + upcast_attention=self.config.upcast_attention, + positional_embeddings=positional_embeddings, + num_positional_embeddings=self.config.max_num_positional_embeddings, + final_dropout=final_dropout, + ) + for _ in range(self.config.num_layers) + ] + ) + print( + "Total number of SelfAttentionTransformer parameters: ", + sum(p.numel() for p in self.parameters() if p.requires_grad), + ) + + def forward( + self, + hidden_states: torch.Tensor, # Shape: (B, T, D) + return_all_hidden_states: bool = False, + ): + # Process through transformer blocks - single pass through the blocks + hidden_states = hidden_states.contiguous() + all_hidden_states = [hidden_states] + + # Process through transformer blocks + for _idx, block in enumerate(self.transformer_blocks): + hidden_states = block(hidden_states) + all_hidden_states.append(hidden_states) + + if return_all_hidden_states: + return hidden_states, all_hidden_states + else: + return hidden_states diff --git a/src/lerobot/policies/groot/action_head/flow_matching_action_head.py b/src/lerobot/policies/groot/action_head/flow_matching_action_head.py new file mode 100644 index 000000000..bfc456ba0 --- /dev/null +++ b/src/lerobot/policies/groot/action_head/flow_matching_action_head.py @@ -0,0 +1,406 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass, field +from typing import TYPE_CHECKING + +import torch +import torch.nn.functional as F # noqa: N812 +from torch import nn +from torch.distributions import Beta + +from lerobot.utils.import_utils import _transformers_available + +# Conditional import for type checking and lazy loading +if TYPE_CHECKING or _transformers_available: + from transformers import PretrainedConfig + from transformers.feature_extraction_utils import BatchFeature +else: + PretrainedConfig = object + BatchFeature = None + +from lerobot.policies.groot.action_head.action_encoder import ( + SinusoidalPositionalEncoding, + swish, +) + +from .cross_attention_dit import DiT, SelfAttentionTransformer + + +class CategorySpecificLinear(nn.Module): + def __init__(self, num_categories, input_dim, hidden_dim): + super().__init__() + self.num_categories = num_categories + # For each category, we have separate weights and biases. + self.W = nn.Parameter(0.02 * torch.randn(num_categories, input_dim, hidden_dim)) + self.b = nn.Parameter(torch.zeros(num_categories, hidden_dim)) + + def forward(self, x, cat_ids): + selected_w = self.W[cat_ids] + selected_b = self.b[cat_ids] + return torch.bmm(x, selected_w) + selected_b.unsqueeze(1) + + +class CategorySpecificMLP(nn.Module): + def __init__(self, num_categories, input_dim, hidden_dim, output_dim): + super().__init__() + self.num_categories = num_categories + self.layer1 = CategorySpecificLinear(num_categories, input_dim, hidden_dim) + self.layer2 = CategorySpecificLinear(num_categories, hidden_dim, output_dim) + + def forward(self, x, cat_ids): + hidden = F.relu(self.layer1(x, cat_ids)) + return self.layer2(hidden, cat_ids) + + +class MultiEmbodimentActionEncoder(nn.Module): + def __init__(self, action_dim, hidden_size, num_embodiments): + super().__init__() + self.hidden_size = hidden_size + self.num_embodiments = num_embodiments + + # W1: R^{w x d}, W2: R^{w x 2w}, W3: R^{w x w} + self.W1 = CategorySpecificLinear(num_embodiments, action_dim, hidden_size) # (d -> w) + self.W2 = CategorySpecificLinear(num_embodiments, 2 * hidden_size, hidden_size) # (2w -> w) + self.W3 = CategorySpecificLinear(num_embodiments, hidden_size, hidden_size) # (w -> w) + self.pos_encoding = SinusoidalPositionalEncoding(hidden_size) + + def forward(self, actions, timesteps, cat_ids): + """ + actions: shape (B, T, action_dim) + timesteps: shape (B,) -- a single scalar per batch item + cat_ids: shape (B,) + returns: shape (B, T, hidden_size) + """ + b, t, _ = actions.shape + + # 1) Expand each batch's single scalar time 'tau' across all T steps + # so that shape => (B, T) + # e.g. if timesteps is (B,), replicate across T + if timesteps.dim() == 1 and timesteps.shape[0] == b: + # shape (B,) => (B,T) + timesteps = timesteps.unsqueeze(1).expand(-1, t) + else: + raise ValueError("Expected `timesteps` to have shape (B,) so we can replicate across T.") + + # 2) Standard action MLP step for shape => (B, T, w) + a_emb = self.W1(actions, cat_ids) + + # 3) Get the sinusoidal encoding (B, T, w) + tau_emb = self.pos_encoding(timesteps).to(dtype=a_emb.dtype) + + # 4) Concat along last dim => (B, T, 2w), then W2 => (B, T, w), swish + x = torch.cat([a_emb, tau_emb], dim=-1) + x = swish(self.W2(x, cat_ids)) + + # 5) Finally W3 => (B, T, w) + x = self.W3(x, cat_ids) + return x + + +@dataclass +class FlowmatchingActionHeadConfig(PretrainedConfig): + """NOTE: N1.5 uses XEmbFlowmatchingPolicyHeadConfig as action head""" + + add_pos_embed: bool = field(default=True, metadata={"help": "Whether to add positional embedding"}) + model_dtype: str = field(default="float32", metadata={"help": "Model data type."}) + diffusion_model_cfg: dict = field(default=None, metadata={"help": "Diffusion model configuration."}) + input_embedding_dim: int = field(default=1536, metadata={"help": "Input embedding channel dimension."}) + backbone_embedding_dim: int = field( + default=1536, metadata={"help": "Backbone embedding channel dimension."} + ) + + hidden_size: int = field(default=1024, metadata={"help": "Input embedding dimension."}) + max_seq_len: int = field(default=1024, metadata={"help": "Maximum Sequence Length"}) + action_dim: int = field(default=None, metadata={"help": "Action dimension."}) + action_horizon: int = field(default=None, metadata={"help": "Action horizon."}) + noise_beta_alpha: float = field(default=1.5, metadata={"help": ""}) + noise_beta_beta: float = field(default=1.0, metadata={"help": ""}) + noise_s: float = field(default=0.999, metadata={"help": "Flow matching noise Beta distribution s."}) + num_timestep_buckets: int = field( + default=1000, metadata={"help": "Number of timestep discretization buckets."} + ) + num_inference_timesteps: int = field( + default=None, + metadata={"help": "Number of inference steps for noise diffusion."}, + ) + max_num_embodiments: int = field(default=32, metadata={"help": "Number of embodiments."}) + tune_projector: bool = field(default=True, metadata={"help": "Whether to tune the projector."}) + tune_diffusion_model: bool = field( + default=True, metadata={"help": "Whether to tune the diffusion model."} + ) + load_pretrained_det_decode_layer_path: str = field( + default=None, metadata={"help": "Path to pretrained detection model."} + ) + detection_coeff: float = field(default=1.0, metadata={"help": "Detection coefficient."}) + + freeze_decode_layer: bool = field(default=False) + expand_batch: int = field(default=None) + use_vlln: bool = field(default=True) + + vl_self_attention_cfg: dict = field(default=None) + num_target_vision_tokens: int = field(default=32, metadata={"help": "Number of target vision tokens."}) + + def __init__(self, **kwargs): + super().__init__(**kwargs) + for key, value in kwargs.items(): + setattr(self, key, value) + + +class FlowmatchingActionHead(nn.Module): + config_class = FlowmatchingActionHeadConfig + supports_gradient_checkpointing = True + + def __init__( + self, + config: FlowmatchingActionHeadConfig, + ): + super().__init__() + self.hidden_size = config.hidden_size + self.input_embedding_dim = config.input_embedding_dim + + self.model = DiT(**config.diffusion_model_cfg) + self.action_dim = config.action_dim + self.action_horizon = config.action_horizon + self.num_inference_timesteps = config.num_inference_timesteps + + self.state_encoder = CategorySpecificMLP( + num_categories=config.max_num_embodiments, + input_dim=config.max_state_dim, + hidden_dim=self.hidden_size, + output_dim=self.input_embedding_dim, + ) + self.action_encoder = MultiEmbodimentActionEncoder( + action_dim=config.action_dim, + hidden_size=self.input_embedding_dim, + num_embodiments=config.max_num_embodiments, + ) + self.action_decoder = CategorySpecificMLP( + num_categories=config.max_num_embodiments, + input_dim=self.hidden_size, + hidden_dim=self.hidden_size, + output_dim=self.action_dim, + ) + self.future_tokens = nn.Embedding(config.num_target_vision_tokens, self.input_embedding_dim) + nn.init.normal_(self.future_tokens.weight, mean=0.0, std=0.02) + + self.vlln = nn.LayerNorm(config.backbone_embedding_dim) if config.use_vlln else nn.Identity() + self.vl_self_attention = ( + SelfAttentionTransformer(**config.vl_self_attention_cfg) if config.use_vlln else nn.Identity() + ) + + if config.add_pos_embed: + self.position_embedding = nn.Embedding(config.max_seq_len, self.input_embedding_dim) + nn.init.normal_(self.position_embedding.weight, mean=0.0, std=0.02) + + self.beta_dist = Beta(config.noise_beta_alpha, config.noise_beta_beta) + self.num_timestep_buckets = config.num_timestep_buckets + self.config = config + self.set_trainable_parameters(config.tune_projector, config.tune_diffusion_model) + + def set_trainable_parameters(self, tune_projector: bool, tune_diffusion_model: bool): + self.tune_projector = tune_projector + self.tune_diffusion_model = tune_diffusion_model + for p in self.parameters(): + p.requires_grad = True + if not tune_projector: + self.state_encoder.requires_grad_(False) + self.action_encoder.requires_grad_(False) + self.action_decoder.requires_grad_(False) + if self.config.add_pos_embed: + self.position_embedding.requires_grad_(False) + if not tune_diffusion_model: + self.model.requires_grad_(False) + print(f"Tune action head projector: {self.tune_projector}") + print(f"Tune action head diffusion model: {self.tune_diffusion_model}") + # Check if any parameters are still trainable. If not, print a warning. + if not tune_projector and not tune_diffusion_model: + for name, p in self.named_parameters(): + if p.requires_grad: + print(f"Action head trainable parameter: {name}") + if not any(p.requires_grad for p in self.parameters()): + print("Warning: No action head trainable parameters found.") + + def set_frozen_modules_to_eval_mode(self): + """ + Huggingface will call model.train() at each training_step. To ensure + the expected behaviors for modules like dropout, batchnorm, etc., we + need to call model.eval() for the frozen modules. + """ + if self.training: + if not self.tune_projector: + self.state_encoder.eval() + self.action_encoder.eval() + self.action_decoder.eval() + if self.config.add_pos_embed: + self.position_embedding.eval() + if not self.tune_diffusion_model: + self.model.eval() + + def sample_time(self, batch_size, device, dtype): + sample = self.beta_dist.sample([batch_size]).to(device, dtype=dtype) + return (self.config.noise_s - sample) / self.config.noise_s + + def prepare_input(self, batch: dict) -> BatchFeature: + return BatchFeature(data=batch) + + def process_backbone_output(self, backbone_output: BatchFeature) -> BatchFeature: + backbone_features = backbone_output["backbone_features"] + backbone_features = self.vlln(backbone_features) + backbone_features = self.vl_self_attention(backbone_features) + backbone_output["backbone_features"] = backbone_features + return backbone_output + + def forward(self, backbone_output: BatchFeature, action_input: BatchFeature) -> BatchFeature: + # Set frozen modules to eval + self.set_frozen_modules_to_eval_mode() + + backbone_output = self.process_backbone_output(backbone_output) + + if self.config.expand_batch is not None: + for k, v in backbone_output.items(): + ndim = len(v.shape) + factors = [self.config.expand_batch] + while len(factors) < ndim: + factors.append(1) + factors = tuple(factors) + expanded = v.repeat(*factors) + backbone_output[k] = expanded + + for k, v in action_input.items(): + ndim = len(v.shape) + factors = [self.config.expand_batch] + while len(factors) < ndim: + factors.append(1) + factors = tuple(factors) + expanded = v.repeat(*factors) + action_input[k] = expanded + + # Get vision and language embeddings. + vl_embs = backbone_output.backbone_features + device = vl_embs.device + + # Get embodiment ID. + embodiment_id = action_input.embodiment_id + + # Embed state. + state_features = self.state_encoder(action_input.state, embodiment_id) + + # Embed noised action trajectory. + actions = action_input.action + noise = torch.randn(actions.shape, device=actions.device, dtype=actions.dtype) + t = self.sample_time(actions.shape[0], device=actions.device, dtype=actions.dtype) + t = t[:, None, None] # shape (B,1,1) for broadcast + + noisy_trajectory = (1 - t) * noise + t * actions + velocity = actions - noise + + # Convert (continuous) t -> discrete if needed + t_discretized = (t[:, 0, 0] * self.num_timestep_buckets).long() + action_features = self.action_encoder(noisy_trajectory, t_discretized, embodiment_id) + + # Maybe add position embedding. + if self.config.add_pos_embed: + pos_ids = torch.arange(action_features.shape[1], dtype=torch.long, device=device) + pos_embs = self.position_embedding(pos_ids).unsqueeze(0) + action_features = action_features + pos_embs + + # Join vision, language, state and action embedding along sequence dimension. + future_tokens = self.future_tokens.weight.unsqueeze(0).expand(vl_embs.shape[0], -1, -1) + sa_embs = torch.cat((state_features, future_tokens, action_features), dim=1) + + vl_attn_mask = backbone_output.backbone_attention_mask + + model_output = self.model( + hidden_states=sa_embs, + encoder_hidden_states=vl_embs, + encoder_attention_mask=vl_attn_mask, + timestep=t_discretized, + return_all_hidden_states=False, # NOTE (YL): not using flare now + ) + pred = self.action_decoder(model_output, embodiment_id) + pred_actions = pred[:, -actions.shape[1] :] + + # Slice out only the action portion of pred and target. + action_mask = action_input.action_mask + loss = F.mse_loss(pred_actions, velocity, reduction="none") * action_mask + loss = loss.sum() / action_mask.sum() + output_dict = { + "loss": loss, + } + return BatchFeature(data=output_dict) + + @torch.no_grad() + def get_action(self, backbone_output: BatchFeature, action_input: BatchFeature) -> BatchFeature: + backbone_output = self.process_backbone_output(backbone_output) + + # Get vision and language embeddings. + vl_embs = backbone_output.backbone_features + embodiment_id = action_input.embodiment_id + + # Embed state. + state_features = self.state_encoder(action_input.state, embodiment_id) + + # Set initial actions as the sampled noise. + batch_size = vl_embs.shape[0] + device = vl_embs.device + actions = torch.randn( + size=(batch_size, self.config.action_horizon, self.config.action_dim), + dtype=vl_embs.dtype, + device=device, + ) + + num_steps = self.num_inference_timesteps + dt = 1.0 / num_steps + + # Run denoising steps. + for t in range(num_steps): + t_cont = t / float(num_steps) # e.g. goes 0, 1/N, 2/N, ... + t_discretized = int(t_cont * self.num_timestep_buckets) + + # Embed noised action trajectory. + timesteps_tensor = torch.full(size=(batch_size,), fill_value=t_discretized, device=device) + action_features = self.action_encoder(actions, timesteps_tensor, embodiment_id) + # Maybe add position embedding. + if self.config.add_pos_embed: + pos_ids = torch.arange(action_features.shape[1], dtype=torch.long, device=device) + pos_embs = self.position_embedding(pos_ids).unsqueeze(0) + action_features = action_features + pos_embs + + # Join vision, language, state and action embedding along sequence dimension. + future_tokens = self.future_tokens.weight.unsqueeze(0).expand(vl_embs.shape[0], -1, -1) + sa_embs = torch.cat((state_features, future_tokens, action_features), dim=1) + + # Run model forward. + model_output = self.model( + hidden_states=sa_embs, + encoder_hidden_states=vl_embs, + timestep=timesteps_tensor, + ) + pred = self.action_decoder(model_output, embodiment_id) + + pred_velocity = pred[:, -self.action_horizon :] + + # Update actions using euler integration. + actions = actions + dt * pred_velocity + return BatchFeature(data={"action_pred": actions}) + + @property + def device(self): + return next(iter(self.parameters())).device + + @property + def dtype(self): + return next(iter(self.parameters())).dtype diff --git a/src/lerobot/policies/groot/configuration_groot.py b/src/lerobot/policies/groot/configuration_groot.py new file mode 100644 index 000000000..8002c69ea --- /dev/null +++ b/src/lerobot/policies/groot/configuration_groot.py @@ -0,0 +1,201 @@ +#!/usr/bin/env python + +# Copyright 2024 NVIDIA Corporation and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass, field + +from lerobot.configs.policies import PreTrainedConfig +from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature +from lerobot.optim.optimizers import AdamWConfig +from lerobot.optim.schedulers import CosineDecayWithWarmupSchedulerConfig + + +@PreTrainedConfig.register_subclass("groot") +@dataclass +class GrootConfig(PreTrainedConfig): + """Configuration for Groot policy wrapper.""" + + # Basic policy settings + n_obs_steps: int = 1 + chunk_size: int = 50 + n_action_steps: int = 50 + + # Dimension settings (must match pretrained GR00T model expectations) + # Maximum state dimension. Shorter states will be zero-padded. + max_state_dim: int = 64 + + # Maximum action dimension. Shorter actions will be zero-padded. + max_action_dim: int = 32 + + # Normalization (start with identity, adjust as needed) + normalization_mapping: dict[str, NormalizationMode] = field( + default_factory=lambda: { + "VISUAL": NormalizationMode.IDENTITY, + "STATE": NormalizationMode.MEAN_STD, + "ACTION": NormalizationMode.MEAN_STD, + } + ) + + # Image preprocessing (adjust to match Groot's expected input) + image_size: tuple[int, int] = (224, 224) + + # Groot-specific model parameters (from groot_finetune_script.py) + + # Path or HuggingFace model ID for the base Groot model + base_model_path: str = "nvidia/GR00T-N1.5-3B" + + # HF repo ID (or local path) that hosts vocab.json and merges.txt for Eagle tokenizer. + tokenizer_assets_repo: str = "lerobot/eagle2hg-processor-groot-n1p5" + + # Embodiment tag to use for training (e.g. 'new_embodiment', 'gr1') + embodiment_tag: str = "new_embodiment" + + # Fine-tuning control arguments + + # Whether to fine-tune the llm backbone + tune_llm: bool = False + + # Whether to fine-tune the vision tower + tune_visual: bool = False + + # Whether to fine-tune the projector + tune_projector: bool = True + + # Whether to fine-tune the diffusion model + tune_diffusion_model: bool = True + + # LoRA parameters (from groot_finetune_script.py) + # Rank for the LORA model. If 0, no LORA will be used. + lora_rank: int = 0 + + # Alpha value for the LORA model + lora_alpha: int = 16 + + # Dropout rate for the LORA model + lora_dropout: float = 0.1 + + # Whether to use the full model for LORA + lora_full_model: bool = False + + # Training parameters (matching groot_finetune_script.py) + optimizer_lr: float = 1e-4 + optimizer_betas: tuple[float, float] = (0.95, 0.999) + optimizer_eps: float = 1e-8 + optimizer_weight_decay: float = 1e-5 + warmup_ratio: float = 0.05 + use_bf16: bool = True + + # Dataset parameters + # Video backend to use for training ('decord' or 'torchvision_av') + video_backend: str = "decord" + + # Whether to balance dataset weights in mixture datasets + balance_dataset_weights: bool = True + + # Whether to sample trajectories weighted by their length + balance_trajectory_weights: bool = True + + # Optional dataset paths for delegating training to Isaac-GR00T runner + dataset_paths: list[str] | None = None + output_dir: str = "./tmp/gr00t" + save_steps: int = 1000 + max_steps: int = 10000 + batch_size: int = 32 + dataloader_num_workers: int = 8 + report_to: str = "wandb" + resume: bool = False + + def __post_init__(self): + super().__post_init__() + + if self.n_action_steps > self.chunk_size: + raise ValueError( + f"n_action_steps ({self.n_action_steps}) cannot exceed chunk_size ({self.chunk_size})" + ) + + # groot_repo_path is now optional since we ported the components + # No validation needed + + def validate_features(self) -> None: + """Validate and set up input/output features for Groot.""" + image_features = [key for key, feat in self.input_features.items() if feat.type == FeatureType.VISUAL] + if not image_features: + raise ValueError( + "Groot policy requires at least one visual input feature. " + "No features of type FeatureType.VISUAL found in input_features." + ) + + if "observation.state" not in self.input_features: + state_feature = PolicyFeature( + type=FeatureType.STATE, + shape=(self.max_state_dim,), + ) + self.input_features["observation.state"] = state_feature + else: + state_shape = self.input_features["observation.state"].shape + state_dim = state_shape[0] if state_shape else 0 + if state_dim > self.max_state_dim: + raise ValueError( + f"State dimension {state_dim} exceeds max_state_dim {self.max_state_dim}. " + f"Either reduce state dimension or increase max_state_dim in config." + ) + + if "action" not in self.output_features: + action_feature = PolicyFeature( + type=FeatureType.ACTION, + shape=(self.max_action_dim,), + ) + self.output_features["action"] = action_feature + else: + action_shape = self.output_features["action"].shape + action_dim = action_shape[0] if action_shape else 0 + if action_dim > self.max_action_dim: + raise ValueError( + f"Action dimension {action_dim} exceeds max_action_dim {self.max_action_dim}. " + f"Either reduce action dimension or increase max_action_dim in config." + ) + + def get_optimizer_preset(self) -> AdamWConfig: + """Return optimizer configuration.""" + return AdamWConfig( + lr=self.optimizer_lr, + betas=self.optimizer_betas, + eps=self.optimizer_eps, + weight_decay=self.optimizer_weight_decay, + ) + + def get_scheduler_preset(self) -> CosineDecayWithWarmupSchedulerConfig: + """Return scheduler configuration.""" + return CosineDecayWithWarmupSchedulerConfig( + num_warmup_steps=int(10000 * self.warmup_ratio), # 5% warmup by default + num_decay_steps=10000, # Adjust based on training steps + peak_lr=self.optimizer_lr, + decay_lr=self.optimizer_lr * 0.1, + ) + + @property + def observation_delta_indices(self) -> None: + """Return indices for delta observations (None for Groot).""" + return None + + @property + def action_delta_indices(self) -> list[int]: + """Return indices for delta actions.""" + return list(range(min(self.chunk_size, 16))) + + @property + def reward_delta_indices(self) -> None: + """Return indices for delta rewards (None for Groot).""" + return None diff --git a/src/lerobot/policies/groot/eagle2_hg_model/configuration_eagle2_5_vl.py b/src/lerobot/policies/groot/eagle2_hg_model/configuration_eagle2_5_vl.py new file mode 100755 index 000000000..526b4f7a2 --- /dev/null +++ b/src/lerobot/policies/groot/eagle2_hg_model/configuration_eagle2_5_vl.py @@ -0,0 +1,135 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import copy + +from transformers.configuration_utils import PretrainedConfig +from transformers.models.llama.configuration_llama import LlamaConfig +from transformers.models.qwen2.configuration_qwen2 import Qwen2Config +from transformers.models.qwen3.configuration_qwen3 import Qwen3Config +from transformers.models.siglip.configuration_siglip import SiglipVisionConfig +from transformers.utils import logging + +logger = logging.get_logger(__name__) + + +class Eagle25VLConfig(PretrainedConfig): + model_type = "eagle_2_5_vl" + is_composition = True + sub_configs = {"vision_config": SiglipVisionConfig, "text_config": Qwen2Config} + + def __init__( + self, + vision_config=None, + text_config=None, + use_backbone_lora=0, + use_llm_lora=0, + pad2square=False, + select_layer=-4, + force_image_size=None, + downsample_ratio=0.5, + template=None, + dynamic_image_size=False, + use_thumbnail=False, + loss_version="v1", + min_dynamic_tiles=1, + max_dynamic_tiles=6, + mlp_checkpoint=False, + initializer_range=0.02, + _attn_implementation="flash_attention_2", + _attn_implementation_autoset=False, + llm_config=None, + image_token_index=None, + use_pixel_shuffle=True, + mlp_connector_layers=2, + **kwargs, + ): + super().__init__(**kwargs) + + if vision_config is None: + vision_config = {"model_type": "siglip_vision_model"} + logger.info("vision_config is None. Initializing the InternVisionConfig with default values.") + + if text_config is None: + text_config = {"architectures": ["Qwen2ForCausalLM"]} + logger.info( + "text_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`)." + ) + + if vision_config["model_type"] == "siglip_vision_model": + self.vision_config = SiglipVisionConfig(**vision_config) + else: + raise ValueError("Unsupported model_type: {}".format(vision_config["model_type"])) + + if text_config["architectures"][0] == "LlamaForCausalLM": + self.text_config = LlamaConfig(**text_config) + elif text_config["architectures"][0] == "Qwen2ForCausalLM": + self.text_config = Qwen2Config(**text_config) + elif text_config["architectures"][0] == "Qwen3ForCausalLM": + self.text_config = Qwen3Config(**text_config) + else: + raise ValueError("Unsupported architecture: {}".format(text_config["architectures"][0])) + self.use_backbone_lora = use_backbone_lora + self.use_llm_lora = use_llm_lora + self.mlp_checkpoint = mlp_checkpoint + self.pad2square = pad2square + self.select_layer = select_layer + self.force_image_size = force_image_size + self.downsample_ratio = downsample_ratio + self.template = template + self.dynamic_image_size = dynamic_image_size + self.use_thumbnail = use_thumbnail + self.loss_version = loss_version + self.initializer_range = initializer_range + self.min_dynamic_tiles = min_dynamic_tiles + self.max_dynamic_tiles = max_dynamic_tiles + self.tie_word_embeddings = self.text_config.tie_word_embeddings + self._attn_implementation = _attn_implementation + self._attn_implementation_autoset = _attn_implementation_autoset + self.image_token_index = image_token_index + self.use_pixel_shuffle = use_pixel_shuffle + self.mlp_connector_layers = mlp_connector_layers + logger.info(f"min_dynamic_tiles: {self.min_dynamic_tiles}") + logger.info(f"max_dynamic_tiles: {self.max_dynamic_tiles}") + + def to_dict(self): + """ + Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. + + Returns: + `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, + """ + output = copy.deepcopy(self.__dict__) + output["vision_config"] = self.vision_config.to_dict() + output["text_config"] = self.text_config.to_dict() + output["model_type"] = self.__class__.model_type + output["use_backbone_lora"] = self.use_backbone_lora + output["use_llm_lora"] = self.use_llm_lora + output["pad2square"] = self.pad2square + output["select_layer"] = self.select_layer + output["force_image_size"] = self.force_image_size + output["downsample_ratio"] = self.downsample_ratio + output["template"] = self.template + output["dynamic_image_size"] = self.dynamic_image_size + output["use_thumbnail"] = self.use_thumbnail + output["min_dynamic_tiles"] = self.min_dynamic_tiles + output["max_dynamic_tiles"] = self.max_dynamic_tiles + output["tie_word_embeddings"] = self.tie_word_embeddings + output["_attn_implementation"] = self._attn_implementation + output["_attn_implementation_autoset"] = self._attn_implementation_autoset + output["use_pixel_shuffle"] = self.use_pixel_shuffle + output["mlp_connector_layers"] = self.mlp_connector_layers + return output diff --git a/src/lerobot/policies/groot/eagle2_hg_model/image_processing_eagle2_5_vl_fast.py b/src/lerobot/policies/groot/eagle2_hg_model/image_processing_eagle2_5_vl_fast.py new file mode 100644 index 000000000..6b4f6d7ac --- /dev/null +++ b/src/lerobot/policies/groot/eagle2_hg_model/image_processing_eagle2_5_vl_fast.py @@ -0,0 +1,504 @@ +# -------------------------------------------------------- +# NVIDIA +# Copyright (c) 2025 NVIDIA +# Licensed under The MIT License [see LICENSE for details] +# -------------------------------------------------------- + + +# copy from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +from typing import Optional + +from transformers.image_processing_utils import ( + BatchFeature, + get_patch_output_size, +) +from transformers.image_processing_utils_fast import ( + BaseImageProcessorFast, + DefaultFastImageProcessorKwargs, + group_images_by_shape, + reorder_images, +) +from transformers.image_utils import ( + IMAGENET_STANDARD_MEAN, # 0.5, 0.5, 0.5 + IMAGENET_STANDARD_STD, # 0.5, 0.5, 0.5 + ChannelDimension, + ImageInput, + PILImageResampling, + SizeDict, + get_image_size, + make_flat_list_of_images, + validate_kwargs, +) +from transformers.processing_utils import Unpack +from transformers.utils import ( + TensorType, + add_start_docstrings, + is_torch_available, + is_torchvision_v2_available, +) +from transformers.video_utils import VideoInput + +if is_torch_available(): + import torch +if is_torchvision_v2_available(): + from torchvision.transforms.v2 import functional as F # noqa: N812 + from transformers.image_utils import pil_torch_interpolation_mapping +else: + from torchvision.transforms import functional as F # noqa: N812 + + +def crop(img: torch.Tensor, left: int, top: int, right: int, bottom: int) -> torch.Tensor: + """Crop the given numpy array. + + Args: + img (torch.Tensor): Image to be cropped. Format should be (C, H, W). + left (int): The left coordinate of the crop box. + top (int): The top coordinate of the crop box. + right (int): The right coordinate of the crop box. + bottom (int): The bottom coordinate of the crop box. + + Returns: + torch.Tensor: Cropped image. + """ + if not isinstance(img, torch.Tensor): + raise TypeError(f"img should be torch.Tensor. Got {type(img)}") + + if img.ndim not in [2, 3]: + raise ValueError(f"Image should have 2 or 3 dimensions. Got {img.ndim}") + + img_height = img.shape[1] + img_width = img.shape[2] + if top < 0 or left < 0 or bottom > img_height or right > img_width: + raise ValueError("Crop coordinates out of bounds") + + if top >= bottom or left >= right: + raise ValueError("Invalid crop coordinates") + + return img[:, top:bottom, left:right] + + +class Eagle25VLFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): + max_dynamic_tiles: int | None + min_dynamic_tiles: int | None + use_thumbnail: bool | None + pad_during_tiling: bool | None + do_pad: bool | None + + +@add_start_docstrings( + "Constructs a fast ConvNeXT image processor. Based on [`SiglipImageProcessor`] with incorporation of processing each video frame.", + # BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, TODO: this was depreciated from transformers remove! + """ + image_grid_pinpoints (`List[List[int]]`, *optional*): + A list of possible resolutions to use for processing high resolution images. The best resolution is selected + based on the original size of the image. Can be overridden by `image_grid_pinpoints` in the `preprocess` + method. Not used for processing videos. + do_pad (`bool`, *optional*): + Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest + number of patches in the batch. Padding will be applied to the bottom and right with zeros. + """, +) +class Eagle25VLImageProcessorFast(BaseImageProcessorFast): + resample = PILImageResampling.BICUBIC + image_mean = IMAGENET_STANDARD_MEAN + image_std = IMAGENET_STANDARD_STD + size = {"height": 448, "width": 448} + default_to_square = False + crop_size = None + do_resize = True + do_center_crop = None + do_rescale = True + do_normalize = True + do_convert_rgb = True + do_pad = True + max_dynamic_tiles = 12 + min_dynamic_tiles = 1 + use_thumbnail = True + pad_during_tiling = False + valid_kwargs = Eagle25VLFastImageProcessorKwargs + model_input_names = ["pixel_values_videos"] + + def __init__(self, **kwargs: Unpack[Eagle25VLFastImageProcessorKwargs]): + super().__init__(**kwargs) + + @add_start_docstrings( + # BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS, TODO: this was depreciated from transformers remove! + """ + max_dynamic_tiles (`int`, *optional*): + The maximum number of dynamic tiles to use for processing high resolution images. + min_dynamic_tiles (`int`, *optional*): + The minimum number of dynamic tiles to use for processing high resolution images. + use_thumbnail (`bool`, *optional*): + Whether to use a thumbnail for processing high resolution images. + pad_during_tiling (`bool`, *optional*): + Whether to pad the image during tiling. + do_pad (`bool`, *optional*): + Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest + number of patches in the batch. Padding will be applied to the bottom and right with zeros. + """, + ) + + # NOTE(YL): we will overload the preprocess method to add the image_flags + # def preprocess( + # self, images: ImageInput, **kwargs: Unpack[Eagle25VLFastImageProcessorKwargs] + # ) -> BatchFeature: + # return super().preprocess(images, **kwargs) + + def _prepare_images_structure( + self, + images: ImageInput, + expected_ndims: int = 3, + ) -> ImageInput: + """ + Prepare the images structure for processing. + + Args: + images (`ImageInput`): + The input images to process. + expected_ndims (`int`, *optional*, defaults to 3): + Expected number of dimensions for the images (added for transformers >=4.53.0 compatibility). + + Returns: + `ImageInput`: The images with a valid nesting. + """ + return make_flat_list_of_images(images) + + def _resize_for_patching( + self, + image: "torch.Tensor", + target_resolution: tuple, + interpolation: "F.InterpolationMode", + input_data_format: ChannelDimension, + ) -> "torch.Tensor": + """ + Resizes an image to a target resolution while maintaining aspect ratio. + + Args: + image ("torch.Tensor"): + The input image. + target_resolution (tuple): + The target resolution (height, width) of the image. + interpolation (`InterpolationMode`): + Resampling filter to use if resizing the image. + input_data_format (`ChannelDimension` or `str`): + The channel dimension format of the input image. + + Returns: + "torch.Tensor": The resized and padded image. + """ + new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format) + + # Resize the image + resized_image = F.resize(image, (new_height, new_width), interpolation=interpolation) + + return resized_image + + def find_closest_aspect_ratio(self, aspect_ratio, target_ratios, width, height, image_size): + """ + previous version mainly focus on ratio. + We also consider area ratio here. + """ + best_factor = float("-inf") + best_ratio = (1, 1) + area = width * height + for ratio in target_ratios: + target_aspect_ratio = ratio[0] / ratio[1] + # ratio_diff = abs(aspect_ratio - target_aspect_ratio) + # area_ratio = (ratio[0] * ratio[1] * image_size * image_size) / area + """ + new area > 60% of original image area is enough. + """ + factor_based_on_area_n_ratio = min( + (ratio[0] * ratio[1] * image_size * image_size) / area, 0.6 + ) * min(target_aspect_ratio / aspect_ratio, aspect_ratio / target_aspect_ratio) + + if factor_based_on_area_n_ratio > best_factor: + best_factor = factor_based_on_area_n_ratio + best_ratio = ratio + + return best_ratio + + def _pad_for_patching( + self, image: "torch.Tensor", target_resolution: tuple, input_data_format: ChannelDimension + ) -> "torch.Tensor": + """ + Pad an image to a target resolution while maintaining aspect ratio. + """ + target_height, target_width = target_resolution + new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format) + + paste_x = (target_width - new_width) // 2 + paste_y = (target_height - new_height) // 2 + + padded_image = F.pad(image, padding=[paste_x, paste_y, paste_x, paste_y]) + + return padded_image + + def _get_image_patches( + self, + image: "torch.Tensor", + min_num: int, + max_num: int, + size: tuple, + tile_size: int, + use_thumbnail: bool, + interpolation: "F.InterpolationMode", + pad_during_tiling: bool, + ) -> list["torch.Tensor"]: + image_size = get_image_size(image, channel_dim=ChannelDimension.FIRST) + orig_height, orig_width = image_size + aspect_ratio = orig_width / orig_height + + # calculate the existing image aspect ratio + target_ratios = { + (i, j) + for n in range(min_num, max_num + 1) + for i in range(1, n + 1) + for j in range(1, n + 1) + if i * j <= max_num and i * j >= min_num + } + target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) + + # find the closest aspect ratio to the target + target_aspect_ratio = self.find_closest_aspect_ratio( + aspect_ratio, target_ratios, orig_width, orig_height, tile_size + ) + + # calculate the target width and height + target_width = tile_size * target_aspect_ratio[0] + target_height = tile_size * target_aspect_ratio[1] + blocks = target_aspect_ratio[0] * target_aspect_ratio[1] + if pad_during_tiling: + resized_image = self._resize_for_patching( + image, + (target_height, target_width), + interpolation=interpolation, + input_data_format=ChannelDimension.FIRST, + ) + padded_image = self._pad_for_patching( + resized_image, + (target_height, target_width), + input_data_format=ChannelDimension.FIRST, + ) + image_used_to_split = padded_image + else: + image_used_to_split = F.resize(image, (target_height, target_width), interpolation=interpolation) + + processed_tiles = [] + for i in range(blocks): + box = ( + (i % (target_width // tile_size)) * tile_size, + (i // (target_width // tile_size)) * tile_size, + ((i % (target_width // tile_size)) + 1) * tile_size, + ((i // (target_width // tile_size)) + 1) * tile_size, + ) + # split the image + split_img = crop(image_used_to_split, box[0], box[1], box[2], box[3]) + processed_tiles.append(split_img) + assert len(processed_tiles) == blocks + + if use_thumbnail and len(processed_tiles) != 1: + thumbnail_img = F.resize(image, (tile_size, tile_size), interpolation=interpolation) + processed_tiles.append(thumbnail_img) + + return processed_tiles + + def _pad_for_batching( + self, + pixel_values: list["torch.Tensor"], + ) -> list["torch.Tensor"]: + """ + Pads images on the `num_of_patches` dimension with zeros to form a batch of same number of patches. + + Args: + pixel_values (`List[torch.Tensor]`): + An array of pixel values of each images of shape (`batch_size`, `num_patches`, `image_in_3D`) + + Returns: + List[`torch.Tensor`]: The padded images. + """ + max_patch = max(len(x) for x in pixel_values) + pixel_values = [ + torch.nn.functional.pad(image, pad=[0, 0, 0, 0, 0, 0, 0, max_patch - image.shape[0]]) + for image in pixel_values + ] + + return pixel_values + + def _preprocess( + self, + images: list["torch.Tensor"], + do_resize: bool, + size: SizeDict, + max_dynamic_tiles: int, + min_dynamic_tiles: int, + use_thumbnail: bool, + pad_during_tiling: bool, + interpolation: Optional["F.InterpolationMode"], + do_center_crop: bool, + crop_size: SizeDict, + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + image_mean: float | list[float] | None, + image_std: float | list[float] | None, + do_pad: bool, + return_tensors: str | TensorType | None, + pad_size: SizeDict | None = None, # Added for transformers >=4.53.0 compatibility + disable_grouping: bool | None = None, # Added for transformers >=4.53.0 compatibility + ) -> BatchFeature: + processed_images = [] + image_sizes = [] + # Determine the size tuple + if size and size.height and size.width: + size_tuple = (size.height, size.width) + else: + size_tuple = (size.shortest_edge, size.shortest_edge) + + # Determine the patch size + if crop_size and crop_size.height: + tile_size = crop_size.height + elif size and size.height: + tile_size = size.height + else: + tile_size = size.shortest_edge + + for image in images: + image_patches = self._get_image_patches( + image, + min_num=min_dynamic_tiles, + max_num=max_dynamic_tiles, + size=size_tuple, + tile_size=tile_size, + use_thumbnail=use_thumbnail, + interpolation=interpolation, + pad_during_tiling=pad_during_tiling, + ) + + # Group images by size for batched processing + processed_image_patches_grouped = {} + # Added for transformers >=4.53.0 compatibility + grouped_image_patches, grouped_image_patches_index = group_images_by_shape( + image_patches, + disable_grouping=disable_grouping, + ) + + for shape, stacked_image_patches in grouped_image_patches.items(): + if do_resize: + stacked_image_patches = self.resize( + image=stacked_image_patches, + size=size, + interpolation=interpolation, + ) + if do_center_crop: + stacked_image_patches = self.center_crop(stacked_image_patches, crop_size) + # Fused rescale and normalize + stacked_image_patches = self.rescale_and_normalize( + stacked_image_patches, + do_rescale, + rescale_factor, + do_normalize, + image_mean, + image_std, + ) + processed_image_patches_grouped[shape] = stacked_image_patches + processed_image_patches = reorder_images( + processed_image_patches_grouped, grouped_image_patches_index + ) + processed_image_patches = ( + torch.stack(processed_image_patches, dim=0) if return_tensors else processed_image_patches + ) + processed_images.append(processed_image_patches) + image_sizes.append(get_image_size(image, ChannelDimension.FIRST)) + + if do_pad: + processed_images = self._pad_for_batching(processed_images) + + # processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images + processed_images = torch.cat(processed_images, dim=0) if return_tensors else processed_images + return BatchFeature( + data={"pixel_values": processed_images, "image_sizes": image_sizes}, + tensor_type=return_tensors, + ) + + def preprocess( + self, + images: ImageInput, + videos: VideoInput = None, + **kwargs: Unpack[Eagle25VLFastImageProcessorKwargs], + ) -> BatchFeature: + validate_kwargs( + captured_kwargs=kwargs.keys(), + valid_processor_keys=self.valid_kwargs.__annotations__.keys(), + ) + # Set default kwargs from self. This ensures that if a kwarg is not provided + # by the user, it gets its default value from the instance, or is set to None. + for kwarg_name in self.valid_kwargs.__annotations__: + kwargs.setdefault(kwarg_name, getattr(self, kwarg_name, None)) + + # Extract parameters that are only used for preparing the input images + do_convert_rgb = kwargs.pop("do_convert_rgb") + input_data_format = kwargs.pop("input_data_format") + device = kwargs.pop("device") + # Prepare input images + # transformers >= 4.53.0: uses _prepare_image_like_inputs instead of _prepare_input_images + if images is not None: + images = self._prepare_image_like_inputs( + images=images, + do_convert_rgb=do_convert_rgb, + input_data_format=input_data_format, + device=device, + ) + + if videos is not None: + videos = self._prepare_image_like_inputs( + images=videos, + do_convert_rgb=do_convert_rgb, + input_data_format=input_data_format, + device=device, + ) + + # Update kwargs that need further processing before being validated + kwargs = self._further_process_kwargs(**kwargs) + + # Validate kwargs + self._validate_preprocess_kwargs(**kwargs) + + # torch resize uses interpolation instead of resample + # Added for transformers >=4.53.0 compatibility + resample = kwargs.pop("resample", self.resample) + kwargs["interpolation"] = ( + pil_torch_interpolation_mapping[resample] + if isinstance(resample, PILImageResampling | int) + else resample + ) + + # Filter kwargs to only include those accepted by _preprocess + valid_preprocess_kwargs = { + "do_resize", + "size", + "max_dynamic_tiles", + "min_dynamic_tiles", + "use_thumbnail", + "pad_during_tiling", + "interpolation", + "do_center_crop", + "crop_size", + "do_rescale", + "rescale_factor", + "do_normalize", + "image_mean", + "image_std", + "do_pad", + "return_tensors", + "pad_size", + "disable_grouping", + } + filtered_kwargs = {k: v for k, v in kwargs.items() if k in valid_preprocess_kwargs} + if images is not None: + return self._preprocess(images, **filtered_kwargs) + elif videos is not None: + return self._preprocess(videos, **filtered_kwargs) + + +__all__ = ["Eagle25VLImageProcessorFast"] diff --git a/src/lerobot/policies/groot/eagle2_hg_model/modeling_eagle2_5_vl.py b/src/lerobot/policies/groot/eagle2_hg_model/modeling_eagle2_5_vl.py new file mode 100755 index 000000000..5a66cfbce --- /dev/null +++ b/src/lerobot/policies/groot/eagle2_hg_model/modeling_eagle2_5_vl.py @@ -0,0 +1,395 @@ +# -------------------------------------------------------- +# NVIDIA +# Copyright (c) 2025 NVIDIA +# Licensed under The MIT License [see LICENSE for details] +# -------------------------------------------------------- + +import inspect + +import torch +import torch.utils.checkpoint as cp +from peft import LoraConfig, get_peft_model +from torch import nn +from torch.nn import CrossEntropyLoss +from transformers import GenerationConfig +from transformers.generation import GenerationMixin +from transformers.modeling_outputs import CausalLMOutputWithPast +from transformers.modeling_utils import PreTrainedModel +from transformers.models.llama.modeling_llama import LlamaForCausalLM +from transformers.models.qwen2.modeling_qwen2 import Qwen2ForCausalLM +from transformers.models.qwen3.modeling_qwen3 import Qwen3ForCausalLM +from transformers.models.siglip.modeling_siglip import SiglipVisionModel +from transformers.utils import add_start_docstrings, logging + +from .configuration_eagle2_5_vl import Eagle25VLConfig + +logger = logging.get_logger(__name__) + + +# copy from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llava_onevision/modeling_llava_onevision.py#L241C1-L280C1 +EAGLE2_5_VL_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`Eagle25VLConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare Eagle2_5_VL Model outputting raw hidden-states without any specific head on top.", + EAGLE2_5_VL_START_DOCSTRING, +) +class Eagle25VLPreTrainedModel(PreTrainedModel): + config_class = Eagle25VLConfig + base_model_prefix = "model" + main_input_name = "input_ids" + supports_gradient_checkpointing = True + _no_split_modules = [ + "Qwen2DecoderLayer", + "LlamaDecoderLayer", + "Siglip2EncoderLayer", + "SiglipEncoderLayer", + ] + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn_2 = True + _supports_cache_class = True + _supports_static_cache = True + _supports_quantized_cache = True + _supports_sdpa = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear | nn.Conv2d): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +class Eagle25VLForConditionalGeneration(Eagle25VLPreTrainedModel, GenerationMixin): + config_class = Eagle25VLConfig + + def __init__(self, config: Eagle25VLConfig, vision_model=None, language_model=None): + super().__init__(config) + + image_size = config.force_image_size or config.vision_config.image_size + patch_size = config.vision_config.patch_size + self.patch_size = patch_size + if config.use_pixel_shuffle: + self.num_image_token = int((image_size // patch_size) ** 2 * (config.downsample_ratio**2)) + else: + self.num_image_token = int((image_size // patch_size) ** 2) + + self.select_layer = config.select_layer + self.downsample_ratio = config.downsample_ratio + self.loss_version = config.loss_version + self.mlp_checkpoint = config.mlp_checkpoint + self.use_pixel_shuffle = config.use_pixel_shuffle + self.mlp_connector_layers = config.mlp_connector_layers + logger.info(f"num_image_token: {self.num_image_token}") + logger.info(f"mlp_checkpoint: {self.mlp_checkpoint}") + if vision_model is not None: + self.vision_model = vision_model + else: + if config.vision_config.model_type == "siglip_vision_model": + config.vision_config._attn_implementation = "flash_attention_2" + self.vision_model = SiglipVisionModel(config.vision_config) + else: + raise NotImplementedError(f"{config.vision_config.model_type} is not implemented.") + + if language_model is not None: + self.language_model = language_model + else: + if config.text_config.architectures[0] == "LlamaForCausalLM": + self.language_model = LlamaForCausalLM(config.text_config) + elif config.text_config.architectures[0] == "Phi3ForCausalLM": + raise NotImplementedError("Phi3 is not implemented.") + # self.language_model = Phi3ForCausalLM(config.text_config) + elif config.text_config.architectures[0] == "Qwen2ForCausalLM": + assert config.text_config._attn_implementation == "flash_attention_2", ( + f"Qwen2 must use flash_attention_2 but got {config.text_config._attn_implementation}" + ) + self.language_model = Qwen2ForCausalLM(config.text_config) + elif config.text_config.architectures[0] == "Qwen3ForCausalLM": + self.language_model = Qwen3ForCausalLM(config.text_config) + else: + raise NotImplementedError(f"{config.text_config.architectures[0]} is not implemented.") + + vit_hidden_size = config.vision_config.hidden_size + llm_hidden_size = config.text_config.hidden_size + + if config.mlp_connector_layers == 2: + self.mlp1 = nn.Sequential( + nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio) ** 2), + nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio) ** 2, llm_hidden_size), + nn.GELU(), + nn.Linear(llm_hidden_size, llm_hidden_size), + ) + elif config.mlp_connector_layers == 1 and config.use_pixel_shuffle: + self.mlp1 = nn.Sequential( + nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio) ** 2, llm_hidden_size), + ) + elif config.mlp_connector_layers == 1 and not config.use_pixel_shuffle: + self.mlp1 = nn.Sequential( + nn.Linear(vit_hidden_size, llm_hidden_size), + ) + else: + raise NotImplementedError(f"{config.mlp_connector_layers} is not implemented.") + + self.image_token_index = config.image_token_index + self.neftune_alpha = None + + if config.use_backbone_lora: + self.wrap_backbone_lora(r=config.use_backbone_lora, lora_alpha=2 * config.use_backbone_lora) + + self.use_llm_lora = config.use_llm_lora + if config.use_llm_lora: + self.wrap_llm_lora(r=config.use_llm_lora, lora_alpha=2 * config.use_llm_lora) + + self.check_forward_kwargs() + + def check_forward_kwargs(self): + # We intentionally avoid using **kwargs in forward because Hugging Face Transformers + # has special handling for functions with **kwargs parameters that would affect + # how our model is processed during training and inference. + forward_params = inspect.signature(self.forward).parameters + assert not any(k.kind == inspect.Parameter.VAR_KEYWORD for k in forward_params.values()) + + def wrap_backbone_lora(self, r=128, lora_alpha=256, lora_dropout=0.05): + lora_config = LoraConfig( + r=r, + target_modules=[ + "self_attn.q_proj", + "self_attn.k_proj", + "self_attn.v_proj", + "self_attn.out_proj", + "mlp.fc1", + "mlp.fc2", + ], + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + ) + self.vision_model = get_peft_model(self.vision_model, lora_config) + self.vision_model.print_trainable_parameters() + + def wrap_llm_lora(self, r=128, lora_alpha=256, lora_dropout=0.05): + lora_config = LoraConfig( + r=r, + target_modules=[ + "self_attn.q_proj", + "self_attn.k_proj", + "self_attn.v_proj", + "self_attn.o_proj", + "mlp.gate_proj", + "mlp.down_proj", + "mlp.up_proj", + ], + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + task_type="CAUSAL_LM", + ) + self.language_model = get_peft_model(self.language_model, lora_config) + self.language_model.enable_input_require_grads() + self.language_model.print_trainable_parameters() + self.use_llm_lora = True + + def forward( + self, + pixel_values: torch.FloatTensor, + input_ids: torch.LongTensor = None, + attention_mask: torch.Tensor | None = None, + position_ids: torch.LongTensor | None = None, + image_flags: torch.LongTensor | None = None, + past_key_values: list[torch.FloatTensor] | None = None, + labels: torch.LongTensor | None = None, + use_cache: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + num_tiles_list: list[torch.Tensor] | None = None, + ) -> tuple | CausalLMOutputWithPast: + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + input_embeds = self.language_model.get_input_embeddings()(input_ids) + + vit_embeds = self.extract_feature(pixel_values) + + if image_flags is not None: + image_flags = image_flags.view(-1) + vit_embeds = vit_embeds[image_flags == 1] + + b, n, c = input_embeds.shape + input_embeds = input_embeds.reshape(b * n, c) + + input_ids = input_ids.reshape(b * n) + selected = input_ids == self.image_token_index + try: + input_embeds[selected] = input_embeds[selected] * 0.0 + vit_embeds.reshape(-1, c) + except Exception as e: + vit_embeds = vit_embeds.reshape(-1, c) + print( + f"warning: {e}, input_embeds[selected].shape={input_embeds[selected].shape}, " + f"vit_embeds.shape={vit_embeds.shape}" + ) + n_token = selected.sum() + input_embeds[selected] = input_embeds[selected] * 0.0 + vit_embeds[:n_token] + + input_embeds = input_embeds.reshape(b, n, c) + + outputs = self.language_model( + inputs_embeds=input_embeds, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + logits = outputs.logits + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.language_model.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def pixel_shuffle(self, x, scale_factor=0.5): + n, w, h, c = x.size() + # N, W, H, C --> N, W, H * scale, C // scale + x = x.view(n, w, int(h * scale_factor), int(c / scale_factor)) + # N, W, H * scale, C // scale --> N, H * scale, W, C // scale + x = x.permute(0, 2, 1, 3).contiguous() + # N, H * scale, W, C // scale --> N, H * scale, W * scale, C // (scale ** 2) + x = x.view(n, int(h * scale_factor), int(w * scale_factor), int(c / (scale_factor * scale_factor))) + + x = x.permute(0, 2, 1, 3).contiguous() + return x + + def extract_feature(self, pixel_values): + if self.select_layer == -1: + vit_embeds = self.vision_model( + pixel_values=pixel_values, output_hidden_states=False, return_dict=True + ) + if hasattr(vit_embeds, "last_hidden_state"): + vit_embeds = vit_embeds.last_hidden_state + + else: + vit_embeds = self.vision_model( + pixel_values=pixel_values, output_hidden_states=True, return_dict=True + ).hidden_states[self.select_layer] + + if self.use_pixel_shuffle: + h = w = int(vit_embeds.shape[1] ** 0.5) + vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1) + vit_embeds = self.pixel_shuffle( + vit_embeds, scale_factor=self.downsample_ratio + ) # torch.Size([B, 1024, 1024]) -> torch.Size([B, 16, 16, 4096]) + vit_embeds = vit_embeds.reshape( + vit_embeds.shape[0], -1, vit_embeds.shape[-1] + ) # torch.Size([B, 16, 16, 4096]) -> torch.Size([B, 256, 4096]) + + if self.mlp_checkpoint and vit_embeds.requires_grad: + vit_embeds = cp.checkpoint(self.mlp1, vit_embeds) + else: + vit_embeds = self.mlp1(vit_embeds) + + return vit_embeds + + @torch.no_grad() + def generate( + self, + pixel_values: torch.FloatTensor | None = None, + input_ids: torch.FloatTensor | None = None, + attention_mask: torch.LongTensor | None = None, + visual_features: torch.FloatTensor | None = None, + generation_config: GenerationConfig | None = None, + output_hidden_states: bool | None = None, + image_sizes: list[tuple[int, int]] | None = None, + **generate_kwargs, + ) -> torch.LongTensor: + if pixel_values is not None: + if visual_features is not None: + vit_embeds = visual_features + else: + vit_embeds = self.extract_feature(pixel_values) + + input_embeds = self.language_model.get_input_embeddings()(input_ids) + b, n, c = input_embeds.shape + input_embeds = input_embeds.reshape(b * n, c) + + input_ids = input_ids.reshape(b * n) + selected = input_ids == self.config.image_token_index + assert selected.sum() != 0 + input_embeds[selected] = vit_embeds.reshape(-1, c).to(input_embeds.device) + + input_embeds = input_embeds.reshape(b, n, c) + else: + input_embeds = self.language_model.get_input_embeddings()(input_ids) + + if "use_cache" not in generate_kwargs: + generate_kwargs["use_cache"] = True + + outputs = self.language_model.generate( + inputs_embeds=input_embeds, + attention_mask=attention_mask, + generation_config=generation_config, + output_hidden_states=output_hidden_states, + **generate_kwargs, + ) + + return outputs + + # Copied from transformers.models.llava_next.modeling_llava_next.LlavaNextForConditionalGeneration.get_input_embeddings + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + # Copied from transformers.models.llava_next.modeling_llava_next.LlavaNextForConditionalGeneration.set_input_embeddings + def set_input_embeddings(self, value): + self.language_model.set_input_embeddings(value) + + # Copied from transformers.models.llava_next.modeling_llava_next.LlavaNextForConditionalGeneration.get_output_embeddings + def get_output_embeddings(self): + return self.language_model.get_output_embeddings() + + # Copied from transformers.models.llava_next.modeling_llava_next.LlavaNextForConditionalGeneration.set_output_embeddings + def set_output_embeddings(self, new_embeddings): + self.language_model.set_output_embeddings(new_embeddings) + + # Copied from transformers.models.llava_next.modeling_llava_next.LlavaNextForConditionalGeneration.set_decoder + def set_decoder(self, decoder): + self.language_model.set_decoder(decoder) + + # Copied from transformers.models.llava_next.modeling_llava_next.LlavaNextForConditionalGeneration.get_decoder + def get_decoder(self): + return self.language_model.get_decoder() diff --git a/src/lerobot/policies/groot/eagle2_hg_model/processing_eagle2_5_vl.py b/src/lerobot/policies/groot/eagle2_hg_model/processing_eagle2_5_vl.py new file mode 100755 index 000000000..27f9b3345 --- /dev/null +++ b/src/lerobot/policies/groot/eagle2_hg_model/processing_eagle2_5_vl.py @@ -0,0 +1,518 @@ +# Copyright 2024 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Processor class for Eagle25VL. +copy from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llava_onevision/processing_llava_onevision.py +""" + +import base64 +import os +import re +from io import BytesIO + +import requests +import torch +from PIL import Image +from transformers.feature_extraction_utils import BatchFeature +from transformers.image_utils import ImageInput +from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack +from transformers.tokenization_utils_base import PreTokenizedInput, TextInput +from transformers.utils import logging +from transformers.video_utils import VideoInput + +logger = logging.get_logger(__name__) + + +FRAME_FACTOR = 2 +FPS = 2.0 +FPS_MIN_FRAMES = 4 +FPS_MAX_FRAMES = 256 + + +def to_rgb(pil_image: Image.Image) -> Image.Image: + if pil_image.mode == "RGBA": + white_background = Image.new("RGB", pil_image.size, (255, 255, 255)) + white_background.paste(pil_image, mask=pil_image.split()[3]) # Use alpha channel as mask + return white_background + else: + return pil_image.convert("RGB") + + +def fetch_image(ele: dict[str, str | Image.Image]) -> Image.Image: + image = ele["image"] if "image" in ele else ele["image_url"] + image_obj = None + if isinstance(image, Image.Image): + image_obj = image + elif image.startswith("http://") or image.startswith("https://"): + response = requests.get(image, stream=True, timeout=10) + image_obj = Image.open(BytesIO(response.content)) + elif image.startswith("file://"): + image_obj = Image.open(image[7:]) + elif image.startswith("data:image"): + if "base64," in image: + _, base64_data = image.split("base64,", 1) + data = base64.b64decode(base64_data) + image_obj = Image.open(BytesIO(data)) + else: + image_obj = Image.open(image) + if image_obj is None: + raise ValueError( + f"Unrecognized image input, support local path, http url, base64 and PIL.Image, got {image}" + ) + image = to_rgb(image_obj) + if "scale_factor" in ele: + scale_factor = ele["scale_factor"] + image = image.resize((image.width * scale_factor, image.height * scale_factor), Image.BILINEAR) + return image + + +class Eagle25VLProcessorKwargs(ProcessingKwargs, total=False): + # see processing_utils.ProcessingKwargs documentation for usage. + _defaults = { + "text_kwargs": { + "padding": False, + }, + "images_kwargs": {}, + "videos_kwargs": {"max_dynamic_tiles": 1}, + } + + +class Eagle25VLProcessor(ProcessorMixin): + r""" + Constructs a Eagle25VL processor which wraps a Eagle25VL video processor, Eagle25VL image processor and a Eagle25VL tokenizer into a single processor. + + [`Eagle25VLProcessor`] offers all the functionalities of [`Eagle25VLVideoProcessor`], [`Eagle25VLImageProcessor`] and [`Eagle25VLTokenizer`]. See the + [`~Eagle25VLVideoProcessor.__call__`], [`~Eagle25VLProcessor.__call__`] and [`~Eagle25VLProcessor.decode`] for more information. + + Args: + image_processor ([`LlavaOnevisionImageProcessor`], *optional*): + The image processor is a required input. + tokenizer ([`LlamaTokenizerFast`], *optional*): + The tokenizer is a required input. + num_image_tokens (`int`, *optional*): + Number of image tokens for one imagethat will be returned by vision tower. + vision_feature_select_strategy (`str`, *optional*): + The feature selection strategy used to select the vision feature from the vision backbone. + Should be same as in model's config + chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages + in a chat into a tokenizable string. + image_token (`str`, *optional*, defaults to `""`): + Special token used to denote image location. + video_token (`str`, *optional*, defaults to `"