diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index ec5ac4372..43e2442d3 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -22,20 +22,21 @@ Short, imperative summary (e.g., "fix(robots): handle None in sensor parser"). S - Short, concrete bullets of the modifications (files/behaviour). - Short note if this introduces breaking changes and migration steps. -## How was this tested +## How was this tested (or how to run locally) - Tests added: list new tests or test files. - Manual checks / dataset runs performed. +- Instructions for the reviewer -## How to run locally (reviewer) +Example: -- Run the relevant tests: +- Ran the relevant tests: ```bash pytest -q tests/ -k ``` -- Run a quick example or CLI (if applicable): +- Reproduce with a quick example or CLI (if applicable): ```bash lerobot-train --some.option=true diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml index 48a10e4bc..c7926c542 100644 --- a/.github/workflows/documentation.yml +++ b/.github/workflows/documentation.yml @@ -18,6 +18,11 @@ name: Documentation on: # Allows running this workflow manually from the Actions tab workflow_dispatch: + inputs: + version: + description: 'Version tag (e.g. v0.1.2) - Leave empty for standard main build' + required: false + type: string # Triggers the workflow on push events to main for the docs folder push: @@ -54,7 +59,13 @@ jobs: with: commit_sha: ${{ github.sha }} package: lerobot - additional_args: --not_python_module ${{ github.event_name == 'release' && format('--version {0}', github.event.release.tag_name) || '' }} + additional_args: >- + --not_python_module + ${{ + (github.event_name == 'release' && format('--version {0}', github.event.release.tag_name)) || + (inputs.version != '' && format('--version {0}', inputs.version)) || + '' + }} secrets: token: ${{ secrets.HUGGINGFACE_PUSH }} hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }} diff --git a/.github/workflows/full_tests.yml b/.github/workflows/full_tests.yml index 7bf2cb78c..4dce3121a 100644 --- a/.github/workflows/full_tests.yml +++ b/.github/workflows/full_tests.yml @@ -186,15 +186,18 @@ jobs: steps: - name: Get Docker Hub Token and Delete Image # zizmor: ignore[template-injection] + env: + DOCKERHUB_LEROBOT_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }} + DOCKERHUB_LEROBOT_PASSWORD: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }} + IMAGE_FULL: ${{ needs.build-and-push-docker.outputs.image_tag }} run: | - IMAGE_NAME=$(echo "${{ needs.build-and-push-docker.outputs.image_tag }}" | cut -d':' -f1) - IMAGE_TAG=$(echo "${{ needs.build-and-push-docker.outputs.image_tag }}" | cut -d':' -f2) - + IMAGE_NAME=$(echo "$IMAGE_FULL" | cut -d':' -f1) + IMAGE_TAG=$(echo "$IMAGE_FULL" | cut -d':' -f2-) echo "Attempting to delete image: $IMAGE_NAME:$IMAGE_TAG" TOKEN=$(curl -s -H "Content-Type: application/json" \ -X POST \ - -d '{"username": "${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}", "password": "${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}"}' \ + -d "{\"username\": \"$DOCKERHUB_LEROBOT_USERNAME\", \"password\": \"$DOCKERHUB_LEROBOT_PASSWORD\"}" \ https://hub.docker.com/v2/users/login/ | jq -r .token) if [ "$TOKEN" == "null" ] || [ -z "$TOKEN" ]; then @@ -205,7 +208,7 @@ jobs: HTTP_RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" \ -H "Authorization: JWT ${TOKEN}" \ -X DELETE \ - https://hub.docker.com/v2/repositories/${IMAGE_NAME}/tags/${IMAGE_TAG}/) + https://hub.docker.com/v2/repositories/${IMAGE_NAME}/tags/$IMAGE_TAG) if [ "$HTTP_RESPONSE" -eq 204 ]; then echo "Successfully deleted Docker image tag: $IMAGE_NAME:$IMAGE_TAG" diff --git a/.github/workflows/unbound_deps_tests.yml b/.github/workflows/unbound_deps_tests.yml index 3908bdc3d..a75ecc121 100644 --- a/.github/workflows/unbound_deps_tests.yml +++ b/.github/workflows/unbound_deps_tests.yml @@ -20,8 +20,8 @@ on: workflow_dispatch: # Run on the 1st and 15th of every month at 09:00 UTC - schedule: - - cron: '0 2 1,15 * *' + # schedule: + # - cron: '0 2 1,15 * *' permissions: contents: read @@ -162,15 +162,19 @@ jobs: steps: - name: Get Docker Hub Token and Delete Image # zizmor: ignore[template-injection] + env: + DOCKERHUB_LEROBOT_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }} + DOCKERHUB_LEROBOT_PASSWORD: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }} + IMAGE_FULL: ${{ needs.build-and-push-docker.outputs.image_tag }} run: | - IMAGE_NAME=$(echo "${{ needs.build-and-push-docker.outputs.image_tag }}" | cut -d':' -f1) - IMAGE_TAG=$(echo "${{ needs.build-and-push-docker.outputs.image_tag }}" | cut -d':' -f2) + IMAGE_NAME=$(echo "$IMAGE_FULL" | cut -d':' -f1) + IMAGE_TAG=$(echo "$IMAGE_FULL" | cut -d':' -f2) echo "Attempting to delete image: $IMAGE_NAME:$IMAGE_TAG" TOKEN=$(curl -s -H "Content-Type: application/json" \ -X POST \ - -d '{"username": "${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}", "password": "${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}"}' \ + -d "{\"username\": \"$DOCKERHUB_LEROBOT_USERNAME\", \"password\": \"$DOCKERHUB_LEROBOT_PASSWORD\"}" \ https://hub.docker.com/v2/users/login/ | jq -r .token) if [ "$TOKEN" == "null" ] || [ -z "$TOKEN" ]; then @@ -181,7 +185,7 @@ jobs: HTTP_RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" \ -H "Authorization: JWT ${TOKEN}" \ -X DELETE \ - https://hub.docker.com/v2/repositories/${IMAGE_NAME}/tags/${IMAGE_TAG}/) + https://hub.docker.com/v2/repositories/${IMAGE_NAME}/tags/$IMAGE_TAG) if [ "$HTTP_RESPONSE" -eq 204 ]; then echo "Successfully deleted Docker image tag: $IMAGE_NAME:$IMAGE_TAG" diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index abca0d821..c51a48831 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -14,7 +14,7 @@ You can contribute in many ways: - **Documentation:** Improve examples, guides, and docstrings. - **Feedback:** Submit tickets related to bugs or desired new features. -If you are unsure where to start, join our [Discord Channel](https://discord.gg/JkrYNdmw). +If you are unsure where to start, join our [Discord Channel](https://discord.gg/q8Dzzpym3f). ## Development Setup diff --git a/README.md b/README.md index 02652d1c9..d60cd35a9 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,7 @@ [![Status](https://img.shields.io/pypi/status/lerobot)](https://pypi.org/project/lerobot/) [![Version](https://img.shields.io/pypi/v/lerobot)](https://pypi.org/project/lerobot/) [![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-v2.1-ff69b4.svg)](https://github.com/huggingface/lerobot/blob/main/CODE_OF_CONDUCT.md) +[![Discord](https://img.shields.io/badge/Discord-Join_Us-5865F2?style=flat&logo=discord&logoColor=white)](https://discord.gg/q8Dzzpym3f) @@ -99,11 +100,11 @@ lerobot-train \ --dataset.repo_id=lerobot/aloha_mobile_cabinet ``` -| Category | Models | -| -------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| **Imitation Learning** | [ACT](./docs/source/policy_act_README.md), [Diffusion](./docs/source/policy_diffusion_README.md), [VQ-BeT](./docs/source/policy_vqbet_README.md) | -| **Reinforcement Learning** | [HIL-SERL](./docs/source/hilserl.mdx), [TDMPC](./docs/source/policy_tdmpc_README.md) & QC-FQL (coming soon) | -| **VLAs Models** | [Pi0.5](./docs/source/pi05.mdx), [GR00T N1.5](./docs/source/policy_groot_README.md), [SmolVLA](./docs/source/policy_smolvla_README.md), [XVLA](./docs/source/xvla.mdx) | +| Category | Models | +| -------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| **Imitation Learning** | [ACT](./docs/source/policy_act_README.md), [Diffusion](./docs/source/policy_diffusion_README.md), [VQ-BeT](./docs/source/policy_vqbet_README.md) | +| **Reinforcement Learning** | [HIL-SERL](./docs/source/hilserl.mdx), [TDMPC](./docs/source/policy_tdmpc_README.md) & QC-FQL (coming soon) | +| **VLAs Models** | [Pi0Fast](./docs/source/pi0fast.mdx), [Pi0.5](./docs/source/pi05.mdx), [GR00T N1.5](./docs/source/policy_groot_README.md), [SmolVLA](./docs/source/policy_smolvla_README.md), [XVLA](./docs/source/xvla.mdx) | Similarly to the hardware, you can easily implement your own policy & leverage LeRobot's data collection, training, and visualization tools, and share your model to the HF Hub @@ -127,7 +128,8 @@ Learn how to implement your own simulation environment or benchmark and distribu ## Resources - **[Documentation](https://huggingface.co/docs/lerobot/index):** The complete guide to tutorials & API. -- **[Discord](https://discord.gg/3gxM6Avj):** Join the `LeRobot` server to discuss with the community. +- **[Chinese Tutorials: LeRobot+SO-ARM101中文教程-同济子豪兄](https://zihao-ai.feishu.cn/wiki/space/7589642043471924447)** Detailed doc for assembling, teleoperate, dataset, train, deploy. Verified by Seed Studio and 5 global hackathon players. +- **[Discord](https://discord.gg/q8Dzzpym3f):** Join the `LeRobot` server to discuss with the community. - **[X](https://x.com/LeRobotHF):** Follow us on X to stay up-to-date with the latest developments. - **[Robot Learning Tutorial](https://huggingface.co/spaces/lerobot/robot-learning-tutorial):** A free, hands-on course to learn robot learning using LeRobot. diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 000000000..cf58f6cdb --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,48 @@ +# Security Policy + +## Project Status & Philosophy + +`lerobot` has so far been primarily a research and prototyping tool, which is why deployment security hasn’t been a strong focus until now. As `lerobot` continues to be adopted and deployed in production, we are paying much closer attention to these kinds of issues. + +Fortunately, being an open-source project, the community can also help by reporting and fixing vulnerabilities. We appreciate your efforts to responsibly disclose your findings and will make every effort to acknowledge your contributions. + +## Reporting a Vulnerability + +To report a security issue, please use the GitHub Security Advisory ["Report a Vulnerability"](https://github.com/huggingface/lerobot/security/advisories/new) tab. + +The `lerobot` team will send a response indicating the next steps in handling your report. After the initial reply to your report, the security team will keep you informed of the progress towards a fix and full announcement, and may ask for additional information or guidance. + +#### Hugging Face Security Team + +Since this project is part of the Hugging Face ecosystem, feel free to submit vulnerability reports directly to: **[security@huggingface.co](mailto:security@huggingface.co)**. Someone from the HF security team will review the report and recommend next steps. + +#### Open Source Disclosures + +If reporting a vulnerability specific to the open-source codebase (and not the underlying Hub infrastructure), you may also use [Huntr](https://huntr.com), a vulnerability disclosure program for open source software. + +## Supported Versions + +Currently, we treat `lerobot` as a rolling release. We prioritize security updates for the latest available version (`main` branch). + +| Version | Supported | +| -------- | --------- | +| Latest | ✅ | +| < Latest | ❌ | + +## Secure Usage Guidelines + +`lerobot` is tightly coupled to the Hugging Face Hub for sharing data and pretrained policies. When downloading artifacts uploaded by others, you expose yourself to risks. Please read below for recommendations to keep your runtime and robot environment safe. + +### Remote Artefacts (Weights & Policies) + +Models and policies uploaded to the Hugging Face Hub come in different formats. We heavily recommend uploading and downloading models in the [`safetensors`](https://github.com/huggingface/safetensors) format. + +`safetensors` was developed specifically to prevent arbitrary code execution on your system, which is critical when running software on physical hardware/robots. + +To avoid loading models from unsafe formats (e.g., `pickle`), you should ensure you are prioritizing `safetensors` files. + +### Remote Code + +Some models or environments on the Hub may require `trust_remote_code=True` to run custom architecture code. + +Please **always** verify the content of the modeling files when using this argument. We recommend setting a specific `revision` (commit hash) when loading remote code to ensure you protect yourself from unverified updates to the repository. diff --git a/docker/Dockerfile.internal b/docker/Dockerfile.internal index 2616cd06c..c1dfa1dae 100644 --- a/docker/Dockerfile.internal +++ b/docker/Dockerfile.internal @@ -73,7 +73,7 @@ ENV HOME=/home/user_lerobot \ RUN uv venv --python python${PYTHON_VERSION} # Install Python dependencies for caching -COPY --chown=user_lerobot:user_lerobot pyproject.toml README.md MANIFEST.in ./ +COPY --chown=user_lerobot:user_lerobot setup.py pyproject.toml README.md MANIFEST.in ./ COPY --chown=user_lerobot:user_lerobot src/ src/ ARG UNBOUND_DEPS=false diff --git a/docker/Dockerfile.user b/docker/Dockerfile.user index c1b284453..031165930 100644 --- a/docker/Dockerfile.user +++ b/docker/Dockerfile.user @@ -59,7 +59,7 @@ ENV HOME=/home/user_lerobot \ RUN uv venv # Install Python dependencies for caching -COPY --chown=user_lerobot:user_lerobot pyproject.toml README.md MANIFEST.in ./ +COPY --chown=user_lerobot:user_lerobot setup.py pyproject.toml README.md MANIFEST.in ./ COPY --chown=user_lerobot:user_lerobot src/ src/ ARG UNBOUND_DEPS=false diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index 7766b3472..962c34077 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -19,6 +19,8 @@ title: Train RL in Simulation - local: multi_gpu_training title: Multi GPU training + - local: peft_training + title: Training with PEFT (e.g., LoRA) title: "Tutorials" - sections: - local: lerobot-dataset-v3 @@ -35,6 +37,8 @@ title: SmolVLA - local: pi0 title: π₀ (Pi0) + - local: pi0fast + title: π₀-FAST (Pi0Fast) - local: pi05 title: π₀.₅ (Pi05) - local: groot @@ -53,12 +57,16 @@ title: Use Async Inference - local: rtc title: Real-Time Chunking (RTC) + - local: training_time_rtc + title: Training-Time RTC title: "Inference" - sections: - local: envhub title: Environments from the Hub - local: envhub_leisaac title: Control & Train Robots in Sim (LeIsaac) + - local: envhub_isaaclab_arena + title: NVIDIA IsaacLab Arena Environments - local: libero title: Using Libero - local: metaworld @@ -93,6 +101,8 @@ title: Unitree G1 - local: earthrover_mini_plus title: Earth Rover Mini + - local: omx + title: OMX title: "Robots" - sections: - local: phone_teleop @@ -107,6 +117,8 @@ title: Notebooks - local: feetech title: Updating Feetech Firmware + - local: damiao + title: Damiao Motors and CAN Bus title: "Resources" - sections: - local: contributing diff --git a/docs/source/async.mdx b/docs/source/async.mdx index 9dd87472c..3244fc2a3 100644 --- a/docs/source/async.mdx +++ b/docs/source/async.mdx @@ -169,7 +169,7 @@ python -m lerobot.async_inference.robot_client \ ```python import threading -from lerobot.robots.so100_follower import SO100FollowerConfig +from lerobot.robots.so_follower import SO100FollowerConfig from lerobot.cameras.opencv.configuration_opencv import OpenCVCameraConfig from lerobot.async_inference.configs import RobotClientConfig from lerobot.async_inference.robot_client import RobotClient @@ -195,6 +195,7 @@ client_cfg = RobotClientConfig( robot=robot_cfg, server_address="localhost:8080", policy_device="mps", + client_device="cpu", policy_type="smolvla", pretrained_name_or_path="/smolvla_async", chunk_size_threshold=0.5, diff --git a/docs/source/damiao.mdx b/docs/source/damiao.mdx new file mode 100644 index 000000000..45388ab9b --- /dev/null +++ b/docs/source/damiao.mdx @@ -0,0 +1,165 @@ +# Damiao Motors and CAN Bus + +This guide covers setup and usage of Damiao motors with LeRobot via CAN bus communication. + +Currently, only Linux is supported, as the OpenArms CAN adapter only has drivers for Linux. + +## Linux CAN Setup + +Before using Damiao motors, you need to set up the CAN interface on your Linux system. + +### Install CAN Utilities + +```bash +sudo apt-get install can-utils +``` + +### Configure CAN Interface (Manual) + +For standard CAN FD (recommended for OpenArms): + +```bash +sudo ip link set can0 down +sudo ip link set can0 type can bitrate 1000000 dbitrate 5000000 fd on +sudo ip link set can0 up +``` + +For standard CAN (without FD): + +```bash +sudo ip link set can0 down +sudo ip link set can0 type can bitrate 1000000 +sudo ip link set can0 up +``` + +### Configure CAN Interface (Using LeRobot) + +LeRobot provides a utility script to setup and test CAN interfaces: + +```bash +# Setup multiple interfaces (e.g., OpenArms Followers with 2 CAN buses) +lerobot-setup-can --mode=setup --interfaces=can0,can1 +``` + +## Debugging CAN Communication + +Use the built-in debug tools to test motor communication: + +```bash +# Test motors on all interfaces +lerobot-setup-can --mode=test --interfaces=can0,can1 + +# Run speed/latency test +lerobot-setup-can --mode=speed --interfaces=can0 +``` + +The test mode will scan for motors (IDs 0x01-0x08) and report which ones respond. Example output: + +``` +can0: UP (CAN FD) + Motor 0x01 (joint_1): ✓ FOUND + → Response 0x11 [FD]: 00112233... + Motor 0x02 (joint_2): ✓ FOUND + Motor 0x03 (joint_3): ✗ No response + ... + Summary: 2/8 motors found +``` + +## Usage + +### Basic Setup + +```python +from lerobot.motors import Motor +from lerobot.motors.damiao import DamiaoMotorsBus + +# Define your motors with send/receive CAN IDs +motors = { + "joint_1": Motor(id=0x01, motor_type_str="dm8009", recv_id=0x11), + "joint_2": Motor(id=0x02, motor_type_str="dm4340", recv_id=0x12), + "joint_3": Motor(id=0x03, motor_type_str="dm4310", recv_id=0x13), +} + +# Create the bus +bus = DamiaoMotorsBus( + port="can0", # Linux socketcan interface + motors=motors, +) + +# Connect +bus.connect() +``` + +### Reading Motor States + +```python +# Read single motor position (degrees) +position = bus.read("Present_Position", "joint_1") + +# Read from multiple motors +positions = bus.sync_read("Present_Position") # All motors +positions = bus.sync_read("Present_Position", ["joint_1", "joint_2"]) + +# Read all states at once (position, velocity, torque) +states = bus.sync_read_all_states() +# Returns: {'joint_1': {'position': 45.2, 'velocity': 1.3, 'torque': 0.5}, ...} +``` + +### Writing Motor Commands + +```python +# Enable torque +bus.enable_torque() + +# Set goal position (degrees) +bus.write("Goal_Position", "joint_1", 45.0) + +# Set positions for multiple motors +bus.sync_write("Goal_Position", { + "joint_1": 45.0, + "joint_2": -30.0, + "joint_3": 90.0, +}) + +# Disable torque +bus.disable_torque() +``` + +## Configuration Options + +| Parameter | Default | Description | +| -------------- | --------- | ----------------------------------------------------------- | +| `port` | - | CAN interface (`can0`) or serial port (`/dev/cu.usbmodem*`) | +| `use_can_fd` | `True` | Enable CAN FD for higher data rates | +| `bitrate` | `1000000` | Nominal bitrate (1 Mbps) | +| `data_bitrate` | `5000000` | CAN FD data bitrate (5 Mbps) | + +## Motor Configuration + +Each motor requires: + +- `id`: CAN ID for sending commands +- `motor_type`: One of the supported motor types (e.g., `"dm8009"`, `"dm4340"`) +- `recv_id`: CAN ID for receiving responses + +OpenArms default IDs follow the pattern: send ID `0x0N`, receive ID `0x1N` where N is the joint number. + +## Troubleshooting + +### No Response from Motors + +1. **Check power** +2. **Verify CAN wiring**: Check CAN-H, CAN-L, and GND connections +3. **Check motor IDs**: Use Damiao Debugging Tools to verify/configure IDs +4. **Test CAN interface**: Run `candump can0` to see if messages are being received +5. **Run diagnostics**: `lerobot-setup-can --mode=test --interfaces=can0` + +### Motor Timeout Parameter + +If motors were configured with timeout=0, they won't respond to commands. Use Damiao Debugging Tools to set a non-zero timeout value. + +### Verify CAN FD Status + +```bash +ip -d link show can0 | grep fd +``` diff --git a/docs/source/earthrover_mini_plus.mdx b/docs/source/earthrover_mini_plus.mdx index 7e27eb93e..d8083336a 100644 --- a/docs/source/earthrover_mini_plus.mdx +++ b/docs/source/earthrover_mini_plus.mdx @@ -1,5 +1,11 @@ # EarthRover Mini Plus +EarthRover Mini Plus + The EarthRover Mini Plus is a fully open source mobile robot that connects through the cloud using the Frodobots SDK. This lets you control the robot and record datasets for training AI models. ## What You Need @@ -12,23 +18,42 @@ The EarthRover Mini Plus is a fully open source mobile robot that connects throu ### Setting Up the Frodobots SDK -The robot needs the [Frodobots SDK](https://github.com/Frodobots/earth-rovers-sdk) running on your computer. Here's how: +The robot needs the [Frodobots SDK](https://github.com/frodobots-org/earth-rovers-sdk) running on your computer. Here's how: 1. Download and install the SDK: ```bash -git clone https://github.com/Frodobots/earth-rovers-sdk.git +git clone https://github.com/frodobots-org/earth-rovers-sdk.git cd earth-rovers-sdk pip install -r requirements.txt ``` -2. Start the SDK: +2. Save Credentials: + +Write your .env variables with the SDK API key and bot name provided by the Frodobots team. + +```bash +SDK_API_TOKEN=your_sdk_api_token_here +BOT_SLUG=your_bot_slug_here +CHROME_EXECUTABLE_PATH=/path/to/chrome_or_chromium +# Default value is MAP_ZOOM_LEVEL=18 https://wiki.openstreetmap.org/wiki/Zoom_levels +MAP_ZOOM_LEVEL=18 +MISSION_SLUG=your_mission_slug_here +# Image quality between 0.1 and 1.0 (default: 0.8) +# Recommended: 0.8 for better performance +IMAGE_QUALITY=0.8 +# Image format: jpeg, png or webp (default: png) +# Recommended: jpeg for better performance and lower bandwidth usage +IMAGE_FORMAT=jpeg +``` + +3. Start the SDK: ```bash hypercorn main:app --reload ``` -3. Open your web browser and go to `http://localhost:8000`, then click "Join" +4. Open your web browser and go to `http://localhost:8000`, then click "Join" The SDK gives you: diff --git a/docs/source/envhub.mdx b/docs/source/envhub.mdx index ba6464460..df103d0dd 100644 --- a/docs/source/envhub.mdx +++ b/docs/source/envhub.mdx @@ -2,14 +2,32 @@ The **EnvHub** feature allows you to load simulation environments directly from the Hugging Face Hub with a single line of code. This unlocks a powerful new model for collaboration: instead of environments being locked away inside monolithic libraries, anyone can publish custom environments and share them with the community. -## Overview +## What is EnvHub? -With EnvHub, you can: +EnvHub lets you create custom robotics simulation environments with your own robot models and scenarios, and make them easily usable by anyone through the LeRobot framework. -- Load environments from the Hub instantly -- Share your custom simulation tasks with the community -- Version control your environments using Git -- Distribute complex physics simulations without packaging hassles +EnvHub packages are stored on the Hugging Face Hub, and can be seamlessly pulled and used in your AI robotics projects through LeRobot with a single line of code. + +Thanks to EnvHub, you can: + +1. **Create and publish environments** to the Hugging Face Hub as Git repositories, and distribute complex physics simulations without packaging hassles +2. **Load environments** dynamically, without installing them as packages +3. **Version and track** environment changes using Git semantics +4. **Discover** new simulation tasks shared by the community + +This design means you can go from discovering an interesting environment on the Hub to running experiments in seconds, or create your own custom robot and environment without worrying about dependency conflicts or complex installation procedures. + +When you create an EnvHub package, you can build anything you want inside it and use any simulation tool you like: this is your own space to play with. The only requirement is that the package contains an `env.py` file that defines the environment and allows LeRobot to load and use your EnvHub package. + +This `env.py` file needs to expose a small API so LeRobot can load and run it. In particular, you must provide a `make_env(n_envs: int = 1, use_async_envs: bool = False)` or `make_env(n_envs: int = 1, use_async_envs: bool = False, cfg: EnvConfig)` function, which is the main entry point for LeRobot. It should return one of: + +- A `gym.vector.VectorEnv` (most common) +- A single `gym.Env` (will be automatically wrapped) +- A dict mapping `{suite_name: {task_id: VectorEnv}}` (for multi-task benchmarks) + +You can also pass an `EnvConfig` object to `make_env` to configure the environment (e.g. the number of environments, task, camera name, initial states, control mode, episode length, etc.). + +Finally, your environment must implement the standard `gym.vector.VectorEnv` interface so it works with LeRobot, including methods like `reset` and `step`. ## Quick Start @@ -29,17 +47,6 @@ env = make_env("lerobot/cartpole-env", trust_remote_code=True) hash for reproducibility and security. -## What is EnvHub? - -EnvHub is a framework that allows researchers and developers to: - -1. **Publish environments** to the Hugging Face Hub as Git repositories -2. **Load environments** dynamically without installing them as packages -3. **Version and track** environment changes using Git semantics -4. **Discover** new simulation tasks shared by the community - -This design means you can go from discovering an interesting environment on the Hub to running experiments in seconds, without worrying about dependency conflicts or complex installation procedures. - ## Repository Structure To make your environment loadable from the Hub, your repository must contain at minimum: diff --git a/docs/source/envhub_isaaclab_arena.mdx b/docs/source/envhub_isaaclab_arena.mdx new file mode 100644 index 000000000..828d51bad --- /dev/null +++ b/docs/source/envhub_isaaclab_arena.mdx @@ -0,0 +1,510 @@ +# NVIDIA IsaacLab Arena & LeRobot + +LeRobot EnvHub now supports **GPU-accelerated simulation** with IsaacLab Arena for policy evaluation at scale. +Train and evaluate imitation learning policies with high-fidelity simulation — all integrated into the LeRobot ecosystem. + +IsaacLab Arena - GR1 Microwave Environment + +[IsaacLab Arena](https://github.com/isaac-sim/IsaacLab-Arena) integrates with NVIDIA IsaacLab to provide: + +- 🤖 **Humanoid embodiments**: GR1, G1, Galileo with various configurations +- 🎯 **Manipulation & loco-manipulation tasks**: Door opening, pick-and-place, button pressing, and more +- ⚡ **GPU-accelerated rollouts**: Parallel environment execution on NVIDIA GPUs +- 🖼️ **RTX Rendering**: Evaluate vision-based policies with realistic rendering, reflections and refractions +- 📦 **LeRobot-compatible datasets**: Ready for training with GR00T N1x, PI0, SmolVLA, ACT, and Diffusion policies +- 🔄 **EnvHub integration**: Load environments from HuggingFace EnvHub with one line + +## Installation + +### Prerequisites + +Hardware requirements are shared with Isaac Sim, and are detailed in [Isaac Sim Requirements](https://docs.isaacsim.omniverse.nvidia.com/5.1.0/installation/requirements.html). + +- NVIDIA GPU with CUDA support +- NVIDIA driver compatible with IsaacSim 5.1.0 +- Linux (Ubuntu 22.04 / 24.04) + +### Setup + +```bash +# 1. Create conda environment +conda create -y -n lerobot-arena python=3.11 +conda activate lerobot-arena +conda install -y -c conda-forge ffmpeg=7.1.1 + +# 2. Install Isaac Sim 5.1.0 +pip install "isaacsim[all,extscache]==5.1.0" --extra-index-url https://pypi.nvidia.com + +# Accept NVIDIA EULA (required) +export ACCEPT_EULA=Y +export PRIVACY_CONSENT=Y + +# 3. Install IsaacLab 2.3.0 +git clone https://github.com/isaac-sim/IsaacLab.git +cd IsaacLab +git checkout v2.3.0 +./isaaclab.sh -i +cd .. + +# 4. Install IsaacLab Arena +git clone https://github.com/isaac-sim/IsaacLab-Arena.git +cd IsaacLab-Arena +git checkout release/0.1.1 +pip install -e . +cd .. + + +# 5. Install LeRobot +git clone https://github.com/huggingface/lerobot.git +cd lerobot +pip install -e . +cd .. + + +# 6. Install additional dependencies +pip install onnxruntime==1.23.2 lightwheel-sdk==1.0.1 vuer[all]==0.0.70 qpsolvers==4.8.1 +pip install numpy==1.26.0 # Isaac Sim 5.1 depends on numpy==1.26.0, this will be fixed in next release +``` + +## Evaluating Policies + +### Pre-trained Policies + +The following trained policies are available: + +| Policy | Architecture | Task | Link | +| :-------------------------- | :----------- | :------------ | :----------------------------------------------------------------------- | +| pi05-arena-gr1-microwave | PI0.5 | GR1 Microwave | [HuggingFace](https://huggingface.co/nvidia/pi05-arena-gr1-microwave) | +| smolvla-arena-gr1-microwave | SmolVLA | GR1 Microwave | [HuggingFace](https://huggingface.co/nvidia/smolvla-arena-gr1-microwave) | + +### Evaluate SmolVLA + +```bash +pip install -e ".[smolvla]" +pip install numpy==1.26.0 # revert numpy to version 1.26 +``` + +```bash +lerobot-eval \ + --policy.path=nvidia/smolvla-arena-gr1-microwave \ + --env.type=isaaclab_arena \ + --env.hub_path=nvidia/isaaclab-arena-envs \ + --rename_map='{"observation.images.robot_pov_cam_rgb": "observation.images.robot_pov_cam"}' \ + --policy.device=cuda \ + --env.environment=gr1_microwave \ + --env.embodiment=gr1_pink \ + --env.object=mustard_bottle \ + --env.headless=false \ + --env.enable_cameras=true \ + --env.video=true \ + --env.video_length=10 \ + --env.video_interval=15 \ + --env.state_keys=robot_joint_pos \ + --env.camera_keys=robot_pov_cam_rgb \ + --trust_remote_code=True \ + --eval.batch_size=1 +``` + +### Evaluate PI0.5 + +```bash +pip install -e ".[pi]" +pip install numpy==1.26.0 # revert numpy to version 1.26 +``` + +PI0.5 requires disabling torch compile for evaluation: + +```bash +TORCH_COMPILE_DISABLE=1 TORCHINDUCTOR_DISABLE=1 lerobot-eval \ + --policy.path=nvidia/pi05-arena-gr1-microwave \ + --env.type=isaaclab_arena \ + --env.hub_path=nvidia/isaaclab-arena-envs \ + --rename_map='{"observation.images.robot_pov_cam_rgb": "observation.images.robot_pov_cam"}' \ + --policy.device=cuda \ + --env.environment=gr1_microwave \ + --env.embodiment=gr1_pink \ + --env.object=mustard_bottle \ + --env.headless=false \ + --env.enable_cameras=true \ + --env.video=true \ + --env.video_length=15 \ + --env.video_interval=15 \ + --env.state_keys=robot_joint_pos \ + --env.camera_keys=robot_pov_cam_rgb \ + --trust_remote_code=True \ + --eval.batch_size=1 +``` + + + To change the number of parallel environments, use the ```--eval.batch_size``` + flag. + + +### What to Expect + +During evaluation, you will see a progress bar showing the running success rate: + +``` +Stepping through eval batches: 8%|██████▍ | 4/50 [00:45<08:06, 10.58s/it, running_success_rate=25.0%] +``` + +### Video Recording + +To enable video recording during evaluation, add the following flags to your command: + +```bash +--env.video=true \ +--env.video_length=15 \ +--env.video_interval=15 +``` + +For more details on video recording, see the [IsaacLab Recording Documentation](https://isaac-sim.github.io/IsaacLab/main/source/how-to/record_video.html). + + +When running headless with `--env.headless=true`, you must also enable cameras explicitly for camera enabled environments: + +```bash +--env.headless=true --env.enable_cameras=true +``` + + + +### Output Directory + +Evaluation videos are saved to the output directory with the following structure: + +``` +outputs/eval//__/videos/_/eval_episode_.mp4 +``` + +For example: + +``` +outputs/eval/2026-01-02/14-38-01_isaaclab_arena_smolvla/videos/gr1_microwave_0/eval_episode_0.mp4 +``` + +## Training Policies + +To learn more about training policies with LeRobot, please refer to the training documentation: + +- [SmolVLA](./smolvla) +- [Pi0.5](./pi05) +- [GR00T N1.5](./groot) + +Sample IsaacLab Arena datasets are available on HuggingFace Hub for experimentation: + +| Dataset | Description | Frames | +| :-------------------------------------------------------------------------------------------------------- | :------------------------- | :----- | +| [Arena-GR1-Manipulation-Task](https://huggingface.co/datasets/nvidia/Arena-GR1-Manipulation-Task-v3) | GR1 microwave manipulation | ~4K | +| [Arena-G1-Loco-Manipulation-Task](https://huggingface.co/datasets/nvidia/Arena-G1-Loco-Manipulation-Task) | G1 loco-manipulation | ~4K | + +## Environment Configuration + +### Full Configuration Options + +```python +from lerobot.envs.configs import IsaaclabArenaEnv + +config = IsaaclabArenaEnv( + # Environment selection + environment="gr1_microwave", # Task environment + embodiment="gr1_pink", # Robot embodiment + object="power_drill", # Object to manipulate + + # Simulation settings + episode_length=300, # Max steps per episode + headless=True, # Run without GUI + device="cuda:0", # GPU device + seed=42, # Random seed + + # Observation configuration + state_keys="robot_joint_pos", # State observation keys (comma-separated) + camera_keys="robot_pov_cam_rgb", # Camera observation keys (comma-separated) + state_dim=54, # Expected state dimension + action_dim=36, # Expected action dimension + camera_height=512, # Camera image height + camera_width=512, # Camera image width + enable_cameras=True, # Enable camera observations + + # Video recording + video=False, # Enable video recording + video_length=100, # Frames per video + video_interval=200, # Steps between recordings + + # Advanced + mimic=False, # Enable mimic mode + teleop_device=None, # Teleoperation device + disable_fabric=False, # Disable fabric optimization + enable_pinocchio=True, # Enable Pinocchio for IK +) +``` + +### Using Environment Hub directly for advanced usage + +Create a file called `test_env_load_arena.py` or [download from the EnvHub](https://huggingface.co/nvidia/isaaclab-arena-envs/blob/main/tests/test_env_load_arena.py): + +```python +import logging +from dataclasses import asdict +from pprint import pformat +import torch +import tqdm +from lerobot.configs import parser +from lerobot.configs.eval import EvalPipelineConfig + + +@parser.wrap() +def main(cfg: EvalPipelineConfig): + """Run random action rollout for IsaacLab Arena environment.""" + logging.info(pformat(asdict(cfg))) + + from lerobot.envs.factory import make_env + + env_dict = make_env( + cfg.env, + n_envs=cfg.env.num_envs, + trust_remote_code=True, + ) + env = next(iter(env_dict.values()))[0] + env.reset() + for _ in tqdm.tqdm(range(cfg.env.episode_length)): + with torch.inference_mode(): + actions = env.action_space.sample() + obs, rewards, terminated, truncated, info = env.step(actions) + if terminated.any() or truncated.any(): + obs, info = env.reset() + env.close() + + +if __name__ == "__main__": + main() +``` + +Run with: + +```bash +python test_env_load_arena.py \ + --env.environment=g1_locomanip_pnp \ + --env.embodiment=gr1_pink \ + --env.object=cracker_box \ + --env.num_envs=4 \ + --env.enable_cameras=true \ + --env.seed=1000 \ + --env.video=true \ + --env.video_length=10 \ + --env.video_interval=15 \ + --env.headless=false \ + --env.hub_path=nvidia/isaaclab-arena-envs \ + --env.type=isaaclab_arena +``` + +## Creating New Environments + +First create a new IsaacLab Arena environment by following the [IsaacLab Arena Documentation](https://isaac-sim.github.io/IsaacLab-Arena/release/0.1.1/index.html). + +Clone our EnvHub repo: + +```bash +git clone https://huggingface.co/nvidia/isaaclab-arena-envs +``` + +Modify the `example_envs.yaml` file based on your new environment. +[Upload](./envhub#step-3-upload-to-the-hub) your modified repo to HuggingFace EnvHub. + + + Your IsaacLab Arena environment code must be locally available during + evaluation. Users can clone your environment repository separately, or you can + bundle the environment code and assets directly in your EnvHub repo. + + +Then, when evaluating, use your new environment: + +```bash +lerobot-eval \ + --env.hub_path=/isaaclab-arena-envs \ + --env.environment= \ + ...other flags... +``` + +We look forward to your contributions! + +## Troubleshooting + +### CUDA out of memory + +Reduce `batch_size` or use a GPU with more VRAM: + +```bash +--eval.batch_size=1 +``` + +### EULA not accepted + +Set environment variables before running: + +```bash +export ACCEPT_EULA=Y +export PRIVACY_CONSENT=Y +``` + +### Video recording not working + +Enable cameras when running headless: + +```bash +--env.video=true --env.enable_cameras=true --env.headless=true +``` + +### Policy output dimension mismatch + +Ensure `action_dim` matches your policy: + +```bash +--env.action_dim=36 +``` + +### libGLU.so.1 Errors during Isaac Sim initialization + +Ensure you have the following dependencies installed, this is likely to happen on headless machines. + +```bash +sudo apt update && sudo apt install -y libglu1-mesa libxt6 +``` + +## See Also + +- [EnvHub Documentation](./envhub.mdx) - General EnvHub usage +- [IsaacLab Arena GitHub](https://github.com/isaac-sim/IsaacLab-Arena) +- [IsaacLab Documentation](https://isaac-sim.github.io/IsaacLab/) + +## Lightwheel LW-BenchHub + +[Lightwheel](https://www.lightwheel.ai) is bringing `Lightwheel-Libero-Tasks` and `Lightwheel-RoboCasa-Tasks` with 268 tasks to the LeRobot ecosystem. +LW-BenchHub collects and generates large-scale datasets via teleoperation that comply with the LeRobot specification, enabling out-of-the-box training and evaluation workflows. +With the unified interface provided by EnvHub, developers can quickly build end-to-end experimental pipelines. + +### Install + +Assuming you followed the [Installation](#installation) steps, you can install LW-BenchHub with: + +```bash +conda install pinocchio -c conda-forge -y +pip install numpy==1.26.0 # revert numpy to version 1.26 + +sudo apt-get install git-lfs && git lfs install + +git clone https://github.com/LightwheelAI/lw_benchhub +git lfs pull # Ensure LFS files (e.g., .usd assets) are downloaded + +cd lw_benchhub +pip install -e . +``` + +For more detailed instructions, please refer to the [LW-BenchHub Documentation](https://docs.lightwheel.net/lw_benchhub/usage/Installation). + +### Lightwheel Tasks Dataset + +LW-BenchHub datasets are available on HuggingFace Hub: + +| Dataset | Description | Tasks | Frames | +| :------------------------------------------------------------------------------------------------------------ | :---------------------- | :---- | :----- | +| [Lightwheel-Tasks-X7S](https://huggingface.co/datasets/LightwheelAI/Lightwheel-Tasks-X7S) | X7S LIBERO and RoboCasa | 117 | ~10.3M | +| [Lightwheel-Tasks-Double-Piper](https://huggingface.co/datasets/LightwheelAI/Lightwheel-Tasks-Double-Piper) | Double-Piper LIBERO | 130 | ~6.0M | +| [Lightwheel-Tasks-G1-Controller](https://huggingface.co/datasets/LightwheelAI/Lightwheel-Tasks-G1-Controller) | G1-Controller LIBERO | 62 | ~2.7M | +| [Lightwheel-Tasks-G1-WBC](https://huggingface.co/datasets/LightwheelAI/Lightwheel-Tasks-G1-WBC) | G1-WBC RoboCasa | 32 | ~1.5M | + +For training policies, refer to the [Training Policies](#training-policies) section. + +### Evaluating Policies + +#### Pre-trained Policies + +The following trained policies are available: + +| Policy | Architecture | Task | Layout | Robot | Link | +| :----------------------- | :----------- | :----------------------------- | :--------- | :-------------- | :------------------------------------------------------------------------------------ | +| smolvla-double-piper-pnp | SmolVLA | L90K1PutTheBlackBowlOnThePlate | libero-1-1 | DoublePiper-Abs | [HuggingFace](https://huggingface.co/LightwheelAI/smolvla-double-piper-pnp/tree/main) | + +#### Evaluate SmolVLA + +```bash +lerobot-eval \ + --policy.path=LightwheelAI/smolvla-double-piper-pnp \ + --env.type=isaaclab_arena \ + --rename_map='{"observation.images.left_hand_camera_rgb": "observation.images.left_hand", "observation.images.right_hand_camera_rgb": "observation.images.right_hand", "observation.images.first_person_camera_rgb": "observation.images.first_person"}' \ + --env.hub_path=LightwheelAI/lw_benchhub_env \ + --env.kwargs='{"config_path": "configs/envhub/example.yml"}' \ + --trust_remote_code=true \ + --env.state_keys=joint_pos \ + --env.action_dim=12 \ + --env.camera_keys=left_hand_camera_rgb,right_hand_camera_rgb,first_person_camera_rgb \ + --policy.device=cuda \ + --eval.batch_size=10 \ + --eval.n_episodes=100 +``` + +### Environment Configuration + +Evaluation can be quickly launched by modifying the `robot`, `task`, and `layout` settings in the configuration file. + +#### Full Configuration Options + +```yml +# ========================= +# Basic Settings +# ========================= +disable_fabric: false +device: cuda:0 +sensitivity: 1.0 +step_hz: 50 +enable_cameras: true +execute_mode: eval +episode_length_s: 20.0 # Episode length in seconds, increase if episodes timeout during eval + +# ========================= +# Robot Settings +# ========================= +robot: DoublePiper-Abs # Robot type, DoublePiper-Abs, X7S-Abs, G1-Controller or G1-Controller-DecoupledWBC +robot_scale: 1.0 + +# ========================= +# Task & Scene Settings +# ========================= +task: L90K1PutTheBlackBowlOnThePlate # Task name +scene_backend: robocasa +task_backend: robocasa +debug_assets: null +layout: libero-1-1 # Layout and style ID +sources: + - objaverse + - lightwheel + - aigen_objs +object_projects: [] +usd_simplify: false +seed: 42 + +# ========================= +# Object Placement Retry Settings +# ========================= +max_scene_retry: 4 +max_object_placement_retry: 3 + +resample_objects_placement_on_reset: true +resample_robot_placement_on_reset: true + +# ========================= +# Replay Configuration Settings +# ========================= +replay_cfgs: + add_camera_to_observation: true + render_resolution: [640, 480] +``` + +### See Also + +- [LW-BenchHub GitHub](https://github.com/LightwheelAI/LW-BenchHub) +- [LW-BenchHub Documentation](https://docs.lightwheel.net/lw_benchhub/) diff --git a/docs/source/envhub_leisaac.mdx b/docs/source/envhub_leisaac.mdx index ff848d415..2537700a5 100644 --- a/docs/source/envhub_leisaac.mdx +++ b/docs/source/envhub_leisaac.mdx @@ -137,7 +137,8 @@ from lerobot.teleoperators import ( # noqa: F401 Teleoperator, TeleoperatorConfig, make_teleoperator_from_config, - so101_leader, + so_leader, + bi_so_leader, ) from lerobot.utils.robot_utils import precise_sleep from lerobot.utils.utils import init_logging @@ -196,7 +197,7 @@ def teleop_loop(teleop: Teleoperator, env: gym.Env, fps: int): obs, info = env.reset() dt_s = time.perf_counter() - loop_start - precise_sleep(1 / fps - dt_s) + precise_sleep(max(1 / fps - dt_s, 0.0)) loop_s = time.perf_counter() - loop_start print(f"\ntime: {loop_s * 1e3:.2f}ms ({1 / loop_s:.0f} Hz)") @@ -222,7 +223,7 @@ def teleoperate(cfg: TeleoperateConfig): def main(): teleoperate(TeleoperateConfig( - teleop=so101_leader.SO101LeaderConfig( + teleop=so_leader.SO101LeaderConfig( port="/dev/ttyACM0", id='leader', use_degrees=False, diff --git a/docs/source/groot.mdx b/docs/source/groot.mdx index 729a64656..8bfc22996 100644 --- a/docs/source/groot.mdx +++ b/docs/source/groot.mdx @@ -12,6 +12,12 @@ Developers and researchers can post-train GR00T N1.5 with their own real or synt GR00T N1.5 (specifically the GR00T-N1.5-3B model) is built using pre-trained vision and language encoders. It utilizes a flow matching action transformer to model a chunk of actions, conditioned on vision, language, and proprioception. +An overview of GR00T + Its strong performance comes from being trained on an expansive and diverse humanoid dataset, which includes: - Real captured data from robots. @@ -103,7 +109,7 @@ Once you have trained your model using your parameters you can run inference in ```bash lerobot-record \ - --robot.type=bi_so100_follower \ + --robot.type=bi_so_follower \ --robot.left_arm_port=/dev/ttyACM1 \ --robot.right_arm_port=/dev/ttyACM0 \ --robot.id=bimanual_follower \ diff --git a/docs/source/il_robots.mdx b/docs/source/il_robots.mdx index 0bc1ca681..84dc6f2f6 100644 --- a/docs/source/il_robots.mdx +++ b/docs/source/il_robots.mdx @@ -58,8 +58,8 @@ lerobot-teleoperate \ ```python -from lerobot.teleoperators.so101_leader import SO101LeaderConfig, SO101Leader -from lerobot.robots.so101_follower import SO101FollowerConfig, SO101Follower +from lerobot.teleoperators.so_leader import SO101LeaderConfig, SO101Leader +from lerobot.robots.so_follower import SO101FollowerConfig, SO101Follower robot_config = SO101FollowerConfig( port="/dev/tty.usbmodem58760431541", @@ -195,9 +195,9 @@ lerobot-record \ from lerobot.cameras.opencv.configuration_opencv import OpenCVCameraConfig from lerobot.datasets.lerobot_dataset import LeRobotDataset from lerobot.datasets.utils import hw_to_dataset_features -from lerobot.robots.so100_follower import SO100Follower, SO100FollowerConfig -from lerobot.teleoperators.so100_leader.config_so100_leader import SO100LeaderConfig -from lerobot.teleoperators.so100_leader.so100_leader import SO100Leader +from lerobot.robots.so_follower import SO100Follower, SO100FollowerConfig +from lerobot.teleoperators.so_leader.config_so100_leader import SO100LeaderConfig +from lerobot.teleoperators.so_leader.so100_leader import SO100Leader from lerobot.utils.control_utils import init_keyboard_listener from lerobot.utils.utils import log_say from lerobot.utils.visualization_utils import init_rerun @@ -408,8 +408,8 @@ lerobot-replay \ import time from lerobot.datasets.lerobot_dataset import LeRobotDataset -from lerobot.robots.so100_follower.config_so100_follower import SO100FollowerConfig -from lerobot.robots.so100_follower.so100_follower import SO100Follower +from lerobot.robots.so_follower.config_so100_follower import SO100FollowerConfig +from lerobot.robots.so_follower.so100_follower import SO100Follower from lerobot.utils.robot_utils import precise_sleep from lerobot.utils.utils import log_say @@ -432,7 +432,7 @@ for idx in range(dataset.num_frames): } robot.send_action(action) - precise_sleep(1.0 / dataset.fps - (time.perf_counter() - t0)) + precise_sleep(max(1.0 / dataset.fps - (time.perf_counter() - t0), 0.0)) robot.disconnect() ``` @@ -531,8 +531,8 @@ from lerobot.datasets.lerobot_dataset import LeRobotDataset from lerobot.datasets.utils import hw_to_dataset_features from lerobot.policies.act.modeling_act import ACTPolicy from lerobot.policies.factory import make_pre_post_processors -from lerobot.robots.so100_follower.config_so100_follower import SO100FollowerConfig -from lerobot.robots.so100_follower.so100_follower import SO100Follower +from lerobot.robots.so_follower.config_so100_follower import SO100FollowerConfig +from lerobot.robots.so_follower.so100_follower import SO100Follower from lerobot.scripts.lerobot_record import record_loop from lerobot.utils.control_utils import init_keyboard_listener from lerobot.utils.utils import log_say diff --git a/docs/source/integrate_hardware.mdx b/docs/source/integrate_hardware.mdx index e1587be91..fa36e7170 100644 --- a/docs/source/integrate_hardware.mdx +++ b/docs/source/integrate_hardware.mdx @@ -18,7 +18,7 @@ If you're using Feetech or Dynamixel motors, LeRobot provides built-in bus inter - [`DynamixelMotorsBus`](https://github.com/huggingface/lerobot/blob/main/src/lerobot/motors/dynamixel/dynamixel.py) – for controlling Dynamixel servos Please refer to the [`MotorsBus`](https://github.com/huggingface/lerobot/blob/main/src/lerobot/motors/motors_bus.py) abstract class to learn about its API. -For a good example of how it can be used, you can have a look at our own [SO101 follower implementation](https://github.com/huggingface/lerobot/blob/main/src/lerobot/robots/so101_follower/so101_follower.py) +For a good example of how it can be used, you can have a look at our own [SO101 follower implementation](https://github.com/huggingface/lerobot/blob/main/src/lerobot/robots/so_follower/so101_follower/so101_follower.py) Use these if compatible. Otherwise, you'll need to find or write a Python interface (not covered in this tutorial): diff --git a/docs/source/lekiwi.mdx b/docs/source/lekiwi.mdx index 875394d71..b339225d8 100644 --- a/docs/source/lekiwi.mdx +++ b/docs/source/lekiwi.mdx @@ -1,5 +1,11 @@ # LeKiwi +LeKiwi + In the steps below, we explain how to assemble the LeKiwi mobile robot. ## Source the parts @@ -204,7 +210,7 @@ lerobot-calibrate \ ```python -from lerobot.teleoperators.so100_leader import SO100LeaderConfig, SO100Leader +from lerobot.teleoperators.so_leader import SO100LeaderConfig, SO100Leader config = SO100LeaderConfig( port="/dev/tty.usbmodem58760431551", diff --git a/docs/source/libero.mdx b/docs/source/libero.mdx index 3617f3b25..def974531 100644 --- a/docs/source/libero.mdx +++ b/docs/source/libero.mdx @@ -42,6 +42,7 @@ lerobot-eval \ ``` - `--env.task` picks the suite (`libero_object`, `libero_spatial`, etc.). +- `--env.task_ids` picks task ids to run (`[0]`, `[1,2,3]`, etc.). Omit this flag (or set it to `null`) to run all tasks in the suite. - `--eval.batch_size` controls how many environments run in parallel. - `--eval.n_episodes` sets how many episodes to run in total. diff --git a/docs/source/omx.mdx b/docs/source/omx.mdx new file mode 100644 index 000000000..4617ac7bd --- /dev/null +++ b/docs/source/omx.mdx @@ -0,0 +1,197 @@ +## Order and Assemble the parts + +First, assemble the OMX hardware following the official assembly guide. + +OMX Assembly Guide: https://ai.robotis.com/omx/assembly_guide_omx.html + +OMX robots are shipped preconfigured from the factory. Motor IDs, communication parameters, and joint offsets are already set, so no additional motor setup or calibration is required before using LeRobot. + +## Install LeRobot 🤗 + +To install LeRobot, follow our [Installation Guide](./installation) + +In addition to these instructions, you need to install the Dynamixel SDK: + +```bash +pip install -e ".[dynamixel]" +``` + +## Connect the robot + +To find the port for each bus servo adapter, run this script: + +```bash +lerobot-find-port +``` + +This command runs and when prompted, disconnect the USB cable from either the leader or follower arm and press Enter. The output will show 'The port of this MotorsBus is [port]'. This identifies the port for the disconnected arm. Repeat for the other arm to identify both ports. + + + + +Example output on macOS: + +``` +Finding all available ports for the MotorBus. +['/dev/tty.usbmodem575E0032081', '/dev/tty.usbmodem575E0031751'] +Remove the USB cable from your MotorsBus and press Enter when done. + +[...Disconnect corresponding leader or follower arm and press Enter...] + +The port of this MotorsBus is /dev/tty.usbmodem575E0032081 +Reconnect the USB cable. +``` + +Where the found port is: `/dev/tty.usbmodem575E0032081` corresponding to your leader or follower arm. + + + + +On Linux, we strongly recommend using udev rules to assign persistent and human-readable device names to the OMX leader and follower arms. This avoids issues where device names such as ttyACM0 and ttyACM1 change when the robot is unplugged, replugged, or when the system is rebooted. + +#### 1. Find your device serial numbers + +You should have obtained the port numbers like ../../ttyACM? for the leader and follower using `lerobot-find-port`. You can match those results with the serial numbers using the `ls -l /dev/serial/by-id/` command. +To create udev rules, you need the unique serial number for each OMX device. The easiest way is to list devices under: + +```bash +ls -l /dev/serial/by-id/ +``` + +You will see output similar to: + +```bash +usb-ROBOTIS_OpenRB-150_228BDD7B503059384C2E3120FF0A2B19-if00 -> ../../ttyACM0 +usb-ROBOTIS_OpenRB-150_67E1ED68503059384C2E3120FF092234-if00 -> ../../ttyACM1 +``` + +In each line, the serial number is the long string after `usb-ROBOTIS_OpenRB-150_` and before `-if00`. + +Follower serial: `228BDD7B503059384C2E3120FF0A2B19` + +Leader serial: `67E1ED68503059384C2E3120FF092234` + +#### 2. Create the udev rule + +Create a new udev rule file: + +```bash +sudo nano /etc/udev/rules.d/99-omx.rules +``` + +Paste the following lines, replacing the serial numbers with the values you found above: + +```bash +SUBSYSTEM=="tty", ATTRS{idVendor}=="0403", ATTRS{serial}=="228BDD7B503059384C2E3120FF0A2B19", SYMLINK+="omx_follower" +SUBSYSTEM=="tty", ATTRS{idVendor}=="0403", ATTRS{serial}=="67E1ED68503059384C2E3120FF092234", SYMLINK+="omx_leader" +``` + +Save the file and reload udev rules: + +```bash +sudo udevadm control --reload-rules +sudo udevadm trigger +``` + +Now unplug and replug both devices once. + +#### 3. Verify the symlinks + +Check that the persistent device names exist: + +```bash +ls -l /dev/omx_follower /dev/omx_leader +``` + +You should see them pointing to ttyACM\* devices: + +```bash +/dev/omx_follower -> ttyACM* +/dev/omx_leader -> ttyACM* +``` + +These names remain stable across reboots and reconnections. + + + + +## Teleoperate + +After identifying the correct ports, you can directly teleoperate the follower arm using the leader arm. + + + + +### Teleoperate without camera + +```bash +lerobot-teleoperate \ + --robot.type=omx_follower \ + --robot.port= \ + --robot.id=omx_follower_arm \ + --teleop.type=omx_leader \ + --teleop.port= \ + --teleop.id=omx_leader_arm +``` + +During teleoperation, motions of the leader arm are mirrored in real time by the follower arm. OMX is already preconfigured, teleoperation can begin immediately without any calibration steps. + +### Teleoperate with camera + +You can also enable camera input during teleoperation by providing a camera configuration for the follower arm. + +```bash +lerobot-teleoperate \ + --robot.type=omx_follower \ + --robot.port= \ + --robot.id=omx_follower_arm \ + --robot.cameras="{front: {type: opencv, index_or_path: '/dev/video0', width: 640, height: 480, fps: 30}}" \ + --teleop.type=omx_leader \ + --teleop.port= \ + --teleop.id=omx_leader_arm \ + --display_data=true +``` + +When the camera is enabled, the camera stream is displayed in real time and synchronized with the robot state. This setup is useful for visual monitoring and can be reused later for demonstration recording and imitation learning. + + + + +### Teleoperate without camera + +```bash +lerobot-teleoperate \ + --robot.type=omx_follower \ + --robot.port=/dev/omx_follower \ + --robot.id=omx_follower_arm \ + --teleop.type=omx_leader \ + --teleop.port=/dev/omx_leader \ + --teleop.id=omx_leader_arm +``` + +During teleoperation, motions of the leader arm are mirrored in real time by the follower arm. OMX is already preconfigured, teleoperation can begin immediately without any calibration steps. + +### Teleoperate with camera + +You can also enable camera input during teleoperation by providing a camera configuration for the follower arm. + +```bash +lerobot-teleoperate \ + --robot.type=omx_follower \ + --robot.port=/dev/omx_follower \ + --robot.id=omx_follower_arm \ + --robot.cameras="{front: {type: opencv, index_or_path: '/dev/video0', width: 640, height: 480, fps: 30}}" \ + --teleop.type=omx_leader \ + --teleop.port=/dev/omx_leader \ + --teleop.id=omx_leader_arm \ + --display_data=true +``` + +When the camera is enabled, the camera stream is displayed in real time and synchronized with the robot state. This setup is useful for visual monitoring and can be reused later for demonstration recording and imitation learning. + + + + +Congrats 🎉, your robot is all set to learn a task on its own. + +> If you have any questions or need help, please reach out on [Discord](https://discord.com/invite/robotis). diff --git a/docs/source/peft_training.mdx b/docs/source/peft_training.mdx new file mode 100644 index 000000000..dd0b10075 --- /dev/null +++ b/docs/source/peft_training.mdx @@ -0,0 +1,62 @@ +# Parameter efficient fine-tuning with 🤗 PEFT + +[🤗 PEFT](https://github.com/huggingface/peft) (Parameter-Efficient Fine-Tuning) is a library for efficiently adapting +large pretrained models such as pre-trained policies (e.g., SmolVLA, π₀, ...) to new tasks without training all +of the model's parameters while yielding comparable performance. + +Install the `lerobot[peft]` optional package to enable PEFT support. + +To read about all the possible methods of adaption, please refer to the [🤗 PEFT docs](https://huggingface.co/docs/peft/index). + +## Training SmolVLA + +In this section we'll show you how to train a pre-trained SmolVLA policy with PEFT on the libero dataset. +For brevity we're only training on the `libero_spatial` subset. We will use `lerobot/smolvla_base` as the model +to parameter efficiently fine-tune: + +``` +lerobot-train \ + --policy.path=lerobot/smolvla_base \ + --policy.repo_id=your_hub_name/my_libero_smolvla \ + --dataset.repo_id=HuggingFaceVLA/libero \ + --policy.output_features=null \ + --policy.input_features=null \ + --policy.optimizer_lr=1e-3 \ + --policy.scheduler_decay_lr=1e-4 \ + --env.type=libero \ + --env.task=libero_spatial \ + --steps=100000 \ + --batch_size=32 \ + --peft.method_type=LORA \ + --peft.r=64 +``` + +Note the `--peft.method_type` parameter that let's you select which PEFT method to use. Here we use +[LoRA](https://huggingface.co/docs/peft/main/en/package_reference/lora) (Low-Rank Adapter) which is probably the most +popular fine-tuning method to date. Low-rank adaption means that we only fine-tune a matrix with comparably low rank +instead of the full weight matrix. This rank can be specified using the `--peft.r` parameter. The higher the rank +the closer you get to full fine-tuning + +There are more complex methods that have more parameters. These are not yet supported, feel free to raise an issue +if you want to see a specific PEFT method supported. + +By default, PEFT will target the `q_proj` and `v_proj` layers of the LM expert in SmolVLA. It will also target the +state and action projection matrices as they are most likely task-dependent. If you need to target different layers +you can use `--peft.target_modules` to specify which layers to target. You can refer to the respective PEFT method's +documentation to see what inputs are supported, (e.g., [LoRA's target_modules documentation](https://huggingface.co/docs/peft/main/en/package_reference/lora#peft.LoraConfig.target_modules)). +Usually a list of suffixes or a regex are supported. For example, to target the MLPs of the `lm_expert` instead of +the `q` and `v` projections, use: + +``` +--peft.target_modules='(model\.vlm_with_expert\.lm_expert\..*\.(down|gate|up)_proj|.*\.(state_proj|action_in_proj|action_out_proj|action_time_mlp_in|action_time_mlp_out))' +``` + +In case you need to fully fine-tune a layer instead of just adapting it, you can supply a list of layer suffixes +to the `--peft.full_training_modules` parameter: + +``` +--peft.full_training_modules=["state_proj"] +``` + +The learning rate and the scheduled target learning rate can usually be scaled by a factor of 10 compared to the +learning rate used for full fine-tuning (e.g., 1e-4 normal, so 1e-3 using LoRA). diff --git a/docs/source/phone_teleop.mdx b/docs/source/phone_teleop.mdx index 76e3c367c..06e524975 100644 --- a/docs/source/phone_teleop.mdx +++ b/docs/source/phone_teleop.mdx @@ -44,7 +44,7 @@ Modify the examples to use `PhoneOS.IOS` or `PhoneOS.ANDROID` in `PhoneConfig`. Teleoperation example: -```36:43:examples/phone_so100_teleop.py +```python from lerobot.teleoperators.phone.config_phone import PhoneConfig, PhoneOS teleop_config = PhoneConfig(phone_os=PhoneOS.IOS) # or PhoneOS.ANDROID @@ -103,7 +103,7 @@ Additionally you can customize mapping or safety limits by editing the processor - Kinematics are used in multiple steps. We use [Placo](https://github.com/Rhoban/placo) which is a wrapper around Pinocchio for handling our kinematics. We construct the kinematics object by passing the robot's URDF and target frame. We set `target_frame_name` to the gripper frame. - ```examples/phone_to_so100/teleoperate.py + ```python kinematics_solver = RobotKinematics( urdf_path="./SO101/so101_new_calib.urdf", target_frame_name="gripper_frame_link", @@ -114,7 +114,7 @@ Additionally you can customize mapping or safety limits by editing the processor - The `MapPhoneActionToRobotAction` step converts the calibrated phone pose and inputs into target deltas and gripper commands, below is shown what the step outputs. - ```src/lerobot/teleoperators/phone/phone_processor.py + ```python action["enabled"] = enabled action["target_x"] = -pos[1] if enabled else 0.0 action["target_y"] = pos[0] if enabled else 0.0 @@ -127,7 +127,7 @@ Additionally you can customize mapping or safety limits by editing the processor - The `EEReferenceAndDelta` step converts target deltas to an absolute desired EE pose, storing a reference on enable, the `end_effector_step_sizes` are the step sizes for the EE pose and can be modified to change the motion speed. - ```examples/phone_to_so100/teleoperate.py + ```python EEReferenceAndDelta( kinematics=kinematics_solver, end_effector_step_sizes={"x": 0.5, "y": 0.5, "z": 0.5}, @@ -138,7 +138,7 @@ Additionally you can customize mapping or safety limits by editing the processor - The `EEBoundsAndSafety` step clamps EE motion to a workspace and checks for large ee step jumps to ensure safety. The `end_effector_bounds` are the bounds for the EE pose and can be modified to change the workspace. The `max_ee_step_m` are the step limits for the EE pose and can be modified to change the safety limits. - ```examples/phone_to_so100/teleoperate.py + ```python EEBoundsAndSafety( end_effector_bounds={"min": [-1.0, -1.0, -1.0], "max": [1.0, 1.0, 1.0]}, max_ee_step_m=0.10, @@ -147,7 +147,7 @@ Additionally you can customize mapping or safety limits by editing the processor - The `GripperVelocityToJoint` step turns a velocity‑like gripper input into absolute gripper position using the current measured state. The `speed_factor` is the factor by which the velocity is multiplied. - ```examples/phone_to_so100/teleoperate.py + ```python GripperVelocityToJoint(speed_factor=20.0) ``` @@ -157,7 +157,7 @@ We use different IK initial guesses in the kinematic steps. As initial guess eit - Closed loop (used in record/eval): sets `initial_guess_current_joints=True` so IK starts from the measured joints each frame. - ```examples/phone_to_so100/record.py + ```python InverseKinematicsEEToJoints( kinematics=kinematics_solver, motor_names=list(robot.bus.motors.keys()), @@ -167,7 +167,7 @@ We use different IK initial guesses in the kinematic steps. As initial guess eit - Open loop (used in replay): sets `initial_guess_current_joints=False` so IK continues from the previous IK solution rather than the measured state. This preserves action stability when we replay without feedback. - ```examples/phone_to_so100/replay.py + ```python InverseKinematicsEEToJoints( kinematics=kinematics_solver, motor_names=list(robot.bus.motors.keys()), diff --git a/docs/source/pi0.mdx b/docs/source/pi0.mdx index d15f7e91f..93e0b4c88 100644 --- a/docs/source/pi0.mdx +++ b/docs/source/pi0.mdx @@ -6,6 +6,12 @@ π₀ represents a breakthrough in robotics as the first general-purpose robot foundation model developed by [Physical Intelligence](https://www.physicalintelligence.company/blog/pi0). Unlike traditional robot programs that are narrow specialists programmed for repetitive motions, π₀ is designed to be a generalist policy that can understand visual inputs, interpret natural language instructions, and control a variety of different robots across diverse tasks. +An overview of Pi0 + ### The Vision for Physical Intelligence As described by Physical Intelligence, while AI has achieved remarkable success in digital domains, from chess-playing to drug discovery, human intelligence still dramatically outpaces AI in the physical world. To paraphrase Moravec's paradox, winning a game of chess represents an "easy" problem for AI, but folding a shirt or cleaning up a table requires solving some of the most difficult engineering problems ever conceived. π₀ represents a first step toward developing artificial physical intelligence that enables users to simply ask robots to perform any task they want, just like they can with large language models. @@ -64,6 +70,8 @@ python src/lerobot/scripts/lerobot_train.py \ --policy.compile_model=true \ --policy.gradient_checkpointing=true \ --policy.dtype=bfloat16 \ + --policy.freeze_vision_encoder=false \ + --policy.train_expert_only=false \ --steps=3000 \ --policy.device=cuda \ --batch_size=32 @@ -79,6 +87,15 @@ python src/lerobot/scripts/lerobot_train.py \ - [lerobot/pi0_base](https://huggingface.co/lerobot/pi0_base) - [lerobot/pi0_libero](https://huggingface.co/lerobot/pi0_libero) (specifically trained on the Libero dataset) +### Training Parameters Explained + +| Parameter | Default | Description | +| ----------------------- | ------- | ------------------------------------------- | +| `freeze_vision_encoder` | `false` | Do not freeze the vision encoder | +| `train_expert_only` | `false` | Do not freeze the VLM, train all parameters | + +**💡 Tip**: Setting `train_expert_only=true` freezes the VLM and trains only the action expert and projections, allowing finetuning with reduced memory usage. + ## License This model follows the **Apache 2.0 License**, consistent with the original [OpenPI repository](https://github.com/Physical-Intelligence/openpi). diff --git a/docs/source/pi05.mdx b/docs/source/pi05.mdx index 29b797935..dbf118aa3 100644 --- a/docs/source/pi05.mdx +++ b/docs/source/pi05.mdx @@ -67,6 +67,8 @@ python src/lerobot/scripts/lerobot_train.py\ --policy.gradient_checkpointing=true \ --wandb.enable=true \ --policy.dtype=bfloat16 \ + --policy.freeze_vision_encoder=false \ + --policy.train_expert_only=false \ --steps=3000 \ --policy.device=cuda \ --batch_size=32 @@ -82,6 +84,15 @@ python src/lerobot/scripts/lerobot_train.py\ - [lerobot/pi05_base](https://huggingface.co/lerobot/pi05_base) - [lerobot/pi05_libero](https://huggingface.co/lerobot/pi05_libero) (specifically trained on the Libero dataset) +### Training Parameters Explained + +| Parameter | Default | Description | +| ----------------------- | ------- | ------------------------------------------- | +| `freeze_vision_encoder` | `false` | Do not freeze the vision encoder | +| `train_expert_only` | `false` | Do not freeze the VLM, train all parameters | + +**💡 Tip**: Setting `train_expert_only=true` freezes the VLM and trains only the action expert and projections, allowing finetuning with reduced memory usage. + If your dataset is not converted with `quantiles`, you can convert it with the following command: ```bash diff --git a/docs/source/pi0fast.mdx b/docs/source/pi0fast.mdx new file mode 100644 index 000000000..c4230fa79 --- /dev/null +++ b/docs/source/pi0fast.mdx @@ -0,0 +1,246 @@ +# π₀-FAST (Pi0-FAST) + +π₀-FAST is a **Vision-Language-Action model for general robot control** that uses autoregressive next-token prediction to model continuous robot actions. + +## Model Overview + +π₀-FAST combines the power of Vision-Language Models with a novel action tokenization approach called **FAST (Frequency-space Action Sequence Tokenization)**. This enables training autoregressive VLAs on highly dexterous tasks that are impossible with standard binning-based discretization, while training **up to 5x faster** than diffusion-based approaches like π₀. + +An overview of Pi0-FAST + +### Why FAST? + +Standard approaches for robot action tokenization use simple per-dimension, per-timestep binning schemes. While passable for simple behaviors, this rapidly breaks down for complex and dexterous skills that require precision and high-frequency control. + +FAST solves this by compressing action sequences using signal processing techniques, resulting in a dense sequence of action tokens that can be predicted autoregressively—just like language tokens. + +### How FAST Tokenization Works + +The FAST tokenizer compresses action sequences through the following steps: + +1. **Normalize**: Take a continuous action chunk of shape `(H, D)` where `H` is the horizon and `D` is the action dimension. Normalize using one of the supported normalization methods (Quantiles recommended to handle outliers). + +2. **Discrete Cosine Transform (DCT)**: Apply DCT (via scipy) to each action dimension separately. DCT is a compression algorithm commonly used in image and audio codecs (JPEG, MP3). + +3. **Quantization**: Round and remove insignificant coefficients for each action dimension, producing a sparse frequency matrix. + +4. **Flatten**: Flatten the matrix into a 1D vector, with low-frequency components first. + +5. **Byte Pair Encoding (BPE)**: Train a BPE tokenizer to compress the DCT coefficients into dense action tokens, typically achieving **10x compression** over prior tokenization approaches. + +This approach can transform **any existing VLM** into a VLA by training it to predict these FAST tokens. + +## Installation Requirements + +1. Install LeRobot by following our [Installation Guide](./installation). +2. Install π₀-FAST dependencies by running: + + ```bash + pip install -e ".[pi]" + ``` + + > [!NOTE] + > For lerobot 0.4.0, if you want to install the pi tag, you will have to do: `pip install "lerobot[pi]@git+https://github.com/huggingface/lerobot.git"`. + > + > This will be solved in the next patch release + +## Training a Custom FAST Tokenizer + +You have two options for the FAST tokenizer: + +1. **Use the pre-trained tokenizer**: The `physical-intelligence/fast` tokenizer was trained on 1M+ real robot action sequences and works as a general-purpose tokenizer. + +2. **Train your own tokenizer**: For maximum performance on your specific dataset, you can finetune the tokenizer on your own data. + +### Training Your Own Tokenizer + +```bash +lerobot-train-tokenizer \ + --repo_id "user/my-lerobot-dataset" \ + --action_horizon 10 \ + --encoded_dims "0:6" \ + --vocab_size 1024 \ + --scale 10.0 \ + --normalization_mode QUANTILES \ + --output_dir "./my_fast_tokenizer" \ + --push_to_hub \ + --hub_repo_id "username/my-action-tokenizer" +``` + +### Key Tokenizer Parameters + +| Parameter | Description | Default | +| ---------------------- | --------------------------------------------------------------------------------- | ------------ | +| `--repo_id` | LeRobot dataset repository ID | Required | +| `--action_horizon` | Number of future actions in each chunk | `10` | +| `--encoded_dims` | Comma-separated dimension ranges to encode (e.g., `"0:6,7:23"`) | `"0:6,7:23"` | +| `--vocab_size` | BPE vocabulary size | `1024` | +| `--scale` | DCT scaling factor for quantization | `10.0` | +| `--normalization_mode` | Normalization mode (`MEAN_STD`, `MIN_MAX`, `QUANTILES`, `QUANTILE10`, `IDENTITY`) | `QUANTILES` | +| `--sample_fraction` | Fraction of chunks to sample per episode | `0.1` | + +## Usage + +To use π₀-FAST in LeRobot, specify the policy type as: + +```python +policy.type=pi0_fast +``` + +## Training + +For training π₀-FAST, you can use the LeRobot training script: + +```bash +lerobot-train \ + --dataset.repo_id=your_dataset \ + --policy.type=pi0_fast \ + --output_dir=./outputs/pi0fast_training \ + --job_name=pi0fast_training \ + --policy.pretrained_path=lerobot/pi0_fast_base \ + --policy.dtype=bfloat16 \ + --policy.gradient_checkpointing=true \ + --policy.chunk_size=10 \ + --policy.n_action_steps=10 \ + --policy.max_action_tokens=256 \ + --steps=100000 \ + --batch_size=4 \ + --policy.device=cuda +``` + +### Key Training Parameters + +| Parameter | Description | Default | +| -------------------------------------- | -------------------------------------------------- | ---------------------------- | +| `--policy.gradient_checkpointing=true` | Reduces memory usage significantly during training | `false` | +| `--policy.dtype=bfloat16` | Use mixed precision training for efficiency | `float32` | +| `--policy.chunk_size` | Number of action steps to predict (action horizon) | `50` | +| `--policy.n_action_steps` | Number of action steps to execute | `50` | +| `--policy.max_action_tokens` | Maximum number of FAST tokens per action chunk | `256` | +| `--policy.action_tokenizer_name` | FAST tokenizer to use | `physical-intelligence/fast` | +| `--policy.compile_model=true` | Enable torch.compile for faster training | `false` | + +## Inference + +### KV-Caching for Fast Inference + +π₀-FAST supports **KV-caching**, a widely used optimization in LLM inference. This caches the key-value pairs from the attention mechanism, avoiding redundant computation during autoregressive decoding. + +```python +# KV-caching is enabled by default +policy.use_kv_cache=true +``` + +### Inference Example + +```python +from lerobot.policies.pi0_fast import PI0FastPolicy, PI0FastConfig + +# Load the policy +policy = PI0FastPolicy.from_pretrained("your-model-path") + +# During inference +actions = policy.predict_action_chunk(batch) +``` + +## Model Architecture + +π₀-FAST uses a PaliGemma-based architecture: + +- **Vision Encoder**: SigLIP vision tower for image understanding +- **Language Model**: Gemma 2B for processing language instructions and predicting action tokens + +The model takes images, text instructions, and robot state as input, and outputs discrete FAST tokens that are decoded back to continuous actions. + +## Configuration Options + +| Parameter | Description | Default | +| -------------------- | ----------------------------------------------- | ---------- | +| `paligemma_variant` | VLM backbone variant (`gemma_300m`, `gemma_2b`) | `gemma_2b` | +| `max_state_dim` | Maximum state vector dimension (padded) | `32` | +| `max_action_dim` | Maximum action vector dimension (padded) | `32` | +| `temperature` | Sampling temperature (0.0 for greedy) | `0.0` | +| `max_decoding_steps` | Maximum decoding steps | `256` | +| `use_kv_cache` | Enable KV caching for faster inference | `true` | + +## Comparison with π₀ + +| Feature | π₀ | π₀-FAST | +| --------------------- | ------------------------- | ---------------------------- | +| Action Representation | Flow Matching (Diffusion) | Autoregressive Tokens (FAST) | +| Training Speed | 1x | **5x faster** | +| Dexterity | High | High | +| Inference Method | Iterative Denoising | Autoregressive Decoding | +| KV-Caching | N/A | Supported | + +## Reproducing π₀Fast results + +We reproduce the results of π₀Fast on the LIBERO benchmark using the LeRobot implementation. We take the LeRobot PiFast base model [lerobot/pi0fast-base](https://huggingface.co/lerobot/pi0fast-base) and finetune for an additional 40kk steps in bfloat16, with batch size of 256 on 8 H100 GPUs using the [HuggingFace LIBERO dataset](https://huggingface.co/datasets/HuggingFaceVLA/libero). + +The finetuned model can be found here: + +- **π₀Fast LIBERO**: [lerobot/pi0fast-libero](https://huggingface.co/lerobot/pi0fast-libero) + +With the following training command: + +```bash +lerobot-train \ + --dataset.repo_id=lerobot/libero \ + --output_dir=outputs/libero_pi0fast \ + --job_name=libero_pi0fast \ + --policy.path=lerobot/pi0fast_base \ + --policy.dtype=bfloat16 \ + --steps=100000 \ + --save_freq=20000 \ + --batch_size=4 \ + --policy.device=cuda \ + --policy.scheduler_warmup_steps=4000 \ + --policy.scheduler_decay_steps=100000 \ + --policy.scheduler_decay_lr=1e-5 \ + --policy.gradient_checkpointing=true \ + --policy.chunk_size=10 \ + --policy.n_action_steps=10 \ + --policy.max_action_tokens=256 \ + --policy.empty_cameras=1 \ +``` + +We then evaluate the finetuned model using the LeRobot LIBERO implementation, by running the following command: + +```bash +tasks="libero_object,libero_spatial,libero_goal,libero_10" +lerobot-eval \ + --policy.path=lerobot/pi0fast-libero \ + --policy.max_action_tokens=256 \ + --env.type=libero \ + --policy.gradient_checkpointing=false \ + --env.task=${tasks} \ + --eval.batch_size=1 \ + --eval.n_episodes=1 \ + --rename_map='{"observation.images.image":"observation.images.base_0_rgb","observation.images.image2":"observation.images.left_wrist_0_rgb"}' +``` + +**Note:** We set `n_action_steps=10`, similar to the original OpenPI implementation. + +### Results + +We obtain the following results on the LIBERO benchmark: + +| Model | LIBERO Spatial | LIBERO Object | LIBERO Goal | LIBERO 10 | Average | +| ----------- | -------------- | ------------- | ----------- | --------- | -------- | +| **π₀-fast** | 70.0 | 100.0 | 100.0 | 60.0 | **82.5** | + +The full evaluation output folder, including videos, is available [here](https://drive.google.com/drive/folders/1HXpwPTRm4hx6g1sF2P7OOqGG0TwPU7LQ?usp=sharing) + +## License + +This model follows the **Apache 2.0 License**, consistent with the original [OpenPI repository](https://github.com/Physical-Intelligence/openpi). + +## References + +- [FAST: Efficient Robot Action Tokenization](https://www.physicalintelligence.company/research/fast) - Physical Intelligence Blog +- [OpenPI Repository](https://github.com/Physical-Intelligence/openpi) - Original implementation +- [FAST Tokenizer on Hugging Face](https://huggingface.co/physical-intelligence/fast) - Pre-trained tokenizer diff --git a/docs/source/policy_walloss_README.md b/docs/source/policy_walloss_README.md index 78548bd8d..93c0ad392 100644 --- a/docs/source/policy_walloss_README.md +++ b/docs/source/policy_walloss_README.md @@ -1,20 +1,30 @@ # WALL-OSS -This repository contains the Hugging Face port of **WALL-OSS**, a Vision-Language-Action model for cross-embodiment robotic control based on Qwen2.5-VL with flow matching/FAST action prediction. +This repository contains the Hugging Face port of [**WALL-OSS**](https://x2robot.com/en/research/68bc2cde8497d7f238dde690), a Vision-Language-Action model for cross-embodiment robotic control based on Qwen2.5-VL with flow matching/FAST action prediction. --- ## Model Overview | Feature | Description | -| ------------------ | ----------------------------------------------------- | --- | +| ------------------ | ----------------------------------------------------- | | Base Model | Qwen2.5-VL (Vision-Language Model) | | Action Prediction | Flow Matching (diffusion) or FAST (discrete tokens) | -| Architecture | Mixture of Experts (MoE) with action-specific routing | | +| Architecture | Mixture of Experts (MoE) with action-specific routing | | Multi-Modal Inputs | Vision (images/videos), Language, Proprioception | --- +## Additional Resources + +Paper: https://arxiv.org/pdf/2509.11766 + +Official Repository: https://github.com/X-Square-Robot/wall-x + +Hugging Face: https://huggingface.co/x-square-robot + +--- + ## Citation If you use this work, please cite: @@ -32,4 +42,4 @@ If you use this work, please cite: ## License -This port follows the **Apache 2.0 License**. +This model follows the **Apache 2.0 License**, consistent with the original [WallX repository](https://github.com/X-Square-Robot/wall-x). diff --git a/docs/source/processors_robots_teleop.mdx b/docs/source/processors_robots_teleop.mdx index 3d8dcb409..093a8e0e3 100644 --- a/docs/source/processors_robots_teleop.mdx +++ b/docs/source/processors_robots_teleop.mdx @@ -30,7 +30,7 @@ Each of these pipelines handle different conversions between different action an Below is an example of the three pipelines that we use in the phone to SO-100 follower examples: -```69:90:examples/phone_so100_record.py +```python phone_to_robot_ee_pose_processor = RobotProcessorPipeline[RobotAction, RobotAction]( # teleop -> dataset action steps=[ MapPhoneActionToRobotAction(platform=teleop_config.phone_os), @@ -84,7 +84,7 @@ Dataset features are determined by the keys saved in the dataset. Each step can Below is and example of how we declare features with the `transform_features` method in the phone to SO-100 follower examples: -```src/lerobot/robots/so100_follower/robot_kinematic_processor.py +```python def transform_features( self, features: dict[PipelineFeatureType, dict[str, PolicyFeature]] ) -> dict[PipelineFeatureType, dict[str, PolicyFeature]]: @@ -103,7 +103,7 @@ Here we declare what PolicyFeatures we modify in this step, so we know what feat Below is an example of how we aggregate and merge features in the phone to SO-100 record example: -```121:145:examples/phone_so100_record.py +```python features=combine_feature_dicts( # Run the feature contract of the pipelines # This tells you how the features would look like after the pipeline steps diff --git a/docs/source/reachy2.mdx b/docs/source/reachy2.mdx index 7d3dc1b60..51b09acd2 100644 --- a/docs/source/reachy2.mdx +++ b/docs/source/reachy2.mdx @@ -38,6 +38,7 @@ docker run --rm -it \ start_rviz:=true start_sdk_server:=true mujoco:=true ``` +> [!NOTE] > If MuJoCo runs slowly (low simulation frequency), append `-e LD_LIBRARY_PATH="/opt/host-libs:$LD_LIBRARY_PATH" \` to the previous command to improve performance: > > ``` @@ -141,7 +142,7 @@ If you choose this option but still want to use the VR teleoperation application First add reachy2 and reachy2_teleoperator to the imports of the record script. Then you can use the following command: ```bash -python -m lerobot.record \ +lerobot-record \ --robot.type=reachy2 \ --robot.ip_address=192.168.0.200 \ --robot.id=r2-0000 \ @@ -150,6 +151,7 @@ python -m lerobot.record \ --teleop.type=reachy2_teleoperator \ --teleop.ip_address=192.168.0.200 \ --teleop.with_mobile_base=false \ + --robot.with_torso_camera=true \ --dataset.repo_id=pollen_robotics/record_test \ --dataset.single_task="Reachy 2 recording test" \ --dataset.num_episodes=1 \ @@ -165,7 +167,7 @@ python -m lerobot.record \ **Extended setup overview (all options included):** ```bash -python -m lerobot.record \ +lerobot-record \ --robot.type=reachy2 \ --robot.ip_address=192.168.0.200 \ --robot.use_external_commands=true \ @@ -177,6 +179,8 @@ python -m lerobot.record \ --robot.with_left_teleop_camera=true \ --robot.with_right_teleop_camera=true \ --robot.with_torso_camera=false \ + --robot.camera_width=640 \ + --robot.camera_height=480 \ --robot.disable_torque_on_disconnect=false \ --robot.max_relative_target=5.0 \ --teleop.type=reachy2_teleoperator \ @@ -212,9 +216,10 @@ Must be set to true if a compliant Reachy 2 is used to control another one. From our initial tests, recording **all** joints when only some are moving can reduce model quality with certain policies. To avoid this, you can exclude specific parts from recording and replay using: -```` +```bash --robot.with_=false -```, +``` + with `` being one of : `mobile_base`, `l_arm`, `r_arm", `neck`, `antennas`. It determine whether the corresponding part is recorded in the observations. True if not set. @@ -222,49 +227,60 @@ By default, **all parts are recorded**. The same per-part mechanism is available in `reachy2_teleoperator` as well. -```` - +```bash --teleop.with\_ - ``` + with `` being one of : `mobile_base`, `l_arm`, `r_arm", `neck`, `antennas`. Determine whether the corresponding part is recorded in the actions. True if not set. > **Important:** In a given session, the **enabled parts must match** on both the robot and the teleoperator. -For example, if the robot runs with `--robot.with_mobile_base=false`, the teleoperator must disable the same part `--teleoperator.with_mobile_base=false`. +> For example, if the robot runs with `--robot.with_mobile_base=false`, the teleoperator must disable the same part `--teleoperator.with_mobile_base=false`. ##### Use the relevant cameras -You can do the same for **cameras**. By default, only the **teleoperation cameras** are recorded (both `left_teleop_camera` and `right_teleop_camera`). Enable or disable each camera with: +You can do the same for **cameras**. Enable or disable each camera with default parameters using: +```bash +--robot.with_left_teleop_camera= \ +--robot.with_right_teleop_camera= \ +--robot.with_torso_camera= ``` ---robot.with_left_teleop_camera= ---robot.with_right_teleop_camera= ---robot.with_torso_camera= +By default, no camera is recorded, all camera arguments are set to `false`. +If you want to, you can use custom `width` and `height` parameters for Reachy 2's cameras using the `--robot.camera_width` & `--robot.camera_height` argument: -```` +```bash +--robot.camera_width=1920 \ +--robot.camera_height=1080 +``` +This will change the resolution of all 3 default robot cameras (enabled by the above bool arguments). + +If you want, you can add additional cameras other than the ones in the robot as usual with: + +```bash +--robot.cameras="{ extra: {type: opencv, index_or_path: 42, width: 640, height: 480, fps: 30}}" \ +``` ## Step 2: Replay Make sure the robot is configured with the same parts as the dataset: ```bash -python -m lerobot.replay \ +lerobot-replay \ --robot.type=reachy2 \ --robot.ip_address=192.168.0.200 \ --robot.use_external_commands=false \ --robot.with_mobile_base=false \ --dataset.repo_id=pollen_robotics/record_test \ --dataset.episode=0 - --display_data=true -```` +``` ## Step 3: Train ```bash -python -m lerobot.scripts.train \ +lerobot-train \ --dataset.repo_id=pollen_robotics/record_test \ --policy.type=act \ --output_dir=outputs/train/reachy2_test \ @@ -277,10 +293,9 @@ python -m lerobot.scripts.train \ ## Step 4: Evaluate ```bash -python -m lerobot.record \ +lerobot-eval \ --robot.type=reachy2 \ --robot.ip_address=192.168.0.200 \ - --display_data=false \ --dataset.repo_id=pollen_robotics/eval_record_test \ --dataset.single_task="Evaluate reachy2 policy" \ --dataset.num_episodes=10 \ diff --git a/docs/source/sarm.mdx b/docs/source/sarm.mdx index 321097692..65e49792b 100644 --- a/docs/source/sarm.mdx +++ b/docs/source/sarm.mdx @@ -4,6 +4,12 @@ SARM (Stage-Aware Reward Modeling) is a video-based reward modeling framework fo **Paper**: [SARM: Stage-Aware Reward Modeling for Long Horizon Robot Manipulation](https://arxiv.org/abs/2509.25358) +An overview of SARM + ## Why Reward Models? Standard behavior cloning treats all demonstration frames equally, but real-world robot datasets are messy. They contain hesitations, corrections, and variable-quality trajectories. Reward models solve this by learning a generalizable notion of **task progress** from demonstrations: given video frames and a task description, they predict how close the robot is to completing the task (0→1). This learned "progress signal" can be used in multiple ways, two promising applications are: (1) **weighted imitation learning** (RA-BC), where high-progress frames receive more weight during policy training, and (2) **reinforcement learning**, where the reward model provides dense rewards for online or offline policy improvement. diff --git a/docs/source/so100.mdx b/docs/source/so100.mdx index 3c73ae801..399781ef4 100644 --- a/docs/source/so100.mdx +++ b/docs/source/so100.mdx @@ -103,7 +103,7 @@ lerobot-setup-motors \ ```python -from lerobot.robots.so100_follower import SO100Follower, SO100FollowerConfig +from lerobot.robots.so_follower import SO100Follower, SO100FollowerConfig config = SO100FollowerConfig( port="/dev/tty.usbmodem585A0076841", @@ -177,7 +177,7 @@ lerobot-setup-motors \ ```python -from lerobot.teleoperators.so100_leader import SO100Leader, SO100LeaderConfig +from lerobot.teleoperators.so_leader import SO100Leader, SO100LeaderConfig config = SO100LeaderConfig( port="/dev/tty.usbmodem585A0076841", @@ -579,7 +579,7 @@ lerobot-calibrate \ ```python -from lerobot.robots.so100_follower import SO100FollowerConfig, SO100Follower +from lerobot.robots.so_follower import SO100FollowerConfig, SO100Follower config = SO100FollowerConfig( port="/dev/tty.usbmodem585A0076891", @@ -617,7 +617,7 @@ lerobot-calibrate \ ```python -from lerobot.teleoperators.so100_leader import SO100LeaderConfig, SO100Leader +from lerobot.teleoperators.so_leader import SO100LeaderConfig, SO100Leader config = SO100LeaderConfig( port="/dev/tty.usbmodem58760431551", diff --git a/docs/source/so101.mdx b/docs/source/so101.mdx index 57e8d691d..7c9df588a 100644 --- a/docs/source/so101.mdx +++ b/docs/source/so101.mdx @@ -1,5 +1,18 @@ # SO-101 +
+ SO-101 + SO-101 +
+ In the steps below, we explain how to assemble our flagship robot, the SO-101. ## Source the parts @@ -125,7 +138,7 @@ lerobot-setup-motors \ ```python -from lerobot.robots.so101_follower import SO101Follower, SO101FollowerConfig +from lerobot.robots.so_follower import SO101Follower, SO101FollowerConfig config = SO101FollowerConfig( port="/dev/tty.usbmodem585A0076841", @@ -201,7 +214,7 @@ lerobot-setup-motors \ ```python -from lerobot.teleoperators.so101_leader import SO101Leader, SO101LeaderConfig +from lerobot.teleoperators.so_leader import SO101Leader, SO101LeaderConfig config = SO101LeaderConfig( port="/dev/tty.usbmodem585A0076841", @@ -364,7 +377,7 @@ lerobot-calibrate \ ```python -from lerobot.robots.so101_follower import SO101FollowerConfig, SO101Follower +from lerobot.robots.so_follower import SO101FollowerConfig, SO101Follower config = SO101FollowerConfig( port="/dev/tty.usbmodem585A0076891", @@ -413,7 +426,7 @@ lerobot-calibrate \ ```python -from lerobot.teleoperators.so101_leader import SO101LeaderConfig, SO101Leader +from lerobot.teleoperators.so_leader import SO101LeaderConfig, SO101Leader config = SO101LeaderConfig( port="/dev/tty.usbmodem58760431551", diff --git a/docs/source/training_time_rtc.mdx b/docs/source/training_time_rtc.mdx new file mode 100644 index 000000000..7e7e64fac --- /dev/null +++ b/docs/source/training_time_rtc.mdx @@ -0,0 +1,86 @@ +# Training-Time RTC + +Training-Time RTC teaches the model to handle inference delay during training. +It feeds the **ground-truth action prefix** to the model and trains only on the remaining postfix actions. +This keeps chunk transitions smooth without doing any inference-time inpainting. + +Based on: [Training-Time Action Conditioning for Efficient Real-Time Chunking](https://arxiv.org/abs/2512.05964). + +LeRobot supports this for `pi0`, `pi05` and `smolvla` without changing model parameters. + +--- + +## How It Works + +### At Training Time + +- Sample a delay `d` per batch element. +- Keep the first `d` action steps as **ground truth** (no noise). +- Add noise only to the postfix actions. +- Set the flow-matching timestep to **1.0** for prefix tokens and normal timesteps for postfix tokens. +- Mask the loss to only train on the postfix. + +### At Inference Time + +When `rtc_training_config.enabled=true`, the model uses training-time RTC inference: + +- Replace prefix positions in `x_t` with previous chunk's leftover actions. +- Set timestep to **1.0** for prefix positions. + +--- + +## Quick Start (CLI) + +```bash +lerobot-train \ + --policy.type=pi0 \ + --dataset.repo_id=your/dataset \ + --policy.rtc_training_config.enabled=true \ + --policy.rtc_training_config.min_delay=0 \ + --policy.rtc_training_config.max_delay=6 \ + --policy.rtc_training_config.delay_distribution=UNIFORM +``` + +--- + +## Inference with Training-Time RTC + +After training with `rtc_training_config`, use the same config at inference. The model will automatically use training-time RTC inference: + +```python +policy = PI0Policy.from_pretrained("path/to/trained/model") +# rtc_training_config is loaded from the saved config + +actions = policy.predict_action_chunk( + batch, + inference_delay=5, # estimated delay in timesteps + prev_chunk_left_over=previous_actions, # from previous chunk +) +``` + +--- + +## Key Parameters + +`RTCTrainingConfig` is available on the policy config (`pi0`, `pi05`, `smolvla`, `xvla`): + +- **`enabled`**: Toggle training-time RTC (both training and inference). +- **`min_delay` / `max_delay`**: Delay range (inclusive). +- **`delay_distribution`**: + - `UNIFORM`: uniform in `[min_delay, max_delay]` + - `EXP`: exponentially decayed distribution over delays +- **`exp_decay`**: Exponential decay factor for `EXP` sampling. + +--- + +## Notes and Recommendations + +- Start with `min_delay=0` and `max_delay` around your expected worst-case inference delay. +- Use `EXP` if you want more supervision on smaller delays. + +--- + +## Related Docs + +- [Real-Time Chunking (Inference-Time RTC)](./rtc) +- [Pi0](./pi0), [Pi0.5](./pi05), [SmolVLA](./smolvla) diff --git a/docs/source/unitree_g1.mdx b/docs/source/unitree_g1.mdx index af06fd742..e6bffdf1b 100644 --- a/docs/source/unitree_g1.mdx +++ b/docs/source/unitree_g1.mdx @@ -1,21 +1,21 @@ -# Unitree G1 Robot Setup and Control +# Unitree G1 This guide covers the complete setup process for the Unitree G1 humanoid, from initial connection to running gr00t_wbc locomotion. -## About the Unitree G1 +## About -We offer support for both 29 and 23 DOF G1. We introduce: +We support both 29 and 23 DOF G1 EDU version. We introduce: -- **`unitree g1` robot class, handling low level communication with the humanoid** -- **ZMQ socket bridge** for remote communication over WiFi, allowing one to deploy policies remotely instead of over ethernet or directly on the Orin -- **GR00T locomotion policy** for bipedal walking and balance -- **MuJoCo simulation mode** for testing policies without the physical robot +- **`unitree g1` robot class, handling low level read/write from/to the humanoid** +- **ZMQ socket bridge** for remote communication and camera streaming, allowing for remote policy deployment over wlan, eth or directly on the robot +- **Locomotion policies** from NVIDIA gr00t and Amazon FAR Holosoma +- **Simulation mode** for testing policies without the physical robot in mujoco --- -## Part 1: Connect to Robot over Ethernet +## Connection guide -### Step 1: Configure Your Computer's Ethernet Interface +### Step 1: Configure Ethernet Interface Set a static IP on the same subnet as the robot: @@ -26,7 +26,7 @@ sudo ip addr add 192.168.123.200/24 dev enp131s0 sudo ip link set enp131s0 up ``` -**Note**: The robot's Ethernet IP is fixed at `192.168.123.164`. Your computer must use `192.168.123.x` where x ≠ 164. +**Note**: The G1's Ethernet IP is fixed at `192.168.123.164`. Your computer must use `192.168.123.x` with x ≠ 164. ### Step 2: SSH into the Robot @@ -35,25 +35,24 @@ ssh unitree@192.168.123.164 # Password: 123 ``` -You should now be connected to the robot's onboard computer. +You should now be connected to the G1's Orin. --- ## Part 2: Enable WiFi on the Robot -Once connected via Ethernet, follow these steps to enable WiFi: +Wlan0 is disabled by default on the G1. To enable it: ### Step 1: Enable WiFi Hardware ```bash -# Unblock WiFi radio sudo rfkill unblock wifi sudo rfkill unblock all -# Bring up WiFi interface +# Bring up wlan0 sudo ip link set wlan0 up -# Enable NetworkManager control +# Enable NetworkManager control of wlan0 sudo nmcli radio wifi on sudo nmcli device set wlan0 managed yes sudo systemctl restart NetworkManager @@ -73,7 +72,7 @@ sudo iptables -A FORWARD -i wlp132s0f0 -o enp131s0 -m state --state RELATED,ESTA sudo iptables -A FORWARD -i enp131s0 -o wlp132s0f0 -j ACCEPT ``` -**On the robot:** +**On the G1:** ```bash # Add laptop as default gateway @@ -111,7 +110,7 @@ ssh unitree@ # Password: 123 ``` -Replace `` with your robot's actual WiFi IP address (e.g., `172.18.129.215`). +Replace `` with your robot's actual WiFi IP address. --- @@ -147,9 +146,9 @@ python src/lerobot/robots/unitree_g1/run_g1_server.py --- -## Part 4: Running GR00T Locomotion +## Part 4: Controlling the robot -With the robot server running, you can now control the robot from your laptop. +With the robot server running, you can now control the robot remotely. Let's launch a locomotion policy ### Step 1: Install LeRobot on your machine @@ -172,34 +171,30 @@ Edit the config file to match your robot's WiFi IP: robot_ip: str = "" # Replace with your robot's WiFi IP. ``` -**Note**: When running directly on the G1 (not remotely), set `robot_ip: str = "127.0.0.1"` instead. - ### Step 3: Run the Locomotion Policy ```bash # Run GR00T locomotion controller python examples/unitree_g1/gr00t_locomotion.py --repo-id "nepyope/GR00T-WholeBodyControl_g1" + +# Run Holosoma locomotion controller +python examples/unitree_g1/holosoma_locomotion.py + ``` -### Step 4: Control with Remote - -- **Left stick**: Forward/backward and left/right movement -- **Right stick**: Rotation -- **R1 button**: Raise waist height -- **R2 button**: Lower waist height - Press `Ctrl+C` to stop the policy. --- -## Extra: Running in Simulation Mode (MuJoCo) +## Running in Simulation Mode (MuJoCo) -You can now test and develop policies without a physical robot using MuJoCo. to do so set `is_simulation=True` in config. +You can now test policies before unleashing them on the physical robot using MuJoCo. To do so simply set `is_simulation=True` in config. ## Additional Resources - [Unitree SDK Documentation](https://github.com/unitreerobotics/unitree_sdk2_python) -- [GR00T Policy Repository](https://huggingface.co/nepyope/GR00T-WholeBodyControl_g1) +- [GR00T-WholeBodyControl](https://github.com/NVlabs/GR00T-WholeBodyControl) +- [Holosoma](https://github.com/amazon-far/holosoma) - [LeRobot Documentation](https://github.com/huggingface/lerobot) - [Unitree_IL_Lerobot](https://github.com/unitreerobotics/unitree_IL_lerobot) diff --git a/docs/source/using_dataset_tools.mdx b/docs/source/using_dataset_tools.mdx index 29e16ea0a..9e662604e 100644 --- a/docs/source/using_dataset_tools.mdx +++ b/docs/source/using_dataset_tools.mdx @@ -95,26 +95,26 @@ Convert an image-based dataset to video format, creating a new LeRobotDataset wh # Local-only: Save to a custom output directory (no hub push) lerobot-edit-dataset \ --repo_id lerobot/pusht_image \ - --operation.type convert_to_video \ + --operation.type convert_image_to_video \ --operation.output_dir /path/to/output/pusht_video # Save with new repo_id (local storage) lerobot-edit-dataset \ --repo_id lerobot/pusht_image \ --new_repo_id lerobot/pusht_video \ - --operation.type convert_to_video + --operation.type convert_image_to_video # Convert and push to Hugging Face Hub lerobot-edit-dataset \ --repo_id lerobot/pusht_image \ --new_repo_id lerobot/pusht_video \ - --operation.type convert_to_video \ + --operation.type convert_image_to_video \ --push_to_hub true # Convert with custom video codec and quality settings lerobot-edit-dataset \ --repo_id lerobot/pusht_image \ - --operation.type convert_to_video \ + --operation.type convert_image_to_video \ --operation.output_dir outputs/pusht_video \ --operation.vcodec libsvtav1 \ --operation.pix_fmt yuv420p \ @@ -124,16 +124,23 @@ lerobot-edit-dataset \ # Convert only specific episodes lerobot-edit-dataset \ --repo_id lerobot/pusht_image \ - --operation.type convert_to_video \ + --operation.type convert_image_to_video \ --operation.output_dir outputs/pusht_video \ --operation.episode_indices "[0, 1, 2, 5, 10]" # Convert with multiple workers for parallel processing lerobot-edit-dataset \ --repo_id lerobot/pusht_image \ - --operation.type convert_to_video \ + --operation.type convert_image_to_video \ --operation.output_dir outputs/pusht_video \ --operation.num_workers 8 + +# For memory-constrained systems, users can now specify limits: +lerobot-edit-dataset \ + --repo_id lerobot/pusht_image \ + --operation.type convert_to_video \ + --operation.max_episodes_per_batch 50 \ + --operation.max_frames_per_batch 10000 ``` **Parameters:** diff --git a/docs/source/walloss.mdx b/docs/source/walloss.mdx index 12e9b1fc7..c0756c087 100644 --- a/docs/source/walloss.mdx +++ b/docs/source/walloss.mdx @@ -8,6 +8,12 @@ X Square Robot’s WALL-OSS is now integrated into Hugging Face’s LeRobot ecos The WALL-OSS team is building the embodied foundation model to capture and compress the world's most valuable data: the continuous, high-fidelity stream of physical interaction. By creating a direct feedback loop between the model's decisions and the body's lived experience, the emergence of a truly generalizable intelligence is enabled—one that understands not just how the world works, but how to act effectively within it. +An overview of WALL-OSS + Technically, WALL-OSS introduces a tightly coupled multimodal architecture (tightly-coupled MoE structure) that integrates both discrete and continuous action modeling strategies. Through a two-stage training pipeline (Inspiration → Integration), the model gradually unifies semantic reasoning and high-frequency action generation. Its core innovations include: - **Embodied perception–enhanced multimodal pretraining**: Large-scale training on unified vision–language–action data to strengthen spatial, causal, and manipulation understanding. diff --git a/examples/backward_compatibility/replay.py b/examples/backward_compatibility/replay.py index ed52a24c9..ed78d016f 100644 --- a/examples/backward_compatibility/replay.py +++ b/examples/backward_compatibility/replay.py @@ -41,8 +41,7 @@ from lerobot.robots import ( # noqa: F401 RobotConfig, koch_follower, make_robot_from_config, - so100_follower, - so101_follower, + so_follower, ) from lerobot.utils.constants import ACTION from lerobot.utils.robot_utils import precise_sleep @@ -97,7 +96,7 @@ def replay(cfg: ReplayConfig): robot.send_action(action) dt_s = time.perf_counter() - start_episode_t - precise_sleep(1 / dataset.fps - dt_s) + precise_sleep(max(1 / dataset.fps - dt_s, 0.0)) robot.disconnect() diff --git a/examples/lekiwi/record.py b/examples/lekiwi/record.py index 67d826ccb..18b9f857e 100644 --- a/examples/lekiwi/record.py +++ b/examples/lekiwi/record.py @@ -21,7 +21,7 @@ from lerobot.robots.lekiwi.config_lekiwi import LeKiwiClientConfig from lerobot.robots.lekiwi.lekiwi_client import LeKiwiClient from lerobot.scripts.lerobot_record import record_loop from lerobot.teleoperators.keyboard import KeyboardTeleop, KeyboardTeleopConfig -from lerobot.teleoperators.so100_leader import SO100Leader, SO100LeaderConfig +from lerobot.teleoperators.so_leader import SO100Leader, SO100LeaderConfig from lerobot.utils.constants import ACTION, OBS_STR from lerobot.utils.control_utils import init_keyboard_listener from lerobot.utils.utils import log_say diff --git a/examples/lekiwi/teleoperate.py b/examples/lekiwi/teleoperate.py index c4d20ebbe..feb3cbb01 100644 --- a/examples/lekiwi/teleoperate.py +++ b/examples/lekiwi/teleoperate.py @@ -18,7 +18,7 @@ import time from lerobot.robots.lekiwi import LeKiwiClient, LeKiwiClientConfig from lerobot.teleoperators.keyboard.teleop_keyboard import KeyboardTeleop, KeyboardTeleopConfig -from lerobot.teleoperators.so100_leader import SO100Leader, SO100LeaderConfig +from lerobot.teleoperators.so_leader import SO100Leader, SO100LeaderConfig from lerobot.utils.robot_utils import precise_sleep from lerobot.utils.visualization_utils import init_rerun, log_rerun_data diff --git a/examples/phone_to_so100/evaluate.py b/examples/phone_to_so100/evaluate.py index 5a47b8ffa..246c923aa 100644 --- a/examples/phone_to_so100/evaluate.py +++ b/examples/phone_to_so100/evaluate.py @@ -34,12 +34,11 @@ from lerobot.processor.converters import ( transition_to_observation, transition_to_robot_action, ) -from lerobot.robots.so100_follower.config_so100_follower import SO100FollowerConfig -from lerobot.robots.so100_follower.robot_kinematic_processor import ( +from lerobot.robots.so_follower import SO100Follower, SO100FollowerConfig +from lerobot.robots.so_follower.robot_kinematic_processor import ( ForwardKinematicsJointsToEE, InverseKinematicsEEToJoints, ) -from lerobot.robots.so100_follower.so100_follower import SO100Follower from lerobot.scripts.lerobot_record import record_loop from lerobot.utils.control_utils import init_keyboard_listener from lerobot.utils.utils import log_say diff --git a/examples/phone_to_so100/record.py b/examples/phone_to_so100/record.py index e563d8eb3..7b5b704e2 100644 --- a/examples/phone_to_so100/record.py +++ b/examples/phone_to_so100/record.py @@ -26,15 +26,14 @@ from lerobot.processor.converters import ( transition_to_observation, transition_to_robot_action, ) -from lerobot.robots.so100_follower.config_so100_follower import SO100FollowerConfig -from lerobot.robots.so100_follower.robot_kinematic_processor import ( +from lerobot.robots.so_follower import SO100Follower, SO100FollowerConfig +from lerobot.robots.so_follower.robot_kinematic_processor import ( EEBoundsAndSafety, EEReferenceAndDelta, ForwardKinematicsJointsToEE, GripperVelocityToJoint, InverseKinematicsEEToJoints, ) -from lerobot.robots.so100_follower.so100_follower import SO100Follower from lerobot.scripts.lerobot_record import record_loop from lerobot.teleoperators.phone.config_phone import PhoneConfig, PhoneOS from lerobot.teleoperators.phone.phone_processor import MapPhoneActionToRobotAction diff --git a/examples/phone_to_so100/replay.py b/examples/phone_to_so100/replay.py index a7b18a53c..875025dfc 100644 --- a/examples/phone_to_so100/replay.py +++ b/examples/phone_to_so100/replay.py @@ -23,11 +23,10 @@ from lerobot.processor.converters import ( robot_action_observation_to_transition, transition_to_robot_action, ) -from lerobot.robots.so100_follower.config_so100_follower import SO100FollowerConfig -from lerobot.robots.so100_follower.robot_kinematic_processor import ( +from lerobot.robots.so_follower import SO100Follower, SO100FollowerConfig +from lerobot.robots.so_follower.robot_kinematic_processor import ( InverseKinematicsEEToJoints, ) -from lerobot.robots.so100_follower.so100_follower import SO100Follower from lerobot.utils.constants import ACTION from lerobot.utils.robot_utils import precise_sleep from lerobot.utils.utils import log_say @@ -96,7 +95,7 @@ def main(): # Send action to robot _ = robot.send_action(joint_action) - precise_sleep(1.0 / dataset.fps - (time.perf_counter() - t0)) + precise_sleep(max(1.0 / dataset.fps - (time.perf_counter() - t0), 0.0)) # Clean up robot.disconnect() diff --git a/examples/phone_to_so100/teleoperate.py b/examples/phone_to_so100/teleoperate.py index 2ac8b3cce..6eaaec806 100644 --- a/examples/phone_to_so100/teleoperate.py +++ b/examples/phone_to_so100/teleoperate.py @@ -21,14 +21,13 @@ from lerobot.processor.converters import ( robot_action_observation_to_transition, transition_to_robot_action, ) -from lerobot.robots.so100_follower.config_so100_follower import SO100FollowerConfig -from lerobot.robots.so100_follower.robot_kinematic_processor import ( +from lerobot.robots.so_follower import SO100Follower, SO100FollowerConfig +from lerobot.robots.so_follower.robot_kinematic_processor import ( EEBoundsAndSafety, EEReferenceAndDelta, GripperVelocityToJoint, InverseKinematicsEEToJoints, ) -from lerobot.robots.so100_follower.so100_follower import SO100Follower from lerobot.teleoperators.phone.config_phone import PhoneConfig, PhoneOS from lerobot.teleoperators.phone.phone_processor import MapPhoneActionToRobotAction from lerobot.teleoperators.phone.teleop_phone import Phone diff --git a/examples/rtc/eval_with_real_robot.py b/examples/rtc/eval_with_real_robot.py index 6f051485a..1470899d9 100644 --- a/examples/rtc/eval_with_real_robot.py +++ b/examples/rtc/eval_with_real_robot.py @@ -94,9 +94,9 @@ from lerobot.rl.process import ProcessSignalHandler from lerobot.robots import ( # noqa: F401 Robot, RobotConfig, + bi_so_follower, koch_follower, - so100_follower, - so101_follower, + so_follower, ) from lerobot.robots.utils import make_robot_from_config from lerobot.utils.constants import OBS_IMAGES @@ -455,7 +455,18 @@ def demo_cli(cfg: RTCDemoConfig): if cfg.policy.type == "pi05" or cfg.policy.type == "pi0": config.compile_model = cfg.use_torch_compile - policy = policy_class.from_pretrained(cfg.policy.pretrained_path, config=config) + if config.use_peft: + from peft import PeftConfig, PeftModel + + peft_pretrained_path = cfg.policy.pretrained_path + peft_config = PeftConfig.from_pretrained(peft_pretrained_path) + + policy = policy_class.from_pretrained( + pretrained_name_or_path=peft_config.base_model_name_or_path, config=config + ) + policy = PeftModel.from_pretrained(policy, peft_pretrained_path, config=peft_config) + else: + policy = policy_class.from_pretrained(cfg.policy.pretrained_path, config=config) # Turn on RTC policy.config.rtc_config = cfg.rtc diff --git a/examples/so100_to_so100_EE/evaluate.py b/examples/so100_to_so100_EE/evaluate.py index 90973d373..87d188f99 100644 --- a/examples/so100_to_so100_EE/evaluate.py +++ b/examples/so100_to_so100_EE/evaluate.py @@ -34,12 +34,11 @@ from lerobot.processor.converters import ( transition_to_observation, transition_to_robot_action, ) -from lerobot.robots.so100_follower.config_so100_follower import SO100FollowerConfig -from lerobot.robots.so100_follower.robot_kinematic_processor import ( +from lerobot.robots.so_follower import SO100Follower, SO100FollowerConfig +from lerobot.robots.so_follower.robot_kinematic_processor import ( ForwardKinematicsJointsToEE, InverseKinematicsEEToJoints, ) -from lerobot.robots.so100_follower.so100_follower import SO100Follower from lerobot.scripts.lerobot_record import record_loop from lerobot.utils.control_utils import init_keyboard_listener from lerobot.utils.utils import log_say diff --git a/examples/so100_to_so100_EE/record.py b/examples/so100_to_so100_EE/record.py index 6bfdfe32d..eead7a9a8 100644 --- a/examples/so100_to_so100_EE/record.py +++ b/examples/so100_to_so100_EE/record.py @@ -27,16 +27,14 @@ from lerobot.processor.converters import ( transition_to_observation, transition_to_robot_action, ) -from lerobot.robots.so100_follower.config_so100_follower import SO100FollowerConfig -from lerobot.robots.so100_follower.robot_kinematic_processor import ( +from lerobot.robots.so_follower import SO100Follower, SO100FollowerConfig +from lerobot.robots.so_follower.robot_kinematic_processor import ( EEBoundsAndSafety, ForwardKinematicsJointsToEE, InverseKinematicsEEToJoints, ) -from lerobot.robots.so100_follower.so100_follower import SO100Follower from lerobot.scripts.lerobot_record import record_loop -from lerobot.teleoperators.so100_leader.config_so100_leader import SO100LeaderConfig -from lerobot.teleoperators.so100_leader.so100_leader import SO100Leader +from lerobot.teleoperators.so_leader import SO100Leader, SO100LeaderConfig from lerobot.utils.control_utils import init_keyboard_listener from lerobot.utils.utils import log_say from lerobot.utils.visualization_utils import init_rerun diff --git a/examples/so100_to_so100_EE/replay.py b/examples/so100_to_so100_EE/replay.py index 9951b139d..7d35a7b44 100644 --- a/examples/so100_to_so100_EE/replay.py +++ b/examples/so100_to_so100_EE/replay.py @@ -24,11 +24,10 @@ from lerobot.processor.converters import ( robot_action_observation_to_transition, transition_to_robot_action, ) -from lerobot.robots.so100_follower.config_so100_follower import SO100FollowerConfig -from lerobot.robots.so100_follower.robot_kinematic_processor import ( +from lerobot.robots.so_follower import SO100Follower, SO100FollowerConfig +from lerobot.robots.so_follower.robot_kinematic_processor import ( InverseKinematicsEEToJoints, ) -from lerobot.robots.so100_follower.so100_follower import SO100Follower from lerobot.utils.constants import ACTION from lerobot.utils.robot_utils import precise_sleep from lerobot.utils.utils import log_say @@ -97,7 +96,7 @@ def main(): # Send action to robot _ = robot.send_action(joint_action) - precise_sleep(1.0 / dataset.fps - (time.perf_counter() - t0)) + precise_sleep(max(1.0 / dataset.fps - (time.perf_counter() - t0), 0.0)) # Clean up robot.disconnect() diff --git a/examples/so100_to_so100_EE/teleoperate.py b/examples/so100_to_so100_EE/teleoperate.py index 21299103b..71d2899de 100644 --- a/examples/so100_to_so100_EE/teleoperate.py +++ b/examples/so100_to_so100_EE/teleoperate.py @@ -23,15 +23,13 @@ from lerobot.processor.converters import ( robot_action_to_transition, transition_to_robot_action, ) -from lerobot.robots.so100_follower.config_so100_follower import SO100FollowerConfig -from lerobot.robots.so100_follower.robot_kinematic_processor import ( +from lerobot.robots.so_follower import SO100Follower, SO100FollowerConfig +from lerobot.robots.so_follower.robot_kinematic_processor import ( EEBoundsAndSafety, ForwardKinematicsJointsToEE, InverseKinematicsEEToJoints, ) -from lerobot.robots.so100_follower.so100_follower import SO100Follower -from lerobot.teleoperators.so100_leader.config_so100_leader import SO100LeaderConfig -from lerobot.teleoperators.so100_leader.so100_leader import SO100Leader +from lerobot.teleoperators.so_leader import SO100Leader, SO100LeaderConfig from lerobot.utils.robot_utils import precise_sleep from lerobot.utils.visualization_utils import init_rerun, log_rerun_data diff --git a/examples/tutorial/act/act_using_example.py b/examples/tutorial/act/act_using_example.py index b268e8790..60bc802d8 100644 --- a/examples/tutorial/act/act_using_example.py +++ b/examples/tutorial/act/act_using_example.py @@ -5,8 +5,7 @@ from lerobot.datasets.lerobot_dataset import LeRobotDatasetMetadata from lerobot.policies.act.modeling_act import ACTPolicy from lerobot.policies.factory import make_pre_post_processors from lerobot.policies.utils import build_inference_frame, make_robot_action -from lerobot.robots.so100_follower.config_so100_follower import SO100FollowerConfig -from lerobot.robots.so100_follower.so100_follower import SO100Follower +from lerobot.robots.so_follower import SO100Follower, SO100FollowerConfig MAX_EPISODES = 5 MAX_STEPS_PER_EPISODE = 20 diff --git a/examples/tutorial/async-inf/robot_client.py b/examples/tutorial/async-inf/robot_client.py index fff7b15b3..db6ead3fe 100644 --- a/examples/tutorial/async-inf/robot_client.py +++ b/examples/tutorial/async-inf/robot_client.py @@ -4,7 +4,7 @@ from lerobot.async_inference.configs import RobotClientConfig from lerobot.async_inference.helpers import visualize_action_queue_size from lerobot.async_inference.robot_client import RobotClient from lerobot.cameras.opencv.configuration_opencv import OpenCVCameraConfig -from lerobot.robots.so100_follower import SO100FollowerConfig +from lerobot.robots.so_follower import SO100FollowerConfig def main(): @@ -30,6 +30,7 @@ def main(): robot=robot_cfg, server_address=server_address, policy_device="mps", + client_device="cpu", policy_type="act", pretrained_name_or_path="/robot_learning_tutorial_act", chunk_size_threshold=0.5, # g diff --git a/examples/tutorial/diffusion/diffusion_using_example.py b/examples/tutorial/diffusion/diffusion_using_example.py index 96cc607b6..d8ac75cfe 100644 --- a/examples/tutorial/diffusion/diffusion_using_example.py +++ b/examples/tutorial/diffusion/diffusion_using_example.py @@ -5,8 +5,7 @@ from lerobot.datasets.lerobot_dataset import LeRobotDatasetMetadata from lerobot.policies.diffusion.modeling_diffusion import DiffusionPolicy from lerobot.policies.factory import make_pre_post_processors from lerobot.policies.utils import build_inference_frame, make_robot_action -from lerobot.robots.so100_follower.config_so100_follower import SO100FollowerConfig -from lerobot.robots.so100_follower.so100_follower import SO100Follower +from lerobot.robots.so_follower import SO100Follower, SO100FollowerConfig MAX_EPISODES = 5 MAX_STEPS_PER_EPISODE = 20 diff --git a/examples/tutorial/pi0/using_pi0_example.py b/examples/tutorial/pi0/using_pi0_example.py index 362092ccf..056c3d81a 100644 --- a/examples/tutorial/pi0/using_pi0_example.py +++ b/examples/tutorial/pi0/using_pi0_example.py @@ -5,8 +5,7 @@ from lerobot.datasets.utils import hw_to_dataset_features from lerobot.policies.factory import make_pre_post_processors from lerobot.policies.pi0.modeling_pi0 import PI0Policy from lerobot.policies.utils import build_inference_frame, make_robot_action -from lerobot.robots.so100_follower.config_so100_follower import SO100FollowerConfig -from lerobot.robots.so100_follower.so100_follower import SO100Follower +from lerobot.robots.so_follower import SO100Follower, SO100FollowerConfig MAX_EPISODES = 5 MAX_STEPS_PER_EPISODE = 20 diff --git a/examples/tutorial/rl/hilserl_example.py b/examples/tutorial/rl/hilserl_example.py index c49233ebb..980ac7985 100644 --- a/examples/tutorial/rl/hilserl_example.py +++ b/examples/tutorial/rl/hilserl_example.py @@ -14,8 +14,8 @@ from lerobot.policies.sac.modeling_sac import SACPolicy from lerobot.policies.sac.reward_model.modeling_classifier import Classifier from lerobot.rl.buffer import ReplayBuffer from lerobot.rl.gym_manipulator import make_robot_env -from lerobot.robots.so100_follower import SO100FollowerConfig -from lerobot.teleoperators.so100_leader import SO100LeaderConfig +from lerobot.robots.so_follower import SO100FollowerConfig +from lerobot.teleoperators.so_leader import SO100LeaderConfig from lerobot.teleoperators.utils import TeleopEvents LOG_EVERY = 10 diff --git a/examples/tutorial/smolvla/using_smolvla_example.py b/examples/tutorial/smolvla/using_smolvla_example.py index d4219f316..ce3aa7bca 100644 --- a/examples/tutorial/smolvla/using_smolvla_example.py +++ b/examples/tutorial/smolvla/using_smolvla_example.py @@ -5,8 +5,7 @@ from lerobot.datasets.utils import hw_to_dataset_features from lerobot.policies.factory import make_pre_post_processors from lerobot.policies.smolvla.modeling_smolvla import SmolVLAPolicy from lerobot.policies.utils import build_inference_frame, make_robot_action -from lerobot.robots.so100_follower.config_so100_follower import SO100FollowerConfig -from lerobot.robots.so100_follower.so100_follower import SO100Follower +from lerobot.robots.so_follower import SO100Follower, SO100FollowerConfig MAX_EPISODES = 5 MAX_STEPS_PER_EPISODE = 20 diff --git a/examples/unitree_g1/gr00t_locomotion.py b/examples/unitree_g1/gr00t_locomotion.py index 7cc4e03be..0123b5206 100644 --- a/examples/unitree_g1/gr00t_locomotion.py +++ b/examples/unitree_g1/gr00t_locomotion.py @@ -13,16 +13,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" -Example: GR00T Locomotion with Pre-loaded Policies - -This example demonstrates the NEW pattern for loading GR00T policies externally -and passing them to the robot class. -""" import argparse import logging -import threading import time from collections import deque @@ -31,24 +24,26 @@ import onnxruntime as ort from huggingface_hub import hf_hub_download from lerobot.robots.unitree_g1.config_unitree_g1 import UnitreeG1Config +from lerobot.robots.unitree_g1.g1_utils import G1_29_JointIndex from lerobot.robots.unitree_g1.unitree_g1 import UnitreeG1 +logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) + GROOT_DEFAULT_ANGLES = np.zeros(29, dtype=np.float32) -GROOT_DEFAULT_ANGLES[[0, 6]] = -0.1 # hip pitch -GROOT_DEFAULT_ANGLES[[3, 9]] = 0.3 # knee -GROOT_DEFAULT_ANGLES[[4, 10]] = -0.2 # ankle pitch +GROOT_DEFAULT_ANGLES[[0, 6]] = -0.1 # Hip pitch +GROOT_DEFAULT_ANGLES[[3, 9]] = 0.3 # Knee +GROOT_DEFAULT_ANGLES[[4, 10]] = -0.2 # Ankle pitch MISSING_JOINTS = [] -G1_MODEL = "g1_23" # or "g1_29" +G1_MODEL = "g1_23" # Or "g1_29" if G1_MODEL == "g1_23": - MISSING_JOINTS = [12, 14, 20, 21, 27, 28] # waist yaw/pitch, wrist pitch/yaw - -LOCOMOTION_ACTION_SCALE = 0.25 - -LOCOMOTION_CONTROL_DT = 0.02 + MISSING_JOINTS = [12, 14, 20, 21, 27, 28] # Waist yaw/pitch, wrist pitch/yaw +# Control parameters +ACTION_SCALE = 0.25 +CONTROL_DT = 0.02 # 50Hz ANG_VEL_SCALE: float = 0.25 DOF_POS_SCALE: float = 1.0 DOF_VEL_SCALE: float = 0.05 @@ -61,12 +56,12 @@ DEFAULT_GROOT_REPO_ID = "nepyope/GR00T-WholeBodyControl_g1" def load_groot_policies( repo_id: str = DEFAULT_GROOT_REPO_ID, ) -> tuple[ort.InferenceSession, ort.InferenceSession]: - """Load GR00T dual-policy system (Balance + Walk) from Hugging Face Hub. + """Load GR00T dual-policy system (Balance + Walk) from the hub. Args: repo_id: Hugging Face Hub repository ID containing the ONNX policies. """ - logger.info(f"Loading GR00T dual-policy system from Hugging Face Hub ({repo_id})...") + logger.info(f"Loading GR00T dual-policy system from the hub ({repo_id})...") # Download ONNX policies from Hugging Face Hub balance_path = hf_hub_download( @@ -88,15 +83,7 @@ def load_groot_policies( class GrootLocomotionController: - """ - Handles GR00T-style locomotion control for the Unitree G1 robot. - - This controller manages: - - Dual-policy system (Balance + Walk) - - 29-joint observation processing - - 15D action output (legs + waist) - - Policy inference and motor command generation - """ + """GR00T lower-body locomotion controller for the Unitree G1.""" def __init__(self, policy_balance, policy_walk, robot, config): self.policy_balance = policy_balance @@ -104,9 +91,9 @@ class GrootLocomotionController: self.robot = robot self.config = config - self.locomotion_cmd = np.array([0.0, 0.0, 0.0], dtype=np.float32) # vx, vy, theta_dot + self.cmd = np.array([0.0, 0.0, 0.0], dtype=np.float32) # vx, vy, theta_dot - # GR00T-specific state + # Robot state self.groot_qj_all = np.zeros(29, dtype=np.float32) self.groot_dqj_all = np.zeros(29, dtype=np.float32) self.groot_action = np.zeros(15, dtype=np.float32) @@ -116,47 +103,39 @@ class GrootLocomotionController: self.groot_height_cmd = 0.74 # Default base height self.groot_orientation_cmd = np.array([0.0, 0.0, 0.0], dtype=np.float32) - # input to gr00t is 6 frames (6*86D=516) + # Input to GR00T is 6 frames (6*86D=516) for _ in range(6): self.groot_obs_history.append(np.zeros(86, dtype=np.float32)) - # Thread management - self.locomotion_running = False - self.locomotion_thread = None - logger.info("GrootLocomotionController initialized") - def groot_locomotion_run(self): - # get current observation - robot_state = self.robot.get_observation() + def run_step(self): + # Get current observation + obs = self.robot.get_observation() - if robot_state is None: + if not obs: return - # get command from remote controller - if robot_state.wireless_remote is not None: - self.robot.remote_controller.set(robot_state.wireless_remote) - if self.robot.remote_controller.button[0]: # R1 - raise waist - self.groot_height_cmd += 0.001 - self.groot_height_cmd = np.clip(self.groot_height_cmd, 0.50, 1.00) - if self.robot.remote_controller.button[4]: # R2 - lower waist - self.groot_height_cmd -= 0.001 - self.groot_height_cmd = np.clip(self.groot_height_cmd, 0.50, 1.00) - else: - self.robot.remote_controller.lx = 0.0 - self.robot.remote_controller.ly = 0.0 - self.robot.remote_controller.rx = 0.0 - self.robot.remote_controller.ry = 0.0 + # Get command from remote controller + if obs["remote.buttons"][0]: # R1 - raise waist + self.groot_height_cmd += 0.001 + self.groot_height_cmd = np.clip(self.groot_height_cmd, 0.50, 1.00) + if obs["remote.buttons"][4]: # R2 - lower waist + self.groot_height_cmd -= 0.001 + self.groot_height_cmd = np.clip(self.groot_height_cmd, 0.50, 1.00) - self.locomotion_cmd[0] = self.robot.remote_controller.ly # forward/backward - self.locomotion_cmd[1] = self.robot.remote_controller.lx * -1 # left/right - self.locomotion_cmd[2] = self.robot.remote_controller.rx * -1 # rotation rate + self.cmd[0] = obs["remote.ly"] # Forward/backward + self.cmd[1] = obs["remote.lx"] * -1 # Left/right + self.cmd[2] = obs["remote.rx"] * -1 # Rotation rate - for i in range(29): - self.groot_qj_all[i] = robot_state.motor_state[i].q - self.groot_dqj_all[i] = robot_state.motor_state[i].dq + # Get joint positions and velocities from flat dict + for motor in G1_29_JointIndex: + name = motor.name + idx = motor.value + self.groot_qj_all[idx] = obs[f"{name}.q"] + self.groot_dqj_all[idx] = obs[f"{name}.dq"] - # adapt observation for g1_23dof + # Adapt observation for g1_23dof for idx in MISSING_JOINTS: self.groot_qj_all[idx] = 0.0 self.groot_dqj_all[idx] = 0.0 @@ -165,18 +144,18 @@ class GrootLocomotionController: qj_obs = self.groot_qj_all.copy() dqj_obs = self.groot_dqj_all.copy() - # express imu data in gravity frame of reference - quat = robot_state.imu_state.quaternion - ang_vel = np.array(robot_state.imu_state.gyroscope, dtype=np.float32) + # Express IMU data in gravity frame of reference + quat = [obs["imu.quat.w"], obs["imu.quat.x"], obs["imu.quat.y"], obs["imu.quat.z"]] + ang_vel = np.array([obs["imu.gyro.x"], obs["imu.gyro.y"], obs["imu.gyro.z"]], dtype=np.float32) gravity_orientation = self.robot.get_gravity_orientation(quat) - # scale joint positions and velocities before policy inference + # Scale joint positions and velocities before policy inference qj_obs = (qj_obs - GROOT_DEFAULT_ANGLES) * DOF_POS_SCALE dqj_obs = dqj_obs * DOF_VEL_SCALE ang_vel_scaled = ang_vel * ANG_VEL_SCALE - # build single frame observation - self.groot_obs_single[:3] = self.locomotion_cmd * np.array(CMD_SCALE) + # Build single frame observation + self.groot_obs_single[:3] = self.cmd * np.array(CMD_SCALE) self.groot_obs_single[3] = self.groot_height_cmd self.groot_obs_single[4:7] = self.groot_orientation_cmd self.groot_obs_single[7:10] = ang_vel_scaled @@ -194,113 +173,76 @@ class GrootLocomotionController: end_idx = start_idx + 86 self.groot_obs_stacked[start_idx:end_idx] = obs_frame - # Run policy inference (ONNX) with 516D stacked observation - - cmd_magnitude = np.linalg.norm(self.locomotion_cmd) - + cmd_magnitude = np.linalg.norm(self.cmd) selected_policy = ( self.policy_balance if cmd_magnitude < 0.05 else self.policy_walk - ) # balance/standing policy for small commands, walking policy for movement commands + ) # Balance/standing policy for small commands, walking policy for movement commands - # run policy inference + # Run policy inference ort_inputs = {selected_policy.get_inputs()[0].name: np.expand_dims(self.groot_obs_stacked, axis=0)} ort_outs = selected_policy.run(None, ort_inputs) self.groot_action = ort_outs[0].squeeze() - # transform action back to target joint positions - target_dof_pos_15 = GROOT_DEFAULT_ANGLES[:15] + self.groot_action * LOCOMOTION_ACTION_SCALE + # Transform action back to target joint positions + target_dof_pos_15 = GROOT_DEFAULT_ANGLES[:15] + self.groot_action * ACTION_SCALE - # command motors + # Build action dict (only first 15 joints for GR00T) + action_dict = {} for i in range(15): - motor_idx = i - self.robot.msg.motor_cmd[motor_idx].q = target_dof_pos_15[i] - self.robot.msg.motor_cmd[motor_idx].qd = 0 - self.robot.msg.motor_cmd[motor_idx].kp = self.robot.kp[motor_idx] - self.robot.msg.motor_cmd[motor_idx].kd = self.robot.kd[motor_idx] - self.robot.msg.motor_cmd[motor_idx].tau = 0 + motor_name = G1_29_JointIndex(i).name + action_dict[f"{motor_name}.q"] = float(target_dof_pos_15[i]) - # adapt action for g1_23dof + # Zero out missing joints for g1_23dof for joint_idx in MISSING_JOINTS: - self.robot.msg.motor_cmd[joint_idx].q = 0.0 - self.robot.msg.motor_cmd[joint_idx].qd = 0 - self.robot.msg.motor_cmd[joint_idx].kp = self.robot.kp[joint_idx] - self.robot.msg.motor_cmd[joint_idx].kd = self.robot.kd[joint_idx] - self.robot.msg.motor_cmd[joint_idx].tau = 0 + motor_name = G1_29_JointIndex(joint_idx).name + action_dict[f"{motor_name}.q"] = 0.0 - # send action to robot - self.robot.send_action(self.robot.msg) + # Send action to robot + self.robot.send_action(action_dict) - def _locomotion_thread_loop(self): - """Background thread that runs the locomotion policy at specified rate.""" - logger.info("Locomotion thread started") - while self.locomotion_running: + +def run(repo_id: str = DEFAULT_GROOT_REPO_ID) -> None: + """Main function to run the GR00T locomotion controller. + + Args: + repo_id: Hugging Face Hub repository ID for GR00T policies. + """ + # Load policies + policy_balance, policy_walk = load_groot_policies(repo_id=repo_id) + + # Initialize robot + config = UnitreeG1Config() + robot = UnitreeG1(config) + + robot.connect() + + # Initialize gr00T locomotion controller + groot_controller = GrootLocomotionController( + policy_balance=policy_balance, + policy_walk=policy_walk, + robot=robot, + config=config, + ) + + try: + robot.reset(CONTROL_DT, GROOT_DEFAULT_ANGLES) + + logger.info("Use joystick: LY=fwd/back, LX=left/right, RX=rotate, R1=raise waist, R2=lower waist") + logger.info("Press Ctrl+C to stop") + + # Run step + while not robot._shutdown_event.is_set(): start_time = time.time() - try: - self.groot_locomotion_run() - except Exception as e: - logger.error(f"Error in locomotion loop: {e}") - - # Sleep to maintain control rate + groot_controller.run_step() elapsed = time.time() - start_time - sleep_time = max(0, LOCOMOTION_CONTROL_DT - elapsed) + sleep_time = max(0, CONTROL_DT - elapsed) time.sleep(sleep_time) - logger.info("Locomotion thread stopped") - - def start_locomotion_thread(self): - if self.locomotion_running: - logger.warning("Locomotion thread already running") - return - - logger.info("Starting locomotion control thread...") - self.locomotion_running = True - self.locomotion_thread = threading.Thread(target=self._locomotion_thread_loop, daemon=True) - self.locomotion_thread.start() - - logger.info("Locomotion control thread started!") - - def stop_locomotion_thread(self): - if not self.locomotion_running: - return - - logger.info("Stopping locomotion control thread...") - self.locomotion_running = False - if self.locomotion_thread: - self.locomotion_thread.join(timeout=2.0) - logger.info("Locomotion control thread stopped") - - def reset_robot(self): - """Move robot legs to default standing position over 2 seconds (arms are not moved).""" - total_time = 3.0 - num_step = int(total_time / self.robot.control_dt) - - # Only control legs, not arms (first 12 joints) - default_pos = GROOT_DEFAULT_ANGLES # First 12 values are leg angles - dof_size = len(default_pos) - - # Get current lowstate - robot_state = self.robot.get_observation() - - # Record the current leg positions - init_dof_pos = np.zeros(dof_size, dtype=np.float32) - for i in range(dof_size): - init_dof_pos[i] = robot_state.motor_state[i].q - - # Move legs to default pos - for i in range(num_step): - alpha = i / num_step - for motor_idx in range(dof_size): - target_pos = default_pos[motor_idx] - self.robot.msg.motor_cmd[motor_idx].q = ( - init_dof_pos[motor_idx] * (1 - alpha) + target_pos * alpha - ) - self.robot.msg.motor_cmd[motor_idx].qd = 0 - self.robot.msg.motor_cmd[motor_idx].kp = self.robot.kp[motor_idx] - self.robot.msg.motor_cmd[motor_idx].kd = self.robot.kd[motor_idx] - self.robot.msg.motor_cmd[motor_idx].tau = 0 - self.robot.msg.crc = self.robot.crc.Crc(self.robot.msg) - self.robot.lowcmd_publisher.Write(self.robot.msg) - time.sleep(self.robot.control_dt) - logger.info("Reached default position (legs only)") + except KeyboardInterrupt: + logger.info("Stopping locomotion...") + finally: + if robot.is_connected: + robot.disconnect() + logger.info("Done!") if __name__ == "__main__": @@ -313,35 +255,4 @@ if __name__ == "__main__": ) args = parser.parse_args() - # load policies - policy_balance, policy_walk = load_groot_policies(repo_id=args.repo_id) - - # initialize robot - config = UnitreeG1Config() - robot = UnitreeG1(config) - - # initialize gr00t locomotion controller - groot_controller = GrootLocomotionController( - policy_balance=policy_balance, - policy_walk=policy_walk, - robot=robot, - config=config, - ) - - # reset legs and start locomotion thread - try: - groot_controller.reset_robot() - groot_controller.start_locomotion_thread() - - # log status - logger.info("Robot initialized with GR00T locomotion policies") - logger.info("Locomotion controller running in background thread") - logger.info("Press Ctrl+C to stop") - - # keep robot alive - while True: - time.sleep(1.0) - except KeyboardInterrupt: - print("\nStopping locomotion...") - groot_controller.stop_locomotion_thread() - print("Done!") + run(repo_id=args.repo_id) diff --git a/examples/unitree_g1/holosoma_locomotion.py b/examples/unitree_g1/holosoma_locomotion.py new file mode 100644 index 000000000..3a07023de --- /dev/null +++ b/examples/unitree_g1/holosoma_locomotion.py @@ -0,0 +1,264 @@ +#!/usr/bin/env python + +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import json +import logging +import time + +import numpy as np +import onnx +import onnxruntime as ort +from huggingface_hub import hf_hub_download + +from lerobot.robots.unitree_g1.config_unitree_g1 import UnitreeG1Config +from lerobot.robots.unitree_g1.g1_utils import G1_29_JointIndex +from lerobot.robots.unitree_g1.unitree_g1 import UnitreeG1 + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +DEFAULT_ANGLES = np.zeros(29, dtype=np.float32) +DEFAULT_ANGLES[[0, 6]] = -0.312 # Hip pitch +DEFAULT_ANGLES[[3, 9]] = 0.669 # Knee +DEFAULT_ANGLES[[4, 10]] = -0.363 # Ankle pitch +DEFAULT_ANGLES[[15, 22]] = 0.2 # Shoulder pitch +DEFAULT_ANGLES[16] = 0.2 # Left shoulder roll +DEFAULT_ANGLES[23] = -0.2 # Right shoulder roll +DEFAULT_ANGLES[[18, 25]] = 0.6 # Elbow + +MISSING_JOINTS = [] +G1_MODEL = "g1_23" # Or "g1_29" +if G1_MODEL == "g1_23": + MISSING_JOINTS = [12, 14, 20, 21, 27, 28] # Waist yaw/pitch, wrist pitch/yaw + +# Control parameters +ACTION_SCALE = 0.25 +CONTROL_DT = 0.02 # 50Hz +ANG_VEL_SCALE = 0.25 +DOF_POS_SCALE = 1.0 +DOF_VEL_SCALE = 0.05 +GAIT_PERIOD = 1.0 + + +DEFAULT_HOLOSOMA_REPO_ID = "nepyope/holosoma_locomotion" + +# Policy filename mapping +POLICY_FILES = { + "fastsac": "fastsac_g1_29dof.onnx", + "ppo": "ppo_g1_29dof.onnx", +} + + +def load_policy( + repo_id: str = DEFAULT_HOLOSOMA_REPO_ID, + policy_type: str = "fastsac", +) -> tuple[ort.InferenceSession, np.ndarray, np.ndarray]: + """Load Holosoma locomotion policy and extract KP/KD from metadata. + + Args: + repo_id: Hugging Face Hub repo ID + policy_type: Either "fastsac" (default) or "ppo" + + Returns: + (policy, kp, kd) tuple + """ + if policy_type not in POLICY_FILES: + raise ValueError(f"Unknown policy type: {policy_type}. Choose from: {list(POLICY_FILES.keys())}") + + filename = POLICY_FILES[policy_type] + logger.info(f"Loading {policy_type.upper()} policy from: {repo_id}/{filename}") + policy_path = hf_hub_download(repo_id=repo_id, filename=filename) + + policy = ort.InferenceSession(policy_path) + logger.info(f"Policy loaded: {policy.get_inputs()[0].shape} → {policy.get_outputs()[0].shape}") + + # Extract KP/KD from ONNX metadata + model = onnx.load(policy_path) + metadata = {prop.key: prop.value for prop in model.metadata_props} + + if "kp" not in metadata or "kd" not in metadata: + raise ValueError("ONNX model must contain 'kp' and 'kd' in metadata") + + kp = np.array(json.loads(metadata["kp"]), dtype=np.float32) + kd = np.array(json.loads(metadata["kd"]), dtype=np.float32) + logger.info(f"Loaded KP/KD from ONNX ({len(kp)} joints)") + + return policy, kp, kd + + +class HolosomaLocomotionController: + """Holosoma whole-body locomotion controller for Unitree G1.""" + + def __init__(self, policy, robot, kp: np.ndarray, kd: np.ndarray): + self.policy = policy + self.robot = robot + + # Override robot's PD gains with policy gains + self.robot.kp = kp + self.robot.kd = kd + + self.cmd = np.zeros(3, dtype=np.float32) + + # Robot state + self.qj = np.zeros(29, dtype=np.float32) + self.dqj = np.zeros(29, dtype=np.float32) + self.obs = np.zeros(100, dtype=np.float32) + self.last_action = np.zeros(29, dtype=np.float32) + + # Gait phase + self.phase = np.array([[0.0, np.pi]], dtype=np.float32) + self.phase_dt = 2 * np.pi / ((1.0 / CONTROL_DT) * GAIT_PERIOD) + self.is_standing = True + + def run_step(self): + # Get current observation + obs = self.robot.get_observation() + + if not obs: + return + + # Get command from remote controller + ly = obs["remote.ly"] if abs(obs["remote.ly"]) > 0.1 else 0.0 + lx = obs["remote.lx"] if abs(obs["remote.lx"]) > 0.1 else 0.0 + rx = obs["remote.rx"] if abs(obs["remote.rx"]) > 0.1 else 0.0 + self.cmd[:] = [ly, -lx, -rx] + + # Get joint positions and velocities + for motor in G1_29_JointIndex: + name = motor.name + idx = motor.value + self.qj[idx] = obs[f"{name}.q"] + self.dqj[idx] = obs[f"{name}.dq"] + + # Adapt observation for g1_23dof + for idx in MISSING_JOINTS: + self.qj[idx] = 0.0 + self.dqj[idx] = 0.0 + + # Express IMU data in gravity frame of reference + quat = [obs["imu.quat.w"], obs["imu.quat.x"], obs["imu.quat.y"], obs["imu.quat.z"]] + ang_vel = np.array([obs["imu.gyro.x"], obs["imu.gyro.y"], obs["imu.gyro.z"]], dtype=np.float32) + gravity = self.robot.get_gravity_orientation(quat) + + # Scale joint positions and velocities before policy inference + qj_obs = (self.qj - DEFAULT_ANGLES) * DOF_POS_SCALE + dqj_obs = self.dqj * DOF_VEL_SCALE + ang_vel_s = ang_vel * ANG_VEL_SCALE + + # Update gait phase + if np.linalg.norm(self.cmd[:2]) < 0.01 and abs(self.cmd[2]) < 0.01: + self.phase[0, :] = np.pi + self.is_standing = True + elif self.is_standing: + self.phase = np.array([[0.0, np.pi]], dtype=np.float32) + self.is_standing = False + else: + self.phase = np.fmod(self.phase + self.phase_dt + np.pi, 2 * np.pi) - np.pi + + sin_ph = np.sin(self.phase[0]) + cos_ph = np.cos(self.phase[0]) + + # Build observations + self.obs[0:29] = self.last_action + self.obs[29:32] = ang_vel_s + self.obs[32] = self.cmd[2] + self.obs[33:35] = self.cmd[:2] + self.obs[35:37] = cos_ph + self.obs[37:66] = qj_obs + self.obs[66:95] = dqj_obs + self.obs[95:98] = gravity + self.obs[98:100] = sin_ph + + # Run policy inference + ort_in = {self.policy.get_inputs()[0].name: self.obs.reshape(1, -1).astype(np.float32)} + raw_action = self.policy.run(None, ort_in)[0].squeeze() + action = np.clip(raw_action, -100.0, 100.0) + self.last_action = action.copy() + + # Transform action back to target joint positions + target = DEFAULT_ANGLES + action * ACTION_SCALE + + # Build action dict + action_dict = {} + for motor in G1_29_JointIndex: + action_dict[f"{motor.name}.q"] = float(target[motor.value]) + + # Zero out missing joints for g1_23dof + for joint_idx in MISSING_JOINTS: + motor_name = G1_29_JointIndex(joint_idx).name + action_dict[f"{motor_name}.q"] = 0.0 + + # Send action to robot + self.robot.send_action(action_dict) + + +def run(repo_id: str = DEFAULT_HOLOSOMA_REPO_ID, policy_type: str = "fastsac") -> None: + """Main function to run the Holosoma locomotion controller. + + Args: + repo_id: Hugging Face Hub repository ID for Holosoma policies. + policy_type: Policy type to use ('fastsac' or 'ppo'). + """ + # Load policy and gains + policy, kp, kd = load_policy(repo_id=repo_id, policy_type=policy_type) + + # Initialize robot + config = UnitreeG1Config() + robot = UnitreeG1(config) + robot.connect() + + holosoma_controller = HolosomaLocomotionController(policy, robot, kp, kd) + + try: + robot.reset(CONTROL_DT, DEFAULT_ANGLES) + + logger.info("Use joystick: LY=fwd/back, LX=left/right, RX=rotate") + logger.info("Press Ctrl+C to stop") + + # Run step + while not robot._shutdown_event.is_set(): + start_time = time.time() + holosoma_controller.run_step() + elapsed = time.time() - start_time + sleep_time = max(0, CONTROL_DT - elapsed) + time.sleep(sleep_time) + except KeyboardInterrupt: + logger.info("Stopping locomotion...") + finally: + if robot.is_connected: + robot.disconnect() + logger.info("Done!") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Holosoma Locomotion Controller for Unitree G1") + parser.add_argument( + "--repo-id", + type=str, + default=DEFAULT_HOLOSOMA_REPO_ID, + help=f"Hugging Face Hub repo ID for Holosoma policies (default: {DEFAULT_HOLOSOMA_REPO_ID})", + ) + parser.add_argument( + "--policy", + type=str, + choices=["fastsac", "ppo"], + default="fastsac", + help="Policy type to use: 'fastsac' (default) or 'ppo'", + ) + args = parser.parse_args() + + run(repo_id=args.repo_id, policy_type=args.policy) diff --git a/pyproject.toml b/pyproject.toml index 61b802bc5..27126f855 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,9 +25,9 @@ discord = "https://discord.gg/s3KuuzsPFb" [project] name = "lerobot" -version = "0.4.3" +version = "0.4.4" description = "🤗 LeRobot: State-of-the-art Machine Learning for Real-World Robotics in Pytorch" -readme = "README.md" +dynamic = ["readme"] license = { text = "Apache-2.0" } requires-python = ">=3.10" authors = [ @@ -74,7 +74,7 @@ dependencies = [ "packaging>=24.2,<26.0", "pynput>=1.7.7,<1.9.0", "pyserial>=3.5,<4.0", - "wandb>=0.20.0,<0.22.0", # TODO: Bumb dependency (compatible with protobuf) + "wandb>=0.24.0,<0.25.0", "torch>=2.2.1,<2.8.0", # TODO: Bumb dependency "torchcodec>=0.2.1,<0.6.0; sys_platform != 'win32' and (sys_platform != 'linux' or (platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 'armv7l')) and (sys_platform != 'darwin' or platform_machine != 'x86_64')", # TODO: Bumb dependency @@ -97,11 +97,12 @@ dependencies = [ pygame-dep = ["pygame>=2.5.1,<2.7.0"] placo-dep = ["placo>=0.9.6,<0.10.0"] transformers-dep = ["transformers>=4.57.1,<5.0.0"] -grpcio-dep = ["grpcio==1.73.1", "protobuf==6.31.0"] # TODO: Bumb dependency (compatible with wandb) +grpcio-dep = ["grpcio==1.73.1", "protobuf>=6.31.1,<6.32.0"] # Motors feetech = ["feetech-servo-sdk>=1.0.0,<2.0.0"] dynamixel = ["dynamixel-sdk>=3.7.31,<3.9.0"] +damiao = ["python-can>=4.2.0,<5.0.0"] # Robots gamepad = ["lerobot[pygame-dep]", "hidapi>=0.14.0,<0.15.0"] @@ -109,9 +110,9 @@ hopejr = ["lerobot[feetech]", "lerobot[pygame-dep]"] lekiwi = ["lerobot[feetech]", "pyzmq>=26.2.1,<28.0.0"] unitree_g1 = [ "pyzmq>=26.2.1,<28.0.0", - "onnxruntime>=1.16.0" + "onnxruntime>=1.16.0,<2.0.0" ] -reachy2 = ["reachy2_sdk>=1.0.14,<1.1.0"] +reachy2 = ["reachy2_sdk>=1.0.15,<1.1.0"] kinematics = ["lerobot[placo-dep]"] intelrealsense = [ "pyrealsense2>=2.55.1.6486,<2.57.0 ; sys_platform != 'darwin'", @@ -127,7 +128,7 @@ wallx = [ "torchdiffeq==0.2.5", "qwen_vl_utils==0.0.11" ] -pi = ["transformers @ git+https://github.com/huggingface/transformers.git@fix/lerobot_openpi"] +pi = ["transformers @ git+https://github.com/huggingface/transformers.git@fix/lerobot_openpi", "scipy>=1.10.1,<1.15"] smolvla = ["lerobot[transformers-dep]", "num2words>=0.5.14,<0.6.0", "accelerate>=1.7.0,<2.0.0", "safetensors>=0.4.3,<1.0.0"] groot = [ "lerobot[transformers-dep]", @@ -140,12 +141,13 @@ groot = [ "ninja>=1.11.1,<2.0.0", "flash-attn>=2.5.9,<3.0.0 ; sys_platform != 'darwin'" ] -sarm = ["lerobot[transformers-dep]", "faker>=33.0.0,<35.0.0", "matplotlib>=3.10.3,<4.0.0", "qwen-vl-utils>=0.0.14"] +sarm = ["lerobot[transformers-dep]", "faker>=33.0.0,<35.0.0", "matplotlib>=3.10.3,<4.0.0", "qwen-vl-utils>=0.0.14,<0.1.0"] xvla = ["lerobot[transformers-dep]"] hilserl = ["lerobot[transformers-dep]", "gym-hil>=0.1.13,<0.2.0", "lerobot[grpcio-dep]", "lerobot[placo-dep]"] # Features async = ["lerobot[grpcio-dep]", "matplotlib>=3.10.3,<4.0.0"] +peft = ["lerobot[transformers-dep]", "peft>=0.18.0,<1.0.0"] # Development dev = ["pre-commit>=3.7.0,<5.0.0", "debugpy>=1.8.1,<1.9.0", "lerobot[grpcio-dep]", "grpcio-tools==1.73.1", "mypy>=1.19.1"] @@ -182,7 +184,8 @@ all = [ "lerobot[phone]", "lerobot[libero]", "lerobot[metaworld]", - "lerobot[sarm]" + "lerobot[sarm]", + "lerobot[peft]", ] [project.scripts] @@ -195,11 +198,13 @@ lerobot-setup-motors="lerobot.scripts.lerobot_setup_motors:main" lerobot-teleoperate="lerobot.scripts.lerobot_teleoperate:main" lerobot-eval="lerobot.scripts.lerobot_eval:main" lerobot-train="lerobot.scripts.lerobot_train:main" +lerobot-train-tokenizer="lerobot.scripts.lerobot_train_tokenizer:main" lerobot-dataset-viz="lerobot.scripts.lerobot_dataset_viz:main" lerobot-info="lerobot.scripts.lerobot_info:main" lerobot-find-joint-limits="lerobot.scripts.lerobot_find_joint_limits:main" lerobot-imgtransform-viz="lerobot.scripts.lerobot_imgtransform_viz:main" lerobot-edit-dataset="lerobot.scripts.lerobot_edit_dataset:main" +lerobot-setup-can="lerobot.scripts.lerobot_setup_can:main" # ---------------- Tool Configurations ---------------- [tool.setuptools.packages.find] @@ -275,6 +280,7 @@ default.extend-ignore-identifiers-re = [ "thw", "inpt", "ROBOTIS", + "OT_VALUE" ] # TODO: Uncomment when ready to use @@ -417,6 +423,10 @@ conflicts = [ { extra = "wallx" }, { extra = "libero" }, ], + [ + { extra = "wallx" }, + { extra = "peft" }, + ], [ { extra = "wallx" }, { extra = "all" }, @@ -450,6 +460,10 @@ conflicts = [ { extra = "pi" }, { extra = "libero" }, ], + [ + { extra = "pi" }, + { extra = "peft" }, + ], [ { extra = "pi" }, { extra = "all" }, diff --git a/setup.py b/setup.py new file mode 100644 index 000000000..d97b6f835 --- /dev/null +++ b/setup.py @@ -0,0 +1,72 @@ +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from setuptools import setup + + +def get_version_from_toml() -> str: + """Return the project's version string parsed from `pyproject.toml`. + + The function scans `pyproject.toml` line-by-line looking for a line + that starts with ``version`` (for example: ``version = "1.2.3"``) + and returns the value without surrounding quotes. If no such line is + found a :class:`ValueError` is raised. + + Returns: + The version string from `pyproject.toml` (e.g. ``"1.2.3"`` -> + ``1.2.3``). + """ + + version = None + with open("pyproject.toml", encoding="utf-8") as f: + for line in f: + if line.strip().startswith("version"): + version = line.split("=")[1].strip().strip('"') + break + if version is None: + raise ValueError("Version not found in pyproject.toml") + return version + + +def read_long_description() -> str: + """Read and return the project's long description for setup. + + This function reads `README.md` and replaces image links that point + to the local `./media/` directory with absolute raw GitHub URLs that + reference the release tag corresponding to the version parsed from + `pyproject.toml` (for example, ``v1.2.3``). The modified README + content is returned as a string suitable for passing to + ``setuptools.setup(long_description=...)``. + + Returns: + The README content with rewritten media links. + """ + + with open("README.md", encoding="utf-8") as f: + content = f.read() + + version = get_version_from_toml() + git_tag = f"v{version}" + + base_raw_url = f"https://raw.githubusercontent.com/huggingface/lerobot/{git_tag}/" + content = content.replace('src="./media/', f'src="{base_raw_url}media/') + + return content + + +setup( + long_description=read_long_description(), + long_description_content_type="text/markdown", +) diff --git a/src/lerobot/async_inference/configs.py b/src/lerobot/async_inference/configs.py index d1768a323..2e3fe576d 100644 --- a/src/lerobot/async_inference/configs.py +++ b/src/lerobot/async_inference/configs.py @@ -126,6 +126,12 @@ class RobotClientConfig: # Device configuration policy_device: str = field(default="cpu", metadata={"help": "Device for policy inference"}) + client_device: str = field( + default="cpu", + metadata={ + "help": "Device to move actions to after receiving from server (e.g., for downstream planners)" + }, + ) # Control behavior configuration chunk_size_threshold: float = field(default=0.5, metadata={"help": "Threshold for chunk size control"}) @@ -161,6 +167,9 @@ class RobotClientConfig: if not self.policy_device: raise ValueError("policy_device cannot be empty") + if not self.client_device: + raise ValueError("client_device cannot be empty") + if self.chunk_size_threshold < 0 or self.chunk_size_threshold > 1: raise ValueError(f"chunk_size_threshold must be between 0 and 1, got {self.chunk_size_threshold}") @@ -184,6 +193,7 @@ class RobotClientConfig: "policy_type": self.policy_type, "pretrained_name_or_path": self.pretrained_name_or_path, "policy_device": self.policy_device, + "client_device": self.client_device, "chunk_size_threshold": self.chunk_size_threshold, "fps": self.fps, "actions_per_chunk": self.actions_per_chunk, diff --git a/src/lerobot/async_inference/constants.py b/src/lerobot/async_inference/constants.py index f8b6d7bb3..56910e67f 100644 --- a/src/lerobot/async_inference/constants.py +++ b/src/lerobot/async_inference/constants.py @@ -23,7 +23,7 @@ DEFAULT_INFERENCE_LATENCY = 1 / DEFAULT_FPS DEFAULT_OBS_QUEUE_TIMEOUT = 2 # All action chunking policies -SUPPORTED_POLICIES = ["act", "smolvla", "diffusion", "tdmpc", "vqbet", "pi0", "pi05"] +SUPPORTED_POLICIES = ["act", "smolvla", "diffusion", "tdmpc", "vqbet", "pi0", "pi05", "groot"] # TODO: Add all other robots -SUPPORTED_ROBOTS = ["so100_follower", "so101_follower", "bi_so100_follower", "omx_follower"] +SUPPORTED_ROBOTS = ["so100_follower", "so101_follower", "bi_so_follower", "omx_follower"] diff --git a/src/lerobot/async_inference/helpers.py b/src/lerobot/async_inference/helpers.py index 2158f51ac..8b12920d9 100644 --- a/src/lerobot/async_inference/helpers.py +++ b/src/lerobot/async_inference/helpers.py @@ -18,6 +18,7 @@ import os import time from dataclasses import dataclass, field from pathlib import Path +from typing import Any import torch @@ -39,8 +40,8 @@ from lerobot.utils.utils import init_logging Action = torch.Tensor -# observation as received from the robot -RawObservation = dict[str, torch.Tensor] +# observation as received from the robot (can be numpy arrays, floats, etc.) +RawObservation = dict[str, Any] # observation as those recorded in LeRobot dataset (keys are different) LeRobotObservation = dict[str, torch.Tensor] diff --git a/src/lerobot/async_inference/policy_server.py b/src/lerobot/async_inference/policy_server.py index ab2e6bcd8..aedce2a74 100644 --- a/src/lerobot/async_inference/policy_server.py +++ b/src/lerobot/async_inference/policy_server.py @@ -381,6 +381,8 @@ class PolicyServer(services_pb2_grpc.AsyncInferenceServicer): action_tensor = torch.stack(processed_actions, dim=1).squeeze(0) self.logger.debug(f"Postprocessed action shape: {action_tensor.shape}") + action_tensor = action_tensor.detach().cpu() + """5. Convert to TimedAction list""" action_chunk = self._time_action_chunk( observation_t.get_timestamp(), list(action_tensor), observation_t.get_timestep() diff --git a/src/lerobot/async_inference/robot_client.py b/src/lerobot/async_inference/robot_client.py index d32aa6a21..e4d21652a 100644 --- a/src/lerobot/async_inference/robot_client.py +++ b/src/lerobot/async_inference/robot_client.py @@ -25,6 +25,7 @@ python src/lerobot/async_inference/robot_client.py \ --policy_type=act \ --pretrained_name_or_path=user/model \ --policy_device=mps \ + --client_device=cpu \ --actions_per_chunk=50 \ --chunk_size_threshold=0.5 \ --aggregate_fn_name=weighted_average \ @@ -51,12 +52,11 @@ from lerobot.cameras.realsense.configuration_realsense import RealSenseCameraCon from lerobot.robots import ( # noqa: F401 Robot, RobotConfig, - bi_so100_follower, + bi_so_follower, koch_follower, make_robot_from_config, omx_follower, - so100_follower, - so101_follower, + so_follower, ) from lerobot.transport import ( services_pb2, # type: ignore @@ -286,6 +286,21 @@ class RobotClient: timed_actions = pickle.loads(actions_chunk.data) # nosec deserialize_time = time.perf_counter() - deserialize_start + # Log device type of received actions + if len(timed_actions) > 0: + received_device = timed_actions[0].get_action().device.type + self.logger.debug(f"Received actions on device: {received_device}") + + # Move actions to client_device (e.g., for downstream planners that need GPU) + client_device = self.config.client_device + if client_device != "cpu": + for timed_action in timed_actions: + if timed_action.get_action().device.type != client_device: + timed_action.action = timed_action.get_action().to(client_device) + self.logger.debug(f"Converted actions to device: {client_device}") + else: + self.logger.debug(f"Actions kept on device: {client_device}") + self.action_chunk_size = max(self.action_chunk_size, len(timed_actions)) # Calculate network latency if we have matching observations diff --git a/src/lerobot/cameras/reachy2_camera/configuration_reachy2_camera.py b/src/lerobot/cameras/reachy2_camera/configuration_reachy2_camera.py index f26cf2ad1..ca6db4f03 100644 --- a/src/lerobot/cameras/reachy2_camera/configuration_reachy2_camera.py +++ b/src/lerobot/cameras/reachy2_camera/configuration_reachy2_camera.py @@ -35,18 +35,19 @@ class Reachy2CameraConfig(CameraConfig): name="teleop", image_type="left", ip_address="192.168.0.200", # IP address of the robot - fps=15, + port=50065, # Port of the camera server width=640, height=480, + fps=30, # Not configurable for Reachy 2 cameras color_mode=ColorMode.RGB, - ) # Left teleop camera, 640x480 @ 15FPS + ) # Left teleop camera, 640x480 @ 30FPS ``` Attributes: name: Name of the camera device. Can be "teleop" or "depth". image_type: Type of image stream. For "teleop" camera, can be "left" or "right". For "depth" camera, can be "rgb" or "depth". (depth is not supported yet) - fps: Requested frames per second for the color stream. + fps: Requested frames per second for the color stream. Not configurable for Reachy 2 cameras. width: Requested frame width in pixels for the color stream. height: Requested frame height in pixels for the color stream. color_mode: Color mode for image output (RGB or BGR). Defaults to RGB. @@ -62,7 +63,6 @@ class Reachy2CameraConfig(CameraConfig): color_mode: ColorMode = ColorMode.RGB ip_address: str | None = "localhost" port: int = 50065 - # use_depth: bool = False def __post_init__(self) -> None: if self.name not in ["teleop", "depth"]: diff --git a/src/lerobot/cameras/reachy2_camera/reachy2_camera.py b/src/lerobot/cameras/reachy2_camera/reachy2_camera.py index 30e096767..c8916c5ee 100644 --- a/src/lerobot/cameras/reachy2_camera/reachy2_camera.py +++ b/src/lerobot/cameras/reachy2_camera/reachy2_camera.py @@ -16,12 +16,13 @@ Provides the Reachy2Camera class for capturing frames from Reachy 2 cameras using Reachy 2's CameraManager. """ +from __future__ import annotations + import logging import os import platform import time -from threading import Event, Lock, Thread -from typing import Any +from typing import TYPE_CHECKING, Any from numpy.typing import NDArray # type: ignore # TODO: add type stubs for numpy.typing @@ -30,10 +31,19 @@ if platform.system() == "Windows" and "OPENCV_VIDEOIO_MSMF_ENABLE_HW_TRANSFORMS" os.environ["OPENCV_VIDEOIO_MSMF_ENABLE_HW_TRANSFORMS"] = "0" import cv2 # type: ignore # TODO: add type stubs for OpenCV import numpy as np # type: ignore # TODO: add type stubs for numpy -from reachy2_sdk.media.camera import CameraView # type: ignore # TODO: add type stubs for reachy2_sdk -from reachy2_sdk.media.camera_manager import ( # type: ignore # TODO: add type stubs for reachy2_sdk - CameraManager, -) + +from lerobot.utils.import_utils import _reachy2_sdk_available + +if TYPE_CHECKING or _reachy2_sdk_available: + from reachy2_sdk.media.camera import CameraView + from reachy2_sdk.media.camera_manager import CameraManager +else: + CameraManager = None + + class CameraView: + LEFT = 0 + RIGHT = 1 + from lerobot.utils.errors import DeviceNotConnectedError @@ -69,17 +79,10 @@ class Reachy2Camera(Camera): self.config = config - self.fps = config.fps self.color_mode = config.color_mode self.cam_manager: CameraManager | None = None - self.thread: Thread | None = None - self.stop_event: Event | None = None - self.frame_lock: Lock = Lock() - self.latest_frame: NDArray[Any] | None = None - self.new_frame_event: Event = Event() - def __str__(self) -> str: return f"{self.__class__.__name__}({self.config.name}, {self.config.image_type})" @@ -100,44 +103,23 @@ class Reachy2Camera(Camera): def connect(self, warmup: bool = True) -> None: """ Connects to the Reachy2 CameraManager as specified in the configuration. + + Raises: + DeviceNotConnectedError: If the camera is not connected. """ self.cam_manager = CameraManager(host=self.config.ip_address, port=self.config.port) + if self.cam_manager is None: + raise DeviceNotConnectedError(f"Could not connect to {self}.") self.cam_manager.initialize_cameras() logger.info(f"{self} connected.") @staticmethod - def find_cameras(ip_address: str = "localhost", port: int = 50065) -> list[dict[str, Any]]: + def find_cameras() -> list[dict[str, Any]]: """ - Detects available Reachy 2 cameras. - - Returns: - List[Dict[str, Any]]: A list of dictionaries, - where each dictionary contains 'name', 'stereo', - and the default profile properties (width, height, fps). + Detection not implemented for Reachy2 cameras. """ - initialized_cameras = [] - camera_manager = CameraManager(host=ip_address, port=port) - - for camera in [camera_manager.teleop, camera_manager.depth]: - if camera is None: - continue - - height, width, _, _, _, _, _ = camera.get_parameters() - - camera_info = { - "name": camera._cam_info.name, - "stereo": camera._cam_info.stereo, - "default_profile": { - "width": width, - "height": height, - "fps": 30, - }, - } - initialized_cameras.append(camera_info) - - camera_manager.disconnect() - return initialized_cameras + raise NotImplementedError("Camera detection is not implemented for Reachy2 cameras.") def read(self, color_mode: ColorMode | None = None) -> NDArray[Any]: """ @@ -155,95 +137,49 @@ class Reachy2Camera(Camera): (height, width, channels), using the specified or default color mode and applying any configured rotation. """ + start_time = time.perf_counter() + if not self.is_connected: raise DeviceNotConnectedError(f"{self} is not connected.") - start_time = time.perf_counter() + if self.cam_manager is None: + raise DeviceNotConnectedError(f"{self} is not connected.") frame: NDArray[Any] = np.empty((0, 0, 3), dtype=np.uint8) - if self.cam_manager is None: - raise DeviceNotConnectedError(f"{self} is not connected.") + if self.config.name == "teleop" and hasattr(self.cam_manager, "teleop"): + if self.config.image_type == "left": + frame = self.cam_manager.teleop.get_frame( + CameraView.LEFT, size=(self.config.width, self.config.height) + )[0] + elif self.config.image_type == "right": + frame = self.cam_manager.teleop.get_frame( + CameraView.RIGHT, size=(self.config.width, self.config.height) + )[0] + elif self.config.name == "depth" and hasattr(self.cam_manager, "depth"): + if self.config.image_type == "depth": + frame = self.cam_manager.depth.get_depth_frame()[0] + elif self.config.image_type == "rgb": + frame = self.cam_manager.depth.get_frame(size=(self.config.width, self.config.height))[0] else: - if self.config.name == "teleop" and hasattr(self.cam_manager, "teleop"): - if self.config.image_type == "left": - frame = self.cam_manager.teleop.get_frame(CameraView.LEFT, size=(640, 480))[0] - elif self.config.image_type == "right": - frame = self.cam_manager.teleop.get_frame(CameraView.RIGHT, size=(640, 480))[0] - elif self.config.name == "depth" and hasattr(self.cam_manager, "depth"): - if self.config.image_type == "depth": - frame = self.cam_manager.depth.get_depth_frame()[0] - elif self.config.image_type == "rgb": - frame = self.cam_manager.depth.get_frame(size=(640, 480))[0] + raise ValueError(f"Invalid camera name '{self.config.name}'. Expected 'teleop' or 'depth'.") - if frame is None: - return np.empty((0, 0, 3), dtype=np.uint8) + if frame is None: + return np.empty((0, 0, 3), dtype=np.uint8) - if self.config.color_mode == "rgb": - frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + if self.config.color_mode == "rgb": + frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) read_duration_ms = (time.perf_counter() - start_time) * 1e3 logger.debug(f"{self} read took: {read_duration_ms:.1f}ms") return frame - def _read_loop(self) -> None: - """ - Internal loop run by the background thread for asynchronous reading. - - On each iteration: - 1. Reads a color frame - 2. Stores result in latest_frame (thread-safe) - 3. Sets new_frame_event to notify listeners - - Stops on DeviceNotConnectedError, logs other errors and continues. - """ - if self.stop_event is None: - raise RuntimeError(f"{self}: stop_event is not initialized before starting read loop.") - - while not self.stop_event.is_set(): - try: - color_image = self.read() - - with self.frame_lock: - self.latest_frame = color_image - self.new_frame_event.set() - - except DeviceNotConnectedError: - break - except Exception as e: - logger.warning(f"Error reading frame in background thread for {self}: {e}") - - def _start_read_thread(self) -> None: - """Starts or restarts the background read thread if it's not running.""" - if self.thread is not None and self.thread.is_alive(): - self.thread.join(timeout=0.1) - if self.stop_event is not None: - self.stop_event.set() - - self.stop_event = Event() - self.thread = Thread(target=self._read_loop, args=(), name=f"{self}_read_loop") - self.thread.daemon = True - self.thread.start() - - def _stop_read_thread(self) -> None: - """Signals the background read thread to stop and waits for it to join.""" - if self.stop_event is not None: - self.stop_event.set() - - if self.thread is not None and self.thread.is_alive(): - self.thread.join(timeout=2.0) - - self.thread = None - self.stop_event = None - def async_read(self, timeout_ms: float = 200) -> NDArray[Any]: """ - Reads the latest available frame asynchronously. + Reads the latest available frame. - This method retrieves the most recent frame captured by the background - read thread. It does not block waiting for the camera hardware directly, - but may wait up to timeout_ms for the background thread to provide a frame. + This method retrieves the most recent frame available in Reachy 2's low-level software. Args: timeout_ms (float): Maximum time in milliseconds to wait for a frame @@ -261,22 +197,10 @@ class Reachy2Camera(Camera): if not self.is_connected: raise DeviceNotConnectedError(f"{self} is not connected.") - if self.thread is None or not self.thread.is_alive(): - self._start_read_thread() - - if not self.new_frame_event.wait(timeout=timeout_ms / 1000.0): - thread_alive = self.thread is not None and self.thread.is_alive() - raise TimeoutError( - f"Timed out waiting for frame from camera {self} after {timeout_ms} ms. " - f"Read thread alive: {thread_alive}." - ) - - with self.frame_lock: - frame = self.latest_frame - self.new_frame_event.clear() + frame = self.read() if frame is None: - raise RuntimeError(f"Internal error: Event set but no frame available for {self}.") + raise RuntimeError(f"Internal error: No frame available for {self}.") return frame @@ -287,12 +211,9 @@ class Reachy2Camera(Camera): Raises: DeviceNotConnectedError: If the camera is already disconnected. """ - if not self.is_connected and self.thread is None: + if not self.is_connected: raise DeviceNotConnectedError(f"{self} not connected.") - if self.thread is not None: - self._stop_read_thread() - if self.cam_manager is not None: self.cam_manager.disconnect() diff --git a/src/lerobot/cameras/utils.py b/src/lerobot/cameras/utils.py index 1b2d386d6..c0e7b6284 100644 --- a/src/lerobot/cameras/utils.py +++ b/src/lerobot/cameras/utils.py @@ -43,6 +43,11 @@ def make_cameras_from_configs(camera_configs: dict[str, CameraConfig]) -> dict[s cameras[key] = Reachy2Camera(cfg) + elif cfg.type == "zmq": + from .zmq.camera_zmq import ZMQCamera + + cameras[key] = ZMQCamera(cfg) + else: try: cameras[key] = cast(Camera, make_device_from_device_class(cfg)) diff --git a/src/lerobot/robots/bi_so100_follower/__init__.py b/src/lerobot/cameras/zmq/__init__.py similarity index 75% rename from src/lerobot/robots/bi_so100_follower/__init__.py rename to src/lerobot/cameras/zmq/__init__.py index 90f56516b..d760c5325 100644 --- a/src/lerobot/robots/bi_so100_follower/__init__.py +++ b/src/lerobot/cameras/zmq/__init__.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# Copyright 2026 The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,5 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .bi_so100_follower import BiSO100Follower -from .config_bi_so100_follower import BiSO100FollowerConfig +from .camera_zmq import ZMQCamera +from .configuration_zmq import ZMQCameraConfig + +__all__ = ["ZMQCamera", "ZMQCameraConfig"] diff --git a/src/lerobot/cameras/zmq/camera_zmq.py b/src/lerobot/cameras/zmq/camera_zmq.py new file mode 100644 index 000000000..1a4155f4b --- /dev/null +++ b/src/lerobot/cameras/zmq/camera_zmq.py @@ -0,0 +1,235 @@ +#!/usr/bin/env python + +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +ZMQCamera - Captures frames from remote cameras via ZeroMQ using JSON protocol in the +following format: + { + "timestamps": {"camera_name": float}, + "images": {"camera_name": ""} + } +""" + +import base64 +import json +import logging +import time +from threading import Event, Lock, Thread +from typing import Any + +import cv2 +import numpy as np +from numpy.typing import NDArray + +from lerobot.utils.errors import DeviceAlreadyConnectedError, DeviceNotConnectedError + +from ..camera import Camera +from ..configs import ColorMode +from .configuration_zmq import ZMQCameraConfig + +logger = logging.getLogger(__name__) + + +class ZMQCamera(Camera): + """ + Example usage: + ```python + from lerobot.cameras.zmq import ZMQCamera, ZMQCameraConfig + + config = ZMQCameraConfig(server_address="192.168.123.164", port=5555, camera_name="head_camera") + camera = ZMQCamera(config) + camera.connect() + frame = camera.read() + camera.disconnect() + ``` + """ + + def __init__(self, config: ZMQCameraConfig): + super().__init__(config) + import zmq + + self.config = config + self.server_address = config.server_address + self.port = config.port + self.camera_name = config.camera_name + self.color_mode = config.color_mode + self.timeout_ms = config.timeout_ms + + self.context: zmq.Context | None = None + self.socket: zmq.Socket | None = None + self._connected = False + + self.thread: Thread | None = None + self.stop_event: Event | None = None + self.frame_lock: Lock = Lock() + self.latest_frame: NDArray[Any] | None = None + self.new_frame_event: Event = Event() + + def __str__(self) -> str: + return f"ZMQCamera({self.camera_name}@{self.server_address}:{self.port})" + + @property + def is_connected(self) -> bool: + return self._connected and self.context is not None and self.socket is not None + + def connect(self, warmup: bool = True) -> None: + """Connect to ZMQ camera server.""" + if self.is_connected: + raise DeviceAlreadyConnectedError(f"{self} is already connected.") + + logger.info(f"Connecting to {self}...") + + try: + import zmq + + self.context = zmq.Context() + self.socket = self.context.socket(zmq.SUB) + self.socket.setsockopt_string(zmq.SUBSCRIBE, "") + self.socket.setsockopt(zmq.RCVTIMEO, self.timeout_ms) + self.socket.setsockopt(zmq.CONFLATE, True) + self.socket.connect(f"tcp://{self.server_address}:{self.port}") + self._connected = True + + # Auto-detect resolution + if self.width is None or self.height is None: + h, w = self.read().shape[:2] + self.height = h + self.width = w + logger.info(f"{self} resolution: {w}x{h}") + + logger.info(f"{self} connected.") + + if warmup: + time.sleep(0.1) + + except Exception as e: + self._cleanup() + raise RuntimeError(f"Failed to connect to {self}: {e}") from e + + def _cleanup(self): + """Clean up ZMQ resources.""" + self._connected = False + if self.socket: + self.socket.close() + self.socket = None + if self.context: + self.context.term() + self.context = None + + @staticmethod + def find_cameras() -> list[dict[str, Any]]: + """ZMQ cameras require manual configuration (server address/port).""" + return [] + + def read(self, color_mode: ColorMode | None = None) -> NDArray[Any]: + """ + Read a single frame from the ZMQ camera. + + Returns: + np.ndarray: Decoded frame (height, width, 3) + """ + if not self.is_connected or self.socket is None: + raise DeviceNotConnectedError(f"{self} is not connected.") + + try: + message = self.socket.recv_string() + except Exception as e: + if type(e).__name__ == "Again": + raise TimeoutError(f"{self} timeout after {self.timeout_ms}ms") from e + raise + + # Decode JSON message + data = json.loads(message) + + if "images" not in data: + raise RuntimeError(f"{self} invalid message: missing 'images' key") + + images = data["images"] + + # Get image by camera name or first available + if self.camera_name in images: + img_b64 = images[self.camera_name] + elif images: + img_b64 = next(iter(images.values())) + else: + raise RuntimeError(f"{self} no images in message") + + # Decode base64 JPEG + img_bytes = base64.b64decode(img_b64) + frame = cv2.imdecode(np.frombuffer(img_bytes, np.uint8), cv2.IMREAD_COLOR) + + if frame is None: + raise RuntimeError(f"{self} failed to decode image") + + return frame + + def _read_loop(self) -> None: + while self.stop_event and not self.stop_event.is_set(): + try: + frame = self.read() + with self.frame_lock: + self.latest_frame = frame + self.new_frame_event.set() + except DeviceNotConnectedError: + break + except TimeoutError: + pass + except Exception as e: + logger.warning(f"Read error: {e}") + + def _start_read_thread(self) -> None: + if self.thread and self.thread.is_alive(): + return + self.stop_event = Event() + self.thread = Thread(target=self._read_loop, daemon=True) + self.thread.start() + + def _stop_read_thread(self) -> None: + if self.stop_event: + self.stop_event.set() + if self.thread and self.thread.is_alive(): + self.thread.join(timeout=2.0) + self.thread = None + self.stop_event = None + + def async_read(self, timeout_ms: float = 10000) -> NDArray[Any]: + """Read latest frame asynchronously (non-blocking).""" + if not self.is_connected: + raise DeviceNotConnectedError(f"{self} is not connected.") + + if not self.thread or not self.thread.is_alive(): + self._start_read_thread() + + if not self.new_frame_event.wait(timeout=timeout_ms / 1000.0): + raise TimeoutError(f"{self} async_read timeout after {timeout_ms}ms") + + with self.frame_lock: + frame = self.latest_frame + self.new_frame_event.clear() + + if frame is None: + raise RuntimeError(f"{self} no frame available") + + return frame + + def disconnect(self) -> None: + """Disconnect from ZMQ camera.""" + if not self.is_connected and not self.thread: + raise DeviceNotConnectedError(f"{self} not connected.") + + self._stop_read_thread() + self._cleanup() + logger.info(f"{self} disconnected.") diff --git a/src/lerobot/cameras/zmq/configuration_zmq.py b/src/lerobot/cameras/zmq/configuration_zmq.py new file mode 100644 index 000000000..027ae12b5 --- /dev/null +++ b/src/lerobot/cameras/zmq/configuration_zmq.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python + +# Copyright 2026 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass + +from ..configs import CameraConfig, ColorMode + +__all__ = ["ZMQCameraConfig", "ColorMode"] + + +@CameraConfig.register_subclass("zmq") +@dataclass +class ZMQCameraConfig(CameraConfig): + server_address: str + port: int = 5555 + camera_name: str = "zmq_camera" + color_mode: ColorMode = ColorMode.RGB + timeout_ms: int = 5000 + + def __post_init__(self) -> None: + if self.color_mode not in (ColorMode.RGB, ColorMode.BGR): + raise ValueError( + f"`color_mode` is expected to be {ColorMode.RGB.value} or {ColorMode.BGR.value}, but {self.color_mode} is provided." + ) + + if self.timeout_ms <= 0: + raise ValueError(f"`timeout_ms` must be positive, but {self.timeout_ms} is provided.") + + if not self.server_address: + raise ValueError("`server_address` cannot be empty.") + + if self.port <= 0 or self.port > 65535: + raise ValueError(f"`port` must be between 1 and 65535, but {self.port} is provided.") diff --git a/src/lerobot/cameras/zmq/image_server.py b/src/lerobot/cameras/zmq/image_server.py new file mode 100644 index 000000000..2da366cef --- /dev/null +++ b/src/lerobot/cameras/zmq/image_server.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python + +# Copyright 2026 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Streams camera images over ZMQ. +Uses lerobot's OpenCVCamera for capture, encodes images to base64 and sends them over ZMQ. +""" + +import base64 +import contextlib +import json +import logging +import time +from collections import deque + +import cv2 +import numpy as np +import zmq + +from lerobot.cameras.configs import ColorMode +from lerobot.cameras.opencv import OpenCVCamera, OpenCVCameraConfig + +logger = logging.getLogger(__name__) + + +def encode_image(image: np.ndarray, quality: int = 80) -> str: + """Encode RGB image to base64 JPEG string.""" + _, buffer = cv2.imencode(".jpg", image, [int(cv2.IMWRITE_JPEG_QUALITY), quality]) + return base64.b64encode(buffer).decode("utf-8") + + +class ImageServer: + def __init__(self, config: dict, port: int = 5555): + self.fps = config.get("fps", 30) + self.cameras: dict[str, OpenCVCamera] = {} + + for name, cfg in config.get("cameras", {}).items(): + shape = cfg.get("shape", [480, 640]) + cam_config = OpenCVCameraConfig( + index_or_path=cfg.get("device_id", 0), + fps=self.fps, + width=shape[1], + height=shape[0], + color_mode=ColorMode.RGB, + ) + camera = OpenCVCamera(cam_config) + camera.connect() + self.cameras[name] = camera + logger.info(f"Camera {name}: {shape[1]}x{shape[0]}") + + # ZMQ PUB socket + self.context = zmq.Context() + self.socket = self.context.socket(zmq.PUB) + self.socket.setsockopt(zmq.SNDHWM, 20) + self.socket.setsockopt(zmq.LINGER, 0) + self.socket.bind(f"tcp://*:{port}") + + logger.info(f"ImageServer running on port {port}") + + def run(self): + frame_count = 0 + frame_times = deque(maxlen=60) + + try: + while True: + t0 = time.time() + + # Build message + message = {"timestamps": {}, "images": {}} + for name, cam in self.cameras.items(): + frame = cam.read() # Returns RGB + message["timestamps"][name] = time.time() + message["images"][name] = encode_image(frame) + + # Send as JSON string (suppress if buffer full) + with contextlib.suppress(zmq.Again): + self.socket.send_string(json.dumps(message), zmq.NOBLOCK) + + frame_count += 1 + frame_times.append(time.time() - t0) + + if frame_count % 60 == 0: + logger.debug(f"FPS: {len(frame_times) / sum(frame_times):.1f}") + + sleep = (1.0 / self.fps) - (time.time() - t0) + if sleep > 0: + time.sleep(sleep) + + except KeyboardInterrupt: + pass + finally: + for cam in self.cameras.values(): + cam.disconnect() + self.socket.close() + self.context.term() + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + config = {"fps": 30, "cameras": {"head_camera": {"device_id": 4, "shape": [480, 640]}}} + ImageServer(config, port=5555).run() diff --git a/src/lerobot/configs/default.py b/src/lerobot/configs/default.py index 630d63f1b..f613b5251 100644 --- a/src/lerobot/configs/default.py +++ b/src/lerobot/configs/default.py @@ -67,3 +67,31 @@ class EvalConfig: f"to increase the number of episodes to match the batch size (e.g. `eval.n_episodes={self.batch_size}`), " f"or lower the batch size (e.g. `eval.batch_size={self.n_episodes}`)." ) + + +@dataclass +class PeftConfig: + # PEFT offers many fine-tuning methods, layer adapters being the most common and currently also the most + # effective methods so we'll focus on those in this high-level config interface. + + # Either a string (module name suffix or 'all-linear'), a list of module name suffixes or a regular expression + # describing module names to target with the configured PEFT method. Some policies have a default value for this + # so that you don't *have* to choose which layers to adapt but it might still be worthwhile depending on your case. + target_modules: list[str] | str | None = None + + # Names/suffixes of modules to fully fine-tune and store alongside adapter weights. Useful for layers that are + # not part of a pre-trained model (e.g., action state projections). Depending on the policy this defaults to layers + # that are newly created in pre-trained policies. If you're fine-tuning an already trained policy you might want + # to set this to `[]`. Corresponds to PEFT's `modules_to_save`. + full_training_modules: list[str] | None = None + + # The PEFT (adapter) method to apply to the policy. Needs to be a valid PEFT type. + method_type: str = "LORA" + + # Adapter initialization method. Look at the specific PEFT adapter documentation for defaults. + init_type: str | None = None + + # We expect that all PEFT adapters are in some way doing rank-decomposition therefore this parameter specifies + # the rank used for the adapter. In general a higher rank means more trainable parameters and closer to full + # fine-tuning. + r: int = 16 diff --git a/src/lerobot/configs/eval.py b/src/lerobot/configs/eval.py index 2f085da56..da8bee6b2 100644 --- a/src/lerobot/configs/eval.py +++ b/src/lerobot/configs/eval.py @@ -38,6 +38,8 @@ class EvalPipelineConfig: seed: int | None = 1000 # Rename map for the observation to override the image and state keys rename_map: dict[str, str] = field(default_factory=dict) + # Explicit consent to execute remote code from the Hub (required for hub environments). + trust_remote_code: bool = False def __post_init__(self) -> None: # HACK: We parse again the cli args here to get the pretrained path if there was one. diff --git a/src/lerobot/configs/policies.py b/src/lerobot/configs/policies.py index 0ecfa169b..7f326b70b 100644 --- a/src/lerobot/configs/policies.py +++ b/src/lerobot/configs/policies.py @@ -55,14 +55,18 @@ class PreTrainedConfig(draccus.ChoiceRegistry, HubMixin, abc.ABC): # type: igno n_obs_steps: int = 1 - input_features: dict[str, PolicyFeature] = field(default_factory=dict) - output_features: dict[str, PolicyFeature] = field(default_factory=dict) + # `input_features` can be set to None/null in order to infer those values from the dataset. + input_features: dict[str, PolicyFeature] | None = field(default_factory=dict) + output_features: dict[str, PolicyFeature] | None = field(default_factory=dict) device: str | None = None # e.g. "cuda", "cuda:0", "cpu", or "mps" # `use_amp` determines whether to use Automatic Mixed Precision (AMP) for training and evaluation. With AMP, # automatic gradient scaling is used. use_amp: bool = False + # Whether the policy employed PEFT for training. + use_peft: bool = False + push_to_hub: bool = True # type: ignore[assignment] # TODO: use a different name to avoid override repo_id: str | None = None @@ -125,6 +129,8 @@ class PreTrainedConfig(draccus.ChoiceRegistry, HubMixin, abc.ABC): # type: igno @property def robot_state_feature(self) -> PolicyFeature | None: + if not self.input_features: + return None for ft_name, ft in self.input_features.items(): if ft.type is FeatureType.STATE and ft_name == OBS_STATE: return ft @@ -132,6 +138,8 @@ class PreTrainedConfig(draccus.ChoiceRegistry, HubMixin, abc.ABC): # type: igno @property def env_state_feature(self) -> PolicyFeature | None: + if not self.input_features: + return None for _, ft in self.input_features.items(): if ft.type is FeatureType.ENV: return ft @@ -139,10 +147,14 @@ class PreTrainedConfig(draccus.ChoiceRegistry, HubMixin, abc.ABC): # type: igno @property def image_features(self) -> dict[str, PolicyFeature]: + if not self.input_features: + return {} return {key: ft for key, ft in self.input_features.items() if ft.type is FeatureType.VISUAL} @property def action_feature(self) -> PolicyFeature | None: + if not self.output_features: + return None for ft_name, ft in self.output_features.items(): if ft.type is FeatureType.ACTION and ft_name == ACTION: return ft diff --git a/src/lerobot/configs/train.py b/src/lerobot/configs/train.py index cee9dfdf9..7a5eee77d 100644 --- a/src/lerobot/configs/train.py +++ b/src/lerobot/configs/train.py @@ -24,7 +24,7 @@ from huggingface_hub.errors import HfHubHTTPError from lerobot import envs from lerobot.configs import parser -from lerobot.configs.default import DatasetConfig, EvalConfig, WandBConfig +from lerobot.configs.default import DatasetConfig, EvalConfig, PeftConfig, WandBConfig from lerobot.configs.policies import PreTrainedConfig from lerobot.optim import OptimizerConfig from lerobot.optim.schedulers import LRSchedulerConfig @@ -65,6 +65,7 @@ class TrainPipelineConfig(HubMixin): scheduler: LRSchedulerConfig | None = None eval: EvalConfig = field(default_factory=EvalConfig) wandb: WandBConfig = field(default_factory=WandBConfig) + peft: PeftConfig | None = None # RA-BC (Reward-Aligned Behavior Cloning) parameters use_rabc: bool = False # Enable reward-weighted training diff --git a/src/lerobot/configs/types.py b/src/lerobot/configs/types.py index 18359ef05..8426afe55 100644 --- a/src/lerobot/configs/types.py +++ b/src/lerobot/configs/types.py @@ -50,3 +50,8 @@ class RTCAttentionSchedule(str, Enum): ONES = "ONES" LINEAR = "LINEAR" EXP = "EXP" + + +class RTCTrainingDelayDistribution(str, Enum): + UNIFORM = "UNIFORM" + EXP = "EXP" diff --git a/src/lerobot/datasets/aggregate.py b/src/lerobot/datasets/aggregate.py index 455caf0fe..94ffe602e 100644 --- a/src/lerobot/datasets/aggregate.py +++ b/src/lerobot/datasets/aggregate.py @@ -19,6 +19,7 @@ import logging import shutil from pathlib import Path +import datasets import pandas as pd import tqdm @@ -32,6 +33,7 @@ from lerobot.datasets.utils import ( DEFAULT_VIDEO_FILE_SIZE_IN_MB, DEFAULT_VIDEO_PATH, get_file_size_in_mb, + get_hf_features_from_features, get_parquet_file_size_in_mb, to_parquet_with_hf_images, update_chunk_file_indices, @@ -402,12 +404,21 @@ def aggregate_data(src_meta, dst_meta, data_idx, data_files_size_in_mb, chunk_si } unique_chunk_file_ids = sorted(unique_chunk_file_ids) + contains_images = len(dst_meta.image_keys) > 0 + + # retrieve features schema for proper image typing in parquet + hf_features = get_hf_features_from_features(dst_meta.features) if contains_images else None for src_chunk_idx, src_file_idx in unique_chunk_file_ids: src_path = src_meta.root / DEFAULT_DATA_PATH.format( chunk_index=src_chunk_idx, file_index=src_file_idx ) - df = pd.read_parquet(src_path) + if contains_images: + # Use HuggingFace datasets to read source data to preserve image format + src_ds = datasets.Dataset.from_parquet(str(src_path)) + df = src_ds.to_pandas() + else: + df = pd.read_parquet(src_path) df = update_data_df(df, src_meta, dst_meta) data_idx = append_or_create_parquet_file( @@ -417,8 +428,9 @@ def aggregate_data(src_meta, dst_meta, data_idx, data_files_size_in_mb, chunk_si data_files_size_in_mb, chunk_size, DEFAULT_DATA_PATH, - contains_images=len(dst_meta.image_keys) > 0, + contains_images=contains_images, aggr_root=dst_meta.root, + hf_features=hf_features, ) return data_idx @@ -488,6 +500,7 @@ def append_or_create_parquet_file( default_path: str, contains_images: bool = False, aggr_root: Path = None, + hf_features: datasets.Features | None = None, ): """Appends data to an existing parquet file or creates a new one based on size constraints. @@ -503,6 +516,7 @@ def append_or_create_parquet_file( default_path: Format string for generating file paths. contains_images: Whether the data contains images requiring special handling. aggr_root: Root path for the aggregated dataset. + hf_features: Optional HuggingFace Features schema for proper image typing. Returns: dict: Updated index dictionary with current chunk and file indices. @@ -512,7 +526,7 @@ def append_or_create_parquet_file( if not dst_path.exists(): dst_path.parent.mkdir(parents=True, exist_ok=True) if contains_images: - to_parquet_with_hf_images(df, dst_path) + to_parquet_with_hf_images(df, dst_path, features=hf_features) else: df.to_parquet(dst_path) return idx @@ -527,12 +541,17 @@ def append_or_create_parquet_file( final_df = df target_path = new_path else: - existing_df = pd.read_parquet(dst_path) + if contains_images: + # Use HuggingFace datasets to read existing data to preserve image format + existing_ds = datasets.Dataset.from_parquet(str(dst_path)) + existing_df = existing_ds.to_pandas() + else: + existing_df = pd.read_parquet(dst_path) final_df = pd.concat([existing_df, df], ignore_index=True) target_path = dst_path if contains_images: - to_parquet_with_hf_images(final_df, target_path) + to_parquet_with_hf_images(final_df, target_path, features=hf_features) else: final_df.to_parquet(target_path) diff --git a/src/lerobot/datasets/dataset_tools.py b/src/lerobot/datasets/dataset_tools.py index 2fb68dca1..e2928e2a6 100644 --- a/src/lerobot/datasets/dataset_tools.py +++ b/src/lerobot/datasets/dataset_tools.py @@ -26,6 +26,7 @@ This module provides utilities for: import logging import shutil from collections.abc import Callable +from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path import datasets @@ -51,7 +52,8 @@ from lerobot.datasets.utils import ( write_stats, write_tasks, ) -from lerobot.utils.constants import HF_LEROBOT_HOME +from lerobot.datasets.video_utils import encode_video_frames, get_video_info +from lerobot.utils.constants import HF_LEROBOT_HOME, OBS_IMAGE def _load_episode_with_stats(src_dataset: LeRobotDataset, episode_idx: int) -> dict: @@ -1083,3 +1085,561 @@ def _copy_episodes_metadata_and_stats( else: if src_dataset.meta.stats: write_stats(src_dataset.meta.stats, dst_meta.root) + + +def _save_episode_images_for_video( + dataset: LeRobotDataset, + imgs_dir: Path, + img_key: str, + episode_index: int, + num_workers: int = 4, +) -> None: + """Save images from a specific episode and camera to disk for video encoding. + + Args: + dataset: The LeRobot dataset to extract images from + imgs_dir: Directory to save images to + img_key: The image key (camera) to extract + episode_index: Index of the episode to save + num_workers: Number of threads for parallel image saving + """ + # Create directory + imgs_dir.mkdir(parents=True, exist_ok=True) + + # Get dataset without torch format for PIL image access + hf_dataset = dataset.hf_dataset.with_format(None) + + # Select only this camera's images + imgs_dataset = hf_dataset.select_columns(img_key) + + # Get episode start and end indices + from_idx = dataset.meta.episodes["dataset_from_index"][episode_index] + to_idx = dataset.meta.episodes["dataset_to_index"][episode_index] + + # Get all items for this episode + episode_dataset = imgs_dataset.select(range(from_idx, to_idx)) + + # Define function to save a single image + def save_single_image(i_item_tuple): + i, item = i_item_tuple + img = item[img_key] + # Use frame-XXXXXX.png format to match encode_video_frames expectations + img.save(str(imgs_dir / f"frame-{i:06d}.png"), quality=100) + return i + + # Save images with proper naming convention for encode_video_frames (frame-XXXXXX.png) + items = list(enumerate(episode_dataset)) + + with ThreadPoolExecutor(max_workers=num_workers) as executor: + futures = [executor.submit(save_single_image, item) for item in items] + for future in as_completed(futures): + future.result() # This will raise any exceptions that occurred + + +def _save_batch_episodes_images( + dataset: LeRobotDataset, + imgs_dir: Path, + img_key: str, + episode_indices: list[int], + num_workers: int = 4, +) -> list[float]: + """Save images from multiple episodes to disk for batch video encoding. + + Args: + dataset: The LeRobot dataset to extract images from + imgs_dir: Directory to save images to + img_key: The image key (camera) to extract + episode_indices: List of episode indices to save + num_workers: Number of threads for parallel image saving + + Returns: + List of episode durations in seconds + """ + imgs_dir.mkdir(parents=True, exist_ok=True) + hf_dataset = dataset.hf_dataset.with_format(None) + imgs_dataset = hf_dataset.select_columns(img_key) + + # Define function to save a single image with global frame index + # Defined once outside the loop to avoid repeated closure creation + def save_single_image(i_item_tuple, base_frame_idx, img_key_param): + i, item = i_item_tuple + img = item[img_key_param] + # Use global frame index for naming + img.save(str(imgs_dir / f"frame-{base_frame_idx + i:06d}.png"), quality=100) + return i + + episode_durations = [] + frame_idx = 0 + + for ep_idx in episode_indices: + # Get episode range + from_idx = dataset.meta.episodes["dataset_from_index"][ep_idx] + to_idx = dataset.meta.episodes["dataset_to_index"][ep_idx] + episode_length = to_idx - from_idx + episode_durations.append(episode_length / dataset.fps) + + # Get episode images + episode_dataset = imgs_dataset.select(range(from_idx, to_idx)) + + # Save images + items = list(enumerate(episode_dataset)) + with ThreadPoolExecutor(max_workers=num_workers) as executor: + futures = [executor.submit(save_single_image, item, frame_idx, img_key) for item in items] + for future in as_completed(futures): + future.result() + + frame_idx += episode_length + + return episode_durations + + +def _iter_episode_batches( + episode_indices: list[int], + episode_lengths: dict[int, int], + size_per_frame_mb: float, + video_file_size_limit: float, + max_episodes: int | None, + max_frames: int | None, +): + """Generator that yields batches of episode indices for video encoding. + + Groups episodes into batches that respect size and memory constraints: + - Stays under video file size limit + - Respects maximum episodes per batch (if specified) + - Respects maximum frames per batch (if specified) + + Args: + episode_indices: List of episode indices to batch + episode_lengths: Dictionary mapping episode index to episode length + size_per_frame_mb: Estimated size per frame in MB + video_file_size_limit: Maximum video file size in MB + max_episodes: Maximum number of episodes per batch (None = no limit) + max_frames: Maximum number of frames per batch (None = no limit) + + Yields: + List of episode indices for each batch + """ + batch_episodes = [] + estimated_size = 0.0 + total_frames = 0 + + for ep_idx in episode_indices: + ep_length = episode_lengths[ep_idx] + ep_estimated_size = ep_length * size_per_frame_mb + + # we check if adding this episode would exceed any constraint + would_exceed_size = estimated_size > 0 and estimated_size + ep_estimated_size >= video_file_size_limit + would_exceed_episodes = max_episodes is not None and len(batch_episodes) >= max_episodes + would_exceed_frames = max_frames is not None and total_frames + ep_length > max_frames + + if batch_episodes and (would_exceed_size or would_exceed_episodes or would_exceed_frames): + # yield current batch before adding this episode + yield batch_episodes + # start a new batch with current episode + batch_episodes = [ep_idx] + estimated_size = ep_estimated_size + total_frames = ep_length + else: + # add to current batch + batch_episodes.append(ep_idx) + estimated_size += ep_estimated_size + total_frames += ep_length + + # yield final batch if not empty + if batch_episodes: + yield batch_episodes + + +def _estimate_frame_size_via_calibration( + dataset: LeRobotDataset, + img_key: str, + episode_indices: list[int], + temp_dir: Path, + fps: int, + vcodec: str, + pix_fmt: str, + g: int, + crf: int, + fast_decode: int, + num_calibration_frames: int = 30, +) -> float: + """Estimate MB per frame by encoding a small calibration sample. + + Encodes a representative sample of frames using the exact codec parameters + to measure actual compression ratio, which is more accurate than heuristics. + + Args: + dataset: Source dataset with images. + img_key: Image key to calibrate (e.g., "observation.images.top"). + episode_indices: List of episode indices being processed. + temp_dir: Temporary directory for calibration files. + fps: Frames per second for video encoding. + vcodec: Video codec (libsvtav1, h264, hevc). + pix_fmt: Pixel format (yuv420p, etc.). + g: GOP size (group of pictures). + crf: Constant Rate Factor (quality). + fast_decode: Fast decode tuning parameter. + num_calibration_frames: Number of frames to use for calibration (default: 30). + + Returns: + Estimated size in MB per frame based on actual encoding. + """ + calibration_dir = temp_dir / "calibration" / img_key + calibration_dir.mkdir(parents=True, exist_ok=True) + + try: + # Select a representative episode (prefer middle episode if available) + calibration_ep_idx = episode_indices[len(episode_indices) // 2] + + # Get episode range + from_idx = dataset.meta.episodes["dataset_from_index"][calibration_ep_idx] + to_idx = dataset.meta.episodes["dataset_to_index"][calibration_ep_idx] + episode_length = to_idx - from_idx + + # Use up to num_calibration_frames from this episode + num_frames = min(num_calibration_frames, episode_length) + + # Get frames from dataset + hf_dataset = dataset.hf_dataset.with_format(None) + sample_indices = range(from_idx, from_idx + num_frames) + + # Save calibration frames + for i, idx in enumerate(sample_indices): + img = hf_dataset[idx][img_key] + img.save(str(calibration_dir / f"frame-{i:06d}.png"), quality=100) + + # Encode calibration video + calibration_video_path = calibration_dir / "calibration.mp4" + encode_video_frames( + imgs_dir=calibration_dir, + video_path=calibration_video_path, + fps=fps, + vcodec=vcodec, + pix_fmt=pix_fmt, + g=g, + crf=crf, + fast_decode=fast_decode, + overwrite=True, + ) + + # Measure actual compressed size + video_size_bytes = calibration_video_path.stat().st_size + video_size_mb = video_size_bytes / BYTES_PER_MIB + size_per_frame_mb = video_size_mb / num_frames + + logging.info( + f" Calibration: {num_frames} frames -> {video_size_mb:.2f} MB " + f"= {size_per_frame_mb:.4f} MB/frame for {img_key}" + ) + + return size_per_frame_mb + + finally: + # Clean up calibration files + if calibration_dir.exists(): + shutil.rmtree(calibration_dir) + + +def _copy_data_without_images( + src_dataset: LeRobotDataset, + dst_meta: LeRobotDatasetMetadata, + episode_indices: list[int], + img_keys: list[str], +) -> None: + """Copy data files without image columns. + + Args: + src_dataset: Source dataset + dst_meta: Destination metadata + episode_indices: Episodes to include + img_keys: Image keys to remove + """ + from lerobot.datasets.utils import DATA_DIR + + data_dir = src_dataset.root / DATA_DIR + parquet_files = sorted(data_dir.glob("*/*.parquet")) + + if not parquet_files: + raise ValueError(f"No parquet files found in {data_dir}") + + episode_set = set(episode_indices) + + for src_path in tqdm(parquet_files, desc="Processing data files"): + df = pd.read_parquet(src_path).reset_index(drop=True) + + # Filter to only include selected episodes + df = df[df["episode_index"].isin(episode_set)].copy() + + if len(df) == 0: + continue + + # Remove image columns + columns_to_drop = [col for col in img_keys if col in df.columns] + if columns_to_drop: + df = df.drop(columns=columns_to_drop) + + # Get chunk and file indices from path + relative_path = src_path.relative_to(src_dataset.root) + chunk_dir = relative_path.parts[1] + file_name = relative_path.parts[2] + chunk_idx = int(chunk_dir.split("-")[1]) + file_idx = int(file_name.split("-")[1].split(".")[0]) + + # Write to destination without pandas index + dst_path = dst_meta.root / f"data/chunk-{chunk_idx:03d}/file-{file_idx:03d}.parquet" + dst_path.parent.mkdir(parents=True, exist_ok=True) + df.to_parquet(dst_path, index=False) + + +# Video conversion constants +BYTES_PER_KIB = 1024 +BYTES_PER_MIB = BYTES_PER_KIB * BYTES_PER_KIB + + +def convert_image_to_video_dataset( + dataset: LeRobotDataset, + output_dir: Path, + repo_id: str | None = None, + vcodec: str = "libsvtav1", + pix_fmt: str = "yuv420p", + g: int = 2, + crf: int = 30, + fast_decode: int = 0, + episode_indices: list[int] | None = None, + num_workers: int = 4, + max_episodes_per_batch: int | None = None, + max_frames_per_batch: int | None = None, +) -> LeRobotDataset: + """Convert image-to-video dataset. + + Creates a new LeRobotDataset with images encoded as videos, following the proper + LeRobot dataset structure with videos stored in chunked MP4 files. + + Args: + dataset: The source LeRobot dataset with images + output_dir: Directory to save the new video dataset + repo_id: Repository ID for the new dataset (default: original_id + "_video") + vcodec: Video codec (default: libsvtav1) + pix_fmt: Pixel format (default: yuv420p) + g: Group of pictures size (default: 2) + crf: Constant rate factor (default: 30) + fast_decode: Fast decode tuning (default: 0) + episode_indices: List of episode indices to convert (None = all episodes) + num_workers: Number of threads for parallel processing (default: 4) + max_episodes_per_batch: Maximum episodes per video batch to avoid memory issues (None = no limit) + max_frames_per_batch: Maximum frames per video batch to avoid memory issues (None = no limit) + + Returns: + New LeRobotDataset with images encoded as videos + """ + # Check that it's an image dataset + if len(dataset.meta.video_keys) > 0: + raise ValueError( + f"This operation is for image datasets only. Video dataset provided: {dataset.repo_id}" + ) + + # Get all image keys + hf_dataset = dataset.hf_dataset.with_format(None) + img_keys = [key for key in hf_dataset.features if key.startswith(OBS_IMAGE)] + + if len(img_keys) == 0: + raise ValueError(f"No image keys found in dataset {dataset.repo_id}") + + # Determine which episodes to process + if episode_indices is None: + episode_indices = list(range(dataset.meta.total_episodes)) + + if repo_id is None: + repo_id = f"{dataset.repo_id}_video" + + logging.info( + f"Converting {len(episode_indices)} episodes with {len(img_keys)} cameras from {dataset.repo_id}" + ) + logging.info(f"Video codec: {vcodec}, pixel format: {pix_fmt}, GOP: {g}, CRF: {crf}") + + # Create new features dict, converting image features to video features + new_features = {} + for key, value in dataset.meta.features.items(): + if key not in img_keys: + new_features[key] = value + else: + # Convert image key to video format + new_features[key] = value.copy() + new_features[key]["dtype"] = "video" # Change dtype from "image" to "video" + # Video info will be updated after episodes are encoded + + # Create new metadata for video dataset + new_meta = LeRobotDatasetMetadata.create( + repo_id=repo_id, + fps=dataset.meta.fps, + features=new_features, + robot_type=dataset.meta.robot_type, + root=output_dir, + use_videos=True, + chunks_size=dataset.meta.chunks_size, + data_files_size_in_mb=dataset.meta.data_files_size_in_mb, + video_files_size_in_mb=dataset.meta.video_files_size_in_mb, + ) + + # Create temporary directory for image extraction + temp_dir = output_dir / "temp_images" + temp_dir.mkdir(parents=True, exist_ok=True) + + # Process all episodes and batch encode videos + # Use dictionary for O(1) episode metadata lookups instead of O(n) linear search + all_episode_metadata = {} + fps = int(dataset.fps) + + try: + # Build episode metadata entries first + logging.info("Building episode metadata...") + cumulative_frame_idx = 0 + for ep_idx in episode_indices: + src_episode = dataset.meta.episodes[ep_idx] + ep_length = src_episode["length"] + ep_meta = { + "episode_index": ep_idx, + "length": ep_length, + "dataset_from_index": cumulative_frame_idx, + "dataset_to_index": cumulative_frame_idx + ep_length, + } + if "data/chunk_index" in src_episode: + ep_meta["data/chunk_index"] = src_episode["data/chunk_index"] + ep_meta["data/file_index"] = src_episode["data/file_index"] + all_episode_metadata[ep_idx] = ep_meta + cumulative_frame_idx += ep_length + + # Process each camera and batch encode multiple episodes together + video_file_size_limit = new_meta.video_files_size_in_mb + + # Pre-compute episode lengths for batching + episode_lengths = {ep_idx: dataset.meta.episodes["length"][ep_idx] for ep_idx in episode_indices} + + for img_key in tqdm(img_keys, desc="Processing cameras"): + # Estimate size per frame by encoding a small calibration sample + # This provides accurate compression ratio for the specific codec parameters + size_per_frame_mb = _estimate_frame_size_via_calibration( + dataset=dataset, + img_key=img_key, + episode_indices=episode_indices, + temp_dir=temp_dir, + fps=fps, + vcodec=vcodec, + pix_fmt=pix_fmt, + g=g, + crf=crf, + fast_decode=fast_decode, + ) + + logging.info(f"Processing camera: {img_key}") + chunk_idx, file_idx = 0, 0 + cumulative_timestamp = 0.0 + + # Process episodes in batches to stay under size limit + for batch_episodes in _iter_episode_batches( + episode_indices=episode_indices, + episode_lengths=episode_lengths, + size_per_frame_mb=size_per_frame_mb, + video_file_size_limit=video_file_size_limit, + max_episodes=max_episodes_per_batch, + max_frames=max_frames_per_batch, + ): + total_frames_in_batch = sum(episode_lengths[idx] for idx in batch_episodes) + logging.info( + f" Encoding batch of {len(batch_episodes)} episodes " + f"({batch_episodes[0]}-{batch_episodes[-1]}) = {total_frames_in_batch} frames" + ) + + # Save images for all episodes in this batch + imgs_dir = temp_dir / f"batch_{chunk_idx}_{file_idx}" / img_key + episode_durations = _save_batch_episodes_images( + dataset=dataset, + imgs_dir=imgs_dir, + img_key=img_key, + episode_indices=batch_episodes, + num_workers=num_workers, + ) + + # Encode all batched episodes into single video + video_path = new_meta.root / new_meta.video_path.format( + video_key=img_key, chunk_index=chunk_idx, file_index=file_idx + ) + video_path.parent.mkdir(parents=True, exist_ok=True) + + encode_video_frames( + imgs_dir=imgs_dir, + video_path=video_path, + fps=fps, + vcodec=vcodec, + pix_fmt=pix_fmt, + g=g, + crf=crf, + fast_decode=fast_decode, + overwrite=True, + ) + + # Clean up temporary images + shutil.rmtree(imgs_dir) + + # Update metadata for each episode in the batch + for ep_idx, duration in zip(batch_episodes, episode_durations, strict=True): + from_timestamp = cumulative_timestamp + to_timestamp = cumulative_timestamp + duration + cumulative_timestamp = to_timestamp + + # Find episode metadata entry and add video metadata (O(1) dictionary lookup) + ep_meta = all_episode_metadata[ep_idx] + ep_meta[f"videos/{img_key}/chunk_index"] = chunk_idx + ep_meta[f"videos/{img_key}/file_index"] = file_idx + ep_meta[f"videos/{img_key}/from_timestamp"] = from_timestamp + ep_meta[f"videos/{img_key}/to_timestamp"] = to_timestamp + + # Move to next video file for next batch + chunk_idx, file_idx = update_chunk_file_indices(chunk_idx, file_idx, new_meta.chunks_size) + cumulative_timestamp = 0.0 + + # Copy and transform data files (removing image columns) + _copy_data_without_images(dataset, new_meta, episode_indices, img_keys) + + # Save episode metadata + episodes_df = pd.DataFrame(list(all_episode_metadata.values())) + episodes_path = new_meta.root / "meta" / "episodes" / "chunk-000" / "file-000.parquet" + episodes_path.parent.mkdir(parents=True, exist_ok=True) + episodes_df.to_parquet(episodes_path, index=False) + + # Update metadata info + new_meta.info["total_episodes"] = len(episode_indices) + new_meta.info["total_frames"] = sum(ep["length"] for ep in all_episode_metadata.values()) + new_meta.info["total_tasks"] = dataset.meta.total_tasks + new_meta.info["splits"] = {"train": f"0:{len(episode_indices)}"} + + # Update video info for all image keys (now videos) + # We need to manually set video info since update_video_info() checks video_keys first + for img_key in img_keys: + if not new_meta.features[img_key].get("info", None): + video_path = new_meta.root / new_meta.video_path.format( + video_key=img_key, chunk_index=0, file_index=0 + ) + new_meta.info["features"][img_key]["info"] = get_video_info(video_path) + + write_info(new_meta.info, new_meta.root) + + # Copy stats and tasks + if dataset.meta.stats is not None: + # Remove image stats + new_stats = {k: v for k, v in dataset.meta.stats.items() if k not in img_keys} + write_stats(new_stats, new_meta.root) + + if dataset.meta.tasks is not None: + write_tasks(dataset.meta.tasks, new_meta.root) + + finally: + # Clean up temporary directory + if temp_dir.exists(): + shutil.rmtree(temp_dir) + + logging.info(f"Completed converting {dataset.repo_id} to video format") + logging.info(f"New dataset saved to: {output_dir}") + + # Return new dataset + return LeRobotDataset(repo_id=repo_id, root=output_dir) diff --git a/src/lerobot/datasets/pipeline_features.py b/src/lerobot/datasets/pipeline_features.py index 4fad7bd20..161633f26 100644 --- a/src/lerobot/datasets/pipeline_features.py +++ b/src/lerobot/datasets/pipeline_features.py @@ -18,12 +18,12 @@ from typing import Any from lerobot.configs.types import PipelineFeatureType from lerobot.datasets.utils import hw_to_dataset_features -from lerobot.processor import DataProcessorPipeline +from lerobot.processor import DataProcessorPipeline, RobotAction, RobotObservation from lerobot.utils.constants import ACTION, OBS_IMAGES, OBS_STATE, OBS_STR def create_initial_features( - action: dict[str, Any] | None = None, observation: dict[str, Any] | None = None + action: RobotAction | None = None, observation: RobotObservation | None = None ) -> dict[PipelineFeatureType, dict[str, Any]]: """ Creates the initial features dict for the dataset from action and observation specs. diff --git a/src/lerobot/datasets/utils.py b/src/lerobot/datasets/utils.py index 234736a75..ed678af6e 100644 --- a/src/lerobot/datasets/utils.py +++ b/src/lerobot/datasets/utils.py @@ -1172,12 +1172,21 @@ def validate_episode_buffer(episode_buffer: dict, total_episodes: int, features: ) -def to_parquet_with_hf_images(df: pandas.DataFrame, path: Path) -> None: +def to_parquet_with_hf_images( + df: pandas.DataFrame, path: Path, features: datasets.Features | None = None +) -> None: """This function correctly writes to parquet a panda DataFrame that contains images encoded by HF dataset. This way, it can be loaded by HF dataset and correctly formatted images are returned. + + Args: + df: DataFrame to write to parquet. + path: Path to write the parquet file. + features: Optional HuggingFace Features schema. If provided, ensures image columns + are properly typed as Image() in the parquet schema. """ # TODO(qlhoest): replace this weird synthax by `df.to_parquet(path)` only - datasets.Dataset.from_dict(df.to_dict(orient="list")).to_parquet(path) + ds = datasets.Dataset.from_dict(df.to_dict(orient="list"), features=features) + ds.to_parquet(path) def item_to_torch(item: dict) -> dict: diff --git a/src/lerobot/envs/__init__.py b/src/lerobot/envs/__init__.py index d767b6e8c..183c12325 100644 --- a/src/lerobot/envs/__init__.py +++ b/src/lerobot/envs/__init__.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .configs import AlohaEnv, EnvConfig, PushtEnv # noqa: F401 +from .configs import AlohaEnv, EnvConfig, HubEnvConfig, PushtEnv # noqa: F401 diff --git a/src/lerobot/envs/configs.py b/src/lerobot/envs/configs.py index 4323f3316..cd88b37bc 100644 --- a/src/lerobot/envs/configs.py +++ b/src/lerobot/envs/configs.py @@ -13,7 +13,7 @@ # limitations under the License. import abc -from dataclasses import dataclass, field +from dataclasses import dataclass, field, fields from typing import Any import draccus @@ -68,6 +68,22 @@ class EnvConfig(draccus.ChoiceRegistry, abc.ABC): raise NotImplementedError() +@dataclass +class HubEnvConfig(EnvConfig): + """Base class for environments that delegate creation to a hub-hosted make_env. + + Hub environments download and execute remote code from the HF Hub. + The hub_path points to a repository containing an env.py with a make_env function. + """ + + hub_path: str | None = None # required: e.g., "username/repo" or "username/repo@branch:file.py" + + @property + def gym_kwargs(self) -> dict: + # Not used for hub environments - the hub's make_env handles everything + return {} + + @EnvConfig.register_subclass("aloha") @dataclass class AlohaEnv(EnvConfig): @@ -244,6 +260,7 @@ class HILSerlRobotEnvConfig(EnvConfig): @dataclass class LiberoEnv(EnvConfig): task: str = "libero_10" # can also choose libero_spatial, libero_object, etc. + task_ids: list[int] | None = None fps: int = 30 episode_length: int | None = None obs_type: str = "pixels_agent_pos" @@ -322,10 +339,10 @@ class LiberoEnv(EnvConfig): @property def gym_kwargs(self) -> dict: - return { - "obs_type": self.obs_type, - "render_mode": self.render_mode, - } + kwargs: dict[str, Any] = {"obs_type": self.obs_type, "render_mode": self.render_mode} + if self.task_ids is not None: + kwargs["task_ids"] = self.task_ids + return kwargs @EnvConfig.register_subclass("metaworld") @@ -368,3 +385,71 @@ class MetaworldEnv(EnvConfig): "obs_type": self.obs_type, "render_mode": self.render_mode, } + + +@EnvConfig.register_subclass("isaaclab_arena") +@dataclass +class IsaaclabArenaEnv(HubEnvConfig): + hub_path: str = "nvidia/isaaclab-arena-envs" + episode_length: int = 300 + num_envs: int = 1 + embodiment: str | None = "gr1_pink" + object: str | None = "power_drill" + mimic: bool = False + teleop_device: str | None = None + seed: int | None = 42 + device: str | None = "cuda:0" + disable_fabric: bool = False + enable_cameras: bool = False + headless: bool = False + enable_pinocchio: bool = True + environment: str | None = "gr1_microwave" + task: str | None = "Reach out to the microwave and open it." + state_dim: int = 54 + action_dim: int = 36 + camera_height: int = 512 + camera_width: int = 512 + video: bool = False + video_length: int = 100 + video_interval: int = 200 + # Comma-separated keys, e.g., "robot_joint_pos,left_eef_pos" + state_keys: str = "robot_joint_pos" + # Comma-separated keys, e.g., "robot_pov_cam_rgb,front_cam_rgb" + # Set to None or "" for environments without cameras + camera_keys: str | None = None + features: dict[str, PolicyFeature] = field(default_factory=dict) + features_map: dict[str, str] = field(default_factory=dict) + kwargs: dict | None = None + + def __post_init__(self): + if self.kwargs: + # dynamically convert kwargs to fields in the dataclass + # NOTE! the new fields will not bee seen by the dataclass repr + field_names = {f.name for f in fields(self)} + for key, value in self.kwargs.items(): + if key not in field_names and key != "kwargs": + setattr(self, key, value) + self.kwargs = None + + # Set action feature + self.features[ACTION] = PolicyFeature(type=FeatureType.ACTION, shape=(self.action_dim,)) + self.features_map[ACTION] = ACTION + + # Set state feature + self.features[OBS_STATE] = PolicyFeature(type=FeatureType.STATE, shape=(self.state_dim,)) + self.features_map[OBS_STATE] = OBS_STATE + + # Add camera features for each camera key + if self.enable_cameras and self.camera_keys: + for cam_key in self.camera_keys.split(","): + cam_key = cam_key.strip() + if cam_key: + self.features[cam_key] = PolicyFeature( + type=FeatureType.VISUAL, + shape=(self.camera_height, self.camera_width, 3), + ) + self.features_map[cam_key] = f"{OBS_IMAGES}.{cam_key}" + + @property + def gym_kwargs(self) -> dict: + return {} diff --git a/src/lerobot/envs/factory.py b/src/lerobot/envs/factory.py index b39cfee71..1c59ccb7d 100644 --- a/src/lerobot/envs/factory.py +++ b/src/lerobot/envs/factory.py @@ -20,11 +20,11 @@ import gymnasium as gym from gymnasium.envs.registration import registry as gym_registry from lerobot.configs.policies import PreTrainedConfig -from lerobot.envs.configs import AlohaEnv, EnvConfig, LiberoEnv, PushtEnv +from lerobot.envs.configs import AlohaEnv, EnvConfig, HubEnvConfig, IsaaclabArenaEnv, LiberoEnv, PushtEnv from lerobot.envs.utils import _call_make_env, _download_hub_file, _import_hub_module, _normalize_hub_result from lerobot.policies.xvla.configuration_xvla import XVLAConfig from lerobot.processor import ProcessorStep -from lerobot.processor.env_processor import LiberoProcessorStep +from lerobot.processor.env_processor import IsaaclabArenaProcessorStep, LiberoProcessorStep from lerobot.processor.pipeline import PolicyProcessorPipeline @@ -73,6 +73,26 @@ def make_env_pre_post_processors( if isinstance(env_cfg, LiberoEnv) or "libero" in env_cfg.type: preprocessor_steps.append(LiberoProcessorStep()) + # For Isaaclab Arena environments, add the IsaaclabArenaProcessorStep + if isinstance(env_cfg, IsaaclabArenaEnv) or "isaaclab_arena" in env_cfg.type: + # Parse comma-separated keys (handle None for state-based policies) + if env_cfg.state_keys: + state_keys = tuple(k.strip() for k in env_cfg.state_keys.split(",") if k.strip()) + else: + state_keys = () + if env_cfg.camera_keys: + camera_keys = tuple(k.strip() for k in env_cfg.camera_keys.split(",") if k.strip()) + else: + camera_keys = () + if not state_keys and not camera_keys: + raise ValueError("At least one of state_keys or camera_keys must be specified.") + preprocessor_steps.append( + IsaaclabArenaProcessorStep( + state_keys=state_keys, + camera_keys=camera_keys, + ) + ) + preprocessor = PolicyProcessorPipeline(steps=preprocessor_steps) postprocessor = PolicyProcessorPipeline(steps=postprocessor_steps) @@ -98,7 +118,6 @@ def make_env( hub_cache_dir (str | None): Optional cache path for downloaded hub files. trust_remote_code (bool): **Explicit consent** to execute remote code from the Hub. Default False — must be set to True to import/exec hub `env.py`. - Raises: ValueError: if n_envs < 1 ModuleNotFoundError: If the requested env package is not installed @@ -112,19 +131,35 @@ def make_env( """ # if user passed a hub id string (e.g., "username/repo", "username/repo@main:env.py") # simplified: only support hub-provided `make_env` + # TODO: (jadechoghari): deprecate string API and remove this check if isinstance(cfg, str): + hub_path: str | None = cfg + elif isinstance(cfg, HubEnvConfig): + hub_path = cfg.hub_path + else: + hub_path = None + + # If hub_path is set, download and call hub-provided `make_env` + if hub_path: # _download_hub_file will raise the same RuntimeError if trust_remote_code is False - repo_id, file_path, local_file, revision = _download_hub_file(cfg, trust_remote_code, hub_cache_dir) + repo_id, file_path, local_file, revision = _download_hub_file( + hub_path, trust_remote_code, hub_cache_dir + ) # import and surface clear import errors module = _import_hub_module(local_file, repo_id) # call the hub-provided make_env - raw_result = _call_make_env(module, n_envs=n_envs, use_async_envs=use_async_envs) + env_cfg = None if isinstance(cfg, str) else cfg + raw_result = _call_make_env(module, n_envs=n_envs, use_async_envs=use_async_envs, cfg=env_cfg) # normalize the return into {suite: {task_id: vec_env}} return _normalize_hub_result(raw_result) + # At this point, cfg must be an EnvConfig (not a string) since hub_path would have been set otherwise + if isinstance(cfg, str): + raise TypeError("cfg should be an EnvConfig at this point") + if n_envs < 1: raise ValueError("`n_envs` must be at least 1") diff --git a/src/lerobot/envs/libero.py b/src/lerobot/envs/libero.py index b1eb37377..96c5cf102 100644 --- a/src/lerobot/envs/libero.py +++ b/src/lerobot/envs/libero.py @@ -29,6 +29,8 @@ from gymnasium import spaces from libero.libero import benchmark, get_libero_path from libero.libero.envs import OffScreenRenderEnv +from lerobot.processor import RobotObservation + def _parse_camera_names(camera_name: str | Sequence[str]) -> list[str]: """Normalize camera_name into a non-empty list of strings.""" @@ -237,7 +239,7 @@ class LiberoEnv(gym.Env): env.reset() return env - def _format_raw_obs(self, raw_obs: dict[str, Any]) -> dict[str, Any]: + def _format_raw_obs(self, raw_obs: RobotObservation) -> RobotObservation: images = {} for camera_name in self.camera_name: image = raw_obs[camera_name] @@ -291,9 +293,9 @@ class LiberoEnv(gym.Env): def reset(self, seed=None, **kwargs): super().reset(seed=seed) self._env.seed(seed) - if self.init_states and self._init_states is not None: - self._env.set_init_state(self._init_states[self._init_state_id]) raw_obs = self._env.reset() + if self.init_states and self._init_states is not None: + raw_obs = self._env.set_init_state(self._init_states[self._init_state_id]) # After reset, objects may be unstable (slightly floating, intersecting, etc.). # Step the simulator with a no-op action for a few frames so everything settles. @@ -313,7 +315,7 @@ class LiberoEnv(gym.Env): info = {"is_success": False} return observation, info - def step(self, action: np.ndarray) -> tuple[dict[str, Any], float, bool, bool, dict[str, Any]]: + def step(self, action: np.ndarray) -> tuple[RobotObservation, float, bool, bool, dict[str, Any]]: if action.ndim != 1: raise ValueError( f"Expected action to be 1-D (shape (action_dim,)), " diff --git a/src/lerobot/envs/metaworld.py b/src/lerobot/envs/metaworld.py index 9190f33ad..4d91e002d 100644 --- a/src/lerobot/envs/metaworld.py +++ b/src/lerobot/envs/metaworld.py @@ -25,6 +25,8 @@ import metaworld.policies as policies import numpy as np from gymnasium import spaces +from lerobot.processor import RobotObservation + # ---- Load configuration data from the external JSON file ---- CONFIG_PATH = Path(__file__).parent / "metaworld_config.json" try: @@ -161,7 +163,7 @@ class MetaworldEnv(gym.Env): env._freeze_rand_vec = False # otherwise no randomization return env - def _format_raw_obs(self, raw_obs: np.ndarray) -> dict[str, Any]: + def _format_raw_obs(self, raw_obs: np.ndarray) -> RobotObservation: image = None if self._env is not None: image = self._env.render() @@ -196,7 +198,7 @@ class MetaworldEnv(gym.Env): self, seed: int | None = None, **kwargs, - ) -> tuple[dict[str, Any], dict[str, Any]]: + ) -> tuple[RobotObservation, dict[str, Any]]: """ Reset the environment to its initial state. @@ -204,7 +206,7 @@ class MetaworldEnv(gym.Env): seed (Optional[int]): Random seed for environment initialization. Returns: - observation (Dict[str, Any]): The initial formatted observation. + observation (RobotObservation): The initial formatted observation. info (Dict[str, Any]): Additional info about the reset state. """ super().reset(seed=seed) @@ -216,7 +218,7 @@ class MetaworldEnv(gym.Env): info = {"is_success": False} return observation, info - def step(self, action: np.ndarray) -> tuple[dict[str, Any], float, bool, bool, dict[str, Any]]: + def step(self, action: np.ndarray) -> tuple[RobotObservation, float, bool, bool, dict[str, Any]]: """ Perform one environment step. @@ -224,7 +226,7 @@ class MetaworldEnv(gym.Env): action (np.ndarray): The action to execute, must be 1-D with shape (action_dim,). Returns: - observation (Dict[str, Any]): The formatted observation after the step. + observation (RobotObservation): The formatted observation after the step. reward (float): The scalar reward for this step. terminated (bool): Whether the episode terminated successfully. truncated (bool): Whether the episode was truncated due to a time limit. diff --git a/src/lerobot/envs/utils.py b/src/lerobot/envs/utils.py index 8d0f24922..09431a18d 100644 --- a/src/lerobot/envs/utils.py +++ b/src/lerobot/envs/utils.py @@ -29,6 +29,7 @@ from torch import Tensor from lerobot.configs.types import FeatureType, PolicyFeature from lerobot.envs.configs import EnvConfig +from lerobot.processor import RobotObservation from lerobot.utils.constants import OBS_ENV_STATE, OBS_IMAGE, OBS_IMAGES, OBS_STATE, OBS_STR from lerobot.utils.utils import get_channel_first_image_shape @@ -46,7 +47,7 @@ def _convert_nested_dict(d): def preprocess_observation(observations: dict[str, np.ndarray]) -> dict[str, Tensor]: - # TODO(aliberts, rcadene): refactor this to use features from the environment (no hardcoding) + # TODO(jadechoghari, imstevenpmwork): refactor this to use features from the environment (no hardcoding) """Convert environment observation to LeRobot format observation. Args: observation: Dictionary of observation batches from a Gym vector environment. @@ -98,11 +99,19 @@ def preprocess_observation(observations: dict[str, np.ndarray]) -> dict[str, Ten if "robot_state" in observations: return_observations[f"{OBS_STR}.robot_state"] = _convert_nested_dict(observations["robot_state"]) + + # Handle IsaacLab Arena format: observations have 'policy' and 'camera_obs' keys + if "policy" in observations: + return_observations[f"{OBS_STR}.policy"] = observations["policy"] + + if "camera_obs" in observations: + return_observations[f"{OBS_STR}.camera_obs"] = observations["camera_obs"] + return return_observations def env_to_policy_features(env_cfg: EnvConfig) -> dict[str, PolicyFeature]: - # TODO(aliberts, rcadene): remove this hardcoding of keys and just use the nested keys as is + # TODO(jadechoghari, imstevenpmwork): remove this hardcoding of keys and just use the nested keys as is # (need to also refactor preprocess_observation and externalize normalization from policies) policy_features = {} for key, ft in env_cfg.features.items(): @@ -144,7 +153,7 @@ def check_env_attributes_and_types(env: gym.vector.VectorEnv) -> None: ) -def add_envs_task(env: gym.vector.VectorEnv, observation: dict[str, Any]) -> dict[str, Any]: +def add_envs_task(env: gym.vector.VectorEnv, observation: RobotObservation) -> RobotObservation: """Adds task feature to the observation dict with respect to the first environment attribute.""" if hasattr(env.envs[0], "task_description"): task_result = env.call("task_description") @@ -302,7 +311,7 @@ def _import_hub_module(local_file: str, repo_id: str) -> Any: return module -def _call_make_env(module: Any, n_envs: int, use_async_envs: bool) -> Any: +def _call_make_env(module: Any, n_envs: int, use_async_envs: bool, cfg: EnvConfig | None) -> Any: """ Ensure module exposes make_env and call it. """ @@ -311,7 +320,11 @@ def _call_make_env(module: Any, n_envs: int, use_async_envs: bool) -> Any: f"The hub module {getattr(module, '__name__', 'hub_module')} must expose `make_env(n_envs=int, use_async_envs=bool)`." ) entry_fn = module.make_env - return entry_fn(n_envs=n_envs, use_async_envs=use_async_envs) + # Only pass cfg if it's not None (i.e., when an EnvConfig was provided, not a string hub ID) + if cfg is not None: + return entry_fn(n_envs=n_envs, use_async_envs=use_async_envs, cfg=cfg) + else: + return entry_fn(n_envs=n_envs, use_async_envs=use_async_envs) def _normalize_hub_result(result: Any) -> dict[str, dict[int, gym.vector.VectorEnv]]: diff --git a/src/lerobot/motors/calibration_gui.py b/src/lerobot/motors/calibration_gui.py index 9832a1636..02bba454f 100644 --- a/src/lerobot/motors/calibration_gui.py +++ b/src/lerobot/motors/calibration_gui.py @@ -18,7 +18,7 @@ from dataclasses import dataclass os.environ["PYGAME_HIDE_SUPPORT_PROMPT"] = "1" -from lerobot.motors import MotorCalibration, MotorsBus +from .motors_bus import MotorCalibration, MotorsBus BAR_LEN, BAR_THICKNESS = 450, 8 HANDLE_R = 10 diff --git a/src/lerobot/motors/feetech/tables.py b/src/lerobot/motors/feetech/tables.py index 91e844a72..56500e527 100644 --- a/src/lerobot/motors/feetech/tables.py +++ b/src/lerobot/motors/feetech/tables.py @@ -205,6 +205,7 @@ MODEL_BAUDRATE_TABLE = { # Sign-Magnitude encoding bits STS_SMS_SERIES_ENCODINGS_TABLE = { + "Present_Load": 10, "Homing_Offset": 11, "Goal_Position": 15, "Goal_Velocity": 15, diff --git a/src/lerobot/policies/__init__.py b/src/lerobot/policies/__init__.py index 99275e787..c7951f028 100644 --- a/src/lerobot/policies/__init__.py +++ b/src/lerobot/policies/__init__.py @@ -16,6 +16,7 @@ from .act.configuration_act import ACTConfig as ACTConfig from .diffusion.configuration_diffusion import DiffusionConfig as DiffusionConfig from .groot.configuration_groot import GrootConfig as GrootConfig from .pi0.configuration_pi0 import PI0Config as PI0Config +from .pi0_fast.configuration_pi0_fast import PI0FastConfig as PI0FastConfig from .pi05.configuration_pi05 import PI05Config as PI05Config from .smolvla.configuration_smolvla import SmolVLAConfig as SmolVLAConfig from .smolvla.processor_smolvla import SmolVLANewLineProcessor @@ -29,6 +30,7 @@ __all__ = [ "DiffusionConfig", "PI0Config", "PI05Config", + "PI0FastConfig", "SmolVLAConfig", "SARMConfig", "TDMPCConfig", diff --git a/src/lerobot/policies/factory.py b/src/lerobot/policies/factory.py index 3e24656fc..a593e5bcb 100644 --- a/src/lerobot/policies/factory.py +++ b/src/lerobot/policies/factory.py @@ -51,7 +51,11 @@ from lerobot.processor.converters import ( transition_to_batch, transition_to_policy_action, ) -from lerobot.utils.constants import POLICY_POSTPROCESSOR_DEFAULT_NAME, POLICY_PREPROCESSOR_DEFAULT_NAME +from lerobot.utils.constants import ( + ACTION, + POLICY_POSTPROCESSOR_DEFAULT_NAME, + POLICY_PREPROCESSOR_DEFAULT_NAME, +) def get_policy_class(name: str) -> type[PreTrainedPolicy]: @@ -91,6 +95,10 @@ def get_policy_class(name: str) -> type[PreTrainedPolicy]: from lerobot.policies.pi0.modeling_pi0 import PI0Policy return PI0Policy + elif name == "pi0_fast": + from lerobot.policies.pi0_fast.modeling_pi0_fast import PI0FastPolicy + + return PI0FastPolicy elif name == "pi05": from lerobot.policies.pi05.modeling_pi05 import PI05Policy @@ -246,7 +254,7 @@ def make_pre_post_processors( } # Also ensure postprocessing slices to env action dim and unnormalizes with dataset stats - env_action_dim = policy_cfg.output_features["action"].shape[0] + env_action_dim = policy_cfg.output_features[ACTION].shape[0] postprocessor_overrides["groot_action_unpack_unnormalize_v1"] = { "stats": kwargs.get("dataset_stats"), "normalize_min_max": True, @@ -471,11 +479,40 @@ def make_policy( if ds_meta is not None: kwargs["dataset_meta"] = ds_meta - if cfg.pretrained_path: + if not cfg.pretrained_path and cfg.use_peft: + raise ValueError( + "Instantiating a policy with `use_peft=True` without a checkpoint is not supported since that requires " + "the PEFT config parameters to be set. For training with PEFT, see `lerobot_train.py` on how to do that." + ) + + if cfg.pretrained_path and not cfg.use_peft: # Load a pretrained policy and override the config if needed (for example, if there are inference-time # hyperparameters that we want to vary). kwargs["pretrained_name_or_path"] = cfg.pretrained_path policy = policy_cls.from_pretrained(**kwargs) + elif cfg.pretrained_path and cfg.use_peft: + # Load a pretrained PEFT model on top of the policy. The pretrained path points to the folder/repo + # of the adapter and the adapter's config contains the path to the base policy. So we need the + # adapter config first, then load the correct policy and then apply PEFT. + from peft import PeftConfig, PeftModel + + logging.info("Loading policy's PEFT adapter.") + + peft_pretrained_path = cfg.pretrained_path + peft_config = PeftConfig.from_pretrained(peft_pretrained_path) + + kwargs["pretrained_name_or_path"] = peft_config.base_model_name_or_path + if not kwargs["pretrained_name_or_path"]: + # This means that there's a bug or we trained a policy from scratch using PEFT. + # It is more likely that this is a bug so we'll raise an error. + raise ValueError( + "No pretrained model name found in adapter config. Can't instantiate the pre-trained policy on which " + "the adapter was trained." + ) + + policy = policy_cls.from_pretrained(**kwargs) + policy = PeftModel.from_pretrained(policy, peft_pretrained_path, config=peft_config) + else: # Make a fresh policy. policy = policy_cls(**kwargs) diff --git a/src/lerobot/policies/groot/configuration_groot.py b/src/lerobot/policies/groot/configuration_groot.py index 8002c69ea..4f3d78222 100644 --- a/src/lerobot/policies/groot/configuration_groot.py +++ b/src/lerobot/policies/groot/configuration_groot.py @@ -20,6 +20,7 @@ from lerobot.configs.policies import PreTrainedConfig from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature from lerobot.optim.optimizers import AdamWConfig from lerobot.optim.schedulers import CosineDecayWithWarmupSchedulerConfig +from lerobot.utils.constants import ACTION, OBS_STATE @PreTrainedConfig.register_subclass("groot") @@ -137,14 +138,14 @@ class GrootConfig(PreTrainedConfig): "No features of type FeatureType.VISUAL found in input_features." ) - if "observation.state" not in self.input_features: + if OBS_STATE not in self.input_features: state_feature = PolicyFeature( type=FeatureType.STATE, shape=(self.max_state_dim,), ) - self.input_features["observation.state"] = state_feature + self.input_features[OBS_STATE] = state_feature else: - state_shape = self.input_features["observation.state"].shape + state_shape = self.input_features[OBS_STATE].shape state_dim = state_shape[0] if state_shape else 0 if state_dim > self.max_state_dim: raise ValueError( @@ -152,14 +153,14 @@ class GrootConfig(PreTrainedConfig): f"Either reduce state dimension or increase max_state_dim in config." ) - if "action" not in self.output_features: + if ACTION not in self.output_features: action_feature = PolicyFeature( type=FeatureType.ACTION, shape=(self.max_action_dim,), ) - self.output_features["action"] = action_feature + self.output_features[ACTION] = action_feature else: - action_shape = self.output_features["action"].shape + action_shape = self.output_features[ACTION].shape action_dim = action_shape[0] if action_shape else 0 if action_dim > self.max_action_dim: raise ValueError( diff --git a/src/lerobot/policies/groot/groot_n1.py b/src/lerobot/policies/groot/groot_n1.py index 0da26874e..06ff5a04d 100644 --- a/src/lerobot/policies/groot/groot_n1.py +++ b/src/lerobot/policies/groot/groot_n1.py @@ -46,7 +46,7 @@ from lerobot.policies.groot.action_head.flow_matching_action_head import ( FlowmatchingActionHeadConfig, ) from lerobot.policies.groot.utils import ensure_eagle_cache_ready -from lerobot.utils.constants import HF_LEROBOT_HOME +from lerobot.utils.constants import ACTION, HF_LEROBOT_HOME DEFAULT_VENDOR_EAGLE_PATH = str((Path(__file__).resolve().parent / "eagle2_hg_model").resolve()) DEFAULT_TOKENIZER_ASSETS_REPO = "lerobot/eagle2hg-processor-groot-n1p5" @@ -227,8 +227,8 @@ class GR00TN15(PreTrainedModel): detected_error = False error_msg = ERROR_MSG - if "action" in inputs: - action = inputs["action"] + if ACTION in inputs: + action = inputs[ACTION] # In inference, action may be omitted or None; validate only when it's a tensor. if action is None: pass # allow None during inference diff --git a/src/lerobot/policies/groot/modeling_groot.py b/src/lerobot/policies/groot/modeling_groot.py index bdaef37b9..9a479b8f9 100644 --- a/src/lerobot/policies/groot/modeling_groot.py +++ b/src/lerobot/policies/groot/modeling_groot.py @@ -32,15 +32,22 @@ Notes: from LeRobot, see `GrootPolicy.finetune_with_groot_runner` below. """ +import builtins import os from collections import deque +from pathlib import Path +from typing import TypeVar import torch from torch import Tensor +from lerobot.configs.types import FeatureType, PolicyFeature from lerobot.policies.groot.configuration_groot import GrootConfig from lerobot.policies.groot.groot_n1 import GR00TN15 from lerobot.policies.pretrained import PreTrainedPolicy +from lerobot.utils.constants import ACTION, OBS_IMAGES + +T = TypeVar("T", bound="GrootPolicy") class GrootPolicy(PreTrainedPolicy): @@ -89,6 +96,129 @@ class GrootPolicy(PreTrainedPolicy): """Reset policy state when environment resets.""" self._action_queue = deque([], maxlen=self.config.n_action_steps) + @classmethod + def from_pretrained( + cls: builtins.type[T], + pretrained_name_or_path: str | Path, + *, + config: GrootConfig | None = None, + force_download: bool = False, + resume_download: bool | None = None, + proxies: dict | None = None, + token: str | bool | None = None, + cache_dir: str | Path | None = None, + local_files_only: bool = False, + revision: str | None = None, + strict: bool = True, + **kwargs, + ) -> T: + """Load Groot policy from pretrained model. + + Handles two cases: + 1. Base GR00T models (e.g., 'nvidia/GR00T-N1.5-3B') - loads the raw model + 2. Fine-tuned LeRobot checkpoints - loads config and weights from safetensors + + Args: + pretrained_name_or_path: Path to the GR00T model or fine-tuned checkpoint + config: Optional GrootConfig. If None, loads from checkpoint or creates default + force_download: Force download even if cached + resume_download: Resume interrupted download + proxies: Proxy settings + token: HuggingFace authentication token + cache_dir: Cache directory path + local_files_only: Only use local files + revision: Specific model revision + strict: Strict state dict loading + **kwargs: Additional arguments (passed to config) + + Returns: + Initialized GrootPolicy instance with loaded model + """ + from huggingface_hub import hf_hub_download + from huggingface_hub.constants import SAFETENSORS_SINGLE_FILE + from huggingface_hub.errors import HfHubHTTPError + + print( + "The Groot policy is a wrapper around Nvidia's GR00T N1.5 model.\n" + f"Loading pretrained model from: {pretrained_name_or_path}" + ) + + model_id = str(pretrained_name_or_path) + is_finetuned_checkpoint = False + + # Check if this is a fine-tuned LeRobot checkpoint (has model.safetensors) + try: + if os.path.isdir(model_id): + is_finetuned_checkpoint = os.path.exists(os.path.join(model_id, SAFETENSORS_SINGLE_FILE)) + else: + # Try to download the safetensors file to check if it exists + try: + hf_hub_download( + repo_id=model_id, + filename=SAFETENSORS_SINGLE_FILE, + revision=revision, + cache_dir=cache_dir, + force_download=False, # Just check, don't force download + proxies=proxies, + token=token, + local_files_only=local_files_only, + ) + is_finetuned_checkpoint = True + except HfHubHTTPError: + is_finetuned_checkpoint = False + except Exception: + is_finetuned_checkpoint = False + + if is_finetuned_checkpoint: + # This is a fine-tuned LeRobot checkpoint - use parent class loading + print("Detected fine-tuned LeRobot checkpoint, loading with state dict...") + return super().from_pretrained( + pretrained_name_or_path=pretrained_name_or_path, + config=config, + force_download=force_download, + resume_download=resume_download, + proxies=proxies, + token=token, + cache_dir=cache_dir, + local_files_only=local_files_only, + revision=revision, + strict=strict, + **kwargs, + ) + + # This is a base GR00T model - load it fresh + print("Detected base GR00T model, loading from HuggingFace...") + + if config is None: + # Create default config with the pretrained path + config = GrootConfig(base_model_path=str(pretrained_name_or_path)) + + # Add minimal visual feature required for validation + # validate_features() will automatically add state and action features + # These are placeholders - actual robot features come from the preprocessor + if not config.input_features: + config.input_features = { + f"{OBS_IMAGES}.camera": PolicyFeature( + type=FeatureType.VISUAL, + shape=(3, 224, 224), # Default image size from config + ), + } + else: + # Override the base_model_path with the provided path + config.base_model_path = str(pretrained_name_or_path) + + # Pass through any additional config overrides from kwargs + for key, value in kwargs.items(): + if hasattr(config, key): + setattr(config, key, value) + + # Create a fresh policy instance - this will automatically load the GR00T model + # in __init__ via _create_groot_model() + policy = cls(config) + + policy.eval() + return policy + def get_optim_params(self) -> dict: return self.parameters() @@ -147,7 +277,7 @@ class GrootPolicy(PreTrainedPolicy): actions = outputs.get("action_pred") - original_action_dim = self.config.output_features["action"].shape[0] + original_action_dim = self.config.output_features[ACTION].shape[0] actions = actions[:, :, :original_action_dim] return actions diff --git a/src/lerobot/policies/groot/processor_groot.py b/src/lerobot/policies/groot/processor_groot.py index d87e43c11..14149cf2f 100644 --- a/src/lerobot/policies/groot/processor_groot.py +++ b/src/lerobot/policies/groot/processor_groot.py @@ -51,7 +51,11 @@ from lerobot.processor.converters import ( ) from lerobot.processor.core import EnvTransition, TransitionKey from lerobot.utils.constants import ( + ACTION, HF_LEROBOT_HOME, + OBS_IMAGE, + OBS_IMAGES, + OBS_STATE, POLICY_POSTPROCESSOR_DEFAULT_NAME, POLICY_PREPROCESSOR_DEFAULT_NAME, ) @@ -107,9 +111,9 @@ def make_groot_pre_post_processors( # Define feature specs for optional normalization steps _features: dict[str, PolicyFeature] = { # Observation features (only add those we may normalize) - "observation.state": PolicyFeature(type=FeatureType.STATE, shape=(state_horizon, max_state_dim)), + OBS_STATE: PolicyFeature(type=FeatureType.STATE, shape=(state_horizon, max_state_dim)), # Action feature - "action": PolicyFeature(type=FeatureType.ACTION, shape=(action_horizon, max_action_dim)), + ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(action_horizon, max_action_dim)), } # Normalize STATE and ACTION with min_max (SO100-like default) @@ -120,7 +124,7 @@ def make_groot_pre_post_processors( # Determine env action dimension from config (simple, object-like PolicyFeature) try: - env_action_dim = int(config.output_features["action"].shape[0]) + env_action_dim = int(config.output_features[ACTION].shape[0]) except Exception: env_action_dim = 0 @@ -268,9 +272,9 @@ class GrootPackInputsStep(ProcessorStep): return torch.where(mask, mapped, torch.zeros_like(mapped)) # 1) Video (B, T=1, V, H, W, C) uint8 - img_keys = sorted([k for k in obs if k.startswith("observation.images.")]) - if not img_keys and "observation.image" in obs: - img_keys = ["observation.image"] + img_keys = sorted([k for k in obs if k.startswith(OBS_IMAGES)]) + if not img_keys and OBS_IMAGE in obs: + img_keys = [OBS_IMAGE] if img_keys: cams = [_to_uint8_np_bhwc(obs[k]) for k in img_keys] video = np.stack(cams, axis=1) # (B, V, H, W, C) @@ -294,14 +298,14 @@ class GrootPackInputsStep(ProcessorStep): comp["language"] = lang # 3) State/state_mask -> (B, 1, max_state_dim) - if "observation.state" in obs: - state = obs["observation.state"] # (B, D) + if OBS_STATE in obs: + state = obs[OBS_STATE] # (B, D) if state.dim() != 2: raise ValueError(f"state must be (B, D), got {tuple(state.shape)}") bsz, d = state.shape # Normalize BEFORE padding if self.normalize_min_max: - state = _min_max_norm(state, "observation.state") + state = _min_max_norm(state, OBS_STATE) state = state.unsqueeze(1) # (B, 1, D) if d > self.max_state_dim: state = state[:, :, : self.max_state_dim] @@ -320,11 +324,11 @@ class GrootPackInputsStep(ProcessorStep): # Normalize BEFORE temporal expansion/padding if self.normalize_min_max: if action.dim() == 2: - action = _min_max_norm(action, "action") + action = _min_max_norm(action, ACTION) elif action.dim() == 3: b, t, d = action.shape flat = action.reshape(b * t, d) - flat = _min_max_norm(flat, "action") + flat = _min_max_norm(flat, ACTION) action = flat.view(b, t, d) if action.dim() == 2: action = action.unsqueeze(1).repeat(1, self.action_horizon, 1) @@ -590,7 +594,7 @@ class GrootActionUnpackUnnormalizeStep(ProcessorStep): # forward: y = 2 * (x - min) / denom - 1, with y=0 when denom==0 # inverse: x = (y+1)/2 * denom + min, and when denom==0 -> x = min if self.normalize_min_max and self.stats is not None: - stats_k = self.stats.get("action", {}) + stats_k = self.stats.get(ACTION, {}) d = action.shape[-1] min_v = torch.as_tensor( stats_k.get("min", torch.zeros(d)), dtype=action.dtype, device=action.device diff --git a/src/lerobot/policies/pi0/configuration_pi0.py b/src/lerobot/policies/pi0/configuration_pi0.py index 33753e0b2..a54ec5db0 100644 --- a/src/lerobot/policies/pi0/configuration_pi0.py +++ b/src/lerobot/policies/pi0/configuration_pi0.py @@ -20,8 +20,8 @@ from lerobot.configs.policies import PreTrainedConfig from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature from lerobot.optim.optimizers import AdamWConfig from lerobot.optim.schedulers import CosineDecayWithWarmupSchedulerConfig -from lerobot.policies.rtc.configuration_rtc import RTCConfig -from lerobot.utils.constants import OBS_IMAGES +from lerobot.policies.rtc.configuration_rtc import RTCConfig, RTCTrainingConfig +from lerobot.utils.constants import ACTION, OBS_IMAGES, OBS_STATE DEFAULT_IMAGE_SIZE = 224 @@ -50,8 +50,9 @@ class PI0Config(PreTrainedConfig): min_period: float = 4e-3 max_period: float = 4.0 - # Real-Time Chunking (RTC) configuration + # Real-Time Chunking (RTC) configurations rtc_config: RTCConfig | None = None + rtc_training_config: RTCTrainingConfig | None = None image_resolution: tuple[int, int] = ( DEFAULT_IMAGE_SIZE, @@ -76,6 +77,10 @@ class PI0Config(PreTrainedConfig): compile_mode: str = "max-autotune" # Torch compile mode device: str | None = None # Device to use for the model (None = auto-detect) + # Finetuning settings + freeze_vision_encoder: bool = False # Freeze only the vision encoder + train_expert_only: bool = False # Freeze entire VLM, train only action expert and projections + # Optimizer settings: see openpi `AdamW`` optimizer_lr: float = 2.5e-5 # see openpi `CosineDecaySchedule: peak_lr` optimizer_betas: tuple[float, float] = (0.9, 0.95) @@ -120,19 +125,19 @@ class PI0Config(PreTrainedConfig): ) self.input_features[key] = empty_camera - if "observation.state" not in self.input_features: + if OBS_STATE not in self.input_features: state_feature = PolicyFeature( type=FeatureType.STATE, shape=(self.max_state_dim,), # Padded to max_state_dim ) - self.input_features["observation.state"] = state_feature + self.input_features[OBS_STATE] = state_feature - if "action" not in self.output_features: + if ACTION not in self.output_features: action_feature = PolicyFeature( type=FeatureType.ACTION, shape=(self.max_action_dim,), # Padded to max_action_dim ) - self.output_features["action"] = action_feature + self.output_features[ACTION] = action_feature def get_optimizer_preset(self) -> AdamWConfig: return AdamWConfig( diff --git a/src/lerobot/policies/pi0/modeling_pi0.py b/src/lerobot/policies/pi0/modeling_pi0.py index e4970dcf1..3bdda1ef9 100644 --- a/src/lerobot/policies/pi0/modeling_pi0.py +++ b/src/lerobot/policies/pi0/modeling_pi0.py @@ -44,6 +44,12 @@ from lerobot.configs.policies import PreTrainedConfig from lerobot.policies.pi0.configuration_pi0 import DEFAULT_IMAGE_SIZE, PI0Config from lerobot.policies.pretrained import PreTrainedPolicy, T from lerobot.policies.rtc.modeling_rtc import RTCProcessor +from lerobot.policies.rtc.training_time import ( + apply_rtc_training_time, + apply_training_time_rtc_inference, + masked_mean, + sample_rtc_delay, +) from lerobot.utils.constants import ( ACTION, OBS_LANGUAGE_ATTENTION_MASK, @@ -79,8 +85,8 @@ def create_sinusoidal_pos_embedding( # see openpi `create_sinusoidal_pos_embedd if dimension % 2 != 0: raise ValueError(f"dimension ({dimension}) must be divisible by 2") - if time.ndim != 1: - raise ValueError("The time tensor is expected to be of shape `(batch_size, )`.") + if time.ndim not in (1, 2): + raise ValueError("The time tensor is expected to be of shape `(batch_size,)` or `(batch_size, T)`.") dtype = get_safe_dtype(torch.float64, device.type) fraction = torch.linspace(0.0, 1.0, dimension // 2, dtype=dtype, device=device) @@ -88,8 +94,14 @@ def create_sinusoidal_pos_embedding( # see openpi `create_sinusoidal_pos_embedd # Compute the outer product scaling_factor = 1.0 / period * 2 * math.pi - sin_input = scaling_factor[None, :] * time[:, None] - return torch.cat([torch.sin(sin_input), torch.cos(sin_input)], dim=1) + if time.ndim == 1: + sin_input = scaling_factor[None, :] * time[:, None] + return torch.cat([torch.sin(sin_input), torch.cos(sin_input)], dim=1) + + time_flat = time.reshape(-1) + sin_input = scaling_factor[None, :] * time_flat[:, None] + pos_emb = torch.cat([torch.sin(sin_input), torch.cos(sin_input)], dim=1) + return pos_emb.reshape(*time.shape, dimension) def sample_beta(alpha, beta, bsize, device): # see openpi `sample_beta` (exact copy) @@ -339,10 +351,14 @@ class PaliGemmaWithExpertModel( use_adarms=None, precision: Literal["bfloat16", "float32"] = "bfloat16", image_size: int = DEFAULT_IMAGE_SIZE, + freeze_vision_encoder: bool = False, + train_expert_only: bool = False, ): if use_adarms is None: use_adarms = [False, False] super().__init__() + self.freeze_vision_encoder = freeze_vision_encoder + self.train_expert_only = train_expert_only vlm_config_hf = CONFIG_MAPPING["paligemma"]() vlm_config_hf._vocab_size = 257152 # noqa: SLF001 @@ -383,6 +399,7 @@ class PaliGemmaWithExpertModel( self.gemma_expert.model.embed_tokens = None self.to_bfloat16_for_selected_params(precision) + self._set_requires_grad() def to_bfloat16_for_selected_params(self, precision: Literal["bfloat16", "float32"] = "bfloat16"): if precision == "bfloat16": @@ -406,6 +423,23 @@ class PaliGemmaWithExpertModel( if any(selector in name for selector in params_to_keep_float32): param.data = param.data.to(dtype=torch.float32) + def _set_requires_grad(self): + if self.freeze_vision_encoder: + self.paligemma.vision_tower.eval() + for param in self.paligemma.vision_tower.parameters(): + param.requires_grad = False + if self.train_expert_only: + self.paligemma.eval() + for param in self.paligemma.parameters(): + param.requires_grad = False + + def train(self, mode: bool = True): + super().train(mode) + if self.freeze_vision_encoder: + self.paligemma.vision_tower.eval() + if self.train_expert_only: + self.paligemma.eval() + def embed_image(self, image: torch.Tensor): return self.paligemma.model.get_image_features(image) @@ -533,6 +567,8 @@ class PI0Pytorch(nn.Module): # see openpi `PI0Pytorch` use_adarms=[False, False], precision=config.dtype, image_size=config.image_resolution[0], + freeze_vision_encoder=config.freeze_vision_encoder, + train_expert_only=config.train_expert_only, ) self.action_in_proj = nn.Linear(config.max_action_dim, action_expert_config.width) @@ -581,6 +617,9 @@ class PI0Pytorch(nn.Module): # see openpi `PI0Pytorch` def _rtc_enabled(self): return self.config.rtc_config is not None and self.config.rtc_config.enabled + def _training_time_rtc_inference_enabled(self): + return self.config.rtc_training_config is not None and self.config.rtc_training_config.enabled + def _apply_checkpoint(self, func, *args, **kwargs): """Helper method to apply gradient checkpointing if enabled.""" if self.gradient_checkpointing_enabled and self.training: @@ -690,7 +729,10 @@ class PI0Pytorch(nn.Module): # see openpi `PI0Pytorch` action_emb = self._apply_checkpoint(action_proj_func, noisy_actions) - time_emb = time_emb[:, None, :].expand_as(action_emb) + if time_emb.dim() == 2: + time_emb = time_emb[:, None, :].expand_as(action_emb) + elif time_emb.shape[:2] != action_emb.shape[:2]: + raise ValueError(f"Expected time_emb shape {action_emb.shape[:2]}, got {time_emb.shape[:2]}") action_time_emb = torch.cat([action_emb, time_emb], dim=2) def mlp_func(action_time_emb): @@ -726,7 +768,12 @@ class PI0Pytorch(nn.Module): # see openpi `PI0Pytorch` if time is None: time = self.sample_time(actions.shape[0], actions.device) - time_expanded = time[:, None, None] + if time.ndim == 1: + time_expanded = time[:, None, None] + elif time.ndim == 2: + time_expanded = time[:, :, None] + else: + raise ValueError(f"Expected time shape (B,) or (B, T), got {time.shape}") x_t = time_expanded * noise + (1 - time_expanded) * actions u_t = noise - actions @@ -822,24 +869,37 @@ class PI0Pytorch(nn.Module): # see openpi `PI0Pytorch` dt = -1.0 / num_steps + inference_delay = kwargs.get("inference_delay") + prev_chunk_left_over = kwargs.get("prev_chunk_left_over") + execution_horizon = kwargs.get("execution_horizon") + use_training_time_rtc = self._training_time_rtc_inference_enabled() + x_t = noise for step in range(num_steps): time = 1.0 + step * dt - time_tensor = torch.tensor(time, dtype=torch.float32, device=device).expand(bsize) - def denoise_step_partial_call(input_x_t, current_timestep=time_tensor): - return self.denoise_step( + if use_training_time_rtc: + x_t_cond, time_tensor = apply_training_time_rtc_inference( + x_t, time, inference_delay, prev_chunk_left_over, self.config.chunk_size + ) + v_t = self.denoise_step( state=state, prefix_pad_masks=prefix_pad_masks, past_key_values=past_key_values, - x_t=input_x_t, - timestep=current_timestep, + x_t=x_t_cond, + timestep=time_tensor, ) + elif self._rtc_enabled(): + time_tensor = torch.tensor(time, dtype=torch.float32, device=device).expand(bsize) - if self._rtc_enabled(): - inference_delay = kwargs.get("inference_delay") - prev_chunk_left_over = kwargs.get("prev_chunk_left_over") - execution_horizon = kwargs.get("execution_horizon") + def denoise_step_partial_call(input_x_t, current_timestep=time_tensor): + return self.denoise_step( + state=state, + prefix_pad_masks=prefix_pad_masks, + past_key_values=past_key_values, + x_t=input_x_t, + timestep=current_timestep, + ) v_t = self.rtc_processor.denoise_step( x_t=x_t, @@ -850,7 +910,14 @@ class PI0Pytorch(nn.Module): # see openpi `PI0Pytorch` execution_horizon=execution_horizon, ) else: - v_t = denoise_step_partial_call(x_t) + time_tensor = torch.tensor(time, dtype=torch.float32, device=device).expand(bsize) + v_t = self.denoise_step( + state=state, + prefix_pad_masks=prefix_pad_masks, + past_key_values=past_key_values, + x_t=x_t, + timestep=time_tensor, + ) x_t = x_t + dt * v_t @@ -1253,7 +1320,19 @@ class PI0Policy(PreTrainedPolicy): actions = self.prepare_action(batch) # Compute loss - losses = self.model.forward(images, img_masks, lang_tokens, lang_masks, state, actions) + postfix_mask = None + rtc_cfg = self.config.rtc_training_config + if rtc_cfg is not None and rtc_cfg.enabled and self.training: + batch_size = actions.shape[0] + time = self.model.sample_time(batch_size, actions.device) + noise = self.model.sample_noise(actions.shape, actions.device) + delay = sample_rtc_delay(rtc_cfg, batch_size, actions.device) + time, postfix_mask = apply_rtc_training_time(time, delay, actions.shape[1]) + losses = self.model.forward( + images, img_masks, lang_tokens, lang_masks, state, actions, noise=noise, time=time + ) + else: + losses = self.model.forward(images, img_masks, lang_tokens, lang_masks, state, actions) # Truncate losses to actual action dimensions original_action_dim = self.config.output_features[ACTION].shape[0] @@ -1265,11 +1344,22 @@ class PI0Policy(PreTrainedPolicy): if reduction == "none": # Return per-sample losses (B,) by averaging over time and action dims - per_sample_loss = losses.mean(dim=(1, 2)) + per_sample_loss = masked_mean(losses, postfix_mask, reduce_dims=(1, 2)) loss_dict["loss"] = per_sample_loss.mean().item() return per_sample_loss, loss_dict else: # Default: return scalar mean loss - loss = losses.mean() + loss = masked_mean(losses, postfix_mask, reduce_dims=(0, 1, 2)) loss_dict["loss"] = loss.item() return loss, loss_dict + + def _get_default_peft_targets(self) -> dict[str, any]: + """Return default PEFT target modules for PI0 fine-tuning.""" + common_projections = ( + "state_proj|action_in_proj|action_out_proj|action_time_mlp_in|action_time_mlp_out" + ) + target_modules = rf"(.*\.gemma_expert\..*\.self_attn\.(q|v)_proj|model\.({common_projections}))" + return { + "target_modules": target_modules, + "modules_to_save": [], + } diff --git a/src/lerobot/policies/pi05/configuration_pi05.py b/src/lerobot/policies/pi05/configuration_pi05.py index 7bdce70dd..f8be7c8bb 100644 --- a/src/lerobot/policies/pi05/configuration_pi05.py +++ b/src/lerobot/policies/pi05/configuration_pi05.py @@ -20,7 +20,8 @@ from lerobot.configs.policies import PreTrainedConfig from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature from lerobot.optim.optimizers import AdamWConfig from lerobot.optim.schedulers import CosineDecayWithWarmupSchedulerConfig -from lerobot.policies.rtc.configuration_rtc import RTCConfig +from lerobot.policies.rtc.configuration_rtc import RTCConfig, RTCTrainingConfig +from lerobot.utils.constants import ACTION, OBS_IMAGES, OBS_STATE DEFAULT_IMAGE_SIZE = 224 @@ -51,6 +52,7 @@ class PI05Config(PreTrainedConfig): # Real-Time Chunking (RTC) configuration rtc_config: RTCConfig | None = None + rtc_training_config: RTCTrainingConfig | None = None image_resolution: tuple[int, int] = ( DEFAULT_IMAGE_SIZE, @@ -76,6 +78,10 @@ class PI05Config(PreTrainedConfig): compile_mode: str = "max-autotune" # Torch compile mode device: str | None = None # Device to use for the model (None = auto-detect) + # Finetuning settings + freeze_vision_encoder: bool = False # Freeze only the vision encoder + train_expert_only: bool = False # Freeze entire VLM, train only action expert and projections + # Optimizer settings: see openpi `AdamW` optimizer_lr: float = 2.5e-5 # see openpi `CosineDecaySchedule: peak_lr` optimizer_betas: tuple[float, float] = (0.9, 0.95) @@ -113,26 +119,26 @@ class PI05Config(PreTrainedConfig): def validate_features(self) -> None: """Validate and set up input/output features.""" for i in range(self.empty_cameras): - key = f"observation.images.empty_camera_{i}" + key = OBS_IMAGES + f".empty_camera_{i}" empty_camera = PolicyFeature( type=FeatureType.VISUAL, shape=(3, *self.image_resolution), # Use configured image resolution ) self.input_features[key] = empty_camera - if "observation.state" not in self.input_features: + if OBS_STATE not in self.input_features: state_feature = PolicyFeature( type=FeatureType.STATE, shape=(self.max_state_dim,), # Padded to max_state_dim ) - self.input_features["observation.state"] = state_feature + self.input_features[OBS_STATE] = state_feature - if "action" not in self.output_features: + if ACTION not in self.output_features: action_feature = PolicyFeature( type=FeatureType.ACTION, shape=(self.max_action_dim,), # Padded to max_action_dim ) - self.output_features["action"] = action_feature + self.output_features[ACTION] = action_feature def get_optimizer_preset(self) -> AdamWConfig: return AdamWConfig( diff --git a/src/lerobot/policies/pi05/modeling_pi05.py b/src/lerobot/policies/pi05/modeling_pi05.py index 2cd142042..cbca282c9 100644 --- a/src/lerobot/policies/pi05/modeling_pi05.py +++ b/src/lerobot/policies/pi05/modeling_pi05.py @@ -44,6 +44,12 @@ from lerobot.configs.policies import PreTrainedConfig from lerobot.policies.pi05.configuration_pi05 import DEFAULT_IMAGE_SIZE, PI05Config from lerobot.policies.pretrained import PreTrainedPolicy, T from lerobot.policies.rtc.modeling_rtc import RTCProcessor +from lerobot.policies.rtc.training_time import ( + apply_rtc_training_time, + apply_training_time_rtc_inference, + masked_mean, + sample_rtc_delay, +) from lerobot.utils.constants import ( ACTION, OBS_LANGUAGE_ATTENTION_MASK, @@ -78,8 +84,8 @@ def create_sinusoidal_pos_embedding( # see openpi `create_sinusoidal_pos_embedd if dimension % 2 != 0: raise ValueError(f"dimension ({dimension}) must be divisible by 2") - if time.ndim != 1: - raise ValueError("The time tensor is expected to be of shape `(batch_size, )`.") + if time.ndim not in (1, 2): + raise ValueError("The time tensor is expected to be of shape `(batch_size,)` or `(batch_size, T)`.") dtype = get_safe_dtype(torch.float64, device.type) fraction = torch.linspace(0.0, 1.0, dimension // 2, dtype=dtype, device=device) @@ -87,8 +93,14 @@ def create_sinusoidal_pos_embedding( # see openpi `create_sinusoidal_pos_embedd # Compute the outer product scaling_factor = 1.0 / period * 2 * math.pi - sin_input = scaling_factor[None, :] * time[:, None] - return torch.cat([torch.sin(sin_input), torch.cos(sin_input)], dim=1) + if time.ndim == 1: + sin_input = scaling_factor[None, :] * time[:, None] + return torch.cat([torch.sin(sin_input), torch.cos(sin_input)], dim=1) + + time_flat = time.reshape(-1) + sin_input = scaling_factor[None, :] * time_flat[:, None] + pos_emb = torch.cat([torch.sin(sin_input), torch.cos(sin_input)], dim=1) + return pos_emb.reshape(*time.shape, dimension) def sample_beta(alpha, beta, bsize, device): # see openpi `sample_beta` (exact copy) @@ -337,10 +349,14 @@ class PaliGemmaWithExpertModel( use_adarms=None, precision: Literal["bfloat16", "float32"] = "bfloat16", image_size: int = DEFAULT_IMAGE_SIZE, + freeze_vision_encoder: bool = False, + train_expert_only: bool = False, ): if use_adarms is None: use_adarms = [False, False] super().__init__() + self.freeze_vision_encoder = freeze_vision_encoder + self.train_expert_only = train_expert_only vlm_config_hf = CONFIG_MAPPING["paligemma"]() vlm_config_hf._vocab_size = 257152 # noqa: SLF001 @@ -381,6 +397,7 @@ class PaliGemmaWithExpertModel( self.gemma_expert.model.embed_tokens = None self.to_bfloat16_for_selected_params(precision) + self._set_requires_grad() def to_bfloat16_for_selected_params(self, precision: Literal["bfloat16", "float32"] = "bfloat16"): if precision == "bfloat16": @@ -404,6 +421,23 @@ class PaliGemmaWithExpertModel( if any(selector in name for selector in params_to_keep_float32): param.data = param.data.to(dtype=torch.float32) + def _set_requires_grad(self): + if self.freeze_vision_encoder: + self.paligemma.vision_tower.eval() + for param in self.paligemma.vision_tower.parameters(): + param.requires_grad = False + if self.train_expert_only: + self.paligemma.eval() + for param in self.paligemma.parameters(): + param.requires_grad = False + + def train(self, mode: bool = True): + super().train(mode) + if self.freeze_vision_encoder: + self.paligemma.vision_tower.eval() + if self.train_expert_only: + self.paligemma.eval() + def embed_image(self, image: torch.Tensor): return self.paligemma.model.get_image_features(image) @@ -531,6 +565,8 @@ class PI05Pytorch(nn.Module): # see openpi `PI0Pytorch` use_adarms=[False, True], precision=config.dtype, image_size=config.image_resolution[0], + freeze_vision_encoder=config.freeze_vision_encoder, + train_expert_only=config.train_expert_only, ) self.action_in_proj = nn.Linear(config.max_action_dim, action_expert_config.width) @@ -578,6 +614,9 @@ class PI05Pytorch(nn.Module): # see openpi `PI0Pytorch` def _rtc_enabled(self): return self.config.rtc_config is not None and self.config.rtc_config.enabled + def _training_time_rtc_inference_enabled(self): + return self.config.rtc_training_config is not None and self.config.rtc_training_config.enabled + def _apply_checkpoint(self, func, *args, **kwargs): """Helper method to apply gradient checkpointing if enabled.""" if self.gradient_checkpointing_enabled and self.training: @@ -705,7 +744,12 @@ class PI05Pytorch(nn.Module): # see openpi `PI0Pytorch` if time is None: time = self.sample_time(actions.shape[0], actions.device) - time_expanded = time[:, None, None] + if time.ndim == 1: + time_expanded = time[:, None, None] + elif time.ndim == 2: + time_expanded = time[:, :, None] + else: + raise ValueError(f"Expected time shape (B,) or (B, T), got {time.shape}") x_t = time_expanded * noise + (1 - time_expanded) * actions u_t = noise - actions @@ -796,23 +840,35 @@ class PI05Pytorch(nn.Module): # see openpi `PI0Pytorch` dt = -1.0 / num_steps + inference_delay = kwargs.get("inference_delay") + prev_chunk_left_over = kwargs.get("prev_chunk_left_over") + execution_horizon = kwargs.get("execution_horizon") + use_training_time_rtc = self._training_time_rtc_inference_enabled() + x_t = noise for step in range(num_steps): time = 1.0 + step * dt - time_tensor = torch.tensor(time, dtype=torch.float32, device=device).expand(bsize) - def denoise_step_partial_call(input_x_t, current_timestep=time_tensor): - return self.denoise_step( + if use_training_time_rtc: + x_t_cond, time_tensor = apply_training_time_rtc_inference( + x_t, time, inference_delay, prev_chunk_left_over, self.config.chunk_size + ) + v_t = self.denoise_step( prefix_pad_masks=prefix_pad_masks, past_key_values=past_key_values, - x_t=input_x_t, - timestep=current_timestep, + x_t=x_t_cond, + timestep=time_tensor, ) + elif self._rtc_enabled(): + time_tensor = torch.tensor(time, dtype=torch.float32, device=device).expand(bsize) - if self._rtc_enabled(): - inference_delay = kwargs.get("inference_delay") - prev_chunk_left_over = kwargs.get("prev_chunk_left_over") - execution_horizon = kwargs.get("execution_horizon") + def denoise_step_partial_call(input_x_t, current_timestep=time_tensor): + return self.denoise_step( + prefix_pad_masks=prefix_pad_masks, + past_key_values=past_key_values, + x_t=input_x_t, + timestep=current_timestep, + ) v_t = self.rtc_processor.denoise_step( x_t=x_t, @@ -823,7 +879,13 @@ class PI05Pytorch(nn.Module): # see openpi `PI0Pytorch` execution_horizon=execution_horizon, ) else: - v_t = denoise_step_partial_call(x_t) + time_tensor = torch.tensor(time, dtype=torch.float32, device=device).expand(bsize) + v_t = self.denoise_step( + prefix_pad_masks=prefix_pad_masks, + past_key_values=past_key_values, + x_t=x_t, + timestep=time_tensor, + ) x_t = x_t + dt * v_t @@ -1226,7 +1288,17 @@ class PI05Policy(PreTrainedPolicy): actions = self.prepare_action(batch) # Compute loss (no separate state needed for PI05) - losses = self.model.forward(images, img_masks, tokens, masks, actions) + postfix_mask = None + rtc_cfg = self.config.rtc_training_config + if rtc_cfg is not None and rtc_cfg.enabled and self.training: + batch_size = actions.shape[0] + time = self.model.sample_time(batch_size, actions.device) + noise = self.model.sample_noise(actions.shape, actions.device) + delay = sample_rtc_delay(rtc_cfg, batch_size, actions.device) + time, postfix_mask = apply_rtc_training_time(time, delay, actions.shape[1]) + losses = self.model.forward(images, img_masks, tokens, masks, actions, noise=noise, time=time) + else: + losses = self.model.forward(images, img_masks, tokens, masks, actions) # Truncate losses to actual action dimensions original_action_dim = self.config.output_features[ACTION].shape[0] @@ -1238,11 +1310,22 @@ class PI05Policy(PreTrainedPolicy): if reduction == "none": # Return per-sample losses (B,) by averaging over time and action dims - per_sample_loss = losses.mean(dim=(1, 2)) + per_sample_loss = masked_mean(losses, postfix_mask, reduce_dims=(1, 2)) loss_dict["loss"] = per_sample_loss.mean().item() return per_sample_loss, loss_dict else: # Default: return scalar mean loss - loss = losses.mean() + loss = masked_mean(losses, postfix_mask, reduce_dims=(0, 1, 2)) loss_dict["loss"] = loss.item() return loss, loss_dict + + def _get_default_peft_targets(self) -> dict[str, any]: + """Return default PEFT target modules for PI0.5 fine-tuning.""" + common_projections = ( + "state_proj|action_in_proj|action_out_proj|action_time_mlp_in|action_time_mlp_out" + ) + target_modules = rf"(.*\.gemma_expert\..*\.self_attn\.(q|v)_proj|model\.({common_projections}))" + return { + "target_modules": target_modules, + "modules_to_save": [], + } diff --git a/src/lerobot/robots/so100_follower/__init__.py b/src/lerobot/policies/pi0_fast/__init__.py similarity index 63% rename from src/lerobot/robots/so100_follower/__init__.py rename to src/lerobot/policies/pi0_fast/__init__.py index 5dc43ac3b..a0277da0f 100644 --- a/src/lerobot/robots/so100_follower/__init__.py +++ b/src/lerobot/policies/pi0_fast/__init__.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# Copyright 2025 Physical Intelligence and The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,5 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .config_so100_follower import SO100FollowerConfig -from .so100_follower import SO100Follower +from .configuration_pi0_fast import PI0FastConfig +from .modeling_pi0_fast import PI0FastPolicy +from .processor_pi0_fast import make_pi0_fast_pre_post_processors + +__all__ = ["PI0FastConfig", "PI0FastPolicy", "make_pi0_fast_pre_post_processors"] diff --git a/src/lerobot/policies/pi0_fast/configuration_pi0_fast.py b/src/lerobot/policies/pi0_fast/configuration_pi0_fast.py new file mode 100644 index 000000000..96137e91f --- /dev/null +++ b/src/lerobot/policies/pi0_fast/configuration_pi0_fast.py @@ -0,0 +1,162 @@ +#!/usr/bin/env python + +# Copyright 2025 Physical Intelligence and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass, field + +from lerobot.configs.policies import PreTrainedConfig +from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature +from lerobot.optim.optimizers import AdamWConfig +from lerobot.optim.schedulers import CosineDecayWithWarmupSchedulerConfig +from lerobot.policies.rtc.configuration_rtc import RTCConfig +from lerobot.utils.constants import ACTION, OBS_IMAGES, OBS_STATE + +DEFAULT_IMAGE_SIZE = 224 + + +@PreTrainedConfig.register_subclass("pi0_fast") +@dataclass +class PI0FastConfig(PreTrainedConfig): + paligemma_variant: str = "gemma_2b" + action_expert_variant: str = "gemma_300m" + dtype: str = "float32" # Options: "bfloat16", "float32" + + chunk_size: int = 50 # Number of action steps to predict, in openpi called "action_horizon" + n_action_steps: int = 50 # Number of action steps to execute + + # Shorter state and action vectors will be padded to these dimensions + max_state_dim: int = 32 + max_action_dim: int = 32 + max_action_tokens: int = 256 + + # Real-Time Chunking (RTC) configuration + rtc_config: RTCConfig | None = None + + image_resolution: tuple[int, int] = ( + DEFAULT_IMAGE_SIZE, + DEFAULT_IMAGE_SIZE, + ) # see openpi `preprocessing_pytorch.py` + + # Add empty images. Used to add empty cameras when no image features are present. + empty_cameras: int = 0 + + tokenizer_max_length: int = 200 # see openpi `__post_init__` + text_tokenizer_name: str = "google/paligemma-3b-pt-224" + action_tokenizer_name: str = "physical-intelligence/fast" + temperature: float = 0.0 + max_decoding_steps: int = 256 + fast_skip_tokens: int = 128 + + # Whether to validate that decoded action tokens start with "Action: " prefix + validate_action_token_prefix: bool = True + + # Whether to use KV cache for faster autoregressive decoding + use_kv_cache: bool = True + + normalization_mapping: dict[str, NormalizationMode] = field( + default_factory=lambda: { + "VISUAL": NormalizationMode.IDENTITY, + "STATE": NormalizationMode.MEAN_STD, # Pi0Fast uses quantiles for state + "ACTION": NormalizationMode.MEAN_STD, # Pi0Fast uses quantiles for action + } + ) + + # Training settings + gradient_checkpointing: bool = False # Enable gradient checkpointing for memory optimization + compile_model: bool = False # Whether to use torch.compile for model optimization + compile_mode: str = "max-autotune" # Torch compile mode + device: str | None = None # Device to use for the model (None = auto-detect) + + # Optimizer settings: see openpi `AdamW` + optimizer_lr: float = 2.5e-5 # see openpi `CosineDecaySchedule: peak_lr` + optimizer_betas: tuple[float, float] = (0.9, 0.95) + optimizer_eps: float = 1e-8 + optimizer_weight_decay: float = 0.01 + optimizer_grad_clip_norm: float = 1.0 + + # Scheduler settings: see openpi `CosineDecaySchedule` + # Note: These will auto-scale if --steps < scheduler_decay_steps + # For example, --steps=3000 will scale warmup to 100 and decay to 3000 + scheduler_warmup_steps: int = 1_000 + scheduler_decay_steps: int = 30_000 + scheduler_decay_lr: float = 2.5e-6 + + def __post_init__(self): + super().__post_init__() + + # Validate configuration + if self.n_action_steps > self.chunk_size: + raise ValueError( + f"n_action_steps ({self.n_action_steps}) cannot be greater than chunk_size ({self.chunk_size})" + ) + + if self.paligemma_variant not in ["gemma_300m", "gemma_2b"]: + raise ValueError(f"Invalid paligemma_variant: {self.paligemma_variant}") + + if self.dtype not in ["bfloat16", "float32"]: + raise ValueError(f"Invalid dtype: {self.dtype}") + + def validate_features(self) -> None: + """Validate and set up input/output features.""" + for i in range(self.empty_cameras): + key = OBS_IMAGES + f".empty_camera_{i}" + empty_camera = PolicyFeature( + type=FeatureType.VISUAL, + shape=(3, *self.image_resolution), # Use configured image resolution + ) + self.input_features[key] = empty_camera + + if OBS_STATE not in self.input_features: + state_feature = PolicyFeature( + type=FeatureType.STATE, + shape=(self.max_state_dim,), # Padded to max_state_dim + ) + self.input_features[OBS_STATE] = state_feature + + if ACTION not in self.output_features: + action_feature = PolicyFeature( + type=FeatureType.ACTION, + shape=(self.max_action_dim,), # Padded to max_action_dim + ) + self.output_features[ACTION] = action_feature + + def get_optimizer_preset(self) -> AdamWConfig: + return AdamWConfig( + lr=self.optimizer_lr, + betas=self.optimizer_betas, + eps=self.optimizer_eps, + weight_decay=self.optimizer_weight_decay, + grad_clip_norm=self.optimizer_grad_clip_norm, + ) + + def get_scheduler_preset(self): + return CosineDecayWithWarmupSchedulerConfig( + peak_lr=self.optimizer_lr, + decay_lr=self.scheduler_decay_lr, + num_warmup_steps=self.scheduler_warmup_steps, + num_decay_steps=self.scheduler_decay_steps, + ) + + @property + def observation_delta_indices(self) -> None: + return None + + @property + def action_delta_indices(self) -> list: + return list(range(self.chunk_size)) + + @property + def reward_delta_indices(self) -> None: + return None diff --git a/src/lerobot/policies/pi0_fast/modeling_pi0_fast.py b/src/lerobot/policies/pi0_fast/modeling_pi0_fast.py new file mode 100644 index 000000000..b4bc7ba22 --- /dev/null +++ b/src/lerobot/policies/pi0_fast/modeling_pi0_fast.py @@ -0,0 +1,1353 @@ +#!/usr/bin/env python + +# Copyright 2025 Physical Intelligence and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import builtins +import logging +import math +from collections import deque +from pathlib import Path +from typing import TYPE_CHECKING, Literal, TypedDict + +import numpy as np +import torch +import torch.nn.functional as F # noqa: N812 +from torch import Tensor, nn +from typing_extensions import Unpack + +from lerobot.utils.import_utils import _scipy_available, _transformers_available + +# Conditional import for type checking and lazy loading +if TYPE_CHECKING or _scipy_available: + from scipy.fftpack import idct +else: + idct = None + +if TYPE_CHECKING or _transformers_available: + from transformers import AutoTokenizer + from transformers.models.auto import CONFIG_MAPPING + from transformers.models.paligemma.modeling_paligemma import PaliGemmaForConditionalGeneration +else: + CONFIG_MAPPING = None + PaliGemmaForConditionalGeneration = None + AutoTokenizer = None + +from lerobot.configs.policies import PreTrainedConfig +from lerobot.policies.pi0_fast.configuration_pi0_fast import PI0FastConfig +from lerobot.policies.pretrained import PreTrainedPolicy, T +from lerobot.policies.rtc.modeling_rtc import RTCProcessor +from lerobot.utils.constants import ( + ACTION, + ACTION_TOKEN_MASK, + ACTION_TOKENS, + OBS_LANGUAGE_ATTENTION_MASK, + OBS_LANGUAGE_TOKENS, + OPENPI_ATTENTION_MASK_VALUE, +) + + +class ActionSelectKwargs(TypedDict, total=False): + temperature: float | None + + +def pad_vector(vector, new_dim): + """Pad the last dimension of a vector to new_dim with zeros. + + Can be (batch_size x sequence_length x features_dimension) + or (batch_size x features_dimension) + """ + if vector.shape[-1] >= new_dim: + return vector + return F.pad(vector, (0, new_dim - vector.shape[-1])) + + +def resize_with_pad_torch( # see openpi `resize_with_pad_torch` (exact copy) + images: torch.Tensor, + height: int, + width: int, + mode: str = "bilinear", +) -> torch.Tensor: + """PyTorch version of resize_with_pad. Resizes an image to a target height and width without distortion + by padding with black. If the image is float32, it must be in the range [-1, 1]. + + Args: + images: Tensor of shape [*b, h, w, c] or [*b, c, h, w] + height: Target height + width: Target width + mode: Interpolation mode ('bilinear', 'nearest', etc.) + + Returns: + Resized and padded tensor with same shape format as input + """ + # Check if input is in channels-last format [*b, h, w, c] or channels-first [*b, c, h, w] + if images.shape[-1] <= 4: # Assume channels-last format + channels_last = True + if images.dim() == 3: + images = images.unsqueeze(0) # Add batch dimension + images = images.permute(0, 3, 1, 2) # [b, h, w, c] -> [b, c, h, w] + else: + channels_last = False + if images.dim() == 3: + images = images.unsqueeze(0) # Add batch dimension + + batch_size, channels, cur_height, cur_width = images.shape + + # Calculate resize ratio + ratio = max(cur_width / width, cur_height / height) + resized_height = int(cur_height / ratio) + resized_width = int(cur_width / ratio) + + # Resize + resized_images = F.interpolate( + images, + size=(resized_height, resized_width), + mode=mode, + align_corners=False if mode == "bilinear" else None, + ) + + # Handle dtype-specific clipping + if images.dtype == torch.uint8: + resized_images = torch.round(resized_images).clamp(0, 255).to(torch.uint8) + elif images.dtype == torch.float32: + resized_images = resized_images.clamp(-1.0, 1.0) + else: + raise ValueError(f"Unsupported image dtype: {images.dtype}") + + # Calculate padding + pad_h0, remainder_h = divmod(height - resized_height, 2) + pad_h1 = pad_h0 + remainder_h + pad_w0, remainder_w = divmod(width - resized_width, 2) + pad_w1 = pad_w0 + remainder_w + + # Pad + constant_value = 0 if images.dtype == torch.uint8 else -1.0 + padded_images = F.pad( + resized_images, + (pad_w0, pad_w1, pad_h0, pad_h1), # left, right, top, bottom + mode="constant", + value=constant_value, + ) + + # Convert back to original format if needed + if channels_last: + padded_images = padded_images.permute(0, 2, 3, 1) # [b, c, h, w] -> [b, h, w, c] + + return padded_images + + +class GemmaConfig: # see openpi `gemma.py: Config` + """Configuration for Gemma model variants.""" + + def __init__(self, width, depth, mlp_dim, num_heads, num_kv_heads, head_dim): + self.width = width + self.depth = depth + self.mlp_dim = mlp_dim + self.num_heads = num_heads + self.num_kv_heads = num_kv_heads + self.head_dim = head_dim + + +def get_gemma_config(variant: str) -> GemmaConfig: # see openpi `gemma.py: get_config` + """Returns config for specified gemma variant.""" + if variant == "gemma_300m": + return GemmaConfig( + width=1024, + depth=18, + mlp_dim=4096, + num_heads=8, + num_kv_heads=1, + head_dim=256, + ) + elif variant == "gemma_2b": + return GemmaConfig( + width=2048, + depth=18, + mlp_dim=16_384, + num_heads=8, + num_kv_heads=1, + head_dim=256, + ) + else: + raise ValueError(f"Unknown variant: {variant}") + + +class PI0FastPaliGemma(nn.Module): + """PaliGemma model for PI0Fast""" + + def __init__( + self, + vlm_config, + use_adarms=None, + precision: Literal["bfloat16", "float32"] = "bfloat16", + ): + if use_adarms is None: + use_adarms = [False, False] + super().__init__() + + vlm_config_hf = CONFIG_MAPPING["paligemma"]() + vlm_config_hf._vocab_size = 257152 # noqa: SLF001 + vlm_config_hf.image_token_index = 257152 + vlm_config_hf.text_config.hidden_size = vlm_config.width + vlm_config_hf.text_config.intermediate_size = vlm_config.mlp_dim + vlm_config_hf.text_config.num_attention_heads = vlm_config.num_heads + vlm_config_hf.text_config.head_dim = vlm_config.head_dim + vlm_config_hf.text_config.num_hidden_layers = vlm_config.depth + vlm_config_hf.text_config.num_key_value_heads = vlm_config.num_kv_heads + vlm_config_hf.text_config.hidden_activation = "gelu_pytorch_tanh" + vlm_config_hf.text_config.torch_dtype = "float32" + vlm_config_hf.text_config.vocab_size = 257152 + vlm_config_hf.text_config.use_adarms = use_adarms[0] + vlm_config_hf.text_config.adarms_cond_dim = vlm_config.width if use_adarms[0] else None + vlm_config_hf.vision_config.intermediate_size = 4304 + vlm_config_hf.vision_config.projection_dim = 2048 + vlm_config_hf.vision_config.projector_hidden_act = "gelu_fast" + vlm_config_hf.vision_config.torch_dtype = "float32" + + self.paligemma = PaliGemmaForConditionalGeneration(config=vlm_config_hf) + + self.to_bfloat16_for_selected_params(precision) + + def to_bfloat16_for_selected_params(self, precision: Literal["bfloat16", "float32"] = "bfloat16"): + if precision == "bfloat16": + self.to(dtype=torch.bfloat16) + elif precision == "float32": + self.to(dtype=torch.float32) + return + else: + raise ValueError(f"Invalid precision: {precision}") + + params_to_keep_float32 = [ + "vision_tower.vision_model.embeddings.patch_embedding.weight", + "vision_tower.vision_model.embeddings.patch_embedding.bias", + "vision_tower.vision_model.embeddings.position_embedding.weight", + "input_layernorm", + "post_attention_layernorm", + "model.norm", + ] + + for name, param in self.named_parameters(): + if any(selector in name for selector in params_to_keep_float32): + param.data = param.data.to(dtype=torch.float32) + + def embed_image(self, image: torch.Tensor): + return self.paligemma.model.get_image_features(image) + + def embed_language_tokens(self, tokens: torch.Tensor): + return self.paligemma.language_model.embed_tokens(tokens) + + def forward( + self, + attention_mask: torch.Tensor | None = None, + position_ids: torch.LongTensor | None = None, + past_key_values: list[torch.FloatTensor] | None = None, + inputs_embeds: list[torch.FloatTensor] | None = None, + use_cache: bool | None = None, + adarms_cond: list[torch.Tensor] | None = None, + ): + if adarms_cond is None: + adarms_cond = [None, None] + if inputs_embeds[1] is None: + prefix_output = self.paligemma.language_model.forward( + inputs_embeds=inputs_embeds[0], + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + adarms_cond=adarms_cond[0] if adarms_cond is not None else None, + ) + prefix_past_key_values = prefix_output.past_key_values + # prefix_output to be used for the language head + # shape: [batch_size, seq_len, hidden_size] with hidden_size = 2048 + prefix_output = prefix_output.last_hidden_state + suffix_output = None + return [prefix_output, suffix_output], prefix_past_key_values + + +class PI0FastPytorch(nn.Module): # see openpi `PI0Pytorch` + """Core PI0Fast PyTorch model.""" + + def __init__( + self, + config: PI0FastConfig, + rtc_processor: RTCProcessor | None = None, + paligemma_tokenizer: "AutoTokenizer | None" = None, + ): + super().__init__() + self.config = config + self.rtc_processor = rtc_processor + self._paligemma_tokenizer = paligemma_tokenizer + + paligemma_config = get_gemma_config(config.paligemma_variant) + + self.paligemma_with_expert = PI0FastPaliGemma( + paligemma_config, + use_adarms=[False, True], + precision=config.dtype, + ) + + # Initialize gradient checkpointing flag + self.gradient_checkpointing_enabled = False + + # Compile model if requested + if config.compile_model: + torch.set_float32_matmul_precision("high") + self.sample_actions_fast = torch.compile(self.sample_actions_fast, mode=config.compile_mode) + self.forward = torch.compile(self.forward, mode=config.compile_mode) + + msg = """An incorrect transformer version is used, please create an issue on https://github.com/huggingface/lerobot/issues""" + + try: + from transformers.models.siglip import check + + if not check.check_whether_transformers_replace_is_installed_correctly(): + raise ValueError(msg) + except ImportError: + raise ValueError(msg) from None + + def gradient_checkpointing_enable(self): + """Enable gradient checkpointing for memory optimization.""" + self.gradient_checkpointing_enabled = True + # Call the proper gradient_checkpointing_enable() method with use_reentrant=False for better memory efficiency + self.paligemma_with_expert.paligemma.language_model.gradient_checkpointing_enable( + gradient_checkpointing_kwargs={"use_reentrant": False} + ) + self.paligemma_with_expert.paligemma.vision_tower.gradient_checkpointing_enable( + gradient_checkpointing_kwargs={"use_reentrant": False} + ) + logging.info("Enabled gradient checkpointing for PI0FastPytorch model") + + def gradient_checkpointing_disable(self): + """Disable gradient checkpointing.""" + self.gradient_checkpointing_enabled = False + # Call the proper gradient_checkpointing_disable() method + self.paligemma_with_expert.paligemma.language_model.gradient_checkpointing_disable() + self.paligemma_with_expert.paligemma.vision_tower.gradient_checkpointing_disable() + logging.info("Disabled gradient checkpointing for PI0FastPytorch model") + + def _apply_checkpoint(self, func, *args, **kwargs): + """Helper method to apply gradient checkpointing if enabled.""" + if self.gradient_checkpointing_enabled and self.training: + return torch.utils.checkpoint.checkpoint( + func, *args, use_reentrant=False, preserve_rng_state=False, **kwargs + ) + return func(*args, **kwargs) + + def _prepare_attention_masks_4d(self, att_2d_masks, dtype=None): + """Helper method to prepare 4D attention masks for transformer.""" + att_2d_masks_4d = att_2d_masks[:, None, :, :] + result = torch.where(att_2d_masks_4d, 0.0, OPENPI_ATTENTION_MASK_VALUE) + if dtype is not None: + result = result.to(dtype=dtype) + return result + + def embed_prefix_fast( + self, + images, + img_masks, + tokens, + masks, + fast_action_tokens=None, + fast_action_masks=None, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int]: + """Embed images, language tokens, and FAST action tokens. + + Attention pattern: + - Images + Language: bidirectional among themselves + - FAST: attend to images + language, causal among themselves + + Args: + images: List of image tensors + img_masks: List of image masks + tokens: Language instruction tokens + masks: Attention masks for tokens + fast_action_tokens: FAST action tokens (discrete token IDs) + fast_action_masks: Padding masks for FAST action tokens + + Returns: + embs: Concatenated embeddings [images, tokens, fast_action_tokens] + pad_masks: Padding masks + att_masks: 2D attention mask + total_T_images: Total number of image tokens + num_fast_embs: Number of FAST action token embeddings + """ + embs = [] + pad_masks = [] + att_mask_segments = [] + total_t_images = 0 + num_fast_embs = 0 + + # Process images + for img, img_mask in zip(images, img_masks, strict=True): + + def image_embed_func(img): + return self.paligemma_with_expert.embed_image(img) + + img_emb = self._apply_checkpoint(image_embed_func, img) + bsize, num_img_embs = img_emb.shape[:2] + + embs.append(img_emb) + pad_masks.append(img_mask[:, None].expand(bsize, num_img_embs)) + att_mask_segments.append(("image", num_img_embs)) + total_t_images += num_img_embs + + # Process language instruction tokens + def lang_embed_func(tokens): + lang_emb = self.paligemma_with_expert.embed_language_tokens(tokens) + lang_emb_dim = lang_emb.shape[-1] + return lang_emb * math.sqrt(lang_emb_dim) + + lang_emb = self._apply_checkpoint(lang_embed_func, tokens) + embs.append(lang_emb) + pad_masks.append(masks) + + num_lang_embs = lang_emb.shape[1] + att_mask_segments.append(("language", num_lang_embs)) + + # Process FAST action tokens (discrete token IDs) + if fast_action_tokens is not None: + + def fast_action_embed_func(fast_action_tokens): + fast_emb = self.paligemma_with_expert.embed_language_tokens(fast_action_tokens) + fast_emb_dim = fast_emb.shape[-1] + return fast_emb * math.sqrt(fast_emb_dim) + + fast_action_emb = self._apply_checkpoint(fast_action_embed_func, fast_action_tokens) + embs.append(fast_action_emb) + + num_fast_embs = fast_action_tokens.shape[1] + pad_masks.append(fast_action_masks) + att_mask_segments.append(("fast", num_fast_embs)) + + embs = torch.cat(embs, dim=1) + pad_masks = torch.cat(pad_masks, dim=1) + + # Create custom 2D attention mask: + # - Images + Language: bidirectional among themselves + # - FAST: attend to images + language, causal among themselves + att_masks = self._create_custom_attention_mask_fast(att_mask_segments, pad_masks, bsize) + + return embs, pad_masks, att_masks, total_t_images, num_fast_embs + + def _create_custom_attention_mask_fast(self, att_mask_segments, pad_masks, bsize): + """Create custom 2D attention mask. + + Attention rules: + - Images + Language: bidirectional among themselves + - FAST: attend to images + language, causal among themselves + """ + total_len = sum(length for _, length in att_mask_segments) + device = pad_masks.device + + att_2d_masks = torch.zeros(bsize, total_len, total_len, dtype=torch.bool, device=device) + + positions = [] + current_pos = 0 + for seg_type, seg_len in att_mask_segments: + positions.append((seg_type, current_pos, current_pos + seg_len)) + current_pos += seg_len + + for _i, (query_type, query_start, query_end) in enumerate(positions): + for _j, (key_type, key_start, key_end) in enumerate(positions): + # Images and Language can attend to each other bidirectionally + if ( + query_type in ["image", "language"] + and key_type in ["image", "language"] + or query_type == "fast" + and key_type in ["image", "language"] + ): + att_2d_masks[:, query_start:query_end, key_start:key_end] = True + + # FAST tokens attend causally to themselves + elif query_type == "fast" and key_type == "fast": + fast_len = query_end - query_start + causal_mask = torch.tril(torch.ones(fast_len, fast_len, dtype=torch.bool, device=device)) + att_2d_masks[:, query_start:query_end, key_start:key_end] = causal_mask[None, :, :] + + # Apply padding masks + pad_2d_masks = pad_masks[:, None, :] * pad_masks[:, :, None] + att_2d_masks = att_2d_masks & pad_2d_masks + + return att_2d_masks + + def forward( + self, + images, + img_masks, + tokens, + masks, + fast_action_tokens, + fast_action_masks, + ) -> dict: + """Forward pass for PI0Fast. + + This implements the Pi0FAST training objective: predict next action token + using cross-entropy loss. + + Args: + images: List of image tensors + img_masks: List of image masks + tokens: Language instruction tokens + masks: Attention masks for tokens + fast_action_tokens: Discrete action token IDs [B, max_action_tokens] + fast_action_masks: Padding masks for fast action tokens [B, max_action_tokens] + + Returns: + Dictionary with 'fast_loss' and 'loss' keys + """ + if fast_action_tokens is None or fast_action_masks is None: + raise ValueError("fast_action_tokens and fast_action_masks are required for FAST-only mode") + + # Embed prefix with FAST tokens + prefix_embs, prefix_pad_masks, prefix_att_masks, total_t_images, num_fast_embs = ( + self.embed_prefix_fast( + images, + img_masks, + tokens, + masks, + fast_action_tokens=fast_action_tokens, + fast_action_masks=fast_action_masks, + ) + ) + + # Convert embeddings to bfloat16 if needed + if ( + self.paligemma_with_expert.paligemma.language_model.layers[0].self_attn.q_proj.weight.dtype + == torch.bfloat16 + ): + prefix_embs = prefix_embs.to(dtype=torch.bfloat16) + + # for next-token prediction, input tokens [0:T-1] to predict tokens [1:T] + input_embs = prefix_embs + input_pad_masks = prefix_pad_masks + input_att_masks = prefix_att_masks + + position_ids = torch.cumsum(input_pad_masks, dim=1) - 1 + att_2d_4d = self._prepare_attention_masks_4d(input_att_masks, dtype=input_embs.dtype) + + # forward pass through paligemma (language model) + (prefix_out, _), _ = self.paligemma_with_expert.forward( + attention_mask=att_2d_4d, + position_ids=position_ids, + past_key_values=None, + inputs_embeds=[input_embs, None], # No suffix/action expert + use_cache=False, + adarms_cond=[None, None], + ) + + # Get logits for FAST action tokens using the FAST LM head + # only compute logits for the positions that predict FAST tokens + lm_head = self.paligemma_with_expert.paligemma.lm_head + + # Targets are the FAST action tokens + fast_targets = fast_action_tokens # (B, num_fast_embs) + + # extract logits for FAST token prediction + fast_hidden = prefix_out[:, -fast_targets.shape[1] :, :] + fast_logits_for_pred = lm_head(fast_hidden) # (B, num_fast_embs, gemma_vocab_size) + + # Shift left for next-step prediction and shift target + # logits[:, i] predicts targets[:, i+1] + fast_logits_for_pred = fast_logits_for_pred[:, :-1, :] # shift logits left + fast_targets = fast_targets[:, 1:] # shift targets right + fast_action_masks = fast_action_masks[:, 1:] # shift masks to match targets + + # compute cross-entropy loss + loss_fct = torch.nn.CrossEntropyLoss(reduction="none") + fast_logits_flat = fast_logits_for_pred.reshape(-1, fast_logits_for_pred.size(-1)) + fast_targets_flat = fast_targets.reshape(-1) + + fast_loss_per_token = loss_fct(fast_logits_flat, fast_targets_flat) + fast_loss_per_token = fast_loss_per_token.reshape(fast_targets.shape) + + # apply mask and compute mean loss + masked_fast_loss = fast_loss_per_token * fast_action_masks.float() + fast_loss = masked_fast_loss.sum() / fast_action_masks.sum().clamp(min=1) + + return { + "ce_loss": fast_loss, + "loss": fast_loss, + } + + @torch.no_grad() + def sample_actions_fast( + self, + images, + img_masks, + tokens, + masks, + max_decoding_steps=None, + temperature=0.0, + ) -> torch.Tensor: + """ + Inefficient but safe autoregressive decoding for FAST tokens. + Matches the pattern of _generate_subtask_tokens. + TODO: jadechoghari, should we move this logic to PI0FastPolicy class? + """ + if max_decoding_steps is None: + max_decoding_steps = self.config.max_action_tokens + + bsize = tokens.shape[0] + device = tokens.device + lm_head = self.paligemma_with_expert.paligemma.lm_head + + # add bos token after tokens + bos_token = torch.full( + (bsize, 1), self._paligemma_tokenizer.bos_token_id, dtype=torch.long, device=device + ) + tokens = torch.cat([tokens, bos_token], dim=1) + masks = torch.cat([masks, torch.ones((bsize, 1), dtype=torch.bool, device=device)], dim=1) + + # 1. Initial Embedding (matches training prefix) + # prefix_embs will include [Images, Language Prompt, BOS] + prefix_embs, prefix_pad_masks, prefix_att_masks, total_t_images, _ = self.embed_prefix_fast( + images, img_masks, tokens, masks, fast_action_tokens=None, fast_action_masks=None + ) + + if ( + self.paligemma_with_expert.paligemma.language_model.layers[0].self_attn.q_proj.weight.dtype + == torch.bfloat16 + ): + prefix_embs = prefix_embs.to(dtype=torch.bfloat16) + + generated_action_tokens = torch.zeros((bsize, max_decoding_steps), dtype=torch.long, device=device) + + # 2. Decoding Loop (each step re-computes full sequence) + for t in range(max_decoding_steps): + # always re-calculate position IDs from the current pad mask + position_ids = torch.cumsum(prefix_pad_masks, dim=1) - 1 + att_4d = self._prepare_attention_masks_4d(prefix_att_masks, dtype=prefix_embs.dtype) + + # full forward pass (no kv cache) + (prefix_out, _), _ = self.paligemma_with_expert.forward( + attention_mask=att_4d, + position_ids=position_ids, + past_key_values=None, + inputs_embeds=[prefix_embs, None], + use_cache=False, + adarms_cond=[None, None], + ) + + # predict next token from the very last sequence position + last_logits = lm_head(prefix_out[:, -1:, :]) # (B, 1, vocab_size) + + if temperature > 0: + probs = torch.softmax(last_logits[:, -1] / temperature, dim=-1) + next_token = torch.multinomial(probs, num_samples=1) + else: + next_token = torch.argmax(last_logits[:, -1], dim=-1, keepdim=True) + + generated_action_tokens[:, t] = next_token.squeeze(-1) + + # 3. Update sequence for next iteration (unless it's the last step) + if t < max_decoding_steps - 1: + # embed the newly generated token + next_token_emb = self.paligemma_with_expert.embed_language_tokens(next_token) + next_token_emb = next_token_emb * math.sqrt(next_token_emb.shape[-1]) + if prefix_embs.dtype == torch.bfloat16: + next_token_emb = next_token_emb.to(dtype=torch.bfloat16) + + # append to embeddings + prefix_embs = torch.cat([prefix_embs, next_token_emb], dim=1) + + # update padding mask (new token is always valid/1) + prefix_pad_masks = torch.cat( + [prefix_pad_masks, torch.ones((bsize, 1), dtype=torch.bool, device=device)], dim=1 + ) + + # update 2d attention mask: grow the matrix + old_len = prefix_att_masks.shape[1] + new_len = old_len + 1 + new_att_masks = torch.zeros((bsize, new_len, new_len), dtype=torch.bool, device=device) + new_att_masks[:, :old_len, :old_len] = prefix_att_masks + # new token attends to all non-padding tokens in the updated sequence + new_att_masks[:, -1, :] = prefix_pad_masks + prefix_att_masks = new_att_masks + return generated_action_tokens + + @torch.no_grad() + def sample_actions_fast_kv_cache( + self, + images, + img_masks, + tokens, + masks, + max_decoding_steps=None, + temperature=0.0, + ) -> torch.Tensor: + """ + Optimized autoregressive decoding for FAST tokens using KV Caching. + """ + if max_decoding_steps is None: + max_decoding_steps = self.config.max_action_tokens + + bsize = tokens.shape[0] + device = tokens.device + lm_head = self.paligemma_with_expert.paligemma.lm_head + + # --- 1. PREFILL PHASE --- + # Process Images + Text Prompt + BOS token once to populate the KV cache. + + # Add BOS token to the prompt + bos_token = torch.full( + (bsize, 1), self._paligemma_tokenizer.bos_token_id, dtype=torch.long, device=device + ) + tokens_in = torch.cat([tokens, bos_token], dim=1) + masks_in = torch.cat([masks, torch.ones((bsize, 1), dtype=torch.bool, device=device)], dim=1) + + # Embed prefix [Images, Language, BOS] + # fast_action_tokens=None means we are just embedding the condition (images+text) + prefix_embs, prefix_pad_masks, prefix_att_masks, total_t_images, _ = self.embed_prefix_fast( + images, img_masks, tokens_in, masks_in, fast_action_tokens=None, fast_action_masks=None + ) + + # Ensure correct precision (bfloat16/float32) + if ( + self.paligemma_with_expert.paligemma.language_model.layers[0].self_attn.q_proj.weight.dtype + == torch.bfloat16 + ): + prefix_embs = prefix_embs.to(dtype=torch.bfloat16) + + # Create position IDs (cumsum of mask - 1) + position_ids = torch.cumsum(prefix_pad_masks, dim=1) - 1 + + # Create 4D mask for the prefix + att_4d = self._prepare_attention_masks_4d(prefix_att_masks, dtype=prefix_embs.dtype) + + # Forward pass (Prefill) with use_cache=True + # We only pass [prefix_embs, None] because we aren't using the suffix (expert) model yet + (prefix_out, _), past_key_values = self.paligemma_with_expert.forward( + attention_mask=att_4d, + position_ids=position_ids, + past_key_values=None, + inputs_embeds=[prefix_embs, None], + use_cache=True, # Enable caching + adarms_cond=[None, None], + ) + + # Sample the first action token from the last logit of the prefix + last_logits = lm_head(prefix_out[:, -1:, :]) # (B, 1, V) + if temperature > 0: + probs = torch.softmax(last_logits[:, -1] / temperature, dim=-1) + next_token = torch.multinomial(probs, num_samples=1) + else: + next_token = torch.argmax(last_logits[:, -1], dim=-1, keepdim=True) + + # Initialize storage for generated tokens + generated_action_tokens = torch.zeros((bsize, max_decoding_steps), dtype=torch.long, device=device) + generated_action_tokens[:, 0] = next_token.squeeze(-1) + + # Track valid tokens mask (0 for pad, 1 for valid) + # We need this to tell the new token what it can attend to (images + text + past actions) + current_pad_mask = prefix_pad_masks + + # --- 2. DECODING PHASE --- + # Generate remaining tokens one by one using the cache. + + for t in range(1, max_decoding_steps): + # Embed the single previous token + # We use embed_language_tokens directly to avoid overhead of full prefix embedding + next_token_emb = self.paligemma_with_expert.embed_language_tokens(next_token) + next_token_emb = next_token_emb * math.sqrt(next_token_emb.shape[-1]) + if prefix_embs.dtype == torch.bfloat16: + next_token_emb = next_token_emb.to(dtype=torch.bfloat16) + + # Update Pad Mask: append 1s for the new valid token + new_column = torch.ones((bsize, 1), dtype=torch.bool, device=device) + current_pad_mask = torch.cat([current_pad_mask, new_column], dim=1) + + # Update Position IDs for the single new token + current_position_ids = (torch.sum(current_pad_mask, dim=1, keepdim=True) - 1).long() + + # Create Attention Mask for the single new step + # The new token attends to all valid tokens in history (captured by current_pad_mask). + # Shape becomes (B, 1, 1, Total_Len) which works with HF's cache logic. + step_att_mask = self._prepare_attention_masks_4d( + current_pad_mask.unsqueeze(1), dtype=next_token_emb.dtype + ) + + # Forward pass (Decoding step) + # input_embeds is just the new token (B, 1, D) + (step_out, _), past_key_values = self.paligemma_with_expert.forward( + attention_mask=step_att_mask, + position_ids=current_position_ids, + past_key_values=past_key_values, # Pass updated cache + inputs_embeds=[next_token_emb, None], + use_cache=True, + adarms_cond=[None, None], + ) + + # Sample next token + last_logits = lm_head(step_out[:, -1:, :]) + if temperature > 0: + probs = torch.softmax(last_logits[:, -1] / temperature, dim=-1) + next_token = torch.multinomial(probs, num_samples=1) + else: + next_token = torch.argmax(last_logits[:, -1], dim=-1, keepdim=True) + + generated_action_tokens[:, t] = next_token.squeeze(-1) + + return generated_action_tokens + + +class PI0FastPolicy(PreTrainedPolicy): + """PI0Fast Policy for LeRobot.""" + + config_class = PI0FastConfig + name = "pi0_fast" + + def __init__( + self, + config: PI0FastConfig, + **kwargs, + ): + """ + Args: + config: Policy configuration class instance. + """ + super().__init__(config) + config.validate_features() + self.config = config + + # Load tokenizers first + try: + from transformers import AutoProcessor, AutoTokenizer + + # Load FAST tokenizer + self.action_tokenizer = AutoProcessor.from_pretrained( + config.action_tokenizer_name, trust_remote_code=True + ) + + # Load PaliGemma tokenizer for token conversion + self._paligemma_tokenizer = AutoTokenizer.from_pretrained( + config.text_tokenizer_name, trust_remote_code=True, add_eos_token=True, add_bos_token=False + ) + + logging.info("Loaded FAST tokenizer for action detokenization") + except Exception as e: + logging.error(f"Failed to load FAST tokenizer for action detokenization: {e}") + logging.error("Tokenizer loading is required for proper policy initialization; aborting.") + raise RuntimeError("Failed to load required tokenizers for PI0FastPolicy initialization") from e + + # Initialize the core PI0Fast model + self.init_rtc_processor() + self.model = PI0FastPytorch( + config, rtc_processor=self.rtc_processor, paligemma_tokenizer=self._paligemma_tokenizer + ) + + # Enable gradient checkpointing if requested + if config.gradient_checkpointing: + self.model.gradient_checkpointing_enable() + + self.model.to(config.device) + + self.reset() + + @classmethod + def from_pretrained( + cls: builtins.type[T], + pretrained_name_or_path: str | Path, + *, + config: PreTrainedConfig | None = None, + force_download: bool = False, + resume_download: bool | None = None, + proxies: dict | None = None, + token: str | bool | None = None, + cache_dir: str | Path | None = None, + local_files_only: bool = False, + revision: str | None = None, + strict: bool = True, + **kwargs, + ) -> T: + """Override the from_pretrained method to handle key remapping and display important disclaimer.""" + print( + "The PI0Fast model is a direct port of the OpenPI implementation. \n" + "This implementation follows the original OpenPI structure for compatibility. \n" + "Original implementation: https://github.com/Physical-Intelligence/openpi" + ) + if pretrained_name_or_path is None: + raise ValueError("pretrained_name_or_path is required") + + # Use provided config if available, otherwise create default config + if config is None: + config = PreTrainedConfig.from_pretrained( + pretrained_name_or_path=pretrained_name_or_path, + force_download=force_download, + resume_download=resume_download, + proxies=proxies, + token=token, + cache_dir=cache_dir, + local_files_only=local_files_only, + revision=revision, + **kwargs, + ) + + # Initialize model without loading weights + # Check if dataset_stats were provided in kwargs + model = cls(config, **kwargs) + + # Now manually load and remap the state dict + try: + # Try to load the pytorch_model.bin or model.safetensors file + print(f"Loading model from: {pretrained_name_or_path}") + try: + from transformers.utils import cached_file + + # Try safetensors first + resolved_file = cached_file( + pretrained_name_or_path, + "model.safetensors", + cache_dir=kwargs.get("cache_dir"), + force_download=kwargs.get("force_download", False), + resume_download=kwargs.get("resume_download"), + proxies=kwargs.get("proxies"), + use_auth_token=kwargs.get("use_auth_token"), + revision=kwargs.get("revision"), + local_files_only=kwargs.get("local_files_only", False), + ) + from safetensors.torch import load_file + + original_state_dict = load_file(resolved_file) + print("✓ Loaded state dict from model.safetensors") + except Exception as e: + print(f"Could not load state dict from remote files: {e}") + print("Returning model without loading pretrained weights") + return model + + # First, fix any key differences # see openpi `model.py, _fix_pytorch_state_dict_keys` + fixed_state_dict = model._fix_pytorch_state_dict_keys(original_state_dict, model.config) + # Then add "model." prefix for all keys that don't already have it + remapped_state_dict = {} + remap_count = 0 + + for key, value in fixed_state_dict.items(): + if not key.startswith("model."): + new_key = f"model.{key}" + remapped_state_dict[new_key] = value + remap_count += 1 + if remap_count <= 10: # Only print first 10 to avoid spam + print(f"Remapped: {key} -> {new_key}") + else: + remapped_state_dict[key] = value + + if remap_count > 0: + print(f"Remapped {remap_count} state dict keys") + + # Load the remapped state dict into the model + missing_keys, unexpected_keys = model.load_state_dict(remapped_state_dict, strict=strict) + + if missing_keys: + print(f"Missing keys when loading state dict: {len(missing_keys)} keys") + if len(missing_keys) <= 5: + for key in missing_keys: + print(f" - {key}") + else: + for key in missing_keys[:5]: + print(f" - {key}") + print(f" ... and {len(missing_keys) - 5} more") + + if unexpected_keys: + print(f"Unexpected keys when loading state dict: {len(unexpected_keys)} keys") + if len(unexpected_keys) <= 5: + for key in unexpected_keys: + print(f" - {key}") + else: + for key in unexpected_keys[:5]: + print(f" - {key}") + print(f" ... and {len(unexpected_keys) - 5} more") + + if not missing_keys and not unexpected_keys: + print("All keys loaded successfully!") + + except Exception as e: + print(f"Warning: Could not remap state dict keys: {e}") + + return model + + def _fix_pytorch_state_dict_keys( + self, state_dict, model_config + ): # see openpi `BaseModelConfig, _fix_pytorch_state_dict_keys` + """Fix state dict keys to match current model architecture.""" + + fixed_state_dict = {} + + for key, value in state_dict.items(): + new_key = key + + # Handle vision tower embedding layer potential differences + if "patch_embedding" in key: + # Some checkpoints might have this, but current model expects different structure + logging.warning(f"Vision embedding key might need handling: {key}") + + if ( + key == "model.paligemma_with_expert.paligemma.lm_head.weight" + or key == "paligemma_with_expert.paligemma.lm_head.weight" + ): + fixed_state_dict[ + "model.paligemma_with_expert.paligemma.model.language_model.embed_tokens.weight" + ] = value.clone() + + fixed_state_dict[new_key] = value + + return fixed_state_dict + + def get_optim_params(self) -> dict: + return self.parameters() + + def reset(self): + """Reset internal state - called when environment resets.""" + self._action_queue = deque(maxlen=self.config.n_action_steps) + self._queues = { + ACTION: deque(maxlen=self.config.n_action_steps), + } + + def init_rtc_processor(self): + """Initialize RTC processor if RTC is enabled in config.""" + self.rtc_processor = None + + # Create processor if config provided + # If RTC is not enabled - we can still track the denoising data + if self.config.rtc_config is not None: + self.rtc_processor = RTCProcessor(self.config.rtc_config) + + model_value = getattr(self, "model", None) + if model_value is not None: + model_value.rtc_processor = self.rtc_processor + + def _rtc_enabled(self) -> bool: + return self.config.rtc_config is not None and self.config.rtc_config.enabled + + def _preprocess_images(self, batch: dict[str, Tensor]) -> tuple[list[Tensor], list[Tensor]]: + """Preprocess images for the model. + + Images from LeRobot are typically in [B, C, H, W] format and normalized to [0, 1]. + PaliGemma expects images in [B, C, H, W] format and normalized to [-1, 1]. + """ + images = [] + img_masks = [] + + # Get device from model parameters + device = next(self.parameters()).device + + present_img_keys = [key for key in self.config.image_features if key in batch] + missing_img_keys = [key for key in self.config.image_features if key not in batch] + + if len(present_img_keys) == 0: + raise ValueError( + f"All image features are missing from the batch. At least one expected. " + f"(batch: {batch.keys()}) (image_features: {self.config.image_features})" + ) + + # Preprocess image features present in the batch + for key in present_img_keys: + img = batch[key] + + # Ensure tensor is on the same device as the model + if img.device != device: + img = img.to(device) + + # Ensure float32 dtype for consistency + if img.dtype != torch.float32: + img = img.to(torch.float32) + + # from openpi preprocess_observation_pytorch: Handle both [B, C, H, W] and [B, H, W, C] formats + is_channels_first = img.shape[1] == 3 # Check if channels are in dimension 1 + + if is_channels_first: + # Convert [B, C, H, W] to [B, H, W, C] for processing + img = img.permute(0, 2, 3, 1) + + # from openpi preprocess_observation_pytorch: Resize with padding if needed + if img.shape[1:3] != self.config.image_resolution: + img = resize_with_pad_torch(img, *self.config.image_resolution) + + # Normalize from [0,1] to [-1,1] as expected by siglip + img = img * 2.0 - 1.0 + + # from openpi preprocess_observation_pytorch: Convert back to [B, C, H, W] format if it was originally channels-first + if is_channels_first: + img = img.permute(0, 3, 1, 2) # [B, H, W, C] -> [B, C, H, W] + + images.append(img) + # Create mask (all ones for real images) + bsize = img.shape[0] + mask = torch.ones(bsize, dtype=torch.bool, device=device) + img_masks.append(mask) + + # Create image features not present in the batch as fully 0 padded images + for _num_empty_cameras in range(len(missing_img_keys)): + img = torch.ones_like(img) * -1 # Padded with -1 for SigLIP + mask = torch.zeros_like(mask) # Mask is zero for empty cameras + images.append(img) + img_masks.append(mask) + + return images, img_masks + + def prepare_action(self, batch): + """Pad action""" + actions = pad_vector(batch[ACTION], self.config.max_action_dim) + return actions + + def _paligemma_tokens_to_act_tokens(self, tokens: torch.Tensor) -> torch.Tensor: + """ + Converts PaliGemma tokens back to action tokens (inverse of _act_tokens_to_paligemma_tokens). + + Args: + tokens: PaliGemma token IDs + + Returns: + Action token IDs + """ + return self._paligemma_tokenizer.vocab_size - 1 - self.config.fast_skip_tokens - tokens + + def decode_actions_with_fast( + self, token_ids: list[int], time_horizon: int, action_dim: int, relaxed_decoding: bool = True + ) -> np.ndarray: + """ + Decodes action token IDs back to continuous action values using the FAST tokenizer. + + Args: + token_ids: List of token IDs to decode. + time_horizon: The number of timesteps for actions. + action_dim: The dimensionality of each action. + relaxed_decoding: Whether to use relaxed decoding (allows partial sequences). + + Returns: + A numpy array representing the decoded actions. + """ + decoded_actions = [] + + for token in token_ids: + try: + decoded_tokens = self.action_tokenizer.bpe_tokenizer.decode(token) + decoded_dct_coeff = np.array(list(map(ord, decoded_tokens))) + self.action_tokenizer.min_token + + if relaxed_decoding: + # expected sequence length + expected_seq_len = time_horizon * action_dim + diff = expected_seq_len - decoded_dct_coeff.shape[0] + + # apply truncation if too long + if diff < 0: + decoded_dct_coeff = decoded_dct_coeff[:expected_seq_len] # truncate on the right + + # apply padding if too short + elif diff > 0: + decoded_dct_coeff = np.pad( + decoded_dct_coeff, (0, diff), mode="constant", constant_values=0 + ) + + decoded_dct_coeff = decoded_dct_coeff.reshape(-1, action_dim) + assert decoded_dct_coeff.shape == ( + time_horizon, + action_dim, + ), ( + f"Decoded DCT coefficients have shape {decoded_dct_coeff.shape}, expected ({time_horizon}, {action_dim})" + ) + + except Exception as e: + logging.warning(f"Error decoding tokens: {e}") + logging.warning(f"Tokens: {token}") + decoded_dct_coeff = np.zeros((time_horizon, action_dim)) + + decoded_actions.append( + idct(decoded_dct_coeff / self.action_tokenizer.scale, axis=0, norm="ortho") + ) + + return np.stack(decoded_actions) + + def detokenize_actions(self, tokens: torch.Tensor, action_horizon: int, action_dim: int) -> torch.Tensor: + """ + Detokenizes action tokens back to continuous actions. + + This method converts predicted action tokens from the model back to continuous action values + using the FAST tokenizer. It handles the conversion from PaliGemma token space to action token + space, then decodes the action tokens to continuous values using DCT decoding. + + Args: + tokens: The input tensor of tokenized outputs. Shape: (B, seq_len) or (seq_len,) + action_horizon: The number of timesteps for actions. + action_dim: The dimensionality of each action. + + Returns: + The continuous action tensor. Shape: (B, action_horizon, action_dim) or (action_horizon, action_dim) + """ + if self.action_tokenizer is None or self._paligemma_tokenizer is None: + raise ValueError( + "Action tokenizer not initialized. Make sure fast_only=True in config and tokenizers loaded successfully." + ) + + # Handle single sample (add batch dimension) + single_sample = tokens.dim() == 1 + if single_sample: + tokens = tokens.unsqueeze(0) + + # Convert token IDs to token strings + decoded_tokens = [self._paligemma_tokenizer.convert_ids_to_tokens(seq.tolist()) for seq in tokens] + # Get the token sequence for "Action: " to remove it + action_prefix_ids = self._paligemma_tokenizer.encode("Action: ", add_special_tokens=False) + action_prefix_tokens = self._paligemma_tokenizer.convert_ids_to_tokens(action_prefix_ids) + action_prefix_len = len(action_prefix_tokens) + + # Clean tokens by removing everything after the first "|" (end-of-action marker) + # and removing all occurrences of "Action: " token sequence + # assert that beginning contain "Action: " + if self.config.validate_action_token_prefix: + for token_seq in decoded_tokens: + assert len(token_seq) >= 2 and token_seq[0] == "Action" and token_seq[1] == ":", ( + f"Token sequence does not start with ['Action', ':']: {token_seq}" + ) + + cleaned_tokens = [] + for token_seq in decoded_tokens: + # Remove everything after "|" + if "|" in token_seq: + token_seq = token_seq[: token_seq.index("|")] + + # Remove all occurrences of "Action: " token sequence + i = 0 + while i <= len(token_seq) - action_prefix_len: + if token_seq[i : i + action_prefix_len] == action_prefix_tokens: + # Found a match, remove it + token_seq = token_seq[:i] + token_seq[i + action_prefix_len :] + else: + i += 1 + + cleaned_tokens.append(token_seq) + + # Convert token strings back to IDs + raw_action_tokens = [ + torch.tensor( + self._paligemma_tokenizer.convert_tokens_to_ids(token_seq), + dtype=torch.long, + device=tokens.device, + ) + for token_seq in cleaned_tokens + ] + + # Convert PaliGemma tokens to action tokens + action_tokens = [ + self._paligemma_tokens_to_act_tokens(raw_action_token) for raw_action_token in raw_action_tokens + ] + + # Decode action tokens to continuous actions + actions = self.decode_actions_with_fast( + action_tokens, time_horizon=action_horizon, action_dim=action_dim + ) + + # Convert to tensor and return + actions_tensor = torch.tensor(actions, dtype=torch.float32, device=tokens.device) + + # Remove batch dimension if input was single sample + if single_sample: + actions_tensor = actions_tensor.squeeze(0) + + return actions_tensor + + @torch.no_grad() + def select_action(self, batch: dict[str, Tensor]) -> Tensor: + """Select a single action given environment observations.""" + assert not self._rtc_enabled(), ( + "RTC is not supported for select_action, use it with predict_action_chunk" + ) + + self.eval() + + # Action queue logic for n_action_steps > 1 + if len(self._action_queue) == 0: + actions = self.predict_action_chunk(batch)[:, : self.config.n_action_steps] + # Transpose to get shape (n_action_steps, batch_size, action_dim) + self._action_queue.extend(actions.transpose(0, 1)) + + return self._action_queue.popleft() + + @torch.no_grad() + def predict_action_chunk(self, batch: dict[str, Tensor], **kwargs: Unpack[ActionSelectKwargs]) -> Tensor: + """Predict a chunk of actions given environment observations.""" + self.eval() + # Prepare inputs + images, img_masks = self._preprocess_images(batch) + + # FAST-only mode: use autoregressive decoding + tokens = batch[f"{OBS_LANGUAGE_TOKENS}"] + masks = batch[f"{OBS_LANGUAGE_ATTENTION_MASK}"] + + # Get decoding parameters + temperature = self.config.temperature + max_decoding_steps = self.config.max_decoding_steps + + # Sample action tokens autoregressively + if self.config.use_kv_cache: + action_tokens = self.model.sample_actions_fast_kv_cache( + images, + img_masks, + tokens, + masks, + max_decoding_steps=max_decoding_steps, + temperature=temperature, + ) + else: + action_tokens = self.model.sample_actions_fast( + images, + img_masks, + tokens, + masks, + max_decoding_steps=max_decoding_steps, + temperature=temperature, + ) + + # Detokenize action tokens to continuous actions + action_horizon = self.config.n_action_steps + action_dim = self.config.output_features[ACTION].shape[0] + + continuous_actions = self.detokenize_actions( + action_tokens, action_horizon=action_horizon, action_dim=action_dim + ) + + return continuous_actions + + def forward(self, batch: dict[str, Tensor]) -> tuple[Tensor, dict]: + """Run the batch through the model and compute the loss for training.""" + + # Prepare inputs + images, img_masks = self._preprocess_images(batch) + + # Get FAST action tokens from batch + fast_action_tokens = batch.get(ACTION_TOKENS) # (B, max_action_tokens) + fast_action_masks = batch.get(ACTION_TOKEN_MASK) # (B, max_action_tokens) + + # Use full language tokens (no separation into high_level_task and subtask) + tokens = batch.get(OBS_LANGUAGE_TOKENS) + masks = batch.get(OBS_LANGUAGE_ATTENTION_MASK) + + if fast_action_tokens is None or fast_action_masks is None: + raise ValueError( + f"PI0Fast requires {ACTION_TOKENS} and {ACTION_TOKEN_MASK} to be present in the batch" + ) + + loss_dict = self.model.forward( + images, + img_masks, + tokens, + masks, + fast_action_tokens, + fast_action_masks, + ) + + loss = loss_dict["loss"] + detailed_loss_dict = { + "loss": loss.item(), + "ce_loss": loss_dict["ce_loss"].item(), + } + return loss, detailed_loss_dict diff --git a/src/lerobot/policies/pi0_fast/processor_pi0_fast.py b/src/lerobot/policies/pi0_fast/processor_pi0_fast.py new file mode 100644 index 000000000..0d9dac673 --- /dev/null +++ b/src/lerobot/policies/pi0_fast/processor_pi0_fast.py @@ -0,0 +1,177 @@ +#!/usr/bin/env python + +# Copyright 2025 Physical Intelligence and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from copy import deepcopy +from dataclasses import dataclass +from typing import Any + +import numpy as np +import torch + +from lerobot.configs.types import PipelineFeatureType, PolicyFeature +from lerobot.policies.pi0_fast.configuration_pi0_fast import PI0FastConfig +from lerobot.policies.pi0_fast.modeling_pi0_fast import pad_vector +from lerobot.processor import ( + ActionTokenizerProcessorStep, + AddBatchDimensionProcessorStep, + DeviceProcessorStep, + NormalizerProcessorStep, + PolicyAction, + PolicyProcessorPipeline, + ProcessorStep, + ProcessorStepRegistry, + RenameObservationsProcessorStep, + TokenizerProcessorStep, + UnnormalizerProcessorStep, +) +from lerobot.processor.converters import policy_action_to_transition, transition_to_policy_action +from lerobot.processor.core import EnvTransition, TransitionKey +from lerobot.utils.constants import ( + OBS_STATE, + POLICY_POSTPROCESSOR_DEFAULT_NAME, + POLICY_PREPROCESSOR_DEFAULT_NAME, +) + + +@ProcessorStepRegistry.register(name="pi0_fast_prepare_state_tokenizer_processor_step") +@dataclass +class Pi0FastPrepareStateAndLanguageTokenizerProcessorStep(ProcessorStep): + """ + Processor step to prepare the state and tokenize the language input. + """ + + max_state_dim: int = 32 + task_key: str = "task" + + def __call__(self, transition: EnvTransition) -> EnvTransition: + transition = transition.copy() + + state = transition.get(TransitionKey.OBSERVATION, {}).get(OBS_STATE) + if state is None: + raise ValueError("State is required for PI0Fast") + tasks = transition.get(TransitionKey.COMPLEMENTARY_DATA, {}).get(self.task_key) + if tasks is None: + raise ValueError("No task found in complementary data") + + # TODO: check if this necessary + state = deepcopy(state) + + # Prepare state (pad to max_state_dim) + state = pad_vector(state, self.max_state_dim) + + # State should already be normalized to [-1, 1] by the NormalizerProcessorStep that runs before this step + # Discretize into 256 bins (see openpi `PaligemmaTokenizer.tokenize()`) + state_np = state.cpu().numpy() + discretized_states = np.digitize(state_np, bins=np.linspace(-1, 1, 256 + 1)[:-1]) - 1 + + full_prompts = [] + for i, task in enumerate(tasks): + cleaned_text = task.strip().replace("_", " ").replace("\n", " ") + state_str = " ".join(map(str, discretized_states[i])) + full_prompt = f"Task: {cleaned_text}, State: {state_str};\n" + full_prompts.append(full_prompt) + + transition[TransitionKey.COMPLEMENTARY_DATA][self.task_key] = full_prompts + # Normalize state to [-1, 1] range if needed (assuming it's already normalized by normalizer processor step!!) + # Discretize into 256 bins (see openpi `PaligemmaTokenizer.tokenize()`) + return transition + + def transform_features( + self, features: dict[PipelineFeatureType, dict[str, PolicyFeature]] + ) -> dict[PipelineFeatureType, dict[str, PolicyFeature]]: + """ + This step does not alter the feature definitions. + """ + return features + + +def make_pi0_fast_pre_post_processors( + config: PI0FastConfig, + dataset_stats: dict[str, dict[str, torch.Tensor]] | None = None, +) -> tuple[ + PolicyProcessorPipeline[dict[str, Any], dict[str, Any]], + PolicyProcessorPipeline[PolicyAction, PolicyAction], +]: + """ + Constructs pre-processor and post-processor pipelines for the PI0Fast policy. + + The pre-processing pipeline prepares input data for the model by: + 1. Renaming features to match pretrained configurations. + 2. Normalizing input and output features based on dataset statistics. + 3. Adding a batch dimension. + 4. Appending a newline character to the task description for tokenizer compatibility. + 5. Tokenizing the text prompt using the PaliGemma tokenizer. + 6. Moving all data to the specified device. + + The post-processing pipeline handles the model's output by: + 1. Moving data to the CPU. + 2. Unnormalizing the output features to their original scale. + + Args: + config: The configuration object for the PI0Fast policy. + dataset_stats: A dictionary of statistics for normalization. + preprocessor_kwargs: Additional arguments for the pre-processor pipeline. + postprocessor_kwargs: Additional arguments for the post-processor pipeline. + + Returns: + A tuple containing the configured pre-processor and post-processor pipelines. + """ + # Add remaining processors + input_steps: list[ProcessorStep] = [ + RenameObservationsProcessorStep(rename_map={}), # To mimic the same processor as pretrained one + AddBatchDimensionProcessorStep(), + # NOTE: NormalizerProcessorStep MUST come before Pi0FastPrepareStateAndLanguageTokenizerProcessorStep + # because the tokenizer step expects normalized state in [-1, 1] range for discretization + NormalizerProcessorStep( + features={**config.input_features, **config.output_features}, + norm_map=config.normalization_mapping, + stats=dataset_stats, + ), + Pi0FastPrepareStateAndLanguageTokenizerProcessorStep(max_state_dim=config.max_state_dim), + TokenizerProcessorStep( + tokenizer_name=config.text_tokenizer_name, + max_length=config.tokenizer_max_length, + padding_side="right", + padding="max_length", + ), + ActionTokenizerProcessorStep( + action_tokenizer_name=config.action_tokenizer_name, + max_action_tokens=config.max_action_tokens, + fast_skip_tokens=config.fast_skip_tokens, + paligemma_tokenizer_name=config.text_tokenizer_name, + ), + DeviceProcessorStep(device=config.device), + ] + + output_steps: list[ProcessorStep] = [ + UnnormalizerProcessorStep( + features=config.output_features, norm_map=config.normalization_mapping, stats=dataset_stats + ), + DeviceProcessorStep(device="cpu"), + ] + + return ( + PolicyProcessorPipeline[dict[str, Any], dict[str, Any]]( + steps=input_steps, + name=POLICY_PREPROCESSOR_DEFAULT_NAME, + ), + PolicyProcessorPipeline[PolicyAction, PolicyAction]( + steps=output_steps, + name=POLICY_POSTPROCESSOR_DEFAULT_NAME, + to_transition=policy_action_to_transition, + to_output=transition_to_policy_action, + ), + ) diff --git a/src/lerobot/policies/pretrained.py b/src/lerobot/policies/pretrained.py index 3f5d89ec5..e730b78a7 100644 --- a/src/lerobot/policies/pretrained.py +++ b/src/lerobot/policies/pretrained.py @@ -13,6 +13,7 @@ # limitations under the License. import abc import builtins +import dataclasses import logging import os from importlib.resources import files @@ -206,6 +207,7 @@ class PreTrainedPolicy(nn.Module, HubMixin, abc.ABC): def push_model_to_hub( self, cfg: TrainPipelineConfig, + peft_model=None, ): api = HfApi() repo_id = api.create_repo( @@ -216,7 +218,14 @@ class PreTrainedPolicy(nn.Module, HubMixin, abc.ABC): with TemporaryDirectory(ignore_cleanup_errors=True) as tmp: saved_path = Path(tmp) / repo_id - self.save_pretrained(saved_path) # Calls _save_pretrained and stores model tensors + if peft_model is not None: + # Since PEFT just forwards calls to `push_model_to_hub`, `self` is not the PeftModel wrapper + # but the actual policy which is why we need the PEFT model passed to us to save the adapter. + # That also means that we need to store the policy config ourselves since PEFT can't. + peft_model.save_pretrained(saved_path) + self.config.save_pretrained(saved_path) + else: + self.save_pretrained(saved_path) # Calls _save_pretrained and stores model tensors card = self.generate_model_card( cfg.dataset.repo_id, self.config.type, self.config.license, self.config.tags @@ -257,3 +266,166 @@ class PreTrainedPolicy(nn.Module, HubMixin, abc.ABC): card = ModelCard.from_template(card_data, template_str=template_card) card.validate() return card + + def wrap_with_peft( + self, + peft_config=None, + peft_cli_overrides: dict | None = None, + ) -> "PreTrainedPolicy": + """ + Wrap this policy with PEFT adapters for parameter-efficient fine-tuning. + + This method is the single entry point for PEFT integration. Subclasses should + override `_get_default_peft_targets()` to provide default target modules, and + `_validate_peft_config()` for policy-specific validation. + + Args: + peft_config: Optional PEFT adapter configuration (e.g., LoraConfig). + If provided, used directly (with CLI overrides applied). + peft_cli_overrides: Optional dict of CLI overrides (method_type, target_modules, r, etc.) + These are merged with policy defaults to build the final config. + """ + from peft import get_peft_model + + # If user provided a complete config, use it directly (with overrides) + if peft_config is not None: + final_config = peft_config + if peft_cli_overrides: + final_config = self._apply_peft_cli_overrides(final_config, peft_cli_overrides) + else: + # Build config from defaults + CLI overrides + final_config = self._build_peft_config(peft_cli_overrides or {}) + + # Validate the configuration + self._validate_peft_config(final_config) + + # Freeze base parameters, only adapter params will be trained + for p in self.parameters(): + p.requires_grad_(False) + + # Store pretrained path for PEFT's base_model_name_or_path + if self.config.pretrained_path: + self.name_or_path = str(self.config.pretrained_path) + + # Wrap with PEFT + peft_model = get_peft_model(self, final_config) + + # Mark config as using PEFT for proper loading later + peft_model.config.use_peft = True + + logging.info(f"Wrapped {self.name} with PEFT ({type(final_config).__name__})") + return peft_model + + def _get_default_peft_targets(self) -> dict[str, any] | None: + """ + Return default PEFT target modules for this policy. + + Override this in subclasses to provide policy-specific defaults. These defaults + are PEFT-method agnostic - they only specify which modules to target. + + """ + return None + + def _validate_peft_config(self, peft_config) -> None: + """ + Validate the PEFT configuration for this policy. + + Override this in subclasses to add policy-specific validation or warnings. + The default implementation checks that a pretrained_path exists. + + Args: + peft_config: The PEFT configuration to validate. + + Raises: + ValueError: If the configuration is invalid. + """ + if not self.config.pretrained_path: + raise ValueError( + "Training from scratch using PEFT is unlikely to yield good results. " + "Supply a `policy.pretrained_path` to fine-tune an existing model." + ) + + def _preprocess_peft_cli_overrides(self, cli_overrides: dict, peft_method_type) -> dict: + """ + Preprocess CLI overrides: rename keys and handle method-specific init_type. + + Args: + cli_overrides: Dict of CLI options (will be copied, not mutated). + peft_method_type: The PeftType enum value for the PEFT method. + + Returns: + Preprocessed dict with renamed keys and init_type mapped to method-specific key. + """ + from peft import PeftType + + cli_overrides = cli_overrides.copy() + + # Handle the full_training_modules -> modules_to_save rename + if "full_training_modules" in cli_overrides: + cli_overrides["modules_to_save"] = cli_overrides.pop("full_training_modules") + + # Remove method_type as it's handled separately + cli_overrides.pop("method_type", None) + + # Handle init_type specially based on PEFT method + init_type = cli_overrides.pop("init_type", None) + if init_type is not None: + if peft_method_type == PeftType.LORA: + cli_overrides["init_lora_weights"] = init_type + elif peft_method_type == PeftType.MISS: + cli_overrides["init_weights"] = init_type + else: + raise ValueError(f"Init type '{init_type}' unknown for PEFT method {peft_method_type}.") + + return cli_overrides + + def _build_peft_config(self, cli_overrides: dict): + """Build a PEFT config from policy defaults and CLI overrides.""" + from peft import PEFT_TYPE_TO_CONFIG_MAPPING, PeftType + + # Determine PEFT method type (default to LORA) + method_type_str = cli_overrides.get("method_type") or "lora" + peft_method_type = PeftType[method_type_str.upper()] + peft_config_cls = PEFT_TYPE_TO_CONFIG_MAPPING[peft_method_type] + + # Preprocess CLI overrides + cli_overrides = self._preprocess_peft_cli_overrides(cli_overrides, peft_method_type) + + # Start with policy defaults, apply CLI overrides + config_dict = dict(self._get_default_peft_targets() or {}) + for key, value in cli_overrides.items(): + if value is not None: + config_dict[key] = value + + # Ensure we have target_modules + if not config_dict.get("target_modules"): + raise ValueError( + f"Policy '{self.name}' does not define default target_modules. " + "Please pass --peft.target_modules explicitly." + ) + + return peft_config_cls(**config_dict) + + def _apply_peft_cli_overrides(self, peft_config, cli_overrides: dict): + """Apply CLI overrides to an existing PEFT config.""" + from peft import PEFT_TYPE_TO_CONFIG_MAPPING, PeftType + + # Get method type from existing config or CLI override + method_type_str = cli_overrides.get("method_type") + if method_type_str: + peft_method_type = PeftType[method_type_str.upper()] + peft_config_cls = PEFT_TYPE_TO_CONFIG_MAPPING[peft_method_type] + else: + peft_method_type = PeftType(peft_config.peft_type) + peft_config_cls = type(peft_config) + + # Preprocess CLI overrides + cli_overrides = self._preprocess_peft_cli_overrides(cli_overrides, peft_method_type) + + # Start with existing config, apply CLI overrides + config_dict = {k: v for k, v in dataclasses.asdict(peft_config).items() if not k.startswith("_")} + for key, value in cli_overrides.items(): + if value is not None: + config_dict[key] = value + + return peft_config_cls(**config_dict) diff --git a/src/lerobot/policies/rtc/configuration_rtc.py b/src/lerobot/policies/rtc/configuration_rtc.py index 70a8dfb09..1087aa082 100644 --- a/src/lerobot/policies/rtc/configuration_rtc.py +++ b/src/lerobot/policies/rtc/configuration_rtc.py @@ -23,7 +23,7 @@ Based on: from dataclasses import dataclass -from lerobot.configs.types import RTCAttentionSchedule +from lerobot.configs.types import RTCAttentionSchedule, RTCTrainingDelayDistribution @dataclass @@ -53,3 +53,22 @@ class RTCConfig: raise ValueError(f"max_guidance_weight must be positive, got {self.max_guidance_weight}") if self.debug_maxlen <= 0: raise ValueError(f"debug_maxlen must be positive, got {self.debug_maxlen}") + + +@dataclass +class RTCTrainingConfig: + """Configuration for training-time RTC action prefix conditioning.""" + + enabled: bool = False + min_delay: int = 0 + max_delay: int = 0 + delay_distribution: RTCTrainingDelayDistribution = RTCTrainingDelayDistribution.UNIFORM + exp_decay: float = 1.0 + + def __post_init__(self): + if self.min_delay < 0: + raise ValueError(f"min_delay must be >= 0, got {self.min_delay}") + if self.max_delay < self.min_delay: + raise ValueError(f"max_delay ({self.max_delay}) must be >= min_delay ({self.min_delay})") + if self.exp_decay <= 0: + raise ValueError(f"exp_decay must be positive, got {self.exp_decay}") diff --git a/src/lerobot/policies/rtc/training_time.py b/src/lerobot/policies/rtc/training_time.py new file mode 100644 index 000000000..a47bd6cec --- /dev/null +++ b/src/lerobot/policies/rtc/training_time.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python + +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import torch + +from lerobot.configs.types import RTCTrainingDelayDistribution +from lerobot.policies.rtc.configuration_rtc import RTCTrainingConfig + + +def sample_rtc_delay(cfg: RTCTrainingConfig, batch_size: int, device: torch.device) -> torch.Tensor: + if cfg.max_delay == cfg.min_delay: + return torch.full((batch_size,), cfg.min_delay, device=device, dtype=torch.long) + + if cfg.delay_distribution == RTCTrainingDelayDistribution.UNIFORM: + return torch.randint(cfg.min_delay, cfg.max_delay + 1, (batch_size,), device=device, dtype=torch.long) + + delay_values = torch.arange(cfg.min_delay, cfg.max_delay + 1, device=device, dtype=torch.long) + weights = torch.exp(-cfg.exp_decay * delay_values.to(dtype=torch.float32)) + probs = weights / weights.sum() + samples = torch.multinomial(probs, batch_size, replacement=True) + return delay_values[samples] + + +def apply_rtc_training_time( + time: torch.Tensor, delay: torch.Tensor, seq_len: int +) -> tuple[torch.Tensor, torch.Tensor]: + device = time.device + delay = torch.clamp(delay, max=seq_len) + prefix_mask = torch.arange(seq_len, device=device)[None, :] < delay[:, None] + time_tokens = time[:, None].expand(-1, seq_len) + time_tokens = time_tokens.masked_fill(prefix_mask, 0.0) + postfix_mask = ~prefix_mask + return time_tokens, postfix_mask + + +def masked_mean( + losses: torch.Tensor, mask: torch.Tensor | None, reduce_dims: tuple[int, ...], eps: float = 1e-8 +) -> torch.Tensor: + if mask is None: + return losses.mean(dim=reduce_dims) + + mask = mask.to(dtype=losses.dtype) + while mask.dim() < losses.dim(): + mask = mask.unsqueeze(-1) + masked = losses * mask + denom = mask.sum(dim=reduce_dims).clamp_min(eps) + return masked.sum(dim=reduce_dims) / denom + + +def apply_training_time_rtc_inference( + x_t: torch.Tensor, + time: float, + inference_delay: int | None, + prev_chunk_left_over: torch.Tensor | None, + chunk_size: int, +) -> tuple[torch.Tensor, torch.Tensor]: + """Apply training-time RTC conditioning during inference. + + Based on Algorithm 1 from "Training-Time Action Conditioning for Efficient Real-Time Chunking". + + At each denoising step: + 1. Replace prefix positions in x_t with ground truth from previous chunk + 2. Create per-token timesteps with 1.0 for prefix positions + + Args: + x_t: Current noisy actions (B, T, D) + time: Current flow matching timestep (scalar) + inference_delay: Number of prefix actions to condition on + prev_chunk_left_over: Previous chunk's leftover actions (B, T, D) + chunk_size: Total chunk size T + + Returns: + x_t_conditioned: x_t with prefix replaced by previous actions + time_per_token: Per-token timesteps (B, T) with 1.0 for prefix + """ + batch_size = x_t.shape[0] + device = x_t.device + + if inference_delay is None or inference_delay <= 0 or prev_chunk_left_over is None: + time_scalar = torch.full((batch_size,), time, device=device, dtype=torch.float32) + return x_t, time_scalar + + delay = min(inference_delay, chunk_size) + prefix_mask = torch.arange(chunk_size, device=device)[None, :] < delay + + x_t_conditioned = torch.where( + prefix_mask[:, :, None].expand_as(x_t), + prev_chunk_left_over[:, :chunk_size, :], + x_t, + ) + + time_per_token = torch.full((batch_size, chunk_size), time, device=device, dtype=torch.float32) + time_per_token = time_per_token.masked_fill(prefix_mask, 1.0) + + return x_t_conditioned, time_per_token diff --git a/src/lerobot/policies/sarm/configuration_sarm.py b/src/lerobot/policies/sarm/configuration_sarm.py index 59cb352d5..673422fe2 100644 --- a/src/lerobot/policies/sarm/configuration_sarm.py +++ b/src/lerobot/policies/sarm/configuration_sarm.py @@ -26,6 +26,7 @@ from lerobot.configs.policies import PreTrainedConfig from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature from lerobot.optim.optimizers import AdamWConfig from lerobot.optim.schedulers import CosineDecayWithWarmupSchedulerConfig +from lerobot.utils.constants import OBS_IMAGES, OBS_STATE @PreTrainedConfig.register_subclass("sarm") @@ -86,8 +87,8 @@ class SARMConfig(PreTrainedConfig): pretrained_model_path: str | None = None device: str | None = None - image_key: str = "observation.images.top" # Key for image used from the dataset - state_key: str = "observation.state" + image_key: str = OBS_IMAGES + ".top" # Key for image used from the dataset + state_key: str = OBS_STATE # Populated by the processor (video_features, state_features, text_features) input_features: dict = field(default_factory=lambda: {}) diff --git a/src/lerobot/policies/sarm/modeling_sarm.py b/src/lerobot/policies/sarm/modeling_sarm.py index a88b2ad64..6051d90f8 100644 --- a/src/lerobot/policies/sarm/modeling_sarm.py +++ b/src/lerobot/policies/sarm/modeling_sarm.py @@ -40,6 +40,7 @@ from lerobot.policies.sarm.sarm_utils import ( normalize_stage_tau, pad_state_to_max_dim, ) +from lerobot.utils.constants import OBS_STR class StageTransformer(nn.Module): @@ -721,7 +722,7 @@ class SARMRewardModel(PreTrainedPolicy): Returns: Tuple of (total_loss, output_dict with loss components) """ - observation = batch.get("observation", batch) + observation = batch.get(OBS_STR, batch) # Extract features video_features = observation["video_features"].to(self.device) diff --git a/src/lerobot/policies/smolvla/configuration_smolvla.py b/src/lerobot/policies/smolvla/configuration_smolvla.py index c32c8a60e..fa773213d 100644 --- a/src/lerobot/policies/smolvla/configuration_smolvla.py +++ b/src/lerobot/policies/smolvla/configuration_smolvla.py @@ -20,7 +20,7 @@ from lerobot.optim.optimizers import AdamWConfig from lerobot.optim.schedulers import ( CosineDecayWithWarmupSchedulerConfig, ) -from lerobot.policies.rtc.configuration_rtc import RTCConfig +from lerobot.policies.rtc.configuration_rtc import RTCConfig, RTCTrainingConfig from lerobot.utils.constants import OBS_IMAGES @@ -103,8 +103,9 @@ class SmolVLAConfig(PreTrainedConfig): min_period: float = 4e-3 # sensitivity range for the timestep used in sine-cosine positional encoding max_period: float = 4.0 - # Real-Time Chunking (RTC) configuration + # Real-Time Chunking (RTC) configurations rtc_config: RTCConfig | None = None + rtc_training_config: RTCTrainingConfig | None = None def __post_init__(self): super().__post_init__() diff --git a/src/lerobot/policies/smolvla/modeling_smolvla.py b/src/lerobot/policies/smolvla/modeling_smolvla.py index f998661f9..cd53ef78d 100644 --- a/src/lerobot/policies/smolvla/modeling_smolvla.py +++ b/src/lerobot/policies/smolvla/modeling_smolvla.py @@ -63,6 +63,12 @@ from typing_extensions import Unpack from lerobot.policies.pretrained import PreTrainedPolicy from lerobot.policies.rtc.modeling_rtc import RTCProcessor +from lerobot.policies.rtc.training_time import ( + apply_rtc_training_time, + apply_training_time_rtc_inference, + masked_mean, + sample_rtc_delay, +) from lerobot.policies.smolvla.configuration_smolvla import SmolVLAConfig from lerobot.policies.smolvla.smolvlm_with_expert import SmolVLMWithExpertModel from lerobot.policies.utils import ( @@ -85,8 +91,8 @@ def create_sinusoidal_pos_embedding( if dimension % 2 != 0: raise ValueError(f"dimension ({dimension}) must be divisible by 2") - if time.ndim != 1: - raise ValueError("The time tensor is expected to be of shape `(batch_size, )`.") + if time.ndim not in (1, 2): + raise ValueError("The time tensor is expected to be of shape `(batch_size,)` or `(batch_size, T)`.") dtype = get_safe_dtype(torch.float64, device.type) fraction = torch.linspace(0.0, 1.0, dimension // 2, dtype=dtype, device=device) @@ -94,9 +100,14 @@ def create_sinusoidal_pos_embedding( # Compute the outer product scaling_factor = 1.0 / period * 2 * math.pi - sin_input = scaling_factor[None, :] * time[:, None] + if time.ndim == 1: + sin_input = scaling_factor[None, :] * time[:, None] + return torch.cat([torch.sin(sin_input), torch.cos(sin_input)], dim=1) + + time_flat = time.reshape(-1) + sin_input = scaling_factor[None, :] * time_flat[:, None] pos_emb = torch.cat([torch.sin(sin_input), torch.cos(sin_input)], dim=1) - return pos_emb + return pos_emb.reshape(*time.shape, dimension) def make_att_2d_masks(pad_masks, att_masks): @@ -375,6 +386,16 @@ class SmolVLAPolicy(PreTrainedPolicy): lang_tokens = batch[f"{OBS_LANGUAGE_TOKENS}"] lang_masks = batch[f"{OBS_LANGUAGE_ATTENTION_MASK}"] actions = self.prepare_action(batch) + postfix_mask = None + rtc_cfg = self.config.rtc_training_config + if rtc_cfg is not None and rtc_cfg.enabled and self.training: + batch_size = actions.shape[0] + if time is None: + time = self.model.sample_time(batch_size, actions.device) + if noise is None: + noise = self.model.sample_noise(actions.shape, actions.device) + delay = sample_rtc_delay(rtc_cfg, batch_size, actions.device) + time, postfix_mask = apply_rtc_training_time(time, delay, actions.shape[1]) actions_is_pad = batch.get("actions_id_pad") loss_dict = {} losses = self.model.forward(images, img_masks, lang_tokens, lang_masks, state, actions, noise, time) @@ -384,6 +405,7 @@ class SmolVLAPolicy(PreTrainedPolicy): in_episode_bound = ~actions_is_pad losses = losses * in_episode_bound.unsqueeze(-1) loss_dict["losses_after_in_ep_bound"] = losses.clone() + postfix_mask = in_episode_bound if postfix_mask is None else (postfix_mask & in_episode_bound) # Remove padding losses = losses[:, :, : self.config.max_action_dim] @@ -391,12 +413,12 @@ class SmolVLAPolicy(PreTrainedPolicy): if reduction == "none": # Return per-sample losses (B,) by averaging over time and action dims - per_sample_loss = losses.mean(dim=(1, 2)) + per_sample_loss = masked_mean(losses, postfix_mask, reduce_dims=(1, 2)) loss_dict["loss"] = per_sample_loss.mean().item() return per_sample_loss, loss_dict else: # Default: return scalar mean loss - loss = losses.mean() + loss = masked_mean(losses, postfix_mask, reduce_dims=(0, 1, 2)) loss_dict["loss"] = loss.item() return loss, loss_dict @@ -480,6 +502,28 @@ class SmolVLAPolicy(PreTrainedPolicy): actions = pad_vector(batch[ACTION], self.config.max_action_dim) return actions + def _get_default_peft_targets(self) -> dict[str, any]: + """Return default PEFT target modules for SmolVLA fine-tuning.""" + common_projections = ( + "state_proj|action_in_proj|action_out_proj|action_time_mlp_in|action_time_mlp_out" + ) + target_modules = rf"(model\.vlm_with_expert\.lm_expert\..*\.(q|v)_proj|model\.({common_projections}))" + return { + "target_modules": target_modules, + "modules_to_save": [], + } + + def _validate_peft_config(self, peft_config) -> None: + """Validate PEFT configuration for SmolVLA.""" + super()._validate_peft_config(peft_config) + if not self.config.load_vlm_weights: + import logging + + logging.warning( + "Training SmolVLA from scratch using PEFT. This is unlikely to yield good results. " + "Set `load_vlm_weights=True` to fine-tune the existing policy." + ) + def pad_tensor(tensor, max_len, pad_value=0): """ @@ -574,6 +618,9 @@ class VLAFlowMatching(nn.Module): def _rtc_enabled(self): return self.config.rtc_config is not None and self.config.rtc_config.enabled + def _training_time_rtc_inference_enabled(self): + return self.config.rtc_training_config is not None and self.config.rtc_training_config.enabled + def set_requires_grad(self): for params in self.state_proj.parameters(): params.requires_grad = self.config.train_state_proj @@ -709,7 +756,10 @@ class VLAFlowMatching(nn.Module): ) time_emb = time_emb.type(dtype=dtype) - time_emb = time_emb[:, None, :].expand_as(action_emb) + if time_emb.dim() == 2: + time_emb = time_emb[:, None, :].expand_as(action_emb) + elif time_emb.shape[:2] != action_emb.shape[:2]: + raise ValueError(f"Expected time_emb shape {action_emb.shape[:2]}, got {time_emb.shape[:2]}") action_time_emb = torch.cat([action_emb, time_emb], dim=2) action_time_emb = self.action_time_mlp_in(action_time_emb) @@ -741,7 +791,12 @@ class VLAFlowMatching(nn.Module): if time is None: time = self.sample_time(actions.shape[0], actions.device) - time_expanded = time[:, None, None] + if time.ndim == 1: + time_expanded = time[:, None, None] + elif time.ndim == 2: + time_expanded = time[:, :, None] + else: + raise ValueError(f"Expected time shape (B,) or (B, T), got {time.shape}") x_t = time_expanded * noise + (1 - time_expanded) * actions u_t = noise - actions prefix_embs, prefix_pad_masks, prefix_att_masks = self.embed_prefix( @@ -804,23 +859,35 @@ class VLAFlowMatching(nn.Module): num_steps = self.config.num_steps dt = -1.0 / num_steps + inference_delay = kwargs.get("inference_delay") + prev_chunk_left_over = kwargs.get("prev_chunk_left_over") + execution_horizon = kwargs.get("execution_horizon") + use_training_time_rtc = self._training_time_rtc_inference_enabled() + x_t = noise for step in range(num_steps): time = 1.0 + step * dt - time_tensor = torch.tensor(time, dtype=torch.float32, device=device).expand(bsize) - def denoise_step_partial_call(input_x_t, current_timestep=time_tensor): - return self.denoise_step( - x_t=input_x_t, + if use_training_time_rtc: + x_t_cond, time_tensor = apply_training_time_rtc_inference( + x_t, time, inference_delay, prev_chunk_left_over, self.config.chunk_size + ) + v_t = self.denoise_step( + x_t=x_t_cond, prefix_pad_masks=prefix_pad_masks, past_key_values=past_key_values, - timestep=current_timestep, + timestep=time_tensor, ) + elif self._rtc_enabled(): + time_tensor = torch.tensor(time, dtype=torch.float32, device=device).expand(bsize) - if self._rtc_enabled(): - inference_delay = kwargs.get("inference_delay") - prev_chunk_left_over = kwargs.get("prev_chunk_left_over") - execution_horizon = kwargs.get("execution_horizon") + def denoise_step_partial_call(input_x_t, current_timestep=time_tensor): + return self.denoise_step( + x_t=input_x_t, + prefix_pad_masks=prefix_pad_masks, + past_key_values=past_key_values, + timestep=current_timestep, + ) v_t = self.rtc_processor.denoise_step( x_t=x_t, @@ -831,7 +898,13 @@ class VLAFlowMatching(nn.Module): execution_horizon=execution_horizon, ) else: - v_t = denoise_step_partial_call(x_t) + time_tensor = torch.tensor(time, dtype=torch.float32, device=device).expand(bsize) + v_t = self.denoise_step( + x_t=x_t, + prefix_pad_masks=prefix_pad_masks, + past_key_values=past_key_values, + timestep=time_tensor, + ) x_t = x_t + dt * v_t diff --git a/src/lerobot/policies/utils.py b/src/lerobot/policies/utils.py index bfbe2bf1d..1a14b2925 100644 --- a/src/lerobot/policies/utils.py +++ b/src/lerobot/policies/utils.py @@ -16,7 +16,6 @@ import logging from collections import deque -from typing import Any import numpy as np import torch @@ -140,7 +139,7 @@ def prepare_observation_for_inference( def build_inference_frame( - observation: dict[str, Any], + observation: RobotObservation, device: torch.device, ds_features: dict[str, dict], task: str | None = None, diff --git a/src/lerobot/policies/wall_x/configuration_wall_x.py b/src/lerobot/policies/wall_x/configuration_wall_x.py index 0d10a8f98..3962b56f6 100644 --- a/src/lerobot/policies/wall_x/configuration_wall_x.py +++ b/src/lerobot/policies/wall_x/configuration_wall_x.py @@ -18,6 +18,7 @@ from lerobot.configs.policies import PreTrainedConfig from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature from lerobot.optim.optimizers import AdamWConfig from lerobot.optim.schedulers import CosineDecayWithWarmupSchedulerConfig +from lerobot.utils.constants import ACTION, OBS_STATE @PreTrainedConfig.register_subclass("wall_x") @@ -105,14 +106,14 @@ class WallXConfig(PreTrainedConfig): "No features of type FeatureType.VISUAL found in input_features." ) - if "observation.state" not in self.input_features: + if OBS_STATE not in self.input_features: state_feature = PolicyFeature( type=FeatureType.STATE, shape=(self.max_state_dim,), # Padded to max_state_dim ) - self.input_features["observation.state"] = state_feature + self.input_features[OBS_STATE] = state_feature else: - state_shape = self.input_features["observation.state"].shape + state_shape = self.input_features[OBS_STATE].shape state_dim = state_shape[0] if state_shape else 0 if state_dim > self.max_state_dim: raise ValueError( @@ -120,14 +121,14 @@ class WallXConfig(PreTrainedConfig): f"Either reduce state dimension or increase max_state_dim in config." ) - if "action" not in self.output_features: + if ACTION not in self.output_features: action_feature = PolicyFeature( type=FeatureType.ACTION, shape=(self.max_action_dim,), # Padded to max_action_dim ) - self.output_features["action"] = action_feature + self.output_features[ACTION] = action_feature else: - action_shape = self.output_features["action"].shape + action_shape = self.output_features[ACTION].shape action_dim = action_shape[0] if action_shape else 0 if action_dim > self.max_action_dim: raise ValueError( diff --git a/src/lerobot/policies/wall_x/modeling_wall_x.py b/src/lerobot/policies/wall_x/modeling_wall_x.py index c401c8d60..ef99bad89 100644 --- a/src/lerobot/policies/wall_x/modeling_wall_x.py +++ b/src/lerobot/policies/wall_x/modeling_wall_x.py @@ -1697,7 +1697,7 @@ class WallXPolicy(PreTrainedPolicy): config_class = WallXConfig name = "wall_x" - def __init__(self, config: WallXConfig): + def __init__(self, config: WallXConfig, **kwargs): super().__init__(config) config.validate_features() self.config = config @@ -1861,7 +1861,7 @@ class WallXPolicy(PreTrainedPolicy): dim=-1, ) else: - action_dim = self.config.output_features["action"].shape[0] + action_dim = self.config.output_features[ACTION].shape[0] dof_mask = torch.cat( [ torch.ones( @@ -1977,7 +1977,7 @@ class WallXPolicy(PreTrainedPolicy): elif self.config.prediction_mode == "fast": output = self.model( **batch, - action_dim=self.config.output_features["action"].shape[0], + action_dim=self.config.output_features[ACTION].shape[0], pred_horizon=self.config.chunk_size, mode="predict", predict_mode="fast", @@ -1989,7 +1989,7 @@ class WallXPolicy(PreTrainedPolicy): actions = output["predict_action"] # Unpad actions to actual action dimension - action_dim = self.config.output_features["action"].shape[0] + action_dim = self.config.output_features[ACTION].shape[0] actions = actions[:, :, :action_dim] return actions diff --git a/src/lerobot/policies/xvla/processor_xvla.py b/src/lerobot/policies/xvla/processor_xvla.py index 7f7297b9a..c4e3f2d6f 100644 --- a/src/lerobot/policies/xvla/processor_xvla.py +++ b/src/lerobot/policies/xvla/processor_xvla.py @@ -41,6 +41,7 @@ from lerobot.processor.converters import policy_action_to_transition, transition from lerobot.processor.core import EnvTransition, TransitionKey from lerobot.utils.constants import ( OBS_IMAGES, + OBS_PREFIX, OBS_STATE, POLICY_POSTPROCESSOR_DEFAULT_NAME, POLICY_PREPROCESSOR_DEFAULT_NAME, @@ -137,8 +138,9 @@ class LiberoProcessorStep(ObservationProcessorStep): processed_obs[key] = img # Process robot_state into a flat state vector - if "observation.robot_state" in processed_obs: - robot_state = processed_obs.pop("observation.robot_state") + robot_state_str = OBS_PREFIX + "robot_state" + if robot_state_str in processed_obs: + robot_state = processed_obs.pop(robot_state_str) # Extract components eef_pos = robot_state["eef"]["pos"] # (B, 3,) @@ -174,8 +176,8 @@ class LiberoProcessorStep(ObservationProcessorStep): state_feats = {} # add our new flattened state - state_feats["observation.state"] = PolicyFeature( - key="observation.state", + state_feats[OBS_STATE] = PolicyFeature( + key=OBS_STATE, shape=(20,), dtype="float32", ) @@ -247,7 +249,7 @@ class XVLAImageScaleProcessorStep(ProcessorStep): keys_to_scale = self.image_keys if keys_to_scale is None: # Auto-detect image keys - keys_to_scale = [k for k in obs if k.startswith("observation.images.")] + keys_to_scale = [k for k in obs if k.startswith(OBS_IMAGES)] # Scale each image for key in keys_to_scale: @@ -303,7 +305,7 @@ class XVLAImageToFloatProcessorStep(ProcessorStep): keys_to_convert = self.image_keys if keys_to_convert is None: # Auto-detect image keys - keys_to_convert = [k for k in obs if k.startswith("observation.images.")] + keys_to_convert = [k for k in obs if k.startswith(OBS_IMAGES)] # Convert each image for key in keys_to_convert: @@ -376,7 +378,7 @@ class XVLAImageNetNormalizeProcessorStep(ProcessorStep): keys_to_normalize = self.image_keys if keys_to_normalize is None: # Auto-detect image keys - keys_to_normalize = [k for k in obs if k.startswith("observation.images.")] + keys_to_normalize = [k for k in obs if k.startswith(OBS_IMAGES)] # Normalize each image for key in keys_to_normalize: diff --git a/src/lerobot/processor/__init__.py b/src/lerobot/processor/__init__.py index be11ac1af..164f7da03 100644 --- a/src/lerobot/processor/__init__.py +++ b/src/lerobot/processor/__init__.py @@ -49,7 +49,6 @@ from .hil_processor import ( RewardClassifierProcessorStep, TimeLimitProcessorStep, ) -from .joint_observations_processor import JointVelocityProcessorStep, MotorCurrentProcessorStep from .normalize_processor import NormalizerProcessorStep, UnnormalizerProcessorStep, hotswap_stats from .observation_processor import VanillaObservationProcessorStep from .pipeline import ( @@ -75,7 +74,7 @@ from .policy_robot_bridge import ( RobotActionToPolicyActionProcessorStep, ) from .rename_processor import RenameObservationsProcessorStep -from .tokenizer_processor import TokenizerProcessorStep +from .tokenizer_processor import ActionTokenizerProcessorStep, TokenizerProcessorStep __all__ = [ "ActionProcessorStep", @@ -94,14 +93,12 @@ __all__ = [ "ImageCropResizeProcessorStep", "InfoProcessorStep", "InterventionActionProcessorStep", - "JointVelocityProcessorStep", "make_default_processors", "make_default_teleop_action_processor", "make_default_robot_action_processor", "make_default_robot_observation_processor", "MapDeltaActionToRobotActionStep", "MapTensorToDeltaActionDictStep", - "MotorCurrentProcessorStep", "NormalizerProcessorStep", "Numpy2TorchActionProcessorStep", "ObservationProcessorStep", @@ -122,6 +119,7 @@ __all__ = [ "AddBatchDimensionProcessorStep", "RobotProcessorPipeline", "TokenizerProcessorStep", + "ActionTokenizerProcessorStep", "Torch2NumpyActionProcessorStep", "RobotActionToPolicyActionProcessorStep", "PolicyActionToRobotActionProcessorStep", diff --git a/src/lerobot/processor/converters.py b/src/lerobot/processor/converters.py index 126be0e36..4f9485fee 100644 --- a/src/lerobot/processor/converters.py +++ b/src/lerobot/processor/converters.py @@ -23,7 +23,7 @@ from typing import Any import numpy as np import torch -from lerobot.utils.constants import ACTION, DONE, OBS_PREFIX, REWARD, TRUNCATED +from lerobot.utils.constants import ACTION, DONE, INFO, OBS_PREFIX, REWARD, TRUNCATED from .core import EnvTransition, PolicyAction, RobotAction, RobotObservation, TransitionKey @@ -176,7 +176,7 @@ def _extract_complementary_data(batch: dict[str, Any]) -> dict[str, Any]: def create_transition( - observation: dict[str, Any] | None = None, + observation: RobotObservation | None = None, action: PolicyAction | RobotAction | None = None, reward: float = 0.0, done: bool = False, @@ -384,7 +384,7 @@ def transition_to_batch(transition: EnvTransition) -> dict[str, Any]: REWARD: transition.get(TransitionKey.REWARD, 0.0), DONE: transition.get(TransitionKey.DONE, False), TRUNCATED: transition.get(TransitionKey.TRUNCATED, False), - "info": transition.get(TransitionKey.INFO, {}), + INFO: transition.get(TransitionKey.INFO, {}), } # Add complementary data. diff --git a/src/lerobot/processor/core.py b/src/lerobot/processor/core.py index 679ba8c54..0b293c9b0 100644 --- a/src/lerobot/processor/core.py +++ b/src/lerobot/processor/core.py @@ -45,7 +45,7 @@ RobotObservation: TypeAlias = dict[str, Any] EnvTransition = TypedDict( "EnvTransition", { - TransitionKey.OBSERVATION.value: dict[str, Any] | None, + TransitionKey.OBSERVATION.value: RobotObservation | None, TransitionKey.ACTION.value: PolicyAction | RobotAction | EnvAction | None, TransitionKey.REWARD.value: float | torch.Tensor | None, TransitionKey.DONE.value: bool | torch.Tensor | None, diff --git a/src/lerobot/processor/env_processor.py b/src/lerobot/processor/env_processor.py index b1872b032..8d42bfdb7 100644 --- a/src/lerobot/processor/env_processor.py +++ b/src/lerobot/processor/env_processor.py @@ -18,7 +18,7 @@ from dataclasses import dataclass import torch from lerobot.configs.types import PipelineFeatureType, PolicyFeature -from lerobot.utils.constants import OBS_IMAGES, OBS_STATE +from lerobot.utils.constants import OBS_IMAGES, OBS_PREFIX, OBS_STATE, OBS_STR from .pipeline import ObservationProcessorStep, ProcessorStepRegistry @@ -60,8 +60,9 @@ class LiberoProcessorStep(ObservationProcessorStep): processed_obs[key] = img # Process robot_state into a flat state vector - if "observation.robot_state" in processed_obs: - robot_state = processed_obs.pop("observation.robot_state") + observation_robot_state_str = OBS_PREFIX + "robot_state" + if observation_robot_state_str in processed_obs: + robot_state = processed_obs.pop(observation_robot_state_str) # Extract components eef_pos = robot_state["eef"]["pos"] # (B, 3,) @@ -98,8 +99,8 @@ class LiberoProcessorStep(ObservationProcessorStep): state_feats = {} # add our new flattened state - state_feats["observation.state"] = PolicyFeature( - key="observation.state", + state_feats[OBS_STATE] = PolicyFeature( + key=OBS_STATE, shape=(8,), # [eef_pos(3), axis_angle(3), gripper(2)] dtype="float32", description=("Concatenated end-effector position (3), axis-angle (3), and gripper qpos (2)."), @@ -152,3 +153,78 @@ class LiberoProcessorStep(ObservationProcessorStep): result[mask] = axis * angle.unsqueeze(1) return result + + +@dataclass +@ProcessorStepRegistry.register(name="isaaclab_arena_processor") +class IsaaclabArenaProcessorStep(ObservationProcessorStep): + """ + Processes IsaacLab Arena observations into LeRobot format. + + **State Processing:** + - Extracts state components from obs["policy"] based on `state_keys`. + - Concatenates into a flat vector mapped to "observation.state". + + **Image Processing:** + - Extracts images from obs["camera_obs"] based on `camera_keys`. + - Converts from (B, H, W, C) uint8 to (B, C, H, W) float32 [0, 1]. + - Maps to "observation.images.". + """ + + # Configurable from IsaacLabEnv config / cli args: --env.state_keys="robot_joint_pos,left_eef_pos" + state_keys: tuple[str, ...] + + # Configurable from IsaacLabEnv config / cli args: --env.camera_keys="robot_pov_cam_rgb" + camera_keys: tuple[str, ...] + + def _process_observation(self, observation): + """ + Processes both image and policy state observations from IsaacLab Arena. + """ + processed_obs = {} + + if f"{OBS_STR}.camera_obs" in observation: + camera_obs = observation[f"{OBS_STR}.camera_obs"] + + for cam_name, img in camera_obs.items(): + if cam_name not in self.camera_keys: + continue + + img = img.permute(0, 3, 1, 2).contiguous() + if img.dtype == torch.uint8: + img = img.float() / 255.0 + elif img.dtype != torch.float32: + img = img.float() + + processed_obs[f"{OBS_IMAGES}.{cam_name}"] = img + + # Process policy state -> observation.state + if f"{OBS_STR}.policy" in observation: + policy_obs = observation[f"{OBS_STR}.policy"] + + # Collect state components in order + state_components = [] + for key in self.state_keys: + if key in policy_obs: + component = policy_obs[key] + # Flatten extra dims: (B, N, M) -> (B, N*M) + if component.dim() > 2: + batch_size = component.shape[0] + component = component.view(batch_size, -1) + state_components.append(component) + + if state_components: + state = torch.cat(state_components, dim=-1) + state = state.float() + processed_obs[OBS_STATE] = state + + return processed_obs + + def transform_features( + self, features: dict[PipelineFeatureType, dict[str, PolicyFeature]] + ) -> dict[PipelineFeatureType, dict[str, PolicyFeature]]: + """Not used for policy evaluation.""" + return features + + def observation(self, observation): + return self._process_observation(observation) diff --git a/src/lerobot/processor/normalize_processor.py b/src/lerobot/processor/normalize_processor.py index 368c9b270..4769b91ac 100644 --- a/src/lerobot/processor/normalize_processor.py +++ b/src/lerobot/processor/normalize_processor.py @@ -30,7 +30,7 @@ from lerobot.utils.constants import ACTION from .converters import from_tensor_to_numpy, to_tensor from .core import EnvTransition, PolicyAction, TransitionKey -from .pipeline import PolicyProcessorPipeline, ProcessorStep, ProcessorStepRegistry +from .pipeline import PolicyProcessorPipeline, ProcessorStep, ProcessorStepRegistry, RobotObservation @dataclass @@ -239,7 +239,7 @@ class _NormalizationMixin: config["normalize_observation_keys"] = sorted(self.normalize_observation_keys) return config - def _normalize_observation(self, observation: dict[str, Any], inverse: bool) -> dict[str, Tensor]: + def _normalize_observation(self, observation: RobotObservation, inverse: bool) -> dict[str, Tensor]: """ Applies (un)normalization to all relevant features in an observation dictionary. diff --git a/src/lerobot/processor/pipeline.py b/src/lerobot/processor/pipeline.py index e14d8b0b9..97ec716ff 100644 --- a/src/lerobot/processor/pipeline.py +++ b/src/lerobot/processor/pipeline.py @@ -49,7 +49,7 @@ from lerobot.configs.types import PipelineFeatureType, PolicyFeature from lerobot.utils.hub import HubMixin from .converters import batch_to_transition, create_transition, transition_to_batch -from .core import EnvAction, EnvTransition, PolicyAction, RobotAction, TransitionKey +from .core import EnvAction, EnvTransition, PolicyAction, RobotAction, RobotObservation, TransitionKey # Generic type variables for pipeline input and output. TInput = TypeVar("TInput") @@ -1337,7 +1337,7 @@ class DataProcessorPipeline(HubMixin, Generic[TInput, TOutput]): return features # Convenience methods for processing individual parts of a transition. - def process_observation(self, observation: dict[str, Any]) -> dict[str, Any]: + def process_observation(self, observation: RobotObservation) -> RobotObservation: """Processes only the observation part of a transition through the pipeline. Args: @@ -1440,7 +1440,7 @@ class ObservationProcessorStep(ProcessorStep, ABC): """An abstract `ProcessorStep` that specifically targets the observation in a transition.""" @abstractmethod - def observation(self, observation: dict[str, Any]) -> dict[str, Any]: + def observation(self, observation: RobotObservation) -> RobotObservation: """Processes an observation dictionary. Subclasses must implement this method. Args: diff --git a/src/lerobot/processor/tokenizer_processor.py b/src/lerobot/processor/tokenizer_processor.py index 2ef89c107..5cd1bebb0 100644 --- a/src/lerobot/processor/tokenizer_processor.py +++ b/src/lerobot/processor/tokenizer_processor.py @@ -23,22 +23,29 @@ token IDs and attention masks, which are then added to the observation dictionar from __future__ import annotations +import logging from dataclasses import dataclass, field from typing import TYPE_CHECKING, Any import torch from lerobot.configs.types import FeatureType, PipelineFeatureType, PolicyFeature -from lerobot.utils.constants import OBS_LANGUAGE_ATTENTION_MASK, OBS_LANGUAGE_TOKENS +from lerobot.utils.constants import ( + ACTION_TOKEN_MASK, + ACTION_TOKENS, + OBS_LANGUAGE_ATTENTION_MASK, + OBS_LANGUAGE_TOKENS, +) from lerobot.utils.import_utils import _transformers_available -from .core import EnvTransition, TransitionKey -from .pipeline import ObservationProcessorStep, ProcessorStepRegistry +from .core import EnvTransition, RobotObservation, TransitionKey +from .pipeline import ActionProcessorStep, ObservationProcessorStep, ProcessorStepRegistry # Conditional import for type checking and lazy loading if TYPE_CHECKING or _transformers_available: - from transformers import AutoTokenizer + from transformers import AutoProcessor, AutoTokenizer else: + AutoProcessor = None AutoTokenizer = None @@ -132,7 +139,7 @@ class TokenizerProcessorStep(ObservationProcessorStep): return None - def observation(self, observation: dict[str, Any]) -> dict[str, Any]: + def observation(self, observation: RobotObservation) -> RobotObservation: """ Tokenizes the task description and adds it to the observation dictionary. @@ -268,3 +275,256 @@ class TokenizerProcessorStep(ObservationProcessorStep): ) return features + + +@dataclass +@ProcessorStepRegistry.register(name="action_tokenizer_processor") +class ActionTokenizerProcessorStep(ActionProcessorStep): + """ + Processor step to tokenize action data using a fast action tokenizer. + + This step takes action tensors from an `EnvTransition`, tokenizes them using + a Hugging Face `transformers` AutoProcessor (such as the Physical Intelligence "fast" tokenizer), + and returns the tokenized action. + + Requires the `transformers` library to be installed. + + Attributes: + tokenizer_name: The name of a pretrained processor from the Hugging Face Hub (e.g., "physical-intelligence/fast"). + tokenizer: A pre-initialized processor/tokenizer object. If provided, `tokenizer_name` is ignored. + trust_remote_code: Whether to trust remote code when loading the tokenizer (required for some tokenizers). + action_tokenizer: The internal tokenizer/processor instance, loaded during initialization. + paligemma_tokenizer_name: The name of a pretrained PaliGemma tokenizer from the Hugging Face Hub (e.g., "google/paligemma-3b-pt-224"). + """ + + action_tokenizer_name: str | None = None + action_tokenizer_input_object: Any | None = None + trust_remote_code: bool = True + max_action_tokens: int = 256 + fast_skip_tokens: int = 128 + paligemma_tokenizer_name: str = "google/paligemma-3b-pt-224" + # Internal tokenizer instance (not part of the config) + action_tokenizer: Any = field(default=None, init=False, repr=False) + _paligemma_tokenizer: Any = field(default=None, init=False, repr=False) + + def __post_init__(self): + """ + Initializes the action tokenizer after the dataclass is created. + + It checks for the availability of the `transformers` library and loads the tokenizer + either from a provided object or by name from the Hugging Face Hub. + + Raises: + ImportError: If the `transformers` library is not installed. + ValueError: If neither `tokenizer` nor `tokenizer_name` is provided. + """ + if not _transformers_available: + raise ImportError( + "The 'transformers' library is not installed. " + "Please install it with `pip install 'lerobot[transformers-dep]'` to use ActionTokenizerProcessorStep." + ) + + if self.action_tokenizer_input_object is not None: + self.action_tokenizer = self.action_tokenizer_input_object + + elif self.action_tokenizer_name is not None: + if AutoProcessor is None: + raise ImportError("AutoProcessor is not available") + self.action_tokenizer = AutoProcessor.from_pretrained( + self.action_tokenizer_name, trust_remote_code=self.trust_remote_code + ) + else: + raise ValueError( + "Either 'action_tokenizer' or 'action_tokenizer_name' must be provided. " + "Pass a tokenizer object directly or a tokenizer name to auto-load." + ) + + self._paligemma_tokenizer = AutoTokenizer.from_pretrained( + self.paligemma_tokenizer_name, + trust_remote_code=self.trust_remote_code, + add_eos_token=True, + add_bos_token=False, + ) + + def __call__(self, transition: EnvTransition) -> EnvTransition: + """ + Applies action tokenization to the transition. + + This overrides the base class to handle both tokens and mask. + + Args: + transition: The input transition with action data. + + Returns: + The processed transition with tokenized actions and mask in complementary data. + """ + self._current_transition = transition.copy() + new_transition = self._current_transition + + action = new_transition.get(TransitionKey.ACTION) + if action is None: + # During inference, no action is available, skip tokenization + return new_transition + + # Tokenize and get both tokens and mask + tokens, mask = self._tokenize_action(action) + + # Store mask in complementary data + complementary_data = new_transition.get(TransitionKey.COMPLEMENTARY_DATA, {}) + if complementary_data is None: + complementary_data = {} + complementary_data[ACTION_TOKEN_MASK] = mask + complementary_data[ACTION_TOKENS] = tokens + new_transition[TransitionKey.COMPLEMENTARY_DATA] = complementary_data + return new_transition + + def _act_tokens_to_paligemma_tokens(self, tokens: torch.Tensor) -> torch.Tensor: + """ + Converts action tokens to PaliGemma tokens. + """ + return self._paligemma_tokenizer.vocab_size - 1 - self.fast_skip_tokens - tokens + + def _tokenize_action(self, action: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + """ + Tokenizes the action tensor and creates a mask. + + Args: + action: The input action tensor to tokenize. Shape: (B, H, action_dim) or (H, action_dim,) + + Returns: + A tuple of (tokens, mask) where: + - tokens: Tensor of token IDs with shape (B, max_action_tokens) + - mask: Boolean mask with shape (B, max_action_tokens), True for real tokens, False for padding + """ + if action is None: + raise ValueError("Action cannot be None") + + # Get the device and dtype of the input action + device = action.device if isinstance(action, torch.Tensor) else None + + # Handle single sample (add batch dimension) + single_sample = action.dim() == 1 + if single_sample: + action = action.unsqueeze(0) + + batch_size = action.shape[0] + + # Tokenize the action batch + # The fast tokenizer expects action data and returns token IDs + tokens_list = [] + masks_list = [] + + for i in range(batch_size): + # Tokenize single action (move to CPU first as tokenizer uses scipy which requires numpy) + action_cpu = action[i : i + 1].cpu() + tokens = self.action_tokenizer(action_cpu) + + # Convert to numpy array if it's a list + if isinstance(tokens, list) or not isinstance(tokens, torch.Tensor): + tokens = torch.tensor(tokens, dtype=torch.long, device=action.device) + else: + # Move tokens back to the same device as input action + tokens = tokens.to(device=action.device) + + # Flatten to 1D if needed + if tokens.dim() > 1: + tokens = tokens.flatten() + + bos_id = self._paligemma_tokenizer.bos_token_id + # add bos + tokens = torch.cat( + [ + torch.tensor([bos_id], device=action.device), + torch.tensor( + self._paligemma_tokenizer.encode("Action: ", add_special_tokens=False), + device=action.device, + ), + self._act_tokens_to_paligemma_tokens(tokens), + torch.tensor(self._paligemma_tokenizer.encode("|"), device=action.device), + ] + ) + + # Truncate or pad to max_action_tokens + if len(tokens) > self.max_action_tokens: + logging.warning( + f"Token length ({len(tokens)}) exceeds max length ({self.max_action_tokens}), truncating. " + "Consider increasing the `max_action_tokens` in your model config if this happens frequently." + ) + tokens = tokens[: self.max_action_tokens] + mask = torch.ones(self.max_action_tokens, dtype=torch.bool, device=action.device) + else: + mask = torch.cat( + [ + torch.ones(len(tokens), dtype=torch.bool, device=action.device), + torch.zeros( + self.max_action_tokens - len(tokens), dtype=torch.bool, device=action.device + ), + ] + ) + # Pad tokens with zeros + tokens = torch.nn.functional.pad(tokens, (0, self.max_action_tokens - len(tokens)), value=0) + + tokens_list.append(tokens) + masks_list.append(mask) + + # Stack into batched tensors + tokens_batch = torch.stack(tokens_list, dim=0) # (B, max_action_tokens) + masks_batch = torch.stack(masks_list, dim=0) # (B, max_action_tokens) + + # Remove batch dimension if input was single sample + if single_sample: + tokens_batch = tokens_batch.squeeze(0) + masks_batch = masks_batch.squeeze(0) + + # Move to the same device as the input + if device is not None: + tokens_batch = tokens_batch.to(device) + masks_batch = masks_batch.to(device) + + return tokens_batch, masks_batch + + def action(self, action: torch.Tensor) -> torch.Tensor: + """ + This method is not used since we override __call__. + Required by ActionProcessorStep ABC. + """ + tokens, _ = self._tokenize_action(action) + return tokens + + def get_config(self) -> dict[str, Any]: + """ + Returns the serializable configuration of the processor. + + Note: The tokenizer object itself is not serialized. If the processor was initialized + with a tokenizer name, that name will be included in the config. + + Returns: + A dictionary with the processor's configuration parameters. + """ + config = { + "trust_remote_code": self.trust_remote_code, + "max_action_tokens": self.max_action_tokens, + } + + # Only save tokenizer_name if it was used to create the tokenizer + if self.action_tokenizer_name is not None and self.action_tokenizer_input_object is None: + config["action_tokenizer_name"] = self.action_tokenizer_name + + return config + + def transform_features( + self, features: dict[PipelineFeatureType, dict[str, PolicyFeature]] + ) -> dict[PipelineFeatureType, dict[str, PolicyFeature]]: + """ + Updates feature definitions to reflect tokenized actions. + + This updates the policy features dictionary to indicate that the action + has been tokenized into a sequence of token IDs with shape (max_action_tokens,). + + Args: + features: The dictionary of existing policy features. + + Returns: + The updated dictionary of policy features. + """ + return features diff --git a/src/lerobot/rl/actor.py b/src/lerobot/rl/actor.py index 13fd66507..7427633d2 100644 --- a/src/lerobot/rl/actor.py +++ b/src/lerobot/rl/actor.py @@ -65,8 +65,8 @@ from lerobot.policies.sac.modeling_sac import SACPolicy from lerobot.processor import TransitionKey from lerobot.rl.process import ProcessSignalHandler from lerobot.rl.queue import get_last_item_from_queue -from lerobot.robots import so100_follower # noqa: F401 -from lerobot.teleoperators import gamepad, so101_leader # noqa: F401 +from lerobot.robots import so_follower # noqa: F401 +from lerobot.teleoperators import gamepad, so_leader # noqa: F401 from lerobot.teleoperators.utils import TeleopEvents from lerobot.transport import services_pb2, services_pb2_grpc from lerobot.transport.utils import ( @@ -398,7 +398,7 @@ def act_with_policy( if cfg.env.fps is not None: dt_time = time.perf_counter() - start_time - precise_sleep(1 / cfg.env.fps - dt_time) + precise_sleep(max(1 / cfg.env.fps - dt_time, 0.0)) # Communication Functions - Group all gRPC/messaging functions diff --git a/src/lerobot/rl/eval_policy.py b/src/lerobot/rl/eval_policy.py index 16bb64a73..fb2504f2a 100644 --- a/src/lerobot/rl/eval_policy.py +++ b/src/lerobot/rl/eval_policy.py @@ -23,11 +23,11 @@ from lerobot.policies.factory import make_policy from lerobot.robots import ( # noqa: F401 RobotConfig, make_robot_from_config, - so100_follower, + so_follower, ) from lerobot.teleoperators import ( gamepad, # noqa: F401 - so101_leader, # noqa: F401 + so_leader, # noqa: F401 ) from .gym_manipulator import make_robot_env diff --git a/src/lerobot/rl/gym_manipulator.py b/src/lerobot/rl/gym_manipulator.py index ad1fdf55f..3d58ae18f 100644 --- a/src/lerobot/rl/gym_manipulator.py +++ b/src/lerobot/rl/gym_manipulator.py @@ -38,13 +38,12 @@ from lerobot.processor import ( GripperPenaltyProcessorStep, ImageCropResizeProcessorStep, InterventionActionProcessorStep, - JointVelocityProcessorStep, MapDeltaActionToRobotActionStep, MapTensorToDeltaActionDictStep, - MotorCurrentProcessorStep, Numpy2TorchActionProcessorStep, RewardClassifierProcessorStep, RobotActionToPolicyActionProcessorStep, + RobotObservation, TimeLimitProcessorStep, Torch2NumpyActionProcessorStep, TransitionKey, @@ -55,10 +54,10 @@ from lerobot.processor.converters import identity_transition from lerobot.robots import ( # noqa: F401 RobotConfig, make_robot_from_config, - so100_follower, + so_follower, ) from lerobot.robots.robot import Robot -from lerobot.robots.so100_follower.robot_kinematic_processor import ( +from lerobot.robots.so_follower.robot_kinematic_processor import ( EEBoundsAndSafety, EEReferenceAndDelta, ForwardKinematicsJointsToEEObservation, @@ -69,7 +68,7 @@ from lerobot.teleoperators import ( gamepad, # noqa: F401 keyboard, # noqa: F401 make_teleoperator_from_config, - so101_leader, # noqa: F401 + so_leader, # noqa: F401 ) from lerobot.teleoperators.teleoperator import Teleoperator from lerobot.teleoperators.utils import TeleopEvents @@ -77,6 +76,8 @@ from lerobot.utils.constants import ACTION, DONE, OBS_IMAGES, OBS_STATE, REWARD from lerobot.utils.robot_utils import precise_sleep from lerobot.utils.utils import log_say +from .joint_observations_processor import JointVelocityProcessorStep, MotorCurrentProcessorStep + logging.basicConfig(level=logging.INFO) @@ -163,7 +164,7 @@ class RobotEnv(gym.Env): self._setup_spaces() - def _get_observation(self) -> dict[str, Any]: + def _get_observation(self) -> RobotObservation: """Get current robot observation including joint positions and camera images.""" obs_dict = self.robot.get_observation() raw_joint_joint_position = {f"{name}.pos": obs_dict[f"{name}.pos"] for name in self._joint_names} @@ -220,7 +221,7 @@ class RobotEnv(gym.Env): def reset( self, *, seed: int | None = None, options: dict[str, Any] | None = None - ) -> tuple[dict[str, Any], dict[str, Any]]: + ) -> tuple[RobotObservation, dict[str, Any]]: """Reset environment to initial state. Args: @@ -238,7 +239,7 @@ class RobotEnv(gym.Env): reset_follower_position(self.robot, np.array(self.reset_pose)) log_say("Reset the environment done.", play_sounds=True) - precise_sleep(self.reset_time_s - (time.perf_counter() - start_time)) + precise_sleep(max(self.reset_time_s - (time.perf_counter() - start_time), 0.0)) super().reset(seed=seed, options=options) @@ -249,7 +250,7 @@ class RobotEnv(gym.Env): self._raw_joint_positions = {f"{key}.pos": obs[f"{key}.pos"] for key in self._joint_names} return obs, {TeleopEvents.IS_INTERVENTION: False} - def step(self, action) -> tuple[dict[str, np.ndarray], float, bool, bool, dict[str, Any]]: + def step(self, action) -> tuple[RobotObservation, float, bool, bool, dict[str, Any]]: """Execute one environment step with given action.""" joint_targets_dict = {f"{key}.pos": action[i] for i, key in enumerate(self.robot.bus.motors.keys())} @@ -713,7 +714,7 @@ def control_loop( transition = env_processor(transition) # Maintain fps timing - precise_sleep(dt - (time.perf_counter() - step_start_time)) + precise_sleep(max(dt - (time.perf_counter() - step_start_time), 0.0)) if dataset is not None and cfg.dataset.push_to_hub: logging.info("Pushing dataset to hub") @@ -745,7 +746,7 @@ def replay_trajectory( ) transition = action_processor(transition) env.step(transition[TransitionKey.ACTION]) - precise_sleep(1 / cfg.env.fps - (time.perf_counter() - start_time)) + precise_sleep(max(1 / cfg.env.fps - (time.perf_counter() - start_time), 0.0)) @parser.wrap() diff --git a/src/lerobot/processor/joint_observations_processor.py b/src/lerobot/rl/joint_observations_processor.py similarity index 100% rename from src/lerobot/processor/joint_observations_processor.py rename to src/lerobot/rl/joint_observations_processor.py diff --git a/src/lerobot/rl/learner.py b/src/lerobot/rl/learner.py index d9758d3a3..abc5c9504 100644 --- a/src/lerobot/rl/learner.py +++ b/src/lerobot/rl/learner.py @@ -69,8 +69,8 @@ from lerobot.policies.sac.modeling_sac import SACPolicy from lerobot.rl.buffer import ReplayBuffer, concatenate_batch_transitions from lerobot.rl.process import ProcessSignalHandler from lerobot.rl.wandb_utils import WandBLogger -from lerobot.robots import so100_follower # noqa: F401 -from lerobot.teleoperators import gamepad, so101_leader # noqa: F401 +from lerobot.robots import so_follower # noqa: F401 +from lerobot.teleoperators import gamepad, so_leader # noqa: F401 from lerobot.teleoperators.utils import TeleopEvents from lerobot.transport import services_pb2_grpc from lerobot.transport.utils import ( diff --git a/src/lerobot/rl/wandb_utils.py b/src/lerobot/rl/wandb_utils.py index 1537b3783..7b7f8a57b 100644 --- a/src/lerobot/rl/wandb_utils.py +++ b/src/lerobot/rl/wandb_utils.py @@ -112,7 +112,32 @@ class WandBLogger: artifact_name = f"{self._group}-{step_id}" artifact_name = get_safe_wandb_artifact_name(artifact_name) artifact = self._wandb.Artifact(artifact_name, type="model") - artifact.add_file(checkpoint_dir / PRETRAINED_MODEL_DIR / SAFETENSORS_SINGLE_FILE) + pretrained_model_dir = checkpoint_dir / PRETRAINED_MODEL_DIR + + # Check if this is a PEFT model (has adapter files instead of model.safetensors) + adapter_model_file = pretrained_model_dir / "adapter_model.safetensors" + standard_model_file = pretrained_model_dir / SAFETENSORS_SINGLE_FILE + + if adapter_model_file.exists(): + # PEFT model: add adapter files and configs + artifact.add_file(adapter_model_file) + adapter_config_file = pretrained_model_dir / "adapter_config.json" + if adapter_config_file.exists(): + artifact.add_file(adapter_config_file) + # Also add the policy config which is needed for loading + config_file = pretrained_model_dir / "config.json" + if config_file.exists(): + artifact.add_file(config_file) + elif standard_model_file.exists(): + # Standard model: add the single safetensors file + artifact.add_file(standard_model_file) + else: + logging.warning( + f"No {SAFETENSORS_SINGLE_FILE} or adapter_model.safetensors found in {pretrained_model_dir}. " + "Skipping model artifact upload to WandB." + ) + return + self._wandb.log_artifact(artifact) def log_dict( diff --git a/src/lerobot/robots/bi_so100_follower/config_bi_so100_follower.py b/src/lerobot/robots/bi_so100_follower/config_bi_so100_follower.py deleted file mode 100644 index 5806d7415..000000000 --- a/src/lerobot/robots/bi_so100_follower/config_bi_so100_follower.py +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/env python - -# Copyright 2025 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dataclasses import dataclass, field - -from lerobot.cameras import CameraConfig - -from ..config import RobotConfig - - -@RobotConfig.register_subclass("bi_so100_follower") -@dataclass -class BiSO100FollowerConfig(RobotConfig): - left_arm_port: str - right_arm_port: str - - # Optional - left_arm_disable_torque_on_disconnect: bool = True - left_arm_max_relative_target: float | dict[str, float] | None = None - left_arm_use_degrees: bool = False - right_arm_disable_torque_on_disconnect: bool = True - right_arm_max_relative_target: float | dict[str, float] | None = None - right_arm_use_degrees: bool = False - - # cameras (shared between both arms) - cameras: dict[str, CameraConfig] = field(default_factory=dict) diff --git a/src/lerobot/teleoperators/so101_leader/__init__.py b/src/lerobot/robots/bi_so_follower/__init__.py similarity index 78% rename from src/lerobot/teleoperators/so101_leader/__init__.py rename to src/lerobot/robots/bi_so_follower/__init__.py index 11e277c91..f631a14db 100644 --- a/src/lerobot/teleoperators/so101_leader/__init__.py +++ b/src/lerobot/robots/bi_so_follower/__init__.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# Copyright 2026 The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,5 +14,5 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .config_so101_leader import SO101LeaderConfig -from .so101_leader import SO101Leader +from .bi_so_follower import BiSOFollower +from .config_bi_so_follower import BiSOFollowerConfig diff --git a/src/lerobot/robots/bi_so100_follower/bi_so100_follower.py b/src/lerobot/robots/bi_so_follower/bi_so_follower.py similarity index 50% rename from src/lerobot/robots/bi_so100_follower/bi_so100_follower.py rename to src/lerobot/robots/bi_so_follower/bi_so_follower.py index 7992b79fd..09f849772 100644 --- a/src/lerobot/robots/bi_so100_follower/bi_so100_follower.py +++ b/src/lerobot/robots/bi_so_follower/bi_so_follower.py @@ -15,67 +15,73 @@ # limitations under the License. import logging -import time from functools import cached_property -from typing import Any -from lerobot.cameras.utils import make_cameras_from_configs -from lerobot.robots.so100_follower import SO100Follower -from lerobot.robots.so100_follower.config_so100_follower import SO100FollowerConfig +from lerobot.processor import RobotAction, RobotObservation +from lerobot.robots.so_follower import SOFollower, SOFollowerRobotConfig from ..robot import Robot -from .config_bi_so100_follower import BiSO100FollowerConfig +from .config_bi_so_follower import BiSOFollowerConfig logger = logging.getLogger(__name__) -class BiSO100Follower(Robot): +class BiSOFollower(Robot): """ - [Bimanual SO-100 Follower Arms](https://github.com/TheRobotStudio/SO-ARM100) designed by TheRobotStudio - This bimanual robot can also be easily adapted to use SO-101 follower arms, just replace the SO100Follower class with SO101Follower and SO100FollowerConfig with SO101FollowerConfig. + [Bimanual SO Follower Arms](https://github.com/TheRobotStudio/SO-ARM100) designed by TheRobotStudio """ - config_class = BiSO100FollowerConfig - name = "bi_so100_follower" + config_class = BiSOFollowerConfig + name = "bi_so_follower" - def __init__(self, config: BiSO100FollowerConfig): + def __init__(self, config: BiSOFollowerConfig): super().__init__(config) self.config = config - left_arm_config = SO100FollowerConfig( + left_arm_config = SOFollowerRobotConfig( id=f"{config.id}_left" if config.id else None, calibration_dir=config.calibration_dir, - port=config.left_arm_port, - disable_torque_on_disconnect=config.left_arm_disable_torque_on_disconnect, - max_relative_target=config.left_arm_max_relative_target, - use_degrees=config.left_arm_use_degrees, - cameras={}, + port=config.left_arm_config.port, + disable_torque_on_disconnect=config.left_arm_config.disable_torque_on_disconnect, + max_relative_target=config.left_arm_config.max_relative_target, + use_degrees=config.left_arm_config.use_degrees, + cameras=config.left_arm_config.cameras, ) - right_arm_config = SO100FollowerConfig( + right_arm_config = SOFollowerRobotConfig( id=f"{config.id}_right" if config.id else None, calibration_dir=config.calibration_dir, - port=config.right_arm_port, - disable_torque_on_disconnect=config.right_arm_disable_torque_on_disconnect, - max_relative_target=config.right_arm_max_relative_target, - use_degrees=config.right_arm_use_degrees, - cameras={}, + port=config.right_arm_config.port, + disable_torque_on_disconnect=config.right_arm_config.disable_torque_on_disconnect, + max_relative_target=config.right_arm_config.max_relative_target, + use_degrees=config.right_arm_config.use_degrees, + cameras=config.right_arm_config.cameras, ) - self.left_arm = SO100Follower(left_arm_config) - self.right_arm = SO100Follower(right_arm_config) - self.cameras = make_cameras_from_configs(config.cameras) + self.left_arm = SOFollower(left_arm_config) + self.right_arm = SOFollower(right_arm_config) + + # Only for compatibility with other parts of the codebase that expect a `robot.cameras` attribute + self.cameras = {**self.left_arm.cameras, **self.right_arm.cameras} @property def _motors_ft(self) -> dict[str, type]: - return {f"left_{motor}.pos": float for motor in self.left_arm.bus.motors} | { - f"right_{motor}.pos": float for motor in self.right_arm.bus.motors + left_arm_motors_ft = self.left_arm._motors_ft + right_arm_motors_ft = self.right_arm._motors_ft + + return { + **{f"left_{k}": v for k, v in left_arm_motors_ft.items()}, + **{f"right_{k}": v for k, v in right_arm_motors_ft.items()}, } @property def _cameras_ft(self) -> dict[str, tuple]: + left_arm_cameras_ft = self.left_arm._cameras_ft + right_arm_cameras_ft = self.right_arm._cameras_ft + return { - cam: (self.config.cameras[cam].height, self.config.cameras[cam].width, 3) for cam in self.cameras + **{f"left_{k}": v for k, v in left_arm_cameras_ft.items()}, + **{f"right_{k}": v for k, v in right_arm_cameras_ft.items()}, } @cached_property @@ -88,19 +94,12 @@ class BiSO100Follower(Robot): @property def is_connected(self) -> bool: - return ( - self.left_arm.bus.is_connected - and self.right_arm.bus.is_connected - and all(cam.is_connected for cam in self.cameras.values()) - ) + return self.left_arm.is_connected and self.right_arm.is_connected def connect(self, calibrate: bool = True) -> None: self.left_arm.connect(calibrate) self.right_arm.connect(calibrate) - for cam in self.cameras.values(): - cam.connect() - @property def is_calibrated(self) -> bool: return self.left_arm.is_calibrated and self.right_arm.is_calibrated @@ -117,7 +116,7 @@ class BiSO100Follower(Robot): self.left_arm.setup_motors() self.right_arm.setup_motors() - def get_observation(self) -> dict[str, Any]: + def get_observation(self) -> RobotObservation: obs_dict = {} # Add "left_" prefix @@ -128,15 +127,9 @@ class BiSO100Follower(Robot): right_obs = self.right_arm.get_observation() obs_dict.update({f"right_{key}": value for key, value in right_obs.items()}) - for cam_key, cam in self.cameras.items(): - start = time.perf_counter() - obs_dict[cam_key] = cam.async_read() - dt_ms = (time.perf_counter() - start) * 1e3 - logger.debug(f"{self} read {cam_key}: {dt_ms:.1f}ms") - return obs_dict - def send_action(self, action: dict[str, Any]) -> dict[str, Any]: + def send_action(self, action: RobotAction) -> RobotAction: # Remove "left_" prefix left_action = { key.removeprefix("left_"): value for key, value in action.items() if key.startswith("left_") @@ -146,18 +139,15 @@ class BiSO100Follower(Robot): key.removeprefix("right_"): value for key, value in action.items() if key.startswith("right_") } - send_action_left = self.left_arm.send_action(left_action) - send_action_right = self.right_arm.send_action(right_action) + sent_action_left = self.left_arm.send_action(left_action) + sent_action_right = self.right_arm.send_action(right_action) # Add prefixes back - prefixed_send_action_left = {f"left_{key}": value for key, value in send_action_left.items()} - prefixed_send_action_right = {f"right_{key}": value for key, value in send_action_right.items()} + prefixed_sent_action_left = {f"left_{key}": value for key, value in sent_action_left.items()} + prefixed_sent_action_right = {f"right_{key}": value for key, value in sent_action_right.items()} - return {**prefixed_send_action_left, **prefixed_send_action_right} + return {**prefixed_sent_action_left, **prefixed_sent_action_right} def disconnect(self): self.left_arm.disconnect() self.right_arm.disconnect() - - for cam in self.cameras.values(): - cam.disconnect() diff --git a/src/lerobot/teleoperators/bi_so100_leader/config_bi_so100_leader.py b/src/lerobot/robots/bi_so_follower/config_bi_so_follower.py similarity index 68% rename from src/lerobot/teleoperators/bi_so100_leader/config_bi_so100_leader.py rename to src/lerobot/robots/bi_so_follower/config_bi_so_follower.py index 117e09913..dca74fa2d 100644 --- a/src/lerobot/teleoperators/bi_so100_leader/config_bi_so100_leader.py +++ b/src/lerobot/robots/bi_so_follower/config_bi_so_follower.py @@ -16,11 +16,15 @@ from dataclasses import dataclass -from ..config import TeleoperatorConfig +from lerobot.robots.so_follower import SOFollowerConfig + +from ..config import RobotConfig -@TeleoperatorConfig.register_subclass("bi_so100_leader") +@RobotConfig.register_subclass("bi_so_follower") @dataclass -class BiSO100LeaderConfig(TeleoperatorConfig): - left_arm_port: str - right_arm_port: str +class BiSOFollowerConfig(RobotConfig): + """Configuration class for Bi SO Follower robots.""" + + left_arm_config: SOFollowerConfig + right_arm_config: SOFollowerConfig diff --git a/src/lerobot/robots/earthrover_mini_plus/robot_earthrover_mini_plus.py b/src/lerobot/robots/earthrover_mini_plus/robot_earthrover_mini_plus.py index 48cb09215..cdf6efde1 100644 --- a/src/lerobot/robots/earthrover_mini_plus/robot_earthrover_mini_plus.py +++ b/src/lerobot/robots/earthrover_mini_plus/robot_earthrover_mini_plus.py @@ -18,13 +18,14 @@ import base64 import logging from functools import cached_property -from typing import Any import cv2 import numpy as np import requests -from lerobot.utils.errors import DeviceAlreadyConnectedError, DeviceNotConnectedError +from lerobot.processor import RobotAction, RobotObservation +from lerobot.utils.decorators import check_if_already_connected, check_if_not_connected +from lerobot.utils.errors import DeviceNotConnectedError from ..robot import Robot from .config_earthrover_mini_plus import EarthRoverMiniPlusConfig @@ -99,6 +100,7 @@ class EarthRoverMiniPlus(Robot): """Check if robot is connected to SDK.""" return self._is_connected + @check_if_already_connected def connect(self, calibrate: bool = True) -> None: """Connect to robot via Frodobots SDK. @@ -109,8 +111,6 @@ class EarthRoverMiniPlus(Robot): DeviceAlreadyConnectedError: If robot is already connected DeviceNotConnectedError: If cannot connect to SDK server """ - if self._is_connected: - raise DeviceAlreadyConnectedError(f"{self.name} is already connected") # Verify SDK is running and accessible try: @@ -197,11 +197,12 @@ class EarthRoverMiniPlus(Robot): ACTION_ANGULAR_VEL: float, } - def get_observation(self) -> dict[str, Any]: + @check_if_not_connected + def get_observation(self) -> RobotObservation: """Get current robot observation from SDK. Returns: - dict: Observation containing: + RobotObservation: Observation containing: - front: Front camera image (480, 640, 3) in RGB format - rear: Rear camera image (480, 640, 3) in RGB format - linear.vel: Current speed (0-1, SDK reports only positive speeds) @@ -223,8 +224,6 @@ class EarthRoverMiniPlus(Robot): Robot telemetry is retrieved from /data endpoint. All SDK values are normalized to appropriate ranges for dataset recording. """ - if not self._is_connected: - raise DeviceNotConnectedError(f"{self.name} is not connected") observation = {} @@ -255,7 +254,8 @@ class EarthRoverMiniPlus(Robot): return observation - def send_action(self, action: dict[str, Any]) -> dict[str, Any]: + @check_if_not_connected + def send_action(self, action: RobotAction) -> RobotAction: """Send action to robot via SDK. Args: @@ -264,8 +264,7 @@ class EarthRoverMiniPlus(Robot): - angular.vel: Target angular velocity (-1 to 1) Returns: - dict: The action that was sent (matches action_features keys) - + RobotAction: The action that was sent (matches action_features keys) Raises: DeviceNotConnectedError: If robot is not connected @@ -273,8 +272,6 @@ class EarthRoverMiniPlus(Robot): Actions are sent to SDK via POST /control endpoint. SDK expects commands in range [-1, 1]. """ - if not self._is_connected: - raise DeviceNotConnectedError(f"{self.name} is not connected") # Extract action values and convert to float linear = float(action.get(ACTION_LINEAR_VEL, 0.0)) @@ -292,6 +289,7 @@ class EarthRoverMiniPlus(Robot): ACTION_ANGULAR_VEL: angular, } + @check_if_not_connected def disconnect(self) -> None: """Disconnect from robot. @@ -300,8 +298,6 @@ class EarthRoverMiniPlus(Robot): Raises: DeviceNotConnectedError: If robot is not connected """ - if not self._is_connected: - raise DeviceNotConnectedError(f"{self.name} is not connected") # Stop the robot before disconnecting try: diff --git a/src/lerobot/robots/hope_jr/hope_jr_arm.py b/src/lerobot/robots/hope_jr/hope_jr_arm.py index 220a29f8c..5fd9c4d1d 100644 --- a/src/lerobot/robots/hope_jr/hope_jr_arm.py +++ b/src/lerobot/robots/hope_jr/hope_jr_arm.py @@ -17,7 +17,6 @@ import logging import time from functools import cached_property -from typing import Any from lerobot.cameras.utils import make_cameras_from_configs from lerobot.motors import Motor, MotorNormMode @@ -25,7 +24,8 @@ from lerobot.motors.calibration_gui import RangeFinderGUI from lerobot.motors.feetech import ( FeetechMotorsBus, ) -from lerobot.utils.errors import DeviceAlreadyConnectedError, DeviceNotConnectedError +from lerobot.processor import RobotAction, RobotObservation +from lerobot.utils.decorators import check_if_already_connected, check_if_not_connected from ..robot import Robot from ..utils import ensure_safe_goal_position @@ -82,13 +82,12 @@ class HopeJrArm(Robot): def is_connected(self) -> bool: return self.bus.is_connected and all(cam.is_connected for cam in self.cameras.values()) + @check_if_already_connected def connect(self, calibrate: bool = True) -> None: """ We assume that at connection time, arm is in a rest position, and torque can be safely disabled to run calibration. """ - if self.is_connected: - raise DeviceAlreadyConnectedError(f"{self} already connected") self.bus.connect(handshake=False) if not self.is_calibrated and calibrate: @@ -128,10 +127,8 @@ class HopeJrArm(Robot): self.bus.setup_motor(motor) print(f"'{motor}' motor id set to {self.bus.motors[motor].id}") - def get_observation(self) -> dict[str, Any]: - if not self.is_connected: - raise DeviceNotConnectedError(f"{self} is not connected.") - + @check_if_not_connected + def get_observation(self) -> RobotObservation: # Read arm position start = time.perf_counter() obs_dict = self.bus.sync_read("Present_Position", self.other_motors) @@ -149,10 +146,8 @@ class HopeJrArm(Robot): return obs_dict - def send_action(self, action: dict[str, Any]) -> dict[str, Any]: - if not self.is_connected: - raise DeviceNotConnectedError(f"{self} is not connected.") - + @check_if_not_connected + def send_action(self, action: RobotAction) -> RobotAction: goal_pos = {key.removesuffix(".pos"): val for key, val in action.items() if key.endswith(".pos")} # Cap goal position when too far away from present position. @@ -165,10 +160,8 @@ class HopeJrArm(Robot): self.bus.sync_write("Goal_Position", goal_pos) return {f"{motor}.pos": val for motor, val in goal_pos.items()} + @check_if_not_connected def disconnect(self): - if not self.is_connected: - raise DeviceNotConnectedError(f"{self} is not connected.") - self.bus.disconnect(self.config.disable_torque_on_disconnect) for cam in self.cameras.values(): cam.disconnect() diff --git a/src/lerobot/robots/hope_jr/hope_jr_hand.py b/src/lerobot/robots/hope_jr/hope_jr_hand.py index 9e960642b..1e5c72b72 100644 --- a/src/lerobot/robots/hope_jr/hope_jr_hand.py +++ b/src/lerobot/robots/hope_jr/hope_jr_hand.py @@ -17,7 +17,6 @@ import logging import time from functools import cached_property -from typing import Any from lerobot.cameras.utils import make_cameras_from_configs from lerobot.motors import Motor, MotorNormMode @@ -25,7 +24,8 @@ from lerobot.motors.calibration_gui import RangeFinderGUI from lerobot.motors.feetech import ( FeetechMotorsBus, ) -from lerobot.utils.errors import DeviceAlreadyConnectedError, DeviceNotConnectedError +from lerobot.processor import RobotAction, RobotObservation +from lerobot.utils.decorators import check_if_already_connected, check_if_not_connected from ..robot import Robot from .config_hope_jr import HopeJrHandConfig @@ -118,10 +118,8 @@ class HopeJrHand(Robot): def is_connected(self) -> bool: return self.bus.is_connected and all(cam.is_connected for cam in self.cameras.values()) + @check_if_already_connected def connect(self, calibrate: bool = True) -> None: - if self.is_connected: - raise DeviceAlreadyConnectedError(f"{self} already connected") - self.bus.connect() if not self.is_calibrated and calibrate: self.calibrate() @@ -159,10 +157,8 @@ class HopeJrHand(Robot): self.bus.setup_motor(motor) print(f"'{motor}' motor id set to {self.bus.motors[motor].id}") - def get_observation(self) -> dict[str, Any]: - if not self.is_connected: - raise DeviceNotConnectedError(f"{self} is not connected.") - + @check_if_not_connected + def get_observation(self) -> RobotObservation: obs_dict = {} # Read hand position @@ -181,18 +177,14 @@ class HopeJrHand(Robot): return obs_dict - def send_action(self, action: dict[str, Any]) -> dict[str, Any]: - if not self.is_connected: - raise DeviceNotConnectedError(f"{self} is not connected.") - + @check_if_not_connected + def send_action(self, action: RobotAction) -> RobotAction: goal_pos = {key.removesuffix(".pos"): val for key, val in action.items() if key.endswith(".pos")} self.bus.sync_write("Goal_Position", goal_pos) return action + @check_if_not_connected def disconnect(self): - if not self.is_connected: - raise DeviceNotConnectedError(f"{self} is not connected.") - self.bus.disconnect(self.config.disable_torque_on_disconnect) for cam in self.cameras.values(): cam.disconnect() diff --git a/src/lerobot/robots/koch_follower/koch_follower.py b/src/lerobot/robots/koch_follower/koch_follower.py index 41a57828b..fee0adba9 100644 --- a/src/lerobot/robots/koch_follower/koch_follower.py +++ b/src/lerobot/robots/koch_follower/koch_follower.py @@ -17,7 +17,6 @@ import logging import time from functools import cached_property -from typing import Any from lerobot.cameras.utils import make_cameras_from_configs from lerobot.motors import Motor, MotorCalibration, MotorNormMode @@ -25,7 +24,8 @@ from lerobot.motors.dynamixel import ( DynamixelMotorsBus, OperatingMode, ) -from lerobot.utils.errors import DeviceAlreadyConnectedError, DeviceNotConnectedError +from lerobot.processor import RobotAction, RobotObservation +from lerobot.utils.decorators import check_if_already_connected, check_if_not_connected from ..robot import Robot from ..utils import ensure_safe_goal_position @@ -84,13 +84,12 @@ class KochFollower(Robot): def is_connected(self) -> bool: return self.bus.is_connected and all(cam.is_connected for cam in self.cameras.values()) + @check_if_already_connected def connect(self, calibrate: bool = True) -> None: """ We assume that at connection time, arm is in a rest position, and torque can be safely disabled to run calibration. """ - if self.is_connected: - raise DeviceAlreadyConnectedError(f"{self} already connected") self.bus.connect() if not self.is_calibrated and calibrate: @@ -182,10 +181,8 @@ class KochFollower(Robot): self.bus.setup_motor(motor) print(f"'{motor}' motor id set to {self.bus.motors[motor].id}") - def get_observation(self) -> dict[str, Any]: - if not self.is_connected: - raise DeviceNotConnectedError(f"{self} is not connected.") - + @check_if_not_connected + def get_observation(self) -> RobotObservation: # Read arm position start = time.perf_counter() obs_dict = self.bus.sync_read("Present_Position") @@ -202,7 +199,8 @@ class KochFollower(Robot): return obs_dict - def send_action(self, action: dict[str, float]) -> dict[str, float]: + @check_if_not_connected + def send_action(self, action: RobotAction) -> RobotAction: """Command arm to move to a target joint configuration. The relative action magnitude may be clipped depending on the configuration parameter @@ -210,13 +208,11 @@ class KochFollower(Robot): Thus, this function always returns the action actually sent. Args: - action (dict[str, float]): The goal positions for the motors. + action (RobotAction): The goal positions for the motors. Returns: - dict[str, float]: The action sent to the motors, potentially clipped. + RobotAction: The action sent to the motors, potentially clipped. """ - if not self.is_connected: - raise DeviceNotConnectedError(f"{self} is not connected.") goal_pos = {key.removesuffix(".pos"): val for key, val in action.items() if key.endswith(".pos")} @@ -231,10 +227,8 @@ class KochFollower(Robot): self.bus.sync_write("Goal_Position", goal_pos) return {f"{motor}.pos": val for motor, val in goal_pos.items()} + @check_if_not_connected def disconnect(self): - if not self.is_connected: - raise DeviceNotConnectedError(f"{self} is not connected.") - self.bus.disconnect(self.config.disable_torque_on_disconnect) for cam in self.cameras.values(): cam.disconnect() diff --git a/src/lerobot/robots/lekiwi/lekiwi.py b/src/lerobot/robots/lekiwi/lekiwi.py index 86fe017d6..54848f49d 100644 --- a/src/lerobot/robots/lekiwi/lekiwi.py +++ b/src/lerobot/robots/lekiwi/lekiwi.py @@ -28,7 +28,8 @@ from lerobot.motors.feetech import ( FeetechMotorsBus, OperatingMode, ) -from lerobot.utils.errors import DeviceAlreadyConnectedError, DeviceNotConnectedError +from lerobot.processor import RobotAction, RobotObservation +from lerobot.utils.decorators import check_if_already_connected, check_if_not_connected from ..robot import Robot from ..utils import ensure_safe_goal_position @@ -108,10 +109,8 @@ class LeKiwi(Robot): def is_connected(self) -> bool: return self.bus.is_connected and all(cam.is_connected for cam in self.cameras.values()) + @check_if_already_connected def connect(self, calibrate: bool = True) -> None: - if self.is_connected: - raise DeviceAlreadyConnectedError(f"{self} already connected") - self.bus.connect() if not self.is_calibrated and calibrate: logger.info( @@ -338,10 +337,8 @@ class LeKiwi(Robot): "theta.vel": theta, } # m/s and deg/s - def get_observation(self) -> dict[str, Any]: - if not self.is_connected: - raise DeviceNotConnectedError(f"{self} is not connected.") - + @check_if_not_connected + def get_observation(self) -> RobotObservation: # Read actuators position for arm and vel for base start = time.perf_counter() arm_pos = self.bus.sync_read("Present_Position", self.arm_motors) @@ -369,7 +366,8 @@ class LeKiwi(Robot): return obs_dict - def send_action(self, action: dict[str, Any]) -> dict[str, Any]: + @check_if_not_connected + def send_action(self, action: RobotAction) -> RobotAction: """Command lekiwi to move to a target joint configuration. The relative action magnitude may be clipped depending on the configuration parameter @@ -380,10 +378,8 @@ class LeKiwi(Robot): RobotDeviceNotConnectedError: if robot is not connected. Returns: - np.ndarray: the action sent to the motors, potentially clipped. + RobotAction: the action sent to the motors, potentially clipped. """ - if not self.is_connected: - raise DeviceNotConnectedError(f"{self} is not connected.") arm_goal_pos = {k: v for k, v in action.items() if k.endswith(".pos")} base_goal_vel = {k: v for k, v in action.items() if k.endswith(".vel")} @@ -411,10 +407,8 @@ class LeKiwi(Robot): self.bus.sync_write("Goal_Velocity", dict.fromkeys(self.base_motors, 0), num_retry=5) logger.info("Base motors stopped") + @check_if_not_connected def disconnect(self): - if not self.is_connected: - raise DeviceNotConnectedError(f"{self} is not connected.") - self.stop_base() self.bus.disconnect(self.config.disable_torque_on_disconnect) for cam in self.cameras.values(): diff --git a/src/lerobot/robots/lekiwi/lekiwi_client.py b/src/lerobot/robots/lekiwi/lekiwi_client.py index 19744e244..1d5ea64a6 100644 --- a/src/lerobot/robots/lekiwi/lekiwi_client.py +++ b/src/lerobot/robots/lekiwi/lekiwi_client.py @@ -18,13 +18,14 @@ import base64 import json import logging from functools import cached_property -from typing import Any import cv2 import numpy as np +from lerobot.processor import RobotAction, RobotObservation from lerobot.utils.constants import ACTION, OBS_STATE -from lerobot.utils.errors import DeviceAlreadyConnectedError, DeviceNotConnectedError +from lerobot.utils.decorators import check_if_already_connected, check_if_not_connected +from lerobot.utils.errors import DeviceNotConnectedError from ..robot import Robot from .config_lekiwi import LeKiwiClientConfig @@ -112,14 +113,10 @@ class LeKiwiClient(Robot): def is_calibrated(self) -> bool: pass + @check_if_already_connected def connect(self) -> None: """Establishes ZMQ sockets with the remote mobile robot""" - if self._is_connected: - raise DeviceAlreadyConnectedError( - "LeKiwi Daemon is already connected. Do not run `robot.connect()` twice." - ) - zmq = self._zmq self.zmq_context = zmq.Context() self.zmq_cmd_socket = self.zmq_context.socket(zmq.PUSH) @@ -172,7 +169,7 @@ class LeKiwiClient(Robot): return last_msg - def _parse_observation_json(self, obs_string: str) -> dict[str, Any] | None: + def _parse_observation_json(self, obs_string: str) -> RobotObservation | None: """Parses the JSON observation string.""" try: return json.loads(obs_string) @@ -196,15 +193,15 @@ class LeKiwiClient(Robot): return None def _remote_state_from_obs( - self, observation: dict[str, Any] - ) -> tuple[dict[str, np.ndarray], dict[str, Any]]: + self, observation: RobotObservation + ) -> tuple[dict[str, np.ndarray], RobotObservation]: """Extracts frames, and state from the parsed observation.""" flat_state = {key: observation.get(key, 0.0) for key in self._state_order} state_vec = np.array([flat_state[key] for key in self._state_order], dtype=np.float32) - obs_dict: dict[str, Any] = {**flat_state, OBS_STATE: state_vec} + obs_dict: RobotObservation = {**flat_state, OBS_STATE: state_vec} # Decode images current_frames: dict[str, np.ndarray] = {} @@ -217,7 +214,7 @@ class LeKiwiClient(Robot): return current_frames, obs_dict - def _get_data(self) -> tuple[dict[str, np.ndarray], dict[str, Any], dict[str, Any]]: + def _get_data(self) -> tuple[dict[str, np.ndarray], RobotObservation]: """ Polls the video socket for the latest observation data. @@ -252,14 +249,13 @@ class LeKiwiClient(Robot): return new_frames, new_state - def get_observation(self) -> dict[str, Any]: + @check_if_not_connected + def get_observation(self) -> RobotObservation: """ Capture observations from the remote robot: current follower arm positions, present wheel speeds (converted to body-frame velocities: x, y, theta), and a camera frame. Receives over ZMQ, translate to body-frame vel """ - if not self._is_connected: - raise DeviceNotConnectedError("LeKiwiClient is not connected. You need to run `robot.connect()`.") frames, obs_dict = self._get_data() @@ -307,22 +303,18 @@ class LeKiwiClient(Robot): def configure(self): pass - def send_action(self, action: dict[str, Any]) -> dict[str, Any]: + @check_if_not_connected + def send_action(self, action: RobotAction) -> RobotAction: """Command lekiwi to move to a target joint configuration. Translates to motor space + sends over ZMQ Args: - action (np.ndarray): array containing the goal positions for the motors. - + action (RobotAction): array containing the goal positions for the motors. Raises: RobotDeviceNotConnectedError: if robot is not connected. Returns: np.ndarray: the action sent to the motors, potentially clipped. """ - if not self._is_connected: - raise DeviceNotConnectedError( - "ManipulatorRobot is not connected. You need to run `robot.connect()`." - ) self.zmq_cmd_socket.send_string(json.dumps(action)) # action is in motor space @@ -333,13 +325,10 @@ class LeKiwiClient(Robot): action_sent[ACTION] = actions return action_sent + @check_if_not_connected def disconnect(self): """Cleans ZMQ comms""" - if not self._is_connected: - raise DeviceNotConnectedError( - "LeKiwi is not connected. You need to run `robot.connect()` before disconnecting." - ) self.zmq_observation_socket.close() self.zmq_cmd_socket.close() self.zmq_context.term() diff --git a/src/lerobot/robots/omx_follower/omx_follower.py b/src/lerobot/robots/omx_follower/omx_follower.py index 2dd851377..a171affbd 100644 --- a/src/lerobot/robots/omx_follower/omx_follower.py +++ b/src/lerobot/robots/omx_follower/omx_follower.py @@ -17,7 +17,6 @@ import logging import time from functools import cached_property -from typing import Any from lerobot.cameras.utils import make_cameras_from_configs from lerobot.motors import Motor, MotorCalibration, MotorNormMode @@ -26,7 +25,8 @@ from lerobot.motors.dynamixel import ( DynamixelMotorsBus, OperatingMode, ) -from lerobot.utils.errors import DeviceAlreadyConnectedError, DeviceNotConnectedError +from lerobot.processor import RobotAction, RobotObservation +from lerobot.utils.decorators import check_if_already_connected, check_if_not_connected from ..robot import Robot from ..utils import ensure_safe_goal_position @@ -84,6 +84,7 @@ class OmxFollower(Robot): def is_connected(self) -> bool: return self.bus.is_connected and all(cam.is_connected for cam in self.cameras.values()) + @check_if_already_connected def connect(self, calibrate: bool = True) -> None: """ For OMX robots that come pre-calibrated: @@ -91,8 +92,6 @@ class OmxFollower(Robot): - This allows using pre-calibrated robots without manual calibration - If no calibration file exists, use factory default values (homing_offset=0, range_min=0, range_max=4095) """ - if self.is_connected: - raise DeviceAlreadyConnectedError(f"{self} already connected") self.bus.connect() if not self.is_calibrated and calibrate: @@ -165,10 +164,8 @@ class OmxFollower(Robot): self.bus.setup_motor(motor) print(f"'{motor}' motor id set to {self.bus.motors[motor].id}") - def get_observation(self) -> dict[str, Any]: - if not self.is_connected: - raise DeviceNotConnectedError(f"{self} is not connected.") - + @check_if_not_connected + def get_observation(self) -> RobotObservation: # Read arm position start = time.perf_counter() obs_dict = self.bus.sync_read("Present_Position") @@ -185,7 +182,8 @@ class OmxFollower(Robot): return obs_dict - def send_action(self, action: dict[str, float]) -> dict[str, float]: + @check_if_not_connected + def send_action(self, action: RobotAction) -> RobotAction: """Command arm to move to a target joint configuration. The relative action magnitude may be clipped depending on the configuration parameter @@ -193,13 +191,11 @@ class OmxFollower(Robot): Thus, this function always returns the action actually sent. Args: - action (dict[str, float]): The goal positions for the motors. + action (RobotAction): The goal positions for the motors. Returns: - dict[str, float]: The action sent to the motors, potentially clipped. + RobotAction: The action sent to the motors, potentially clipped. """ - if not self.is_connected: - raise DeviceNotConnectedError(f"{self} is not connected.") goal_pos = {key.removesuffix(".pos"): val for key, val in action.items() if key.endswith(".pos")} @@ -214,10 +210,8 @@ class OmxFollower(Robot): self.bus.sync_write("Goal_Position", goal_pos) return {f"{motor}.pos": val for motor, val in goal_pos.items()} + @check_if_not_connected def disconnect(self): - if not self.is_connected: - raise DeviceNotConnectedError(f"{self} is not connected.") - self.bus.disconnect(self.config.disable_torque_on_disconnect) for cam in self.cameras.values(): cam.disconnect() diff --git a/src/lerobot/robots/reachy2/configuration_reachy2.py b/src/lerobot/robots/reachy2/configuration_reachy2.py index aa25351c6..63293e675 100644 --- a/src/lerobot/robots/reachy2/configuration_reachy2.py +++ b/src/lerobot/robots/reachy2/configuration_reachy2.py @@ -30,6 +30,8 @@ class Reachy2RobotConfig(RobotConfig): # IP address of the Reachy 2 robot ip_address: str | None = "localhost" + # Port of the Reachy 2 robot + port: int = 50065 # If True, turn_off_smoothly() will be sent to the robot before disconnecting. disable_torque_on_disconnect: bool = False @@ -51,11 +53,16 @@ class Reachy2RobotConfig(RobotConfig): # Robot cameras # Set to True if you want to use the corresponding cameras in the observations. - # By default, only the teleop cameras are used. - with_left_teleop_camera: bool = True - with_right_teleop_camera: bool = True + # By default, no camera is used. + with_left_teleop_camera: bool = False + with_right_teleop_camera: bool = False with_torso_camera: bool = False + # Camera parameters + camera_width: int = 640 + camera_height: int = 480 + + # For cameras other than the 3 default Reachy 2 cameras. cameras: dict[str, CameraConfig] = field(default_factory=dict) def __post_init__(self) -> None: @@ -65,9 +72,10 @@ class Reachy2RobotConfig(RobotConfig): name="teleop", image_type="left", ip_address=self.ip_address, - fps=15, - width=640, - height=480, + port=self.port, + width=self.camera_width, + height=self.camera_height, + fps=30, # Not configurable for Reachy 2 cameras color_mode=ColorMode.RGB, ) if self.with_right_teleop_camera: @@ -75,9 +83,10 @@ class Reachy2RobotConfig(RobotConfig): name="teleop", image_type="right", ip_address=self.ip_address, - fps=15, - width=640, - height=480, + port=self.port, + width=self.camera_width, + height=self.camera_height, + fps=30, # Not configurable for Reachy 2 cameras color_mode=ColorMode.RGB, ) if self.with_torso_camera: @@ -85,9 +94,10 @@ class Reachy2RobotConfig(RobotConfig): name="depth", image_type="rgb", ip_address=self.ip_address, - fps=15, - width=640, - height=480, + port=self.port, + width=self.camera_width, + height=self.camera_height, + fps=30, # Not configurable for Reachy 2 cameras color_mode=ColorMode.RGB, ) diff --git a/src/lerobot/robots/reachy2/robot_reachy2.py b/src/lerobot/robots/reachy2/robot_reachy2.py index ecc488a79..6f4eef56c 100644 --- a/src/lerobot/robots/reachy2/robot_reachy2.py +++ b/src/lerobot/robots/reachy2/robot_reachy2.py @@ -13,19 +13,24 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations import time -from typing import Any - -import numpy as np -from reachy2_sdk import ReachySDK +from typing import TYPE_CHECKING, Any from lerobot.cameras.utils import make_cameras_from_configs +from lerobot.processor import RobotAction, RobotObservation +from lerobot.utils.import_utils import _reachy2_sdk_available from ..robot import Robot from ..utils import ensure_safe_goal_position from .configuration_reachy2 import Reachy2RobotConfig +if TYPE_CHECKING or _reachy2_sdk_available: + from reachy2_sdk import ReachySDK +else: + ReachySDK = None + # {lerobot_keys: reachy2_sdk_keys} REACHY2_NECK_JOINTS = { "neck_yaw.pos": "head.neck.yaw", @@ -165,8 +170,8 @@ class Reachy2Robot(Robot): else: return {} - def get_observation(self) -> dict[str, np.ndarray]: - obs_dict: dict[str, Any] = {} + def get_observation(self) -> RobotObservation: + obs_dict: RobotObservation = {} # Read Reachy 2 state before_read_t = time.perf_counter() @@ -179,7 +184,7 @@ class Reachy2Robot(Robot): return obs_dict - def send_action(self, action: dict[str, Any]) -> dict[str, Any]: + def send_action(self, action: RobotAction) -> RobotAction: if self.reachy is not None: if not self.is_connected: raise ConnectionError() diff --git a/src/lerobot/robots/robot.py b/src/lerobot/robots/robot.py index 5e88b915b..d165886b9 100644 --- a/src/lerobot/robots/robot.py +++ b/src/lerobot/robots/robot.py @@ -15,11 +15,11 @@ import abc import builtins from pathlib import Path -from typing import Any import draccus from lerobot.motors import MotorCalibration +from lerobot.processor import RobotAction, RobotObservation from lerobot.utils.constants import HF_LEROBOT_CALIBRATION, ROBOTS from .config import RobotConfig @@ -58,6 +58,32 @@ class Robot(abc.ABC): def __str__(self) -> str: return f"{self.id} {self.__class__.__name__}" + def __enter__(self): + """ + Context manager entry. + Automatically connects to the camera. + """ + self.connect() + return self + + def __exit__(self, exc_type, exc_value, traceback) -> None: + """ + Context manager exit. + Automatically disconnects, ensuring resources are released even on error. + """ + self.disconnect() + + def __del__(self) -> None: + """ + Destructor safety net. + Attempts to disconnect if the object is garbage collected without cleanup. + """ + try: + if self.is_connected: + self.disconnect() + except Exception: # nosec B110 + pass + # TODO(aliberts): create a proper Feature class for this that links with datasets @property @abc.abstractmethod @@ -153,28 +179,28 @@ class Robot(abc.ABC): pass @abc.abstractmethod - def get_observation(self) -> dict[str, Any]: + def get_observation(self) -> RobotObservation: """ Retrieve the current observation from the robot. Returns: - dict[str, Any]: A flat dictionary representing the robot's current sensory state. Its structure + RobotObservation: A flat dictionary representing the robot's current sensory state. Its structure should match :pymeth:`observation_features`. """ pass @abc.abstractmethod - def send_action(self, action: dict[str, Any]) -> dict[str, Any]: + def send_action(self, action: RobotAction) -> RobotAction: """ Send an action command to the robot. Args: - action (dict[str, Any]): Dictionary representing the desired action. Its structure should match + action (RobotAction): Dictionary representing the desired action. Its structure should match :pymeth:`action_features`. Returns: - dict[str, Any]: The action actually sent to the motors potentially clipped or modified, e.g. by + RobotAction: The action actually sent to the motors potentially clipped or modified, e.g. by safety limits on velocity. """ pass diff --git a/src/lerobot/robots/so101_follower/config_so101_follower.py b/src/lerobot/robots/so101_follower/config_so101_follower.py deleted file mode 100644 index 03c3530c2..000000000 --- a/src/lerobot/robots/so101_follower/config_so101_follower.py +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env python - -# Copyright 2025 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dataclasses import dataclass, field - -from lerobot.cameras import CameraConfig - -from ..config import RobotConfig - - -@RobotConfig.register_subclass("so101_follower") -@dataclass -class SO101FollowerConfig(RobotConfig): - # Port to connect to the arm - port: str - - disable_torque_on_disconnect: bool = True - - # `max_relative_target` limits the magnitude of the relative positional target vector for safety purposes. - # Set this to a positive scalar to have the same value for all motors, or a dictionary that maps motor - # names to the max_relative_target value for that motor. - max_relative_target: float | dict[str, float] | None = None - - # cameras - cameras: dict[str, CameraConfig] = field(default_factory=dict) - - # Set to `True` for backward compatibility with previous policies/dataset - use_degrees: bool = False diff --git a/src/lerobot/robots/so101_follower/so101_follower.py b/src/lerobot/robots/so101_follower/so101_follower.py deleted file mode 100644 index acfd4bd11..000000000 --- a/src/lerobot/robots/so101_follower/so101_follower.py +++ /dev/null @@ -1,230 +0,0 @@ -#!/usr/bin/env python - -# Copyright 2025 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import time -from functools import cached_property -from typing import Any - -from lerobot.cameras.utils import make_cameras_from_configs -from lerobot.motors import Motor, MotorCalibration, MotorNormMode -from lerobot.motors.feetech import ( - FeetechMotorsBus, - OperatingMode, -) -from lerobot.utils.errors import DeviceAlreadyConnectedError, DeviceNotConnectedError - -from ..robot import Robot -from ..utils import ensure_safe_goal_position -from .config_so101_follower import SO101FollowerConfig - -logger = logging.getLogger(__name__) - - -class SO101Follower(Robot): - """ - SO-101 Follower Arm designed by TheRobotStudio and Hugging Face. - """ - - config_class = SO101FollowerConfig - name = "so101_follower" - - def __init__(self, config: SO101FollowerConfig): - super().__init__(config) - self.config = config - norm_mode_body = MotorNormMode.DEGREES if config.use_degrees else MotorNormMode.RANGE_M100_100 - self.bus = FeetechMotorsBus( - port=self.config.port, - motors={ - "shoulder_pan": Motor(1, "sts3215", norm_mode_body), - "shoulder_lift": Motor(2, "sts3215", norm_mode_body), - "elbow_flex": Motor(3, "sts3215", norm_mode_body), - "wrist_flex": Motor(4, "sts3215", norm_mode_body), - "wrist_roll": Motor(5, "sts3215", norm_mode_body), - "gripper": Motor(6, "sts3215", MotorNormMode.RANGE_0_100), - }, - calibration=self.calibration, - ) - self.cameras = make_cameras_from_configs(config.cameras) - - @property - def _motors_ft(self) -> dict[str, type]: - return {f"{motor}.pos": float for motor in self.bus.motors} - - @property - def _cameras_ft(self) -> dict[str, tuple]: - return { - cam: (self.config.cameras[cam].height, self.config.cameras[cam].width, 3) for cam in self.cameras - } - - @cached_property - def observation_features(self) -> dict[str, type | tuple]: - return {**self._motors_ft, **self._cameras_ft} - - @cached_property - def action_features(self) -> dict[str, type]: - return self._motors_ft - - @property - def is_connected(self) -> bool: - return self.bus.is_connected and all(cam.is_connected for cam in self.cameras.values()) - - def connect(self, calibrate: bool = True) -> None: - """ - We assume that at connection time, arm is in a rest position, - and torque can be safely disabled to run calibration. - """ - if self.is_connected: - raise DeviceAlreadyConnectedError(f"{self} already connected") - - self.bus.connect() - if not self.is_calibrated and calibrate: - logger.info( - "Mismatch between calibration values in the motor and the calibration file or no calibration file found" - ) - self.calibrate() - - for cam in self.cameras.values(): - cam.connect() - - self.configure() - logger.info(f"{self} connected.") - - @property - def is_calibrated(self) -> bool: - return self.bus.is_calibrated - - def calibrate(self) -> None: - if self.calibration: - # self.calibration is not empty here - user_input = input( - f"Press ENTER to use provided calibration file associated with the id {self.id}, or type 'c' and press ENTER to run calibration: " - ) - if user_input.strip().lower() != "c": - logger.info(f"Writing calibration file associated with the id {self.id} to the motors") - self.bus.write_calibration(self.calibration) - return - - logger.info(f"\nRunning calibration of {self}") - self.bus.disable_torque() - for motor in self.bus.motors: - self.bus.write("Operating_Mode", motor, OperatingMode.POSITION.value) - - input(f"Move {self} to the middle of its range of motion and press ENTER....") - homing_offsets = self.bus.set_half_turn_homings() - - print( - "Move all joints sequentially through their entire ranges " - "of motion.\nRecording positions. Press ENTER to stop..." - ) - range_mins, range_maxes = self.bus.record_ranges_of_motion() - - self.calibration = {} - for motor, m in self.bus.motors.items(): - self.calibration[motor] = MotorCalibration( - id=m.id, - drive_mode=0, - homing_offset=homing_offsets[motor], - range_min=range_mins[motor], - range_max=range_maxes[motor], - ) - - self.bus.write_calibration(self.calibration) - self._save_calibration() - print("Calibration saved to", self.calibration_fpath) - - def configure(self) -> None: - with self.bus.torque_disabled(): - self.bus.configure_motors() - for motor in self.bus.motors: - self.bus.write("Operating_Mode", motor, OperatingMode.POSITION.value) - # Set P_Coefficient to lower value to avoid shakiness (Default is 32) - self.bus.write("P_Coefficient", motor, 16) - # Set I_Coefficient and D_Coefficient to default value 0 and 32 - self.bus.write("I_Coefficient", motor, 0) - self.bus.write("D_Coefficient", motor, 32) - - if motor == "gripper": - self.bus.write( - "Max_Torque_Limit", motor, 500 - ) # 50% of the max torque limit to avoid burnout - self.bus.write("Protection_Current", motor, 250) # 50% of max current to avoid burnout - self.bus.write("Overload_Torque", motor, 25) # 25% torque when overloaded - - def setup_motors(self) -> None: - for motor in reversed(self.bus.motors): - input(f"Connect the controller board to the '{motor}' motor only and press enter.") - self.bus.setup_motor(motor) - print(f"'{motor}' motor id set to {self.bus.motors[motor].id}") - - def get_observation(self) -> dict[str, Any]: - if not self.is_connected: - raise DeviceNotConnectedError(f"{self} is not connected.") - - # Read arm position - start = time.perf_counter() - obs_dict = self.bus.sync_read("Present_Position") - obs_dict = {f"{motor}.pos": val for motor, val in obs_dict.items()} - dt_ms = (time.perf_counter() - start) * 1e3 - logger.debug(f"{self} read state: {dt_ms:.1f}ms") - - # Capture images from cameras - for cam_key, cam in self.cameras.items(): - start = time.perf_counter() - obs_dict[cam_key] = cam.async_read() - dt_ms = (time.perf_counter() - start) * 1e3 - logger.debug(f"{self} read {cam_key}: {dt_ms:.1f}ms") - - return obs_dict - - def send_action(self, action: dict[str, Any]) -> dict[str, Any]: - """Command arm to move to a target joint configuration. - - The relative action magnitude may be clipped depending on the configuration parameter - `max_relative_target`. In this case, the action sent differs from original action. - Thus, this function always returns the action actually sent. - - Raises: - RobotDeviceNotConnectedError: if robot is not connected. - - Returns: - the action sent to the motors, potentially clipped. - """ - if not self.is_connected: - raise DeviceNotConnectedError(f"{self} is not connected.") - - goal_pos = {key.removesuffix(".pos"): val for key, val in action.items() if key.endswith(".pos")} - - # Cap goal position when too far away from present position. - # /!\ Slower fps expected due to reading from the follower. - if self.config.max_relative_target is not None: - present_pos = self.bus.sync_read("Present_Position") - goal_present_pos = {key: (g_pos, present_pos[key]) for key, g_pos in goal_pos.items()} - goal_pos = ensure_safe_goal_position(goal_present_pos, self.config.max_relative_target) - - # Send goal position to the arm - self.bus.sync_write("Goal_Position", goal_pos) - return {f"{motor}.pos": val for motor, val in goal_pos.items()} - - def disconnect(self): - if not self.is_connected: - raise DeviceNotConnectedError(f"{self} is not connected.") - - self.bus.disconnect(self.config.disable_torque_on_disconnect) - for cam in self.cameras.values(): - cam.disconnect() - - logger.info(f"{self} disconnected.") diff --git a/src/lerobot/teleoperators/bi_so100_leader/__init__.py b/src/lerobot/robots/so_follower/__init__.py similarity index 68% rename from src/lerobot/teleoperators/bi_so100_leader/__init__.py rename to src/lerobot/robots/so_follower/__init__.py index 34313a61e..eea2fcbdf 100644 --- a/src/lerobot/teleoperators/bi_so100_leader/__init__.py +++ b/src/lerobot/robots/so_follower/__init__.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# Copyright 2026 The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,5 +14,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .bi_so100_leader import BiSO100Leader -from .config_bi_so100_leader import BiSO100LeaderConfig +from .config_so_follower import ( + SO100FollowerConfig, + SO101FollowerConfig, + SOFollowerConfig, + SOFollowerRobotConfig, +) +from .so_follower import SO100Follower, SO101Follower, SOFollower diff --git a/src/lerobot/robots/so100_follower/config_so100_follower.py b/src/lerobot/robots/so_follower/config_so_follower.py similarity index 80% rename from src/lerobot/robots/so100_follower/config_so100_follower.py rename to src/lerobot/robots/so_follower/config_so_follower.py index 272b8c43f..e9ce27123 100644 --- a/src/lerobot/robots/so100_follower/config_so100_follower.py +++ b/src/lerobot/robots/so_follower/config_so_follower.py @@ -15,15 +15,17 @@ # limitations under the License. from dataclasses import dataclass, field +from typing import TypeAlias from lerobot.cameras import CameraConfig from ..config import RobotConfig -@RobotConfig.register_subclass("so100_follower") @dataclass -class SO100FollowerConfig(RobotConfig): +class SOFollowerConfig: + """Base configuration class for SO Follower robots.""" + # Port to connect to the arm port: str @@ -39,3 +41,14 @@ class SO100FollowerConfig(RobotConfig): # Set to `True` for backward compatibility with previous policies/dataset use_degrees: bool = False + + +@RobotConfig.register_subclass("so101_follower") +@RobotConfig.register_subclass("so100_follower") +@dataclass +class SOFollowerRobotConfig(RobotConfig, SOFollowerConfig): + pass + + +SO100FollowerConfig: TypeAlias = SOFollowerRobotConfig +SO101FollowerConfig: TypeAlias = SOFollowerRobotConfig diff --git a/src/lerobot/robots/so100_follower/robot_kinematic_processor.py b/src/lerobot/robots/so_follower/robot_kinematic_processor.py similarity index 99% rename from src/lerobot/robots/so100_follower/robot_kinematic_processor.py rename to src/lerobot/robots/so_follower/robot_kinematic_processor.py index 87e832db6..2aa60e12a 100644 --- a/src/lerobot/robots/so100_follower/robot_kinematic_processor.py +++ b/src/lerobot/robots/so_follower/robot_kinematic_processor.py @@ -28,6 +28,7 @@ from lerobot.processor import ( ProcessorStepRegistry, RobotAction, RobotActionProcessorStep, + RobotObservation, TransitionKey, ) from lerobot.utils.rotation import Rotation @@ -438,7 +439,7 @@ class ForwardKinematicsJointsToEEObservation(ObservationProcessorStep): kinematics: RobotKinematics motor_names: list[str] - def observation(self, observation: dict[str, Any]) -> dict[str, Any]: + def observation(self, observation: RobotObservation) -> RobotObservation: return compute_forward_kinematics_joints_to_ee(observation, self.kinematics, self.motor_names) def transform_features( diff --git a/src/lerobot/robots/so100_follower/so100.mdx b/src/lerobot/robots/so_follower/so100.md similarity index 100% rename from src/lerobot/robots/so100_follower/so100.mdx rename to src/lerobot/robots/so_follower/so100.md diff --git a/src/lerobot/robots/so101_follower/so101.mdx b/src/lerobot/robots/so_follower/so101.md similarity index 100% rename from src/lerobot/robots/so101_follower/so101.mdx rename to src/lerobot/robots/so_follower/so101.md diff --git a/src/lerobot/robots/so100_follower/so100_follower.py b/src/lerobot/robots/so_follower/so_follower.py similarity index 87% rename from src/lerobot/robots/so100_follower/so100_follower.py rename to src/lerobot/robots/so_follower/so_follower.py index d660ebed4..b4d11fe3f 100644 --- a/src/lerobot/robots/so100_follower/so100_follower.py +++ b/src/lerobot/robots/so_follower/so_follower.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# Copyright 2026 The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,7 +17,7 @@ import logging import time from functools import cached_property -from typing import Any +from typing import TypeAlias from lerobot.cameras.utils import make_cameras_from_configs from lerobot.motors import Motor, MotorCalibration, MotorNormMode @@ -25,26 +25,29 @@ from lerobot.motors.feetech import ( FeetechMotorsBus, OperatingMode, ) -from lerobot.utils.errors import DeviceAlreadyConnectedError, DeviceNotConnectedError +from lerobot.processor import RobotAction, RobotObservation +from lerobot.utils.decorators import check_if_already_connected, check_if_not_connected from ..robot import Robot from ..utils import ensure_safe_goal_position -from .config_so100_follower import SO100FollowerConfig +from .config_so_follower import SOFollowerRobotConfig logger = logging.getLogger(__name__) -class SO100Follower(Robot): +class SOFollower(Robot): """ - [SO-100 Follower Arm](https://github.com/TheRobotStudio/SO-ARM100) designed by TheRobotStudio + Generic SO follower base implementing common functionality for SO-100/101/10X. + Designed to be subclassed with a per-hardware-model `config_class` and `name`. """ - config_class = SO100FollowerConfig - name = "so100_follower" + config_class = SOFollowerRobotConfig + name = "so_follower" - def __init__(self, config: SO100FollowerConfig): + def __init__(self, config: SOFollowerRobotConfig): super().__init__(config) self.config = config + # choose normalization mode depending on config if available norm_mode_body = MotorNormMode.DEGREES if config.use_degrees else MotorNormMode.RANGE_M100_100 self.bus = FeetechMotorsBus( port=self.config.port, @@ -82,13 +85,12 @@ class SO100Follower(Robot): def is_connected(self) -> bool: return self.bus.is_connected and all(cam.is_connected for cam in self.cameras.values()) + @check_if_already_connected def connect(self, calibrate: bool = True) -> None: """ We assume that at connection time, arm is in a rest position, and torque can be safely disabled to run calibration. """ - if self.is_connected: - raise DeviceAlreadyConnectedError(f"{self} already connected") self.bus.connect() if not self.is_calibrated and calibrate: @@ -126,6 +128,7 @@ class SO100Follower(Robot): input(f"Move {self} to the middle of its range of motion and press ENTER....") homing_offsets = self.bus.set_half_turn_homings() + # Attempt to call record_ranges_of_motion with a reduced motor set when appropriate. full_turn_motor = "wrist_roll" unknown_range_motors = [motor for motor in self.bus.motors if motor != full_turn_motor] print( @@ -172,10 +175,8 @@ class SO100Follower(Robot): self.bus.setup_motor(motor) print(f"'{motor}' motor id set to {self.bus.motors[motor].id}") - def get_observation(self) -> dict[str, Any]: - if not self.is_connected: - raise DeviceNotConnectedError(f"{self} is not connected.") - + @check_if_not_connected + def get_observation(self) -> RobotObservation: # Read arm position start = time.perf_counter() obs_dict = self.bus.sync_read("Present_Position") @@ -192,7 +193,8 @@ class SO100Follower(Robot): return obs_dict - def send_action(self, action: dict[str, Any]) -> dict[str, Any]: + @check_if_not_connected + def send_action(self, action: RobotAction) -> RobotAction: """Command arm to move to a target joint configuration. The relative action magnitude may be clipped depending on the configuration parameter @@ -203,10 +205,8 @@ class SO100Follower(Robot): RobotDeviceNotConnectedError: if robot is not connected. Returns: - the action sent to the motors, potentially clipped. + RobotAction: the action sent to the motors, potentially clipped. """ - if not self.is_connected: - raise DeviceNotConnectedError(f"{self} is not connected.") goal_pos = {key.removesuffix(".pos"): val for key, val in action.items() if key.endswith(".pos")} @@ -221,12 +221,14 @@ class SO100Follower(Robot): self.bus.sync_write("Goal_Position", goal_pos) return {f"{motor}.pos": val for motor, val in goal_pos.items()} + @check_if_not_connected def disconnect(self): - if not self.is_connected: - raise DeviceNotConnectedError(f"{self} is not connected.") - self.bus.disconnect(self.config.disable_torque_on_disconnect) for cam in self.cameras.values(): cam.disconnect() logger.info(f"{self} disconnected.") + + +SO100Follower: TypeAlias = SOFollower +SO101Follower: TypeAlias = SOFollower diff --git a/src/lerobot/robots/unitree_g1/config_unitree_g1.py b/src/lerobot/robots/unitree_g1/config_unitree_g1.py index ac65f1a7b..0b163019d 100644 --- a/src/lerobot/robots/unitree_g1/config_unitree_g1.py +++ b/src/lerobot/robots/unitree_g1/config_unitree_g1.py @@ -16,6 +16,8 @@ from dataclasses import dataclass, field +from lerobot.cameras import CameraConfig + from ..config import RobotConfig _GAINS: dict[str, dict[str, list[float]]] = { @@ -49,10 +51,17 @@ class UnitreeG1Config(RobotConfig): kp: list[float] = field(default_factory=lambda: _DEFAULT_KP.copy()) kd: list[float] = field(default_factory=lambda: _DEFAULT_KD.copy()) + # Default joint positions + default_positions: list[float] = field(default_factory=lambda: [0.0] * 29) + + # Control loop timestep control_dt: float = 1.0 / 250.0 # 250Hz - # launch mujoco simulation + # Launch mujoco simulation is_simulation: bool = True - # socket config for ZMQ bridge - robot_ip: str = "192.168.123.164" + # Socket config for ZMQ bridge + robot_ip: str = "192.168.123.164" # default G1 IP + + # Cameras (ZMQ-based remote cameras) + cameras: dict[str, CameraConfig] = field(default_factory=dict) diff --git a/src/lerobot/robots/unitree_g1/g1_utils.py b/src/lerobot/robots/unitree_g1/g1_utils.py index c045d73d2..3c41ee985 100644 --- a/src/lerobot/robots/unitree_g1/g1_utils.py +++ b/src/lerobot/robots/unitree_g1/g1_utils.py @@ -79,11 +79,3 @@ class G1_29_JointIndex(IntEnum): kRightWristRoll = 26 kRightWristPitch = 27 kRightWristYaw = 28 - - # not used - kNotUsedJoint0 = 29 - kNotUsedJoint1 = 30 - kNotUsedJoint2 = 31 - kNotUsedJoint3 = 32 - kNotUsedJoint4 = 33 - kNotUsedJoint5 = 34 diff --git a/src/lerobot/robots/unitree_g1/unitree_g1.py b/src/lerobot/robots/unitree_g1/unitree_g1.py index cce9d1b1e..fa6e0da85 100644 --- a/src/lerobot/robots/unitree_g1/unitree_g1.py +++ b/src/lerobot/robots/unitree_g1/unitree_g1.py @@ -23,14 +23,10 @@ from functools import cached_property from typing import Any import numpy as np -from unitree_sdk2py.idl.default import unitree_hg_msg_dds__LowCmd_ -from unitree_sdk2py.idl.unitree_hg.msg.dds_ import ( - LowCmd_ as hg_LowCmd, - LowState_ as hg_LowState, -) -from unitree_sdk2py.utils.crc import CRC +from lerobot.cameras.utils import make_cameras_from_configs from lerobot.envs.factory import make_env +from lerobot.processor import RobotAction, RobotObservation from lerobot.robots.unitree_g1.g1_utils import G1_29_JointIndex from ..robot import Robot @@ -43,11 +39,6 @@ logger = logging.getLogger(__name__) kTopicLowCommand_Debug = "rt/lowcmd" kTopicLowState = "rt/lowstate" -G1_29_Num_Motors = 35 -G1_23_Num_Motors = 35 -H1_2_Num_Motors = 35 -H1_Num_Motors = 20 - @dataclass class MotorState: @@ -69,28 +60,12 @@ class IMUState: # g1 observation class @dataclass class G1_29_LowState: # noqa: N801 - motor_state: list[MotorState] = field( - default_factory=lambda: [MotorState() for _ in range(G1_29_Num_Motors)] - ) + motor_state: list[MotorState] = field(default_factory=lambda: [MotorState() for _ in G1_29_JointIndex]) imu_state: IMUState = field(default_factory=IMUState) wireless_remote: Any = None # Raw wireless remote data mode_machine: int = 0 # Robot mode -class DataBuffer: - def __init__(self): - self.data = None - self.lock = threading.Lock() - - def get_data(self): - with self.lock: - return self.data - - def set_data(self, data): - with self.lock: - self.data = data - - class UnitreeG1(Robot): config_class = UnitreeG1Config name = "unitree_g1" @@ -120,9 +95,12 @@ class UnitreeG1(Robot): logger.info("Initialize UnitreeG1...") self.config = config - self.control_dt = config.control_dt + # Initialize cameras config (ZMQ-based) - actual connection in connect() + self._cameras = make_cameras_from_configs(config.cameras) + + # Import channel classes based on mode if config.is_simulation: from unitree_sdk2py.core.channel import ( ChannelFactoryInitialize, @@ -136,62 +114,33 @@ class UnitreeG1(Robot): ChannelSubscriber, ) - # connect robot - self.ChannelFactoryInitialize = ChannelFactoryInitialize - self.connect() + # Store for use in connect() + self._ChannelFactoryInitialize = ChannelFactoryInitialize + self._ChannelPublisher = ChannelPublisher + self._ChannelSubscriber = ChannelSubscriber - # initialize direct motor control interface - self.lowcmd_publisher = ChannelPublisher(kTopicLowCommand_Debug, hg_LowCmd) - self.lowcmd_publisher.Init() - self.lowstate_subscriber = ChannelSubscriber(kTopicLowState, hg_LowState) - self.lowstate_subscriber.Init() - self.lowstate_buffer = DataBuffer() - - # initialize subscribe thread to read robot state + # Initialize state variables + self.sim_env = None + self._env_wrapper = None + self._lowstate = None self._shutdown_event = threading.Event() - self.subscribe_thread = threading.Thread(target=self._subscribe_motor_state) - self.subscribe_thread.start() - - while not self.is_connected: - time.sleep(0.1) - - # initialize hg's lowcmd msg - self.crc = CRC() - self.msg = unitree_hg_msg_dds__LowCmd_() - self.msg.mode_pr = 0 - - # Wait for first state message to arrive - lowstate = None - while lowstate is None: - lowstate = self.lowstate_buffer.get_data() - if lowstate is None: - time.sleep(0.01) - logger.warning("[UnitreeG1] Waiting for robot state...") - logger.warning("[UnitreeG1] Connected to robot.") - self.msg.mode_machine = lowstate.mode_machine - - # initialize all motors with unified kp/kd from config - self.kp = np.array(config.kp, dtype=np.float32) - self.kd = np.array(config.kd, dtype=np.float32) - - for id in G1_29_JointIndex: - self.msg.motor_cmd[id].mode = 1 - self.msg.motor_cmd[id].kp = self.kp[id.value] - self.msg.motor_cmd[id].kd = self.kd[id.value] - self.msg.motor_cmd[id].q = lowstate.motor_state[id.value].q - - # Initialize remote controller + self.subscribe_thread = None self.remote_controller = self.RemoteController() def _subscribe_motor_state(self): # polls robot state @ 250Hz while not self._shutdown_event.is_set(): start_time = time.time() + + # Step simulation if in simulation mode + if self.config.is_simulation and self.sim_env is not None: + self.sim_env.step() + msg = self.lowstate_subscriber.Read() if msg is not None: lowstate = G1_29_LowState() - # Capture motor states - for id in range(G1_29_Num_Motors): + # Capture motor states using jointindex + for id in G1_29_JointIndex: lowstate.motor_state[id].q = msg.motor_state[id].q lowstate.motor_state[id].dq = msg.motor_state[id].dq lowstate.motor_state[id].tau_est = msg.motor_state[id].tau_est @@ -210,7 +159,7 @@ class UnitreeG1(Robot): # Capture mode_machine lowstate.mode_machine = msg.mode_machine - self.lowstate_buffer.set_data(lowstate) + self._lowstate = lowstate current_time = time.time() all_t_elapsed = current_time - start_time @@ -219,7 +168,7 @@ class UnitreeG1(Robot): @cached_property def action_features(self) -> dict[str, type]: - return {f"{G1_29_JointIndex(motor).name}.pos": float for motor in G1_29_JointIndex} + return {f"{G1_29_JointIndex(motor).name}.q": float for motor in G1_29_JointIndex} def calibrate(self) -> None: # robot is already calibrated pass @@ -228,20 +177,153 @@ class UnitreeG1(Robot): pass def connect(self, calibrate: bool = True) -> None: # connect to DDS + from unitree_sdk2py.idl.default import unitree_hg_msg_dds__LowCmd_ + from unitree_sdk2py.idl.unitree_hg.msg.dds_ import ( + LowCmd_ as hg_LowCmd, + LowState_ as hg_LowState, + ) + from unitree_sdk2py.utils.crc import CRC + + # Initialize DDS channel and simulation environment if self.config.is_simulation: - self.ChannelFactoryInitialize(0, "lo") - self.mujoco_env = make_env("lerobot/unitree-g1-mujoco", trust_remote_code=True) + self._ChannelFactoryInitialize(0, "lo") + self._env_wrapper = make_env("lerobot/unitree-g1-mujoco", trust_remote_code=True) + # Extract the actual gym env from the dict structure + self.sim_env = self._env_wrapper["hub_env"][0].envs[0] else: - self.ChannelFactoryInitialize(0) + self._ChannelFactoryInitialize(0) + + # Initialize direct motor control interface + self.lowcmd_publisher = self._ChannelPublisher(kTopicLowCommand_Debug, hg_LowCmd) + self.lowcmd_publisher.Init() + self.lowstate_subscriber = self._ChannelSubscriber(kTopicLowState, hg_LowState) + self.lowstate_subscriber.Init() + + # Start subscribe thread to read robot state + self.subscribe_thread = threading.Thread(target=self._subscribe_motor_state) + self.subscribe_thread.start() + + # Connect cameras + for cam in self._cameras.values(): + if not cam.is_connected: + cam.connect() + + logger.info(f"Connected {len(self._cameras)} camera(s).") + + # Initialize lowcmd message + self.crc = CRC() + self.msg = unitree_hg_msg_dds__LowCmd_() + self.msg.mode_pr = 0 + + # Wait for first state message to arrive + lowstate = None + while lowstate is None: + lowstate = self._lowstate + if lowstate is None: + time.sleep(0.01) + logger.warning("[UnitreeG1] Waiting for robot state...") + logger.warning("[UnitreeG1] Connected to robot.") + self.msg.mode_machine = lowstate.mode_machine + + # Initialize all motors with unified kp/kd from config + self.kp = np.array(self.config.kp, dtype=np.float32) + self.kd = np.array(self.config.kd, dtype=np.float32) + + for id in G1_29_JointIndex: + self.msg.motor_cmd[id].mode = 1 + self.msg.motor_cmd[id].kp = self.kp[id.value] + self.msg.motor_cmd[id].kd = self.kd[id.value] + self.msg.motor_cmd[id].q = lowstate.motor_state[id.value].q def disconnect(self): + # Signal thread to stop and unblock any waits self._shutdown_event.set() - self.subscribe_thread.join(timeout=2.0) - if self.config.is_simulation: - self.mujoco_env["hub_env"][0].envs[0].kill_sim() - def get_observation(self) -> dict[str, Any]: - return self.lowstate_buffer.get_data() + # Wait for subscribe thread to finish + if self.subscribe_thread is not None: + self.subscribe_thread.join(timeout=2.0) + if self.subscribe_thread.is_alive(): + logger.warning("Subscribe thread did not stop cleanly") + + # Close simulation environment + if self.config.is_simulation and self.sim_env is not None: + try: + # Force-kill the image publish subprocess first to avoid long waits + if hasattr(self.sim_env, "simulator") and hasattr(self.sim_env.simulator, "sim_env"): + sim_env_inner = self.sim_env.simulator.sim_env + if hasattr(sim_env_inner, "image_publish_process"): + proc = sim_env_inner.image_publish_process + if proc.process and proc.process.is_alive(): + logger.info("Force-terminating image publish subprocess...") + proc.stop_event.set() + proc.process.terminate() + proc.process.join(timeout=1) + if proc.process.is_alive(): + proc.process.kill() + self.sim_env.close() + except Exception as e: + logger.warning(f"Error closing sim_env: {e}") + self.sim_env = None + self._env_wrapper = None + + # Disconnect cameras + for cam in self._cameras.values(): + cam.disconnect() + + def get_observation(self) -> RobotObservation: + lowstate = self._lowstate + if lowstate is None: + return {} + + obs = {} + + # Motors - q, dq, tau for all joints + for motor in G1_29_JointIndex: + name = motor.name + idx = motor.value + obs[f"{name}.q"] = lowstate.motor_state[idx].q + obs[f"{name}.dq"] = lowstate.motor_state[idx].dq + obs[f"{name}.tau"] = lowstate.motor_state[idx].tau_est + + # IMU - gyroscope + if lowstate.imu_state.gyroscope: + obs["imu.gyro.x"] = lowstate.imu_state.gyroscope[0] + obs["imu.gyro.y"] = lowstate.imu_state.gyroscope[1] + obs["imu.gyro.z"] = lowstate.imu_state.gyroscope[2] + + # IMU - accelerometer + if lowstate.imu_state.accelerometer: + obs["imu.accel.x"] = lowstate.imu_state.accelerometer[0] + obs["imu.accel.y"] = lowstate.imu_state.accelerometer[1] + obs["imu.accel.z"] = lowstate.imu_state.accelerometer[2] + + # IMU - quaternion + if lowstate.imu_state.quaternion: + obs["imu.quat.w"] = lowstate.imu_state.quaternion[0] + obs["imu.quat.x"] = lowstate.imu_state.quaternion[1] + obs["imu.quat.y"] = lowstate.imu_state.quaternion[2] + obs["imu.quat.z"] = lowstate.imu_state.quaternion[3] + + # IMU - rpy + if lowstate.imu_state.rpy: + obs["imu.rpy.roll"] = lowstate.imu_state.rpy[0] + obs["imu.rpy.pitch"] = lowstate.imu_state.rpy[1] + obs["imu.rpy.yaw"] = lowstate.imu_state.rpy[2] + + # Controller - parse wireless_remote and add to obs + if lowstate.wireless_remote and len(lowstate.wireless_remote) >= 24: + self.remote_controller.set(lowstate.wireless_remote) + obs["remote.buttons"] = self.remote_controller.button.copy() + obs["remote.lx"] = self.remote_controller.lx + obs["remote.ly"] = self.remote_controller.ly + obs["remote.rx"] = self.remote_controller.rx + obs["remote.ry"] = self.remote_controller.ry + + # Cameras - read images from ZMQ cameras + for cam_name, cam in self._cameras.items(): + obs[cam_name] = cam.async_read() + + return obs @property def is_calibrated(self) -> bool: @@ -249,11 +331,15 @@ class UnitreeG1(Robot): @property def is_connected(self) -> bool: - return self.lowstate_buffer.get_data() is not None + return self._lowstate is not None @property def _motors_ft(self) -> dict[str, type]: - return {f"{G1_29_JointIndex(motor).name}.pos": float for motor in G1_29_JointIndex} + return {f"{G1_29_JointIndex(motor).name}.q": float for motor in G1_29_JointIndex} + + @property + def cameras(self) -> dict: + return self._cameras @property def _cameras_ft(self) -> dict[str, tuple]: @@ -265,9 +351,18 @@ class UnitreeG1(Robot): def observation_features(self) -> dict[str, type | tuple]: return {**self._motors_ft, **self._cameras_ft} - def send_action(self, action: dict[str, Any]) -> dict[str, Any]: - self.msg.crc = self.crc.Crc(action) - self.lowcmd_publisher.Write(action) + def send_action(self, action: RobotAction) -> RobotAction: + for motor in G1_29_JointIndex: + key = f"{motor.name}.q" + if key in action: + self.msg.motor_cmd[motor.value].q = action[key] + self.msg.motor_cmd[motor.value].qd = 0 + self.msg.motor_cmd[motor.value].kp = self.kp[motor.value] + self.msg.motor_cmd[motor.value].kd = self.kd[motor.value] + self.msg.motor_cmd[motor.value].tau = 0 + + self.msg.crc = self.crc.Crc(self.msg) + self.lowcmd_publisher.Write(self.msg) return action def get_gravity_orientation(self, quaternion): # get gravity orientation from quaternion @@ -282,3 +377,56 @@ class UnitreeG1(Robot): gravity_orientation[1] = -2 * (qz * qy + qw * qx) gravity_orientation[2] = 1 - 2 * (qw * qw + qz * qz) return gravity_orientation + + def reset( + self, + control_dt: float | None = None, + default_positions: list[float] | None = None, + ) -> None: # move robot to default position + if control_dt is None: + control_dt = self.config.control_dt + if default_positions is None: + default_positions = np.array(self.config.default_positions, dtype=np.float32) + + if self.config.is_simulation and self.sim_env is not None: + self.sim_env.reset() + + for motor in G1_29_JointIndex: + self.msg.motor_cmd[motor.value].q = default_positions[motor.value] + self.msg.motor_cmd[motor.value].qd = 0 + self.msg.motor_cmd[motor.value].kp = self.kp[motor.value] + self.msg.motor_cmd[motor.value].kd = self.kd[motor.value] + self.msg.motor_cmd[motor.value].tau = 0 + self.msg.crc = self.crc.Crc(self.msg) + self.lowcmd_publisher.Write(self.msg) + else: + total_time = 3.0 + num_steps = int(total_time / control_dt) + + # get current state + obs = self.get_observation() + + # record current positions + init_dof_pos = np.zeros(29, dtype=np.float32) + for motor in G1_29_JointIndex: + init_dof_pos[motor.value] = obs[f"{motor.name}.q"] + + # Interpolate to default position + for step in range(num_steps): + start_time = time.time() + + alpha = step / num_steps + action_dict = {} + for motor in G1_29_JointIndex: + target_pos = default_positions[motor.value] + interp_pos = init_dof_pos[motor.value] * (1 - alpha) + target_pos * alpha + action_dict[f"{motor.name}.q"] = float(interp_pos) + + self.send_action(action_dict) + + # Maintain constant control rate + elapsed = time.time() - start_time + sleep_time = max(0, control_dt - elapsed) + time.sleep(sleep_time) + + logger.info("Reached default position") diff --git a/src/lerobot/robots/utils.py b/src/lerobot/robots/utils.py index 9c5043335..27abaaa86 100644 --- a/src/lerobot/robots/utils.py +++ b/src/lerobot/robots/utils.py @@ -33,11 +33,11 @@ def make_robot_from_config(config: RobotConfig) -> Robot: return OmxFollower(config) elif config.type == "so100_follower": - from .so100_follower import SO100Follower + from .so_follower import SO100Follower return SO100Follower(config) elif config.type == "so101_follower": - from .so101_follower import SO101Follower + from .so_follower import SO101Follower return SO101Follower(config) elif config.type == "lekiwi": @@ -52,10 +52,10 @@ def make_robot_from_config(config: RobotConfig) -> Robot: from .hope_jr import HopeJrArm return HopeJrArm(config) - elif config.type == "bi_so100_follower": - from .bi_so100_follower import BiSO100Follower + elif config.type == "bi_so_follower": + from .bi_so_follower import BiSOFollower - return BiSO100Follower(config) + return BiSOFollower(config) elif config.type == "reachy2": from .reachy2 import Reachy2Robot diff --git a/src/lerobot/scripts/lerobot_calibrate.py b/src/lerobot/scripts/lerobot_calibrate.py index 910a9a1b5..cbc7684d3 100644 --- a/src/lerobot/scripts/lerobot_calibrate.py +++ b/src/lerobot/scripts/lerobot_calibrate.py @@ -36,23 +36,23 @@ from lerobot.cameras.realsense.configuration_realsense import RealSenseCameraCon from lerobot.robots import ( # noqa: F401 Robot, RobotConfig, + bi_so_follower, hope_jr, koch_follower, lekiwi, make_robot_from_config, omx_follower, - so100_follower, - so101_follower, + so_follower, ) from lerobot.teleoperators import ( # noqa: F401 Teleoperator, TeleoperatorConfig, + bi_so_leader, homunculus, koch_leader, make_teleoperator_from_config, omx_leader, - so100_leader, - so101_leader, + so_leader, ) from lerobot.utils.import_utils import register_third_party_plugins from lerobot.utils.utils import init_logging diff --git a/src/lerobot/scripts/lerobot_dataset_viz.py b/src/lerobot/scripts/lerobot_dataset_viz.py index 974762b0b..2cd48eab8 100644 --- a/src/lerobot/scripts/lerobot_dataset_viz.py +++ b/src/lerobot/scripts/lerobot_dataset_viz.py @@ -96,6 +96,7 @@ def visualize_dataset( ws_port: int = 9087, save: bool = False, output_dir: Path | None = None, + display_compressed_images: bool = False, ) -> Path | None: if save: assert output_dir is not None, ( @@ -137,8 +138,9 @@ def visualize_dataset( # display each camera image for key in dataset.meta.camera_keys: - # TODO(rcadene): add `.compress()`? is it lossless? - rr.log(key, rr.Image(to_hwc_uint8_numpy(batch[key][i]))) + img = to_hwc_uint8_numpy(batch[key][i]) + img_entity = rr.Image(img).compress() if display_compressed_images else rr.Image(img) + rr.log(key, entity=img_entity) # display each dimension of action space (e.g. actuators command) if ACTION in batch: @@ -261,6 +263,14 @@ def main(): ), ) + parser.add_argument( + "--display-compressed-images", + type=bool, + required=True, + default=False, + help="If set, display compressed images in Rerun instead of uncompressed ones.", + ) + args = parser.parse_args() kwargs = vars(args) repo_id = kwargs.pop("repo_id") diff --git a/src/lerobot/scripts/lerobot_edit_dataset.py b/src/lerobot/scripts/lerobot_edit_dataset.py index e835b1de6..4ba6ce44f 100644 --- a/src/lerobot/scripts/lerobot_edit_dataset.py +++ b/src/lerobot/scripts/lerobot_edit_dataset.py @@ -66,23 +66,23 @@ Remove camera feature: --operation.type remove_feature \ --operation.feature_names "['observation.images.top']" -Convert image dataset to video format (saves locally): +Convert image dataset to video format and save locally: python -m lerobot.scripts.lerobot_edit_dataset \ --repo_id lerobot/pusht_image \ - --operation.type convert_to_video \ + --operation.type convert_image_to_video \ --operation.output_dir /path/to/output/pusht_video -Convert image dataset and save with new repo_id: +Convert image dataset to video format and save with new repo_id: python -m lerobot.scripts.lerobot_edit_dataset \ --repo_id lerobot/pusht_image \ --new_repo_id lerobot/pusht_video \ - --operation.type convert_to_video + --operation.type convert_image_to_video -Convert and push to hub: +Convert image dataset to video format and push to hub: python -m lerobot.scripts.lerobot_edit_dataset \ --repo_id lerobot/pusht_image \ --new_repo_id lerobot/pusht_video \ - --operation.type convert_to_video \ + --operation.type convert_image_to_video \ --push_to_hub true Using JSON config file: @@ -92,24 +92,19 @@ Using JSON config file: import logging import shutil -from concurrent.futures import ThreadPoolExecutor, as_completed from dataclasses import dataclass from pathlib import Path -import pandas as pd -from tqdm import tqdm - from lerobot.configs import parser from lerobot.datasets.dataset_tools import ( + convert_image_to_video_dataset, delete_episodes, merge_datasets, remove_feature, split_dataset, ) -from lerobot.datasets.lerobot_dataset import LeRobotDataset, LeRobotDatasetMetadata -from lerobot.datasets.utils import write_stats, write_tasks -from lerobot.datasets.video_utils import encode_video_frames, get_video_info -from lerobot.utils.constants import HF_LEROBOT_HOME, OBS_IMAGE +from lerobot.datasets.lerobot_dataset import LeRobotDataset +from lerobot.utils.constants import HF_LEROBOT_HOME from lerobot.utils.utils import init_logging @@ -138,8 +133,8 @@ class RemoveFeatureConfig: @dataclass -class ConvertToVideoConfig: - type: str = "convert_to_video" +class ConvertImageToVideoConfig: + type: str = "convert_image_to_video" output_dir: str | None = None vcodec: str = "libsvtav1" pix_fmt: str = "yuv420p" @@ -148,12 +143,16 @@ class ConvertToVideoConfig: fast_decode: int = 0 episode_indices: list[int] | None = None num_workers: int = 4 + max_episodes_per_batch: int | None = None + max_frames_per_batch: int | None = None @dataclass class EditDatasetConfig: repo_id: str - operation: DeleteEpisodesConfig | SplitConfig | MergeConfig | RemoveFeatureConfig | ConvertToVideoConfig + operation: ( + DeleteEpisodesConfig | SplitConfig | MergeConfig | RemoveFeatureConfig | ConvertImageToVideoConfig + ) root: str | None = None new_repo_id: str | None = None push_to_hub: bool = False @@ -297,362 +296,7 @@ def handle_remove_feature(cfg: EditDatasetConfig) -> None: LeRobotDataset(output_repo_id, root=output_dir).push_to_hub() -def save_episode_images_for_video( - dataset: LeRobotDataset, - imgs_dir: Path, - img_key: str, - episode_index: int, - num_workers: int = 4, -) -> None: - """Save images from a specific episode and camera to disk for video encoding. - - Args: - dataset: The LeRobot dataset to extract images from - imgs_dir: Directory to save images to - img_key: The image key (camera) to extract - episode_index: Index of the episode to save - num_workers: Number of threads for parallel image saving - """ - # Create directory - imgs_dir.mkdir(parents=True, exist_ok=True) - - # Get dataset without torch format for PIL image access - hf_dataset = dataset.hf_dataset.with_format(None) - - # Select only this camera's images - imgs_dataset = hf_dataset.select_columns(img_key) - - # Get episode start and end indices - from_idx = dataset.meta.episodes["dataset_from_index"][episode_index] - to_idx = dataset.meta.episodes["dataset_to_index"][episode_index] - - # Get all items for this episode - episode_dataset = imgs_dataset.select(range(from_idx, to_idx)) - - # Define function to save a single image - def save_single_image(i_item_tuple): - i, item = i_item_tuple - img = item[img_key] - # Use frame-XXXXXX.png format to match encode_video_frames expectations - img.save(str(imgs_dir / f"frame-{i:06d}.png"), quality=100) - return i - - # Save images with proper naming convention for encode_video_frames (frame-XXXXXX.png) - items = list(enumerate(episode_dataset)) - - with ThreadPoolExecutor(max_workers=num_workers) as executor: - futures = [executor.submit(save_single_image, item) for item in items] - for future in as_completed(futures): - future.result() # This will raise any exceptions that occurred - - -def encode_episode_videos( - dataset: LeRobotDataset, - new_meta: LeRobotDatasetMetadata, - episode_index: int, - vcodec: str, - pix_fmt: str, - g: int, - crf: int, - fast_decode: int, - temp_dir: Path, - num_image_workers: int = 4, -) -> dict[str, dict]: - """Encode videos for a single episode and return video metadata. - - Args: - dataset: Source dataset with images - new_meta: Metadata object for the new video dataset - episode_index: Episode index to process - vcodec: Video codec - pix_fmt: Pixel format - g: Group of pictures size - crf: Constant rate factor - fast_decode: Fast decode tuning - temp_dir: Temporary directory for images - num_image_workers: Number of workers for saving images - - Returns: - Dictionary mapping video keys to their metadata (chunk_index, file_index, timestamps) - """ - hf_dataset = dataset.hf_dataset.with_format(None) - img_keys = [key for key in hf_dataset.features if key.startswith(OBS_IMAGE)] - - video_metadata = {} - fps = int(dataset.fps) # Convert to int for PyAV compatibility - episode_length = dataset.meta.episodes["length"][episode_index] - episode_duration = episode_length / dataset.fps # Use original fps for duration calculation - - for img_key in img_keys: - # Save images temporarily - imgs_dir = temp_dir / f"episode_{episode_index:06d}" / img_key - save_episode_images_for_video(dataset, imgs_dir, img_key, episode_index, num_image_workers) - - # Determine chunk and file indices - # For simplicity, we'll put each episode in its own file - chunk_idx = episode_index // new_meta.chunks_size - file_idx = episode_index % new_meta.chunks_size - - # Create video path in the new dataset structure - video_path = new_meta.root / new_meta.video_path.format( - video_key=img_key, chunk_index=chunk_idx, file_index=file_idx - ) - video_path.parent.mkdir(parents=True, exist_ok=True) - - # Encode video - encode_video_frames( - imgs_dir=imgs_dir, - video_path=video_path, - fps=fps, - vcodec=vcodec, - pix_fmt=pix_fmt, - g=g, - crf=crf, - fast_decode=fast_decode, - overwrite=True, - ) - - # Clean up temporary images - shutil.rmtree(imgs_dir) - - # Store video metadata - video_metadata[img_key] = { - f"videos/{img_key}/chunk_index": chunk_idx, - f"videos/{img_key}/file_index": file_idx, - f"videos/{img_key}/from_timestamp": 0.0, - f"videos/{img_key}/to_timestamp": episode_duration, - } - - return video_metadata - - -def convert_dataset_to_videos( - dataset: LeRobotDataset, - output_dir: Path, - repo_id: str | None = None, - vcodec: str = "libsvtav1", - pix_fmt: str = "yuv420p", - g: int = 2, - crf: int = 30, - fast_decode: int = 0, - episode_indices: list[int] | None = None, - num_workers: int = 4, -) -> LeRobotDataset: - """Convert image-based dataset to video-based dataset. - - Creates a new LeRobotDataset with videos instead of images, following the proper - LeRobot dataset structure with videos stored in chunked MP4 files. - - Args: - dataset: The source LeRobot dataset with images - output_dir: Directory to save the new video dataset - repo_id: Repository ID for the new dataset (default: original_id + "_video") - vcodec: Video codec (default: libsvtav1) - pix_fmt: Pixel format (default: yuv420p) - g: Group of pictures size (default: 2) - crf: Constant rate factor (default: 30) - fast_decode: Fast decode tuning (default: 0) - episode_indices: List of episode indices to convert (None = all episodes) - num_workers: Number of threads for parallel processing (default: 4) - - Returns: - New LeRobotDataset with videos - """ - # Check that it's an image dataset - if len(dataset.meta.video_keys) > 0: - raise ValueError( - f"This operation is for image datasets only. Video dataset provided: {dataset.repo_id}" - ) - - # Get all image keys - hf_dataset = dataset.hf_dataset.with_format(None) - img_keys = [key for key in hf_dataset.features if key.startswith(OBS_IMAGE)] - - if len(img_keys) == 0: - raise ValueError(f"No image keys found in dataset {dataset.repo_id}") - - # Determine which episodes to process - if episode_indices is None: - episode_indices = list(range(dataset.meta.total_episodes)) - - if repo_id is None: - repo_id = f"{dataset.repo_id}_video" - - logging.info( - f"Converting {len(episode_indices)} episodes with {len(img_keys)} cameras from {dataset.repo_id}" - ) - logging.info(f"Video codec: {vcodec}, pixel format: {pix_fmt}, GOP: {g}, CRF: {crf}") - - # Create new features dict, converting image features to video features - new_features = {} - for key, value in dataset.meta.features.items(): - if key not in img_keys: - new_features[key] = value - else: - # Convert image key to video format - new_features[key] = value.copy() - new_features[key]["dtype"] = "video" # Change dtype from "image" to "video" - # Video info will be updated after episodes are encoded - - # Create new metadata for video dataset - new_meta = LeRobotDatasetMetadata.create( - repo_id=repo_id, - fps=dataset.meta.fps, - features=new_features, - robot_type=dataset.meta.robot_type, - root=output_dir, - use_videos=True, - chunks_size=dataset.meta.chunks_size, - data_files_size_in_mb=dataset.meta.data_files_size_in_mb, - video_files_size_in_mb=dataset.meta.video_files_size_in_mb, - ) - - # Create temporary directory for image extraction - temp_dir = output_dir / "temp_images" - temp_dir.mkdir(parents=True, exist_ok=True) - - # Process each episode - all_episode_metadata = [] - - try: - for ep_idx in tqdm(episode_indices, desc="Converting episodes to videos"): - # Get episode metadata from source - src_episode = dataset.meta.episodes[ep_idx] - - # Encode videos for this episode - video_metadata = encode_episode_videos( - dataset=dataset, - new_meta=new_meta, - episode_index=ep_idx, - vcodec=vcodec, - pix_fmt=pix_fmt, - g=g, - crf=crf, - fast_decode=fast_decode, - temp_dir=temp_dir, - num_image_workers=num_workers, - ) - - # Build episode metadata - episode_meta = { - "episode_index": ep_idx, - "length": src_episode["length"], - "dataset_from_index": ep_idx * src_episode["length"], - "dataset_to_index": (ep_idx + 1) * src_episode["length"], - } - - # Add video metadata - for img_key in img_keys: - episode_meta.update(video_metadata[img_key]) - - # Add data chunk/file info (using same structure as source) - if "data/chunk_index" in src_episode: - episode_meta["data/chunk_index"] = src_episode["data/chunk_index"] - episode_meta["data/file_index"] = src_episode["data/file_index"] - - all_episode_metadata.append(episode_meta) - - # Copy and transform data files (removing image columns) - _copy_data_without_images(dataset, new_meta, episode_indices, img_keys) - - # Save episode metadata - episodes_df = pd.DataFrame(all_episode_metadata) - episodes_path = new_meta.root / "meta" / "episodes" / "chunk-000" / "file-000.parquet" - episodes_path.parent.mkdir(parents=True, exist_ok=True) - episodes_df.to_parquet(episodes_path, index=False) - - # Update metadata info - new_meta.info["total_episodes"] = len(episode_indices) - new_meta.info["total_frames"] = sum(ep["length"] for ep in all_episode_metadata) - new_meta.info["total_tasks"] = dataset.meta.total_tasks - new_meta.info["splits"] = {"train": f"0:{len(episode_indices)}"} - - # Update video info for all image keys (now videos) - # We need to manually set video info since update_video_info() checks video_keys first - for img_key in img_keys: - if not new_meta.features[img_key].get("info", None): - video_path = new_meta.root / new_meta.video_path.format( - video_key=img_key, chunk_index=0, file_index=0 - ) - new_meta.info["features"][img_key]["info"] = get_video_info(video_path) - - from lerobot.datasets.utils import write_info - - write_info(new_meta.info, new_meta.root) - - # Copy stats and tasks - if dataset.meta.stats is not None: - # Remove image stats - new_stats = {k: v for k, v in dataset.meta.stats.items() if k not in img_keys} - write_stats(new_stats, new_meta.root) - - if dataset.meta.tasks is not None: - write_tasks(dataset.meta.tasks, new_meta.root) - - finally: - # Clean up temporary directory - if temp_dir.exists(): - shutil.rmtree(temp_dir) - - logging.info(f"✓ Completed converting {dataset.repo_id} to video format") - logging.info(f"New dataset saved to: {output_dir}") - - # Return new dataset - return LeRobotDataset(repo_id=repo_id, root=output_dir) - - -def _copy_data_without_images( - src_dataset: LeRobotDataset, - dst_meta: LeRobotDatasetMetadata, - episode_indices: list[int], - img_keys: list[str], -) -> None: - """Copy data files without image columns. - - Args: - src_dataset: Source dataset - dst_meta: Destination metadata - episode_indices: Episodes to include - img_keys: Image keys to remove - """ - from lerobot.datasets.utils import DATA_DIR - - data_dir = src_dataset.root / DATA_DIR - parquet_files = sorted(data_dir.glob("*/*.parquet")) - - if not parquet_files: - raise ValueError(f"No parquet files found in {data_dir}") - - episode_set = set(episode_indices) - - for src_path in tqdm(parquet_files, desc="Processing data files"): - df = pd.read_parquet(src_path).reset_index(drop=True) - - # Filter to only include selected episodes - df = df[df["episode_index"].isin(episode_set)].copy() - - if len(df) == 0: - continue - - # Remove image columns - columns_to_drop = [col for col in img_keys if col in df.columns] - if columns_to_drop: - df = df.drop(columns=columns_to_drop) - - # Get chunk and file indices from path - relative_path = src_path.relative_to(src_dataset.root) - chunk_dir = relative_path.parts[1] - file_name = relative_path.parts[2] - chunk_idx = int(chunk_dir.split("-")[1]) - file_idx = int(file_name.split("-")[1].split(".")[0]) - - # Write to destination without pandas index - dst_path = dst_meta.root / f"data/chunk-{chunk_idx:03d}/file-{file_idx:03d}.parquet" - dst_path.parent.mkdir(parents=True, exist_ok=True) - df.to_parquet(dst_path, index=False) - - -def handle_convert_to_video(cfg: EditDatasetConfig) -> None: +def handle_convert_image_to_video(cfg: EditDatasetConfig) -> None: # Note: Parser may create any config type with the right fields, so we access fields directly # instead of checking isinstance() dataset = LeRobotDataset(cfg.repo_id, root=cfg.root) @@ -664,8 +308,12 @@ def handle_convert_to_video(cfg: EditDatasetConfig) -> None: if cfg.new_repo_id: # Use new_repo_id for both local storage and hub push output_repo_id = cfg.new_repo_id - output_dir = Path(cfg.root) / cfg.new_repo_id if cfg.root else HF_LEROBOT_HOME / cfg.new_repo_id - logging.info(f"Saving to new dataset: {cfg.new_repo_id}") + # Place new dataset as a sibling to the original dataset + # Get the parent of the actual dataset root (not cfg.root which might be the lerobot cache dir) + # Extract just the dataset name (after last slash) for the local directory + local_dir_name = cfg.new_repo_id.split("/")[-1] + output_dir = dataset.root.parent / local_dir_name + logging.info(f"Saving to new dataset: {cfg.new_repo_id} at {output_dir}") elif output_dir_config: # Use custom output directory for local-only storage output_dir = Path(output_dir_config) @@ -675,12 +323,15 @@ def handle_convert_to_video(cfg: EditDatasetConfig) -> None: else: # Auto-generate name: append "_video" to original repo_id output_repo_id = f"{cfg.repo_id}_video" - output_dir = Path(cfg.root) / output_repo_id if cfg.root else HF_LEROBOT_HOME / output_repo_id + # Place new dataset as a sibling to the original dataset + # Extract just the dataset name (after last slash) for the local directory + local_dir_name = output_repo_id.split("/")[-1] + output_dir = dataset.root.parent / local_dir_name logging.info(f"Saving to auto-generated location: {output_dir}") logging.info(f"Converting dataset {cfg.repo_id} to video format") - new_dataset = convert_dataset_to_videos( + new_dataset = convert_image_to_video_dataset( dataset=dataset, output_dir=output_dir, repo_id=output_repo_id, @@ -691,6 +342,8 @@ def handle_convert_to_video(cfg: EditDatasetConfig) -> None: fast_decode=getattr(cfg.operation, "fast_decode", 0), episode_indices=getattr(cfg.operation, "episode_indices", None), num_workers=getattr(cfg.operation, "num_workers", 4), + max_episodes_per_batch=getattr(cfg.operation, "max_episodes_per_batch", None), + max_frames_per_batch=getattr(cfg.operation, "max_frames_per_batch", None), ) logging.info("Video dataset created successfully!") @@ -718,8 +371,8 @@ def edit_dataset(cfg: EditDatasetConfig) -> None: handle_merge(cfg) elif operation_type == "remove_feature": handle_remove_feature(cfg) - elif operation_type == "convert_to_video": - handle_convert_to_video(cfg) + elif operation_type == "convert_image_to_video": + handle_convert_image_to_video(cfg) else: raise ValueError( f"Unknown operation type: {operation_type}\n" diff --git a/src/lerobot/scripts/lerobot_eval.py b/src/lerobot/scripts/lerobot_eval.py index d23b9d083..e32b80404 100644 --- a/src/lerobot/scripts/lerobot_eval.py +++ b/src/lerobot/scripts/lerobot_eval.py @@ -177,9 +177,9 @@ def rollout( action = policy.select_action(observation) action = postprocessor(action) - action_transition = {"action": action} + action_transition = {ACTION: action} action_transition = env_postprocessor(action_transition) - action = action_transition["action"] + action = action_transition[ACTION] # Convert to CPU / numpy. action_numpy: np.ndarray = action.to("cpu").numpy() @@ -278,9 +278,16 @@ def eval_policy( raise ValueError("If max_episodes_rendered > 0, videos_dir must be provided.") if not isinstance(policy, PreTrainedPolicy): - raise ValueError( + exc = ValueError( f"Policy of type 'PreTrainedPolicy' is expected, but type '{type(policy)}' was provided." ) + try: + from peft import PeftModel + + if not isinstance(policy, PeftModel): + raise exc + except ImportError: + raise exc from None start = time.time() policy.eval() @@ -509,7 +516,12 @@ def eval_main(cfg: EvalPipelineConfig): logging.info(colored("Output dir:", "yellow", attrs=["bold"]) + f" {cfg.output_dir}") logging.info("Making environment.") - envs = make_env(cfg.env, n_envs=cfg.eval.batch_size, use_async_envs=cfg.eval.use_async_envs) + envs = make_env( + cfg.env, + n_envs=cfg.eval.batch_size, + use_async_envs=cfg.eval.use_async_envs, + trust_remote_code=cfg.trust_remote_code, + ) logging.info("Making policy.") diff --git a/src/lerobot/scripts/lerobot_find_joint_limits.py b/src/lerobot/scripts/lerobot_find_joint_limits.py index f97c0d820..20bbc8615 100644 --- a/src/lerobot/scripts/lerobot_find_joint_limits.py +++ b/src/lerobot/scripts/lerobot_find_joint_limits.py @@ -44,20 +44,20 @@ import numpy as np from lerobot.model.kinematics import RobotKinematics from lerobot.robots import ( # noqa: F401 RobotConfig, + bi_so_follower, koch_follower, make_robot_from_config, omx_follower, - so100_follower, - so101_follower, + so_follower, ) from lerobot.teleoperators import ( # noqa: F401 TeleoperatorConfig, + bi_so_leader, gamepad, koch_leader, make_teleoperator_from_config, omx_leader, - so100_leader, - so101_leader, + so_leader, ) from lerobot.utils.robot_utils import precise_sleep diff --git a/src/lerobot/scripts/lerobot_record.py b/src/lerobot/scripts/lerobot_record.py index 948e92bb8..f03776989 100644 --- a/src/lerobot/scripts/lerobot_record.py +++ b/src/lerobot/scripts/lerobot_record.py @@ -27,6 +27,8 @@ lerobot-record \ --dataset.num_episodes=2 \ --dataset.single_task="Grab the cube" \ --display_data=true + # <- Optional: specify video codec (h264, hevc, libsvtav1). Default is libsvtav1. \ + # --dataset.vcodec=h264 \ # <- Teleop optional if you want to teleoperate to record or in between episodes with a policy \ # --teleop.type=so100_leader \ # --teleop.port=/dev/tty.usbmodem58760431551 \ @@ -38,21 +40,23 @@ lerobot-record \ Example recording with bimanual so100: ```shell lerobot-record \ - --robot.type=bi_so100_follower \ - --robot.left_arm_port=/dev/tty.usbmodem5A460851411 \ - --robot.right_arm_port=/dev/tty.usbmodem5A460812391 \ + --robot.type=bi_so_follower \ + --robot.left_arm_config.port=/dev/tty.usbmodem5A460822851 \ + --robot.right_arm_config.port=/dev/tty.usbmodem5A460814411 \ --robot.id=bimanual_follower \ - --robot.cameras='{ - left: {"type": "opencv", "index_or_path": 0, "width": 640, "height": 480, "fps": 30}, - top: {"type": "opencv", "index_or_path": 1, "width": 640, "height": 480, "fps": 30}, - right: {"type": "opencv", "index_or_path": 2, "width": 640, "height": 480, "fps": 30} + --robot.left_arm_config.cameras='{ + wrist: {"type": "opencv", "index_or_path": 1, "width": 640, "height": 480, "fps": 30}, + top: {"type": "opencv", "index_or_path": 3, "width": 640, "height": 480, "fps": 30}, + }' --robot.right_arm_config.cameras='{ + wrist: {"type": "opencv", "index_or_path": 2, "width": 640, "height": 480, "fps": 30}, + front: {"type": "opencv", "index_or_path": 4, "width": 640, "height": 480, "fps": 30}, }' \ - --teleop.type=bi_so100_leader \ - --teleop.left_arm_port=/dev/tty.usbmodem5A460828611 \ - --teleop.right_arm_port=/dev/tty.usbmodem5A460826981 \ + --teleop.type=bi_so_leader \ + --teleop.left_arm_config.port=/dev/tty.usbmodem5A460852721 \ + --teleop.right_arm_config.port=/dev/tty.usbmodem5A460819811 \ --teleop.id=bimanual_leader \ --display_data=true \ - --dataset.repo_id=${HF_USER}/bimanual-so100-handover-cube \ + --dataset.repo_id=${HF_USER}/bimanual-so-handover-cube \ --dataset.num_episodes=25 \ --dataset.single_task="Grab and handover the red cube to the other arm" ``` @@ -69,7 +73,9 @@ from lerobot.cameras import ( # noqa: F401 CameraConfig, # noqa: F401 ) from lerobot.cameras.opencv.configuration_opencv import OpenCVCameraConfig # noqa: F401 +from lerobot.cameras.reachy2_camera.configuration_reachy2_camera import Reachy2CameraConfig # noqa: F401 from lerobot.cameras.realsense.configuration_realsense import RealSenseCameraConfig # noqa: F401 +from lerobot.cameras.zmq.configuration_zmq import ZMQCameraConfig # noqa: F401 from lerobot.configs import parser from lerobot.configs.policies import PreTrainedConfig from lerobot.datasets.image_writer import safe_stop_image_writer @@ -92,25 +98,26 @@ from lerobot.processor.rename_processor import rename_stats from lerobot.robots import ( # noqa: F401 Robot, RobotConfig, - bi_so100_follower, + bi_so_follower, earthrover_mini_plus, hope_jr, koch_follower, make_robot_from_config, omx_follower, - so100_follower, - so101_follower, + reachy2, + so_follower, + unitree_g1, ) from lerobot.teleoperators import ( # noqa: F401 Teleoperator, TeleoperatorConfig, - bi_so100_leader, + bi_so_leader, homunculus, koch_leader, make_teleoperator_from_config, omx_leader, - so100_leader, - so101_leader, + reachy2_teleoperator, + so_leader, ) from lerobot.teleoperators.keyboard.teleop_keyboard import KeyboardTeleop from lerobot.utils.constants import ACTION, OBS_STR @@ -167,6 +174,9 @@ class DatasetRecordConfig: # Number of episodes to record before batch encoding videos # Set to 1 for immediate encoding (default behavior), or higher for batched encoding video_encoding_batch_size: int = 1 + # Video codec for encoding videos. Options: 'h264', 'hevc', 'libsvtav1'. + # Use 'h264' for faster encoding on systems where AV1 encoding is CPU-heavy. + vcodec: str = "libsvtav1" # Rename map for the observation to override the image and state keys rename_map: dict[str, str] = field(default_factory=dict) @@ -185,6 +195,12 @@ class RecordConfig: policy: PreTrainedConfig | None = None # Display all cameras on screen display_data: bool = False + # Display data on a remote Rerun server + display_ip: str | None = None + # Port of the remote Rerun server + display_port: int | None = None + # Whether to display compressed images in Rerun + display_compressed_images: bool = False # Use vocal synthesis to read events. play_sounds: bool = True # Resume recording on an existing dataset. @@ -193,8 +209,10 @@ class RecordConfig: def __post_init__(self): # HACK: We parse again the cli args here to get the pretrained path if there was one. policy_path = parser.get_path_arg("policy") + if policy_path: cli_overrides = parser.get_cli_overrides("policy") + self.policy = PreTrainedConfig.from_pretrained(policy_path, cli_overrides=cli_overrides) self.policy.pretrained_path = policy_path @@ -259,6 +277,7 @@ def record_loop( control_time_s: int | None = None, single_task: str | None = None, display_data: bool = False, + display_compressed_images: bool = False, ): if dataset is not None and dataset.fps != fps: raise ValueError(f"The dataset fps should be equal to requested fps ({dataset.fps} != {fps}).") @@ -273,8 +292,8 @@ def record_loop( if isinstance( t, ( - so100_leader.SO100Leader - | so101_leader.SO101Leader + so_leader.SO100Leader + | so_leader.SO101Leader | koch_leader.KochLeader | omx_leader.OmxLeader ), @@ -369,10 +388,12 @@ def record_loop( dataset.add_frame(frame) if display_data: - log_rerun_data(observation=obs_processed, action=action_values) + log_rerun_data( + observation=obs_processed, action=action_values, compress_images=display_compressed_images + ) dt_s = time.perf_counter() - start_loop_t - precise_sleep(1 / fps - dt_s) + precise_sleep(max(1 / fps - dt_s, 0.0)) timestamp = time.perf_counter() - start_episode_t @@ -382,7 +403,12 @@ def record(cfg: RecordConfig) -> LeRobotDataset: init_logging() logging.info(pformat(asdict(cfg))) if cfg.display_data: - init_rerun(session_name="recording") + init_rerun(session_name="recording", ip=cfg.display_ip, port=cfg.display_port) + display_compressed_images = ( + True + if (cfg.display_data and cfg.display_ip is not None and cfg.display_port is not None) + else cfg.display_compressed_images + ) robot = make_robot_from_config(cfg.robot) teleop = make_teleoperator_from_config(cfg.teleop) if cfg.teleop is not None else None @@ -413,6 +439,7 @@ def record(cfg: RecordConfig) -> LeRobotDataset: cfg.dataset.repo_id, root=cfg.dataset.root, batch_encoding_size=cfg.dataset.video_encoding_batch_size, + vcodec=cfg.dataset.vcodec, ) if hasattr(robot, "cameras") and len(robot.cameras) > 0: @@ -434,6 +461,7 @@ def record(cfg: RecordConfig) -> LeRobotDataset: image_writer_processes=cfg.dataset.num_image_writer_processes, image_writer_threads=cfg.dataset.num_image_writer_threads_per_camera * len(robot.cameras), batch_encoding_size=cfg.dataset.video_encoding_batch_size, + vcodec=cfg.dataset.vcodec, ) # Load pretrained policy @@ -476,6 +504,7 @@ def record(cfg: RecordConfig) -> LeRobotDataset: control_time_s=cfg.dataset.episode_time_s, single_task=cfg.dataset.single_task, display_data=cfg.display_data, + display_compressed_images=display_compressed_images, ) # Execute a few seconds without recording to give time to manually reset the environment @@ -484,6 +513,11 @@ def record(cfg: RecordConfig) -> LeRobotDataset: (recorded_episodes < cfg.dataset.num_episodes - 1) or events["rerecord_episode"] ): log_say("Reset the environment", cfg.play_sounds) + + # reset g1 robot + if robot.name == "unitree_g1": + robot.reset() + record_loop( robot=robot, events=events, diff --git a/src/lerobot/scripts/lerobot_replay.py b/src/lerobot/scripts/lerobot_replay.py index d5808c768..49c06d643 100644 --- a/src/lerobot/scripts/lerobot_replay.py +++ b/src/lerobot/scripts/lerobot_replay.py @@ -29,7 +29,7 @@ lerobot-replay \ Example replay with bimanual so100: ```shell lerobot-replay \ - --robot.type=bi_so100_follower \ + --robot.type=bi_so_follower \ --robot.left_arm_port=/dev/tty.usbmodem5A460851411 \ --robot.right_arm_port=/dev/tty.usbmodem5A460812391 \ --robot.id=bimanual_follower \ @@ -53,14 +53,15 @@ from lerobot.processor import ( from lerobot.robots import ( # noqa: F401 Robot, RobotConfig, - bi_so100_follower, + bi_so_follower, earthrover_mini_plus, hope_jr, koch_follower, make_robot_from_config, omx_follower, - so100_follower, - so101_follower, + reachy2, + so_follower, + unitree_g1, ) from lerobot.utils.constants import ACTION from lerobot.utils.import_utils import register_third_party_plugins @@ -123,7 +124,7 @@ def replay(cfg: ReplayConfig): _ = robot.send_action(processed_action) dt_s = time.perf_counter() - start_episode_t - precise_sleep(1 / dataset.fps - dt_s) + precise_sleep(max(1 / dataset.fps - dt_s, 0.0)) robot.disconnect() diff --git a/src/lerobot/scripts/lerobot_setup_can.py b/src/lerobot/scripts/lerobot_setup_can.py new file mode 100644 index 000000000..55de74724 --- /dev/null +++ b/src/lerobot/scripts/lerobot_setup_can.py @@ -0,0 +1,360 @@ +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Setup and debug CAN interfaces for Damiao motors (e.g., OpenArms). + +Examples: + +Setup CAN interfaces with CAN FD: +```shell +lerobot-setup-can --mode=setup --interfaces=can0,can1,can2,can3 +``` + +Test motors on a single interface: +```shell +lerobot-setup-can --mode=test --interfaces=can0 +``` + +Test motors on all interfaces: +```shell +lerobot-setup-can --mode=test --interfaces=can0,can1,can2,can3 +``` + +Speed test: +```shell +lerobot-setup-can --mode=speed --interfaces=can0 +``` +""" + +import subprocess +import sys +import time +from dataclasses import dataclass, field + +import draccus + +from lerobot.utils.import_utils import is_package_available + +MOTOR_NAMES = { + 0x01: "joint_1", + 0x02: "joint_2", + 0x03: "joint_3", + 0x04: "joint_4", + 0x05: "joint_5", + 0x06: "joint_6", + 0x07: "joint_7", + 0x08: "gripper", +} + + +@dataclass +class CANSetupConfig: + mode: str = "test" + interfaces: str = "can0" # Comma-separated, e.g. "can0,can1,can2,can3" + bitrate: int = 1000000 + data_bitrate: int = 5000000 + use_fd: bool = True + motor_ids: list[int] = field(default_factory=lambda: list(range(0x01, 0x09))) + timeout: float = 1.0 + speed_iterations: int = 100 + + def get_interfaces(self) -> list[str]: + return [i.strip() for i in self.interfaces.split(",") if i.strip()] + + +def check_interface_status(interface: str) -> tuple[bool, str, bool]: + """Check if CAN interface is UP and configured.""" + try: + result = subprocess.run(["ip", "link", "show", interface], capture_output=True, text=True) # nosec B607 + if result.returncode != 0: + return False, "Interface not found", False + + output = result.stdout + is_up = "UP" in output + is_fd = "fd on" in output.lower() or "canfd" in output.lower() + status = "UP" if is_up else "DOWN" + if is_fd: + status += " (CAN FD)" + + return is_up, status, is_fd + except FileNotFoundError: + return False, "ip command not found", False + + +def setup_interface(interface: str, bitrate: int, data_bitrate: int, use_fd: bool) -> bool: + """Configure a CAN interface.""" + try: + subprocess.run(["sudo", "ip", "link", "set", interface, "down"], check=False, capture_output=True) # nosec B607 + + cmd = ["sudo", "ip", "link", "set", interface, "type", "can", "bitrate", str(bitrate)] + if use_fd: + cmd.extend(["dbitrate", str(data_bitrate), "fd", "on"]) + + result = subprocess.run(cmd, capture_output=True, text=True) # nosec B607 + if result.returncode != 0: + print(f" ✗ Failed to configure: {result.stderr}") + return False + + result = subprocess.run( # nosec B607 + ["sudo", "ip", "link", "set", interface, "up"], capture_output=True, text=True + ) + if result.returncode != 0: + print(f" ✗ Failed to bring up: {result.stderr}") + return False + + return True + except Exception as e: + print(f" ✗ Error: {e}") + return False + + +def test_motor(bus, motor_id: int, timeout: float, use_fd: bool): + """Test a single motor and return responses.""" + import can + + enable_msg = can.Message( + arbitration_id=motor_id, + data=[0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFC], + is_extended_id=False, + is_fd=use_fd, + ) + + try: + bus.send(enable_msg) + except Exception as e: + return None, f"Send error: {e}" + + responses = [] + start_time = time.time() + + while time.time() - start_time < timeout: + msg = bus.recv(timeout=0.1) + if msg: + responses.append((msg.arbitration_id, msg.data.hex(), getattr(msg, "is_fd", False))) + + disable_msg = can.Message( + arbitration_id=motor_id, + data=[0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFD], + is_extended_id=False, + is_fd=use_fd, + ) + try: + bus.send(disable_msg) + except Exception: + print(f"Error sending message to motor 0x{motor_id:02X}") + + return responses, None + + +def test_interface(cfg: CANSetupConfig, interface: str): + """Test all motors on a CAN interface.""" + import can + + is_up, status, _ = check_interface_status(interface) + print(f"\n{interface}: {status}") + + if not is_up: + print(f" ⚠ Interface is not UP. Run: lerobot-setup-can --mode=setup --interfaces {interface}") + return {} + + try: + kwargs = {"channel": interface, "interface": "socketcan", "bitrate": cfg.bitrate} + if cfg.use_fd: + kwargs.update({"data_bitrate": cfg.data_bitrate, "fd": True}) + bus = can.interface.Bus(**kwargs) + except Exception as e: + print(f" ✗ Connection failed: {e}") + return {} + + results = {} + try: + while bus.recv(timeout=0.01): + pass + + for motor_id in cfg.motor_ids: + motor_name = MOTOR_NAMES.get(motor_id, f"motor_0x{motor_id:02X}") + responses, error = test_motor(bus, motor_id, cfg.timeout, cfg.use_fd) + + if error: + print(f" Motor 0x{motor_id:02X} ({motor_name}): ✗ {error}") + results[motor_id] = {"found": False, "error": error} + elif responses: + print(f" Motor 0x{motor_id:02X} ({motor_name}): ✓ FOUND") + for resp_id, data, is_fd in responses: + fd_flag = " [FD]" if is_fd else "" + print(f" → Response 0x{resp_id:02X}{fd_flag}: {data}") + results[motor_id] = {"found": True, "responses": responses} + else: + print(f" Motor 0x{motor_id:02X} ({motor_name}): ✗ No response") + results[motor_id] = {"found": False} + + time.sleep(0.05) + finally: + bus.shutdown() + + found = sum(1 for r in results.values() if r.get("found")) + print(f"\n Summary: {found}/{len(cfg.motor_ids)} motors found") + return results + + +def speed_test(cfg: CANSetupConfig, interface: str): + """Test communication speed with motors.""" + import can + + is_up, status, _ = check_interface_status(interface) + if not is_up: + print(f"{interface}: {status} - skipping") + return + + print(f"\n{interface}: Running speed test ({cfg.speed_iterations} iterations)...") + + try: + kwargs = {"channel": interface, "interface": "socketcan", "bitrate": cfg.bitrate} + if cfg.use_fd: + kwargs.update({"data_bitrate": cfg.data_bitrate, "fd": True}) + bus = can.interface.Bus(**kwargs) + except Exception as e: + print(f" ✗ Connection failed: {e}") + return + + responding_motor = None + for motor_id in cfg.motor_ids: + responses, _ = test_motor(bus, motor_id, 0.5, cfg.use_fd) + if responses: + responding_motor = motor_id + break + + if not responding_motor: + print(" ✗ No responding motors found") + bus.shutdown() + return + + print(f" Testing with motor 0x{responding_motor:02X}...") + latencies = [] + + for _ in range(cfg.speed_iterations): + start = time.perf_counter() + msg = can.Message( + arbitration_id=responding_motor, + data=[0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFC], + is_extended_id=False, + is_fd=cfg.use_fd, + ) + bus.send(msg) + resp = bus.recv(timeout=0.1) + if resp: + latencies.append((time.perf_counter() - start) * 1000) + + bus.shutdown() + + if latencies: + avg_latency = sum(latencies) / len(latencies) + hz = 1000.0 / avg_latency if avg_latency > 0 else 0 + print(f" ✓ Success rate: {len(latencies)}/{cfg.speed_iterations}") + print(f" ✓ Avg latency: {avg_latency:.2f} ms") + print(f" ✓ Max frequency: {hz:.1f} Hz") + else: + print(" ✗ No successful responses") + + +def run_setup(cfg: CANSetupConfig): + """Setup CAN interfaces.""" + print("=" * 50) + print("CAN Interface Setup") + print("=" * 50) + print(f"Mode: {'CAN FD' if cfg.use_fd else 'CAN 2.0'}") + print(f"Bitrate: {cfg.bitrate / 1_000_000:.1f} Mbps") + if cfg.use_fd: + print(f"Data bitrate: {cfg.data_bitrate / 1_000_000:.1f} Mbps") + print() + + interfaces = cfg.get_interfaces() + for interface in interfaces: + print(f"Configuring {interface}...") + if setup_interface(interface, cfg.bitrate, cfg.data_bitrate, cfg.use_fd): + is_up, status, _ = check_interface_status(interface) + print(f" ✓ {interface}: {status}") + else: + print(f" ✗ {interface}: Failed") + + print("\nSetup complete!") + print("\nNext: Test motors with:") + print(f" lerobot-setup-can --mode=test --interfaces {','.join(interfaces)}") + + +def run_test(cfg: CANSetupConfig): + """Test motors on CAN interfaces.""" + print("=" * 50) + print("CAN Motor Test") + print("=" * 50) + print(f"Testing motors 0x{min(cfg.motor_ids):02X}-0x{max(cfg.motor_ids):02X}") + print(f"Mode: {'CAN FD' if cfg.use_fd else 'CAN 2.0'}") + print() + + interfaces = cfg.get_interfaces() + all_results = {} + for interface in interfaces: + all_results[interface] = test_interface(cfg, interface) + + total_found = sum(sum(1 for r in res.values() if r.get("found")) for res in all_results.values()) + + print("\n" + "=" * 50) + print("Summary") + print("=" * 50) + print(f"Total motors found: {total_found}") + + if total_found == 0: + print("\n⚠ No motors found! Check:") + print(" 1. Motors are powered (24V)") + print(" 2. CAN wiring (CANH, CANL, GND)") + print(" 3. Motor timeout parameter > 0 (use Damiao tools)") + print(" 4. 120Ω termination at both cable ends") + print(f" 5. Interface configured: lerobot-setup-can --mode=setup --interfaces {interfaces[0]}") + + +def run_speed(cfg: CANSetupConfig): + """Run speed tests on CAN interfaces.""" + print("=" * 50) + print("CAN Speed Test") + print("=" * 50) + + for interface in cfg.get_interfaces(): + speed_test(cfg, interface) + + +@draccus.wrap() +def setup_can(cfg: CANSetupConfig): + if not is_package_available("can"): + print("Error: python-can not installed. Install with: pip install python-can") + sys.exit(1) + + if cfg.mode == "setup": + run_setup(cfg) + elif cfg.mode == "test": + run_test(cfg) + elif cfg.mode == "speed": + run_speed(cfg) + else: + print(f"Unknown mode: {cfg.mode}") + print("Available modes: setup, test, speed") + sys.exit(1) + + +def main(): + setup_can() + + +if __name__ == "__main__": + main() diff --git a/src/lerobot/scripts/lerobot_teleoperate.py b/src/lerobot/scripts/lerobot_teleoperate.py index bf722d6f1..18d8863d6 100644 --- a/src/lerobot/scripts/lerobot_teleoperate.py +++ b/src/lerobot/scripts/lerobot_teleoperate.py @@ -33,18 +33,18 @@ Example teleoperation with bimanual so100: ```shell lerobot-teleoperate \ - --robot.type=bi_so100_follower \ - --robot.left_arm_port=/dev/tty.usbmodem5A460851411 \ - --robot.right_arm_port=/dev/tty.usbmodem5A460812391 \ + --robot.type=bi_so_follower \ + --robot.left_arm_config.port=/dev/tty.usbmodem5A460822851 \ + --robot.right_arm_config.port=/dev/tty.usbmodem5A460814411 \ --robot.id=bimanual_follower \ - --robot.cameras='{ - left: {"type": "opencv", "index_or_path": 0, "width": 1920, "height": 1080, "fps": 30}, - top: {"type": "opencv", "index_or_path": 1, "width": 1920, "height": 1080, "fps": 30}, - right: {"type": "opencv", "index_or_path": 2, "width": 1920, "height": 1080, "fps": 30} + --robot.left_arm_config.cameras='{ + wrist: {"type": "opencv", "index_or_path": 1, "width": 640, "height": 480, "fps": 30}, + }' --robot.right_arm_config.cameras='{ + wrist: {"type": "opencv", "index_or_path": 2, "width": 640, "height": 480, "fps": 30}, }' \ - --teleop.type=bi_so100_leader \ - --teleop.left_arm_port=/dev/tty.usbmodem5A460828611 \ - --teleop.right_arm_port=/dev/tty.usbmodem5A460826981 \ + --teleop.type=bi_so_leader \ + --teleop.left_arm_config.port=/dev/tty.usbmodem5A460852721 \ + --teleop.right_arm_config.port=/dev/tty.usbmodem5A460819811 \ --teleop.id=bimanual_leader \ --display_data=true ``` @@ -70,27 +70,27 @@ from lerobot.processor import ( from lerobot.robots import ( # noqa: F401 Robot, RobotConfig, - bi_so100_follower, + bi_so_follower, earthrover_mini_plus, hope_jr, koch_follower, make_robot_from_config, omx_follower, - so100_follower, - so101_follower, + reachy2, + so_follower, ) from lerobot.teleoperators import ( # noqa: F401 Teleoperator, TeleoperatorConfig, - bi_so100_leader, + bi_so_leader, gamepad, homunculus, keyboard, koch_leader, make_teleoperator_from_config, omx_leader, - so100_leader, - so101_leader, + reachy2_teleoperator, + so_leader, ) from lerobot.utils.import_utils import register_third_party_plugins from lerobot.utils.robot_utils import precise_sleep @@ -108,6 +108,12 @@ class TeleoperateConfig: teleop_time_s: float | None = None # Display all cameras on screen display_data: bool = False + # Display data on a remote Rerun server + display_ip: str | None = None + # Port of the remote Rerun server + display_port: int | None = None + # Whether to display compressed images in Rerun + display_compressed_images: bool = False def teleop_loop( @@ -119,6 +125,7 @@ def teleop_loop( robot_observation_processor: RobotProcessorPipeline[RobotObservation, RobotObservation], display_data: bool = False, duration: float | None = None, + display_compressed_images: bool = False, ): """ This function continuously reads actions from a teleoperation device, processes them through optional @@ -130,6 +137,7 @@ def teleop_loop( robot: The robot instance being controlled. fps: The target frequency for the control loop in frames per second. display_data: If True, fetches robot observations and displays them in the console and Rerun. + display_compressed_images: If True, compresses images before sending them to Rerun for display. duration: The maximum duration of the teleoperation loop in seconds. If None, the loop runs indefinitely. teleop_action_processor: An optional pipeline to process raw actions from the teleoperator. robot_action_processor: An optional pipeline to process actions before they are sent to the robot. @@ -157,7 +165,7 @@ def teleop_loop( # Process action for robot through pipeline robot_action_to_send = robot_action_processor((teleop_action, obs)) - # Send processed action to robot (robot_action_processor.to_output should return dict[str, Any]) + # Send processed action to robot (robot_action_processor.to_output should return RobotAction) _ = robot.send_action(robot_action_to_send) if display_data: @@ -167,6 +175,7 @@ def teleop_loop( log_rerun_data( observation=obs_transition, action=teleop_action, + compress_images=display_compressed_images, ) print("\n" + "-" * (display_len + 10)) @@ -177,7 +186,7 @@ def teleop_loop( move_cursor_up(len(robot_action_to_send) + 3) dt_s = time.perf_counter() - loop_start - precise_sleep(1 / fps - dt_s) + precise_sleep(max(1 / fps - dt_s, 0.0)) loop_s = time.perf_counter() - loop_start print(f"Teleop loop time: {loop_s * 1e3:.2f}ms ({1 / loop_s:.0f} Hz)") move_cursor_up(1) @@ -191,7 +200,12 @@ def teleoperate(cfg: TeleoperateConfig): init_logging() logging.info(pformat(asdict(cfg))) if cfg.display_data: - init_rerun(session_name="teleoperation") + init_rerun(session_name="teleoperation", ip=cfg.display_ip, port=cfg.display_port) + display_compressed_images = ( + True + if (cfg.display_data and cfg.display_ip is not None and cfg.display_port is not None) + else cfg.display_compressed_images + ) teleop = make_teleoperator_from_config(cfg.teleop) robot = make_robot_from_config(cfg.robot) @@ -210,6 +224,7 @@ def teleoperate(cfg: TeleoperateConfig): teleop_action_processor=teleop_action_processor, robot_action_processor=robot_action_processor, robot_observation_processor=robot_observation_processor, + display_compressed_images=display_compressed_images, ) except KeyboardInterrupt: pass diff --git a/src/lerobot/scripts/lerobot_train.py b/src/lerobot/scripts/lerobot_train.py index 6cf733442..93b99e245 100644 --- a/src/lerobot/scripts/lerobot_train.py +++ b/src/lerobot/scripts/lerobot_train.py @@ -13,6 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import dataclasses import logging import time from contextlib import nullcontext @@ -164,6 +165,8 @@ def train(cfg: TrainPipelineConfig, accelerator: Accelerator | None = None): cfg: A `TrainPipelineConfig` object containing all training configurations. accelerator: Optional Accelerator instance. If None, one will be created automatically. """ + cfg.validate() + # Create Accelerator if not provided # It will automatically detect if running in distributed mode or single-process mode # We set step_scheduler_with_optimizer=False to prevent accelerate from adjusting the lr_scheduler steps based on the num_processes @@ -172,7 +175,14 @@ def train(cfg: TrainPipelineConfig, accelerator: Accelerator | None = None): from accelerate.utils import DistributedDataParallelKwargs ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) - accelerator = Accelerator(step_scheduler_with_optimizer=False, kwargs_handlers=[ddp_kwargs]) + # Accelerate auto-detects the device based on the available hardware and ignores the policy.device setting. + # Force the device to be CPU when policy.device is set to CPU. + force_cpu = cfg.policy.device == "cpu" + accelerator = Accelerator( + step_scheduler_with_optimizer=False, + kwargs_handlers=[ddp_kwargs], + cpu=force_cpu, + ) init_logging(accelerator=accelerator) @@ -180,8 +190,6 @@ def train(cfg: TrainPipelineConfig, accelerator: Accelerator | None = None): # When using accelerate, only the main process should log to avoid duplicate outputs is_main_process = accelerator.is_main_process - cfg.validate() - # Only log on main process if is_main_process: logging.info(pformat(cfg.to_dict())) @@ -217,9 +225,8 @@ def train(cfg: TrainPipelineConfig, accelerator: Accelerator | None = None): # On real-world data, no need to create an environment as evaluations are done outside train.py, # using the eval.py instead, with gym_dora environment and dora-rs. eval_env = None - if cfg.eval_freq > 0 and cfg.env is not None: - if is_main_process: - logging.info("Creating env") + if cfg.eval_freq > 0 and cfg.env is not None and is_main_process: + logging.info("Creating env") eval_env = make_env(cfg.env, n_envs=cfg.eval.batch_size, use_async_envs=cfg.eval.use_async_envs) if is_main_process: @@ -230,6 +237,12 @@ def train(cfg: TrainPipelineConfig, accelerator: Accelerator | None = None): rename_map=cfg.rename_map, ) + if cfg.peft is not None: + logging.info("Using PEFT! Wrapping model.") + # Convert CLI peft config to dict for overrides + peft_cli_overrides = dataclasses.asdict(cfg.peft) + policy = policy.wrap_with_peft(peft_cli_overrides=peft_cli_overrides) + # Wait for all processes to finish policy creation before continuing accelerator.wait_for_everyone() @@ -502,7 +515,10 @@ def train(cfg: TrainPipelineConfig, accelerator: Accelerator | None = None): if cfg.policy.push_to_hub: unwrapped_policy = accelerator.unwrap_model(policy) - unwrapped_policy.push_model_to_hub(cfg) + if cfg.policy.use_peft: + unwrapped_policy.push_model_to_hub(cfg, peft_model=unwrapped_policy) + else: + unwrapped_policy.push_model_to_hub(cfg) preprocessor.push_to_hub(cfg.policy.repo_id) postprocessor.push_to_hub(cfg.policy.repo_id) diff --git a/src/lerobot/scripts/lerobot_train_tokenizer.py b/src/lerobot/scripts/lerobot_train_tokenizer.py new file mode 100644 index 000000000..1d8f4644b --- /dev/null +++ b/src/lerobot/scripts/lerobot_train_tokenizer.py @@ -0,0 +1,604 @@ +# Copyright 2026 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Train FAST tokenizer for action encoding. + +This script: +1. Loads action chunks from LeRobotDataset (with episode sampling) +2. Optionally applies delta transforms (relative vs absolute actions) +3. Extracts specified action dimensions for encoding +4. Applies normalization (MEAN_STD, MIN_MAX, QUANTILES, or other modes) +5. Trains FAST tokenizer (BPE on DCT coefficients) on the action chunks +6. Saves tokenizer to output directory +7. Optionally pushes tokenizer to Hugging Face Hub +8. Reports compression statistics + +Example: + +```shell +lerobot-train-tokenizer \ + --repo_id=user/dataset_name \ + --action_horizon=10 \ + --max_episodes=100 \ + --sample_fraction=0.1 \ + --encoded_dims="0:6" \ + --delta_dims="0,1,2,3,4,5" \ + --use_delta_transform=true \ + --state_key="observation.state" \ + --normalization_mode="QUANTILES" \ + --vocab_size=1024 \ + --scale=10.0 \ + --output_dir="./fast_tokenizer_dataset_name" \ + --push_to_hub=true \ + --hub_repo_id="user/fast_tokenizer_dataset_name" \ + --hub_private=false +""" + +import json +from dataclasses import dataclass +from pathlib import Path +from typing import TYPE_CHECKING + +import numpy as np +import torch +from huggingface_hub import HfApi + +from lerobot.utils.import_utils import _transformers_available + +if TYPE_CHECKING or _transformers_available: + from transformers import AutoProcessor +else: + AutoProcessor = None + +from lerobot.configs import parser +from lerobot.configs.types import NormalizationMode +from lerobot.datasets.lerobot_dataset import LeRobotDataset +from lerobot.utils.constants import ACTION, OBS_STATE + + +@dataclass +class TokenizerTrainingConfig: + """Configuration for training FAST tokenizer.""" + + # LeRobot dataset repository ID + repo_id: str + # Root directory for dataset (default: ~/.cache/huggingface/lerobot) + root: str | None = None + # Number of future actions in each chunk + action_horizon: int = 10 + # Max episodes to use (None = all episodes in dataset) + max_episodes: int | None = None + # Fraction of chunks to sample per episode + sample_fraction: float = 0.1 + # Comma-separated dimension ranges to encode (e.g., "0:6,7:23") + encoded_dims: str = "0:6,7:23" + # Comma-separated dimension indices for delta transform (e.g., "0,1,2,3,4,5") + delta_dims: str | None = None + # Whether to apply delta transform (relative actions vs absolute actions) + use_delta_transform: bool = False + # Dataset key for state observations (default: "observation.state") + state_key: str = OBS_STATE + # Normalization mode (MEAN_STD, MIN_MAX, QUANTILES, QUANTILE10, IDENTITY) + normalization_mode: str = "QUANTILES" + # FAST vocabulary size (BPE vocab size) + vocab_size: int = 1024 + # DCT scaling factor (default: 10.0) + scale: float = 10.0 + # Directory to save tokenizer (default: ./fast_tokenizer_{repo_id}) + output_dir: str | None = None + # Whether to push the tokenizer to Hugging Face Hub + push_to_hub: bool = False + # Hub repository ID (e.g., "username/tokenizer-name"). If None, uses output_dir name + hub_repo_id: str | None = None + # Whether to create a private repository on the Hub + hub_private: bool = False + + +def apply_delta_transform(state: np.ndarray, actions: np.ndarray, delta_dims: list[int] | None) -> np.ndarray: + """Apply delta transform to specified dimensions. + + Args: + state: Current state [D] + actions: Future actions [D] + delta_dims: List of dimension indices to apply delta transform to + + Returns: + Transformed actions [D] + """ + if delta_dims is None or len(delta_dims) == 0: + return actions + + delta_actions = actions.copy() + for dim in delta_dims: + delta_actions[dim] = actions[dim] - state[dim] + + return delta_actions + + +def apply_normalization( + data: np.ndarray, + stats: dict[str, np.ndarray], + mode: NormalizationMode, + eps: float = 1e-8, +) -> np.ndarray: + """Apply normalization to data based on the specified mode. + + Args: + data: Data to normalize [N, H, D] or [D] + stats: Dictionary of statistics (mean, std, min, max, q01, q99, q10, q90) + mode: Normalization mode to apply + eps: Small epsilon for numerical stability + + Returns: + Normalized data with the same shape as input + """ + if mode == NormalizationMode.IDENTITY: + return data + + if mode == NormalizationMode.MEAN_STD: + mean = stats.get("mean") + std = stats.get("std") + if mean is None or std is None: + raise ValueError("MEAN_STD mode requires 'mean' and 'std' in stats") + return (data - mean) / np.maximum(std, eps) + + if mode == NormalizationMode.MIN_MAX: + min_val = stats.get("min") + max_val = stats.get("max") + if min_val is None or max_val is None: + raise ValueError("MIN_MAX mode requires 'min' and 'max' in stats") + denom = np.maximum(max_val - min_val, eps) + return 2.0 * (data - min_val) / denom - 1.0 + + if mode == NormalizationMode.QUANTILES: + q01 = stats.get("q01") + q99 = stats.get("q99") + if q01 is None or q99 is None: + raise ValueError("QUANTILES mode requires 'q01' and 'q99' in stats") + denom = np.maximum(q99 - q01, eps) + # Clip to quantile range then normalize to [-1, 1] + clipped = np.clip(data, q01, q99) + return 2.0 * (clipped - q01) / denom - 1.0 + + if mode == NormalizationMode.QUANTILE10: + q10 = stats.get("q10") + q90 = stats.get("q90") + if q10 is None or q90 is None: + raise ValueError("QUANTILE10 mode requires 'q10' and 'q90' in stats") + denom = np.maximum(q90 - q10, eps) + # Clip to quantile range then normalize to [-1, 1] + clipped = np.clip(data, q10, q90) + return 2.0 * (clipped - q10) / denom - 1.0 + + raise ValueError(f"Unsupported normalization mode: {mode}") + + +def process_episode(args): + """Process single episode and return action chunks.""" + dataset, ep_idx, action_horizon, delta_dims, sample_fraction, state_key, use_delta_transform = args + + try: + # get episode info + ep_info = dataset.meta.episodes[ep_idx] + from_idx = ep_info["dataset_from_index"] + to_idx = ep_info["dataset_to_index"] + ep_length = to_idx - from_idx + + if ep_length < action_horizon: + return None + + # load all frames in episode + # if dataset has episode filtering, we need to use the mapping + states = [] + actions = [] + + for abs_idx in range(from_idx, to_idx): + # map absolute index to relative index if needed + if dataset._absolute_to_relative_idx is not None: + if abs_idx not in dataset._absolute_to_relative_idx: + # this episode's frames aren't in the filtered dataset + return None + rel_idx = dataset._absolute_to_relative_idx[abs_idx] + else: + rel_idx = abs_idx + + frame = dataset.hf_dataset[rel_idx] + + # get state (could be from observation.state or other state key) + if state_key in frame: + state = ( + frame[state_key].numpy() + if torch.is_tensor(frame[state_key]) + else np.array(frame[state_key]) + ) + else: + # if no state key, use zeros (no delta transform) + state = np.zeros_like( + frame[ACTION].numpy() if torch.is_tensor(frame[ACTION]) else np.array(frame[ACTION]) + ) + + action = frame[ACTION].numpy() if torch.is_tensor(frame[ACTION]) else np.array(frame[ACTION]) + + states.append(state) + actions.append(action) + + states = np.array(states) + actions = np.array(actions) + + # create action chunks (sliding window) + # all actions in a chunk are relative to the FIRST state in that chunk + action_chunks = [] + + for i in range(len(states) - action_horizon + 1): + current_state = states[i] # First state in chunk + future_absolute_actions = actions[i : i + action_horizon] + + if use_delta_transform: + # relative actions + delta_chunk = np.zeros_like(future_absolute_actions) + for t in range(action_horizon): + delta_chunk[t] = apply_delta_transform( + current_state, + future_absolute_actions[t], + delta_dims, + ) + action_chunks.append(delta_chunk) + else: + # absolute actions (no delta) + action_chunks.append(future_absolute_actions) + + if len(action_chunks) == 0: + return None + + action_chunks = np.array(action_chunks) + + # sample chunks + if sample_fraction < 1.0: + n_chunks = len(action_chunks) + n_samples = max(1, int(n_chunks * sample_fraction)) + episode_seed = hash(ep_idx) % (2**31) + rng = np.random.RandomState(episode_seed) + indices = rng.choice(n_chunks, size=n_samples, replace=False) + action_chunks = action_chunks[indices] + + return action_chunks + + except Exception as e: + print(f"Error processing episode {ep_idx}: {e}") + import traceback + + traceback.print_exc() + return None + + +def train_fast_tokenizer( + action_chunks: np.ndarray, + vocab_size: int = 1024, + scale: float = 10.0, +) -> AutoProcessor: + """ + Train FAST tokenizer (BPE on DCT coefficients) on action chunks. + + Uses the .fit() method to train a new tokenizer on the provided data. + + Args: + action_chunks: Array of action chunks [N, H, D] where N=num_chunks, H=horizon, D=action_dim + vocab_size: BPE vocabulary size + scale: DCT scaling factor for quantization + + Returns: + Trained FAST tokenizer + """ + print(f"Training FAST tokenizer on {len(action_chunks)} action chunks...") + print(f"Action chunk shape: {action_chunks.shape}") + print(f"Vocab size: {vocab_size}") + print(f"DCT scale: {scale}") + + # download the tokenizer source code (not pretrained weights) + # we'll train a new tokenizer on our own data + base_tokenizer = AutoProcessor.from_pretrained("physical-intelligence/fast", trust_remote_code=True) + + # convert action_chunks array to list of arrays (expected by .fit()) + action_data_list = [action_chunks[i] for i in range(len(action_chunks))] + + # train the new tokenizer on our action data using .fit() + # this trains the BPE tokenizer on DCT coefficients + print("Training new tokenizer (this may take a few minutes)...") + tokenizer = base_tokenizer.fit( + action_data_list, + scale=scale, + vocab_size=vocab_size, + time_horizon=action_chunks.shape[1], # action_horizon + action_dim=action_chunks.shape[2], # encoded dimensions + ) + print("✓ Tokenizer training complete!") + + # validate it works + sample_chunk = action_chunks[0] + encoded = tokenizer(sample_chunk[None])[0] + if isinstance(encoded, list): + encoded = np.array(encoded) + print(f"Sample encoding: {len(encoded)} tokens for chunk shape {sample_chunk.shape}") + + return tokenizer + + +def compute_compression_stats(tokenizer, action_chunks: np.ndarray): + """Compute compression statistics.""" + print("\nComputing compression statistics...") + + # sample for stats (use max 1000 chunks for speed) + sample_size = min(1000, len(action_chunks)) + sample_indices = np.random.RandomState(42).choice(len(action_chunks), size=sample_size, replace=False) + sample_chunks = action_chunks[sample_indices] + + token_lengths = [] + for chunk in sample_chunks: + encoded = tokenizer(chunk[None])[0] + if isinstance(encoded, list): + token_lengths.append(len(encoded)) + else: + token_lengths.append(encoded.shape[0] if hasattr(encoded, "shape") else len(encoded)) + + token_lengths = np.array(token_lengths) + + # compression ratio: (H * D) / avg_tokens + input_size = action_chunks.shape[1] * action_chunks.shape[2] + avg_tokens = np.mean(token_lengths) + compression_ratio = input_size / avg_tokens + + stats = { + "compression_ratio": float(compression_ratio), + "mean_token_length": float(np.mean(token_lengths)), + "p99_token_length": float(np.percentile(token_lengths, 99)), + "min_token_length": float(np.min(token_lengths)), + "max_token_length": float(np.max(token_lengths)), + } + + print("Compression Statistics:") + print(f" Average compression ratio: {stats['compression_ratio']:.2f}x") + print(f" Mean token length: {stats['mean_token_length']:.1f}") + print(f" P99 token length: {stats['p99_token_length']:.0f}") + print(f" Min token length: {stats['min_token_length']:.0f}") + print(f" Max token length: {stats['max_token_length']:.0f}") + + return stats + + +@parser.wrap() +def train_tokenizer(cfg: TokenizerTrainingConfig): + """ + Train FAST tokenizer for action encoding. + + Args: + cfg: TokenizerTrainingConfig dataclass with all configuration parameters + """ + # load dataset + print(f"Loading dataset: {cfg.repo_id}") + dataset = LeRobotDataset(repo_id=cfg.repo_id, root=cfg.root) + print(f"Dataset loaded: {dataset.num_episodes} episodes, {dataset.num_frames} frames") + + # parse normalization mode + try: + norm_mode = NormalizationMode(cfg.normalization_mode) + except ValueError as err: + raise ValueError( + f"Invalid normalization_mode: {cfg.normalization_mode}. " + f"Must be one of: {', '.join([m.value for m in NormalizationMode])}" + ) from err + print(f"Normalization mode: {norm_mode.value}") + + # parse encoded dimensions + encoded_dim_ranges = [] + for range_str in cfg.encoded_dims.split(","): + start, end = map(int, range_str.strip().split(":")) + encoded_dim_ranges.append((start, end)) + + total_encoded_dims = sum(end - start for start, end in encoded_dim_ranges) + print(f"Encoding {total_encoded_dims} dimensions: {cfg.encoded_dims}") + + # parse delta dimensions + delta_dim_list = None + if cfg.delta_dims is not None and cfg.delta_dims.strip(): + delta_dim_list = [int(d.strip()) for d in cfg.delta_dims.split(",")] + print(f"Delta dimensions: {delta_dim_list}") + else: + print("No delta dimensions specified") + + print(f"Use delta transform: {cfg.use_delta_transform}") + if cfg.use_delta_transform and (delta_dim_list is None or len(delta_dim_list) == 0): + print("Warning: use_delta_transform=True but no delta_dims specified. No delta will be applied.") + + print(f"Action horizon: {cfg.action_horizon}") + print(f"State key: {cfg.state_key}") + + # determine episodes to process + num_episodes = dataset.num_episodes + if cfg.max_episodes is not None: + num_episodes = min(cfg.max_episodes, num_episodes) + + print(f"Processing {num_episodes} episodes...") + + # process episodes sequentially (to avoid pickling issues with dataset) + all_chunks = [] + for ep_idx in range(num_episodes): + if ep_idx % 10 == 0: + print(f" Processing episode {ep_idx}/{num_episodes}...") + + chunks = process_episode( + ( + dataset, + ep_idx, + cfg.action_horizon, + delta_dim_list, + cfg.sample_fraction, + cfg.state_key, + cfg.use_delta_transform, + ) + ) + if chunks is not None: + all_chunks.append(chunks) + + # concatenate all chunks + all_chunks = np.concatenate(all_chunks, axis=0) + print(f"Collected {len(all_chunks)} action chunks") + + # extract only encoded dimensions FIRST (before normalization) + encoded_chunks = [] + for start, end in encoded_dim_ranges: + encoded_chunks.append(all_chunks[:, :, start:end]) + encoded_chunks = np.concatenate(encoded_chunks, axis=-1) # [N, H, D_encoded] + print(f"Extracted {encoded_chunks.shape[-1]} encoded dimensions") + + # apply normalization to encoded dimensions + print("\nBefore normalization - overall stats:") + print(f" Min: {np.min(encoded_chunks):.4f}, Max: {np.max(encoded_chunks):.4f}") + print(f" Mean: {np.mean(encoded_chunks):.4f}, Std: {np.std(encoded_chunks):.4f}") + + # get normalization stats from dataset + norm_stats = dataset.meta.stats + if norm_stats is not None and ACTION in norm_stats: + action_stats = norm_stats[ACTION] + + # build encoded dimension indices + encoded_dim_indices = [] + for start, end in encoded_dim_ranges: + encoded_dim_indices.extend(range(start, end)) + encoded_dim_indices = np.array(encoded_dim_indices) + + # extract stats for encoded dimensions only + encoded_stats = {} + for stat_name, stat_values in action_stats.items(): + if isinstance(stat_values, (list, np.ndarray)): + stat_array = np.array(stat_values) + if len(stat_array) > max(encoded_dim_indices): + encoded_stats[stat_name] = stat_array[encoded_dim_indices] + + if encoded_stats: + print(f"\nNormalization stats for encoded dimensions (mode: {norm_mode.value}):") + for stat_name, stat_values in encoded_stats.items(): + print( + f" {stat_name}: shape={stat_values.shape}, " + f"range=[{np.min(stat_values):.4f}, {np.max(stat_values):.4f}]" + ) + + # apply normalization based on mode + try: + encoded_chunks = apply_normalization(encoded_chunks, encoded_stats, norm_mode, eps=1e-8) + print(f"\nApplied {norm_mode.value} normalization") + except ValueError as e: + print(f"Warning: {e}. Using raw actions without normalization.") + + print("\nAfter normalization - overall stats:") + print(f" Min: {np.min(encoded_chunks):.4f}, Max: {np.max(encoded_chunks):.4f}") + print(f" Mean: {np.mean(encoded_chunks):.4f}, Std: {np.std(encoded_chunks):.4f}") + + print("\nPer-dimension stats (after normalization):") + for d in range(encoded_chunks.shape[-1]): + dim_data = encoded_chunks[:, :, d] + print( + f" Dim {d}: min={np.min(dim_data):7.4f}, max={np.max(dim_data):7.4f}, " + f"mean={np.mean(dim_data):7.4f}, std={np.std(dim_data):7.4f}" + ) + else: + print("Warning: Could not extract stats for encoded dimensions, using raw actions") + else: + print("Warning: No normalization stats found in dataset, using raw actions") + + print(f"Encoded chunks shape: {encoded_chunks.shape}") + + # train FAST tokenizer + tokenizer = train_fast_tokenizer( + encoded_chunks, + vocab_size=cfg.vocab_size, + scale=cfg.scale, + ) + + # compute compression statistics + compression_stats = compute_compression_stats(tokenizer, encoded_chunks) + + # save tokenizer + output_dir = cfg.output_dir + if output_dir is None: + output_dir = f"fast_tokenizer_{cfg.repo_id.replace('/', '_')}" + output_path = Path(output_dir) + output_path.mkdir(parents=True, exist_ok=True) + + tokenizer.save_pretrained(output_path) + + # save metadata + metadata = { + "repo_id": cfg.repo_id, + "vocab_size": cfg.vocab_size, + "scale": cfg.scale, + "encoded_dims": cfg.encoded_dims, + "encoded_dim_ranges": encoded_dim_ranges, + "total_encoded_dims": total_encoded_dims, + "delta_dims": cfg.delta_dims, + "delta_dim_list": delta_dim_list, + "use_delta_transform": cfg.use_delta_transform, + "state_key": cfg.state_key, + "normalization_mode": norm_mode.value, + "action_horizon": cfg.action_horizon, + "num_training_chunks": len(encoded_chunks), + "compression_stats": compression_stats, + } + + with open(output_path / "metadata.json", "w") as f: + json.dump(metadata, f, indent=2) + + print(f"\nSaved FAST tokenizer to {output_path}") + print(f"Metadata: {json.dumps(metadata, indent=2)}") + + # push to Hugging Face Hub if requested + if cfg.push_to_hub: + # determine the hub repository ID + hub_repo_id = cfg.hub_repo_id + if hub_repo_id is None: + hub_repo_id = output_path.name + print(f"\nNo hub_repo_id provided, using: {hub_repo_id}") + + print(f"\nPushing tokenizer to Hugging Face Hub: {hub_repo_id}") + print(f" Private: {cfg.hub_private}") + + try: + # use the tokenizer's push_to_hub method + tokenizer.push_to_hub( + repo_id=hub_repo_id, + private=cfg.hub_private, + commit_message=f"Upload FAST tokenizer trained on {cfg.repo_id}", + ) + + # also upload the metadata.json file separately + api = HfApi() + api.upload_file( + path_or_fileobj=str(output_path / "metadata.json"), + path_in_repo="metadata.json", + repo_id=hub_repo_id, + repo_type="model", + commit_message="Upload tokenizer metadata", + ) + + print(f"Successfully pushed tokenizer to: https://huggingface.co/{hub_repo_id}") + except Exception as e: + print(f"Error pushing to hub: {e}") + print(" Make sure you're logged in with `huggingface-cli login`") + + +def main(): + """CLI entry point that parses arguments and runs the tokenizer training.""" + train_tokenizer() + + +if __name__ == "__main__": + main() diff --git a/src/lerobot/teleoperators/so100_leader/__init__.py b/src/lerobot/teleoperators/bi_so_leader/__init__.py similarity index 78% rename from src/lerobot/teleoperators/so100_leader/__init__.py rename to src/lerobot/teleoperators/bi_so_leader/__init__.py index 747416be2..b902270f9 100644 --- a/src/lerobot/teleoperators/so100_leader/__init__.py +++ b/src/lerobot/teleoperators/bi_so_leader/__init__.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# Copyright 2026 The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,5 +14,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .config_so100_leader import SO100LeaderConfig -from .so100_leader import SO100Leader +from .bi_so_leader import BiSOLeader, BiSOLeaderConfig diff --git a/src/lerobot/teleoperators/bi_so100_leader/bi_so100_leader.py b/src/lerobot/teleoperators/bi_so_leader/bi_so_leader.py similarity index 61% rename from src/lerobot/teleoperators/bi_so100_leader/bi_so100_leader.py rename to src/lerobot/teleoperators/bi_so_leader/bi_so_leader.py index 769669655..90bf2a92d 100644 --- a/src/lerobot/teleoperators/bi_so100_leader/bi_so100_leader.py +++ b/src/lerobot/teleoperators/bi_so_leader/bi_so_leader.py @@ -17,47 +17,51 @@ import logging from functools import cached_property -from lerobot.teleoperators.so100_leader.config_so100_leader import SO100LeaderConfig -from lerobot.teleoperators.so100_leader.so100_leader import SO100Leader +from lerobot.teleoperators.so_leader import SOLeaderTeleopConfig +from lerobot.utils.decorators import check_if_not_connected +from ..so_leader import SOLeader from ..teleoperator import Teleoperator -from .config_bi_so100_leader import BiSO100LeaderConfig +from .config_bi_so_leader import BiSOLeaderConfig logger = logging.getLogger(__name__) -class BiSO100Leader(Teleoperator): +class BiSOLeader(Teleoperator): """ - [Bimanual SO-100 Leader Arms](https://github.com/TheRobotStudio/SO-ARM100) designed by TheRobotStudio - This bimanual leader arm can also be easily adapted to use SO-101 leader arms, just replace the SO100Leader class with SO101Leader and SO100LeaderConfig with SO101LeaderConfig. + [Bimanual SO Leader Arms](https://github.com/TheRobotStudio/SO-ARM100) designed by TheRobotStudio """ - config_class = BiSO100LeaderConfig - name = "bi_so100_leader" + config_class = BiSOLeaderConfig + name = "bi_so_leader" - def __init__(self, config: BiSO100LeaderConfig): + def __init__(self, config: BiSOLeaderConfig): super().__init__(config) self.config = config - left_arm_config = SO100LeaderConfig( + left_arm_config = SOLeaderTeleopConfig( id=f"{config.id}_left" if config.id else None, calibration_dir=config.calibration_dir, - port=config.left_arm_port, + port=config.left_arm_config.port, ) - right_arm_config = SO100LeaderConfig( + right_arm_config = SOLeaderTeleopConfig( id=f"{config.id}_right" if config.id else None, calibration_dir=config.calibration_dir, - port=config.right_arm_port, + port=config.right_arm_config.port, ) - self.left_arm = SO100Leader(left_arm_config) - self.right_arm = SO100Leader(right_arm_config) + self.left_arm = SOLeader(left_arm_config) + self.right_arm = SOLeader(right_arm_config) @cached_property def action_features(self) -> dict[str, type]: - return {f"left_{motor}.pos": float for motor in self.left_arm.bus.motors} | { - f"right_{motor}.pos": float for motor in self.right_arm.bus.motors + left_arm_features = self.left_arm.action_features + right_arm_features = self.right_arm.action_features + + return { + **{f"left_{k}": v for k, v in left_arm_features.items()}, + **{f"right_{k}": v for k, v in right_arm_features.items()}, } @cached_property @@ -88,6 +92,7 @@ class BiSO100Leader(Teleoperator): self.left_arm.setup_motors() self.right_arm.setup_motors() + @check_if_not_connected def get_action(self) -> dict[str, float]: action_dict = {} @@ -102,19 +107,8 @@ class BiSO100Leader(Teleoperator): return action_dict def send_feedback(self, feedback: dict[str, float]) -> None: - # Remove "left_" prefix - left_feedback = { - key.removeprefix("left_"): value for key, value in feedback.items() if key.startswith("left_") - } - # Remove "right_" prefix - right_feedback = { - key.removeprefix("right_"): value for key, value in feedback.items() if key.startswith("right_") - } - - if left_feedback: - self.left_arm.send_feedback(left_feedback) - if right_feedback: - self.right_arm.send_feedback(right_feedback) + # TODO: Implement force feedback + raise NotImplementedError def disconnect(self) -> None: self.left_arm.disconnect() diff --git a/src/lerobot/teleoperators/so101_leader/config_so101_leader.py b/src/lerobot/teleoperators/bi_so_leader/config_bi_so_leader.py similarity index 71% rename from src/lerobot/teleoperators/so101_leader/config_so101_leader.py rename to src/lerobot/teleoperators/bi_so_leader/config_bi_so_leader.py index 8d91c32df..c2f23c617 100644 --- a/src/lerobot/teleoperators/so101_leader/config_so101_leader.py +++ b/src/lerobot/teleoperators/bi_so_leader/config_bi_so_leader.py @@ -16,13 +16,15 @@ from dataclasses import dataclass +from lerobot.teleoperators.so_leader import SOLeaderConfig + from ..config import TeleoperatorConfig -@TeleoperatorConfig.register_subclass("so101_leader") +@TeleoperatorConfig.register_subclass("bi_so_leader") @dataclass -class SO101LeaderConfig(TeleoperatorConfig): - # Port to connect to the arm - port: str +class BiSOLeaderConfig(TeleoperatorConfig): + """Configuration class for Bi SO Leader teleoperators.""" - use_degrees: bool = False + left_arm_config: SOLeaderConfig + right_arm_config: SOLeaderConfig diff --git a/src/lerobot/teleoperators/gamepad/teleop_gamepad.py b/src/lerobot/teleoperators/gamepad/teleop_gamepad.py index c7072f4a7..69cb0f971 100644 --- a/src/lerobot/teleoperators/gamepad/teleop_gamepad.py +++ b/src/lerobot/teleoperators/gamepad/teleop_gamepad.py @@ -20,6 +20,9 @@ from typing import Any import numpy as np +from lerobot.processor import RobotAction +from lerobot.utils.decorators import check_if_not_connected + from ..teleoperator import Teleoperator from ..utils import TeleopEvents from .configuration_gamepad import GamepadTeleopConfig @@ -83,7 +86,8 @@ class GamepadTeleop(Teleoperator): self.gamepad = Gamepad() self.gamepad.start() - def get_action(self) -> dict[str, Any]: + @check_if_not_connected + def get_action(self) -> RobotAction: # Update the controller to get fresh inputs self.gamepad.update() @@ -156,6 +160,7 @@ class GamepadTeleop(Teleoperator): self.gamepad.stop() self.gamepad = None + @property def is_connected(self) -> bool: """Check if gamepad is connected.""" return self.gamepad is not None diff --git a/src/lerobot/teleoperators/homunculus/homunculus_arm.py b/src/lerobot/teleoperators/homunculus/homunculus_arm.py index 43116f5c0..178eed544 100644 --- a/src/lerobot/teleoperators/homunculus/homunculus_arm.py +++ b/src/lerobot/teleoperators/homunculus/homunculus_arm.py @@ -22,7 +22,7 @@ from pprint import pformat import serial from lerobot.motors.motors_bus import MotorCalibration, MotorNormMode -from lerobot.utils.errors import DeviceAlreadyConnectedError, DeviceNotConnectedError +from lerobot.utils.decorators import check_if_already_connected, check_if_not_connected from lerobot.utils.utils import enter_pressed, move_cursor_up from ..teleoperator import Teleoperator @@ -93,10 +93,8 @@ class HomunculusArm(Teleoperator): with self.serial_lock: return self.serial.is_open and self.thread.is_alive() + @check_if_already_connected def connect(self, calibrate: bool = True) -> None: - if self.is_connected: - raise DeviceAlreadyConnectedError(f"{self} already connected") - if not self.serial.is_open: self.serial.open() self.thread.start() @@ -299,6 +297,7 @@ class HomunculusArm(Teleoperator): except Exception as e: logger.debug(f"Error reading frame in background thread for {self}: {e}") + @check_if_not_connected def get_action(self) -> dict[str, float]: joint_positions = self._read() return {f"{joint}.pos": pos for joint, pos in joint_positions.items()} @@ -306,10 +305,8 @@ class HomunculusArm(Teleoperator): def send_feedback(self, feedback: dict[str, float]) -> None: raise NotImplementedError + @check_if_not_connected def disconnect(self) -> None: - if not self.is_connected: - DeviceNotConnectedError(f"{self} is not connected.") - self.stop_event.set() self.thread.join(timeout=1) self.serial.close() diff --git a/src/lerobot/teleoperators/homunculus/homunculus_glove.py b/src/lerobot/teleoperators/homunculus/homunculus_glove.py index fefeec1e8..c4393d660 100644 --- a/src/lerobot/teleoperators/homunculus/homunculus_glove.py +++ b/src/lerobot/teleoperators/homunculus/homunculus_glove.py @@ -24,7 +24,7 @@ import serial from lerobot.motors import MotorCalibration from lerobot.motors.motors_bus import MotorNormMode from lerobot.teleoperators.homunculus.joints_translation import homunculus_glove_to_hope_jr_hand -from lerobot.utils.errors import DeviceAlreadyConnectedError, DeviceNotConnectedError +from lerobot.utils.decorators import check_if_already_connected, check_if_not_connected from lerobot.utils.utils import enter_pressed, move_cursor_up from ..teleoperator import Teleoperator @@ -119,10 +119,8 @@ class HomunculusGlove(Teleoperator): with self.serial_lock: return self.serial.is_open and self.thread.is_alive() + @check_if_already_connected def connect(self, calibrate: bool = True) -> None: - if self.is_connected: - raise DeviceAlreadyConnectedError(f"{self} already connected") - if not self.serial.is_open: self.serial.open() self.thread.start() @@ -325,6 +323,7 @@ class HomunculusGlove(Teleoperator): except Exception as e: logger.debug(f"Error reading frame in background thread for {self}: {e}") + @check_if_not_connected def get_action(self) -> dict[str, float]: joint_positions = self._read() return homunculus_glove_to_hope_jr_hand( @@ -334,10 +333,8 @@ class HomunculusGlove(Teleoperator): def send_feedback(self, feedback: dict[str, float]) -> None: raise NotImplementedError + @check_if_not_connected def disconnect(self) -> None: - if not self.is_connected: - DeviceNotConnectedError(f"{self} is not connected.") - self.stop_event.set() self.thread.join(timeout=1) self.serial.close() diff --git a/src/lerobot/teleoperators/keyboard/teleop_keyboard.py b/src/lerobot/teleoperators/keyboard/teleop_keyboard.py index ec8ea18f4..919f463d3 100644 --- a/src/lerobot/teleoperators/keyboard/teleop_keyboard.py +++ b/src/lerobot/teleoperators/keyboard/teleop_keyboard.py @@ -21,7 +21,8 @@ import time from queue import Queue from typing import Any -from lerobot.utils.errors import DeviceAlreadyConnectedError, DeviceNotConnectedError +from lerobot.processor import RobotAction +from lerobot.utils.decorators import check_if_already_connected, check_if_not_connected from ..teleoperator import Teleoperator from ..utils import TeleopEvents @@ -85,12 +86,8 @@ class KeyboardTeleop(Teleoperator): def is_calibrated(self) -> bool: pass + @check_if_already_connected def connect(self) -> None: - if self.is_connected: - raise DeviceAlreadyConnectedError( - "Keyboard is already connected. Do not run `robot.connect()` twice." - ) - if PYNPUT_AVAILABLE: logging.info("pynput is available - enabling local keyboard listener.") self.listener = keyboard.Listener( @@ -124,14 +121,10 @@ class KeyboardTeleop(Teleoperator): def configure(self): pass - def get_action(self) -> dict[str, Any]: + @check_if_not_connected + def get_action(self) -> RobotAction: before_read_t = time.perf_counter() - if not self.is_connected: - raise DeviceNotConnectedError( - "KeyboardTeleop is not connected. You need to run `connect()` before `get_action()`." - ) - self._drain_pressed_keys() # Generate action based on current key states @@ -143,11 +136,8 @@ class KeyboardTeleop(Teleoperator): def send_feedback(self, feedback: dict[str, Any]) -> None: pass + @check_if_not_connected def disconnect(self) -> None: - if not self.is_connected: - raise DeviceNotConnectedError( - "KeyboardTeleop is not connected. You need to run `robot.connect()` before `disconnect()`." - ) if self.listener is not None: self.listener.stop() @@ -181,12 +171,8 @@ class KeyboardEndEffectorTeleop(KeyboardTeleop): "names": {"delta_x": 0, "delta_y": 1, "delta_z": 2}, } - def get_action(self) -> dict[str, Any]: - if not self.is_connected: - raise DeviceNotConnectedError( - "KeyboardTeleop is not connected. You need to run `connect()` before `get_action()`." - ) - + @check_if_not_connected + def get_action(self) -> RobotAction: self._drain_pressed_keys() delta_x = 0.0 delta_y = 0.0 @@ -374,20 +360,16 @@ class KeyboardRoverTeleop(KeyboardTeleop): # Only remove key if it's being released self.current_pressed.pop(key_char, None) - def get_action(self) -> dict[str, Any]: + @check_if_not_connected + def get_action(self) -> RobotAction: """ Get the current action based on pressed keys. Returns: - dict with 'linear.vel' and 'angular.vel' keys + RobotAction with 'linear.vel' and 'angular.vel' keys """ before_read_t = time.perf_counter() - if not self.is_connected: - raise DeviceNotConnectedError( - "KeyboardRoverTeleop is not connected. You need to run `connect()` before `get_action()`." - ) - self._drain_pressed_keys() linear_velocity = 0.0 diff --git a/src/lerobot/teleoperators/koch_leader/koch_leader.py b/src/lerobot/teleoperators/koch_leader/koch_leader.py index 0409f2e57..87084b6b9 100644 --- a/src/lerobot/teleoperators/koch_leader/koch_leader.py +++ b/src/lerobot/teleoperators/koch_leader/koch_leader.py @@ -23,7 +23,7 @@ from lerobot.motors.dynamixel import ( DynamixelMotorsBus, OperatingMode, ) -from lerobot.utils.errors import DeviceAlreadyConnectedError, DeviceNotConnectedError +from lerobot.utils.decorators import check_if_already_connected, check_if_not_connected from ..teleoperator import Teleoperator from .config_koch_leader import KochLeaderConfig @@ -69,10 +69,8 @@ class KochLeader(Teleoperator): def is_connected(self) -> bool: return self.bus.is_connected + @check_if_already_connected def connect(self, calibrate: bool = True) -> None: - if self.is_connected: - raise DeviceAlreadyConnectedError(f"{self} already connected") - self.bus.connect() if not self.is_calibrated and calibrate: logger.info( @@ -161,10 +159,8 @@ class KochLeader(Teleoperator): self.bus.setup_motor(motor) print(f"'{motor}' motor id set to {self.bus.motors[motor].id}") + @check_if_not_connected def get_action(self) -> dict[str, float]: - if not self.is_connected: - raise DeviceNotConnectedError(f"{self} is not connected.") - start = time.perf_counter() action = self.bus.sync_read("Present_Position") action = {f"{motor}.pos": val for motor, val in action.items()} @@ -176,9 +172,7 @@ class KochLeader(Teleoperator): # TODO(rcadene, aliberts): Implement force feedback raise NotImplementedError + @check_if_not_connected def disconnect(self) -> None: - if not self.is_connected: - raise DeviceNotConnectedError(f"{self} is not connected.") - self.bus.disconnect() logger.info(f"{self} disconnected.") diff --git a/src/lerobot/teleoperators/omx_leader/config_omx_leader.py b/src/lerobot/teleoperators/omx_leader/config_omx_leader.py index 3c0420ab2..a0eca38f7 100644 --- a/src/lerobot/teleoperators/omx_leader/config_omx_leader.py +++ b/src/lerobot/teleoperators/omx_leader/config_omx_leader.py @@ -27,4 +27,4 @@ class OmxLeaderConfig(TeleoperatorConfig): # Sets the arm in torque mode with the gripper motor set to this value. This makes it possible to squeeze # the gripper and have it spring back to an open position on its own. - gripper_open_pos: float = 37.0 + gripper_open_pos: float = 60.0 diff --git a/src/lerobot/teleoperators/omx_leader/omx_leader.py b/src/lerobot/teleoperators/omx_leader/omx_leader.py index c0e49b558..4264b0485 100644 --- a/src/lerobot/teleoperators/omx_leader/omx_leader.py +++ b/src/lerobot/teleoperators/omx_leader/omx_leader.py @@ -23,7 +23,7 @@ from lerobot.motors.dynamixel import ( DynamixelMotorsBus, OperatingMode, ) -from lerobot.utils.errors import DeviceAlreadyConnectedError, DeviceNotConnectedError +from lerobot.utils.decorators import check_if_already_connected, check_if_not_connected from ..teleoperator import Teleoperator from .config_omx_leader import OmxLeaderConfig @@ -68,10 +68,8 @@ class OmxLeader(Teleoperator): def is_connected(self) -> bool: return self.bus.is_connected + @check_if_already_connected def connect(self, calibrate: bool = True) -> None: - if self.is_connected: - raise DeviceAlreadyConnectedError(f"{self} already connected") - self.bus.connect() if not self.is_calibrated and calibrate: logger.info( @@ -105,7 +103,7 @@ class OmxLeader(Teleoperator): self.calibration[motor] = MotorCalibration( id=m.id, drive_mode=drive_modes[motor], - homing_offset=0, + homing_offset=0 if motor != "gripper" else 100, range_min=0, range_max=4095, ) @@ -125,12 +123,20 @@ class OmxLeader(Teleoperator): # point self.bus.write("Operating_Mode", motor, OperatingMode.EXTENDED_POSITION.value) + if motor == "gripper": + self.bus.write("Drive_Mode", motor, DriveMode.INVERTED.value) + else: + self.bus.write("Drive_Mode", motor, DriveMode.NON_INVERTED.value) + # Use 'position control current based' for gripper to be limited by the limit of the current. # For the follower gripper, it means it can grasp an object without forcing too much even tho, # its goal position is a complete grasp (both gripper fingers are ordered to join and reach a touch). # For the leader gripper, it means we can use it as a physical trigger, since we can force with our finger # to make it move, and it will move back to its original target position when we release the force. self.bus.write("Operating_Mode", "gripper", OperatingMode.CURRENT_POSITION.value) + self.bus.write("Current_Limit", "gripper", 100) + self.bus.write("Goal_Current", "gripper", 100) + self.bus.write("Homing_Offset", "gripper", 100) # Set gripper's goal pos in current position mode so that we can use it as a trigger. self.bus.enable_torque("gripper") if self.is_calibrated: @@ -142,10 +148,8 @@ class OmxLeader(Teleoperator): self.bus.setup_motor(motor) print(f"'{motor}' motor id set to {self.bus.motors[motor].id}") + @check_if_not_connected def get_action(self) -> dict[str, float]: - if not self.is_connected: - raise DeviceNotConnectedError(f"{self} is not connected.") - start = time.perf_counter() action = self.bus.sync_read("Present_Position") action = {f"{motor}.pos": val for motor, val in action.items()} @@ -157,9 +161,7 @@ class OmxLeader(Teleoperator): # TODO(rcadene, aliberts): Implement force feedback raise NotImplementedError + @check_if_not_connected def disconnect(self) -> None: - if not self.is_connected: - raise DeviceNotConnectedError(f"{self} is not connected.") - self.bus.disconnect() logger.info(f"{self} disconnected.") diff --git a/src/lerobot/teleoperators/phone/teleop_phone.py b/src/lerobot/teleoperators/phone/teleop_phone.py index 91e613190..221ee8083 100644 --- a/src/lerobot/teleoperators/phone/teleop_phone.py +++ b/src/lerobot/teleoperators/phone/teleop_phone.py @@ -28,7 +28,7 @@ from teleop import Teleop from lerobot.teleoperators.phone.config_phone import PhoneConfig, PhoneOS from lerobot.teleoperators.teleoperator import Teleoperator -from lerobot.utils.errors import DeviceAlreadyConnectedError, DeviceNotConnectedError +from lerobot.utils.decorators import check_if_already_connected, check_if_not_connected from lerobot.utils.rotation import Rotation logger = logging.getLogger(__name__) @@ -81,10 +81,8 @@ class IOSPhone(BasePhone, Teleoperator): def is_connected(self) -> bool: return self._group is not None + @check_if_already_connected def connect(self) -> None: - if self.is_connected: - raise DeviceAlreadyConnectedError(f"{self} already connected") - logger.info("Connecting to IPhone, make sure to open the HEBI Mobile I/O app.") lookup = hebi.Lookup() time.sleep(2.0) @@ -164,6 +162,7 @@ class IOSPhone(BasePhone, Teleoperator): pos = ar_pos - rot.apply(self.config.camera_offset) return True, pos, rot, pose + @check_if_not_connected def get_action(self) -> dict: has_pose, raw_position, raw_rotation, fb_pose = self._read_current_pose() if not has_pose or not self.is_calibrated: @@ -204,10 +203,8 @@ class IOSPhone(BasePhone, Teleoperator): "phone.enabled": self._enabled, } + @check_if_not_connected def disconnect(self) -> None: - if not self.is_connected: - raise DeviceNotConnectedError(f"{self} is not connected.") - self._group = None @@ -227,10 +224,8 @@ class AndroidPhone(BasePhone, Teleoperator): def is_connected(self) -> bool: return self._teleop is not None + @check_if_already_connected def connect(self) -> None: - if self.is_connected: - raise DeviceAlreadyConnectedError(f"{self} already connected") - logger.info("Starting teleop stream for Android...") self._teleop = Teleop() self._teleop.subscribe(self._android_callback) @@ -318,6 +313,7 @@ class AndroidPhone(BasePhone, Teleoperator): self._latest_pose = pose self._latest_message = message + @check_if_not_connected def get_action(self) -> dict: ok, raw_pos, raw_rot, pose = self._read_current_pose() if not ok or not self.is_calibrated: @@ -350,10 +346,8 @@ class AndroidPhone(BasePhone, Teleoperator): "phone.enabled": self._enabled, } + @check_if_not_connected def disconnect(self) -> None: - if not self.is_connected: - raise DeviceNotConnectedError(f"{self} is not connected.") - self._teleop = None if self._teleop_thread and self._teleop_thread.is_alive(): self._teleop_thread.join(timeout=1.0) diff --git a/src/lerobot/teleoperators/reachy2_teleoperator/reachy2_teleoperator.py b/src/lerobot/teleoperators/reachy2_teleoperator/reachy2_teleoperator.py index 5a427dd71..db076b20f 100644 --- a/src/lerobot/teleoperators/reachy2_teleoperator/reachy2_teleoperator.py +++ b/src/lerobot/teleoperators/reachy2_teleoperator/reachy2_teleoperator.py @@ -13,11 +13,21 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations import logging import time +from typing import TYPE_CHECKING -from reachy2_sdk import ReachySDK +from lerobot.utils.import_utils import _reachy2_sdk_available + +if TYPE_CHECKING or _reachy2_sdk_available: + from reachy2_sdk import ReachySDK +else: + ReachySDK = None + +from lerobot.utils.decorators import check_if_already_connected, check_if_not_connected +from lerobot.utils.errors import DeviceNotConnectedError from ..teleoperator import Teleoperator from .config_reachy2_teleoperator import Reachy2TeleoperatorConfig @@ -75,6 +85,7 @@ class Reachy2Teleoperator(Teleoperator): def __init__(self, config: Reachy2TeleoperatorConfig): super().__init__(config) + self.config = config self.reachy: None | ReachySDK = None @@ -116,10 +127,12 @@ class Reachy2Teleoperator(Teleoperator): def is_connected(self) -> bool: return self.reachy.is_connected() if self.reachy is not None else False + @check_if_already_connected def connect(self, calibrate: bool = True) -> None: self.reachy = ReachySDK(self.config.ip_address) + if not self.is_connected: - raise ConnectionError() + raise DeviceNotConnectedError() logger.info(f"{self} connected.") @property @@ -132,26 +145,25 @@ class Reachy2Teleoperator(Teleoperator): def configure(self) -> None: pass + @check_if_not_connected def get_action(self) -> dict[str, float]: start = time.perf_counter() - if self.reachy and self.is_connected: - if self.config.use_present_position: - joint_action = { - k: self.reachy.joints[v].present_position for k, v in self.joints_dict.items() - } - else: - joint_action = {k: self.reachy.joints[v].goal_position for k, v in self.joints_dict.items()} + joint_action: dict[str, float] = {} + vel_action: dict[str, float] = {} - if not self.config.with_mobile_base: - dt_ms = (time.perf_counter() - start) * 1e3 - logger.debug(f"{self} read action: {dt_ms:.1f}ms") - return joint_action - - if self.config.use_present_position: - vel_action = {k: self.reachy.mobile_base.odometry[v] for k, v in REACHY2_VEL.items()} - else: - vel_action = {k: self.reachy.mobile_base.last_cmd_vel[v] for k, v in REACHY2_VEL.items()} + if self.config.use_present_position: + joint_action = {k: self.reachy.joints[v].present_position for k, v in self.joints_dict.items()} + else: + joint_action = {k: self.reachy.joints[v].goal_position for k, v in self.joints_dict.items()} + if not self.config.with_mobile_base: + dt_ms = (time.perf_counter() - start) * 1e3 + logger.debug(f"{self} read action: {dt_ms:.1f}ms") + return joint_action + if self.config.use_present_position: + vel_action = {k: self.reachy.mobile_base.odometry[v] for k, v in REACHY2_VEL.items()} + else: + vel_action = {k: self.reachy.mobile_base.last_cmd_vel[v] for k, v in REACHY2_VEL.items()} dt_ms = (time.perf_counter() - start) * 1e3 logger.debug(f"{self} read action: {dt_ms:.1f}ms") return {**joint_action, **vel_action} @@ -160,5 +172,5 @@ class Reachy2Teleoperator(Teleoperator): raise NotImplementedError def disconnect(self) -> None: - if self.reachy and self.is_connected: + if self.is_connected: self.reachy.disconnect() diff --git a/src/lerobot/teleoperators/so100_leader/so100_leader.py b/src/lerobot/teleoperators/so100_leader/so100_leader.py deleted file mode 100644 index edcfe53e6..000000000 --- a/src/lerobot/teleoperators/so100_leader/so100_leader.py +++ /dev/null @@ -1,159 +0,0 @@ -#!/usr/bin/env python - -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import time - -from lerobot.motors import Motor, MotorCalibration, MotorNormMode -from lerobot.motors.feetech import ( - FeetechMotorsBus, - OperatingMode, -) -from lerobot.utils.errors import DeviceAlreadyConnectedError, DeviceNotConnectedError - -from ..teleoperator import Teleoperator -from .config_so100_leader import SO100LeaderConfig - -logger = logging.getLogger(__name__) - - -class SO100Leader(Teleoperator): - """ - [SO-100 Leader Arm](https://github.com/TheRobotStudio/SO-ARM100) designed by TheRobotStudio - """ - - config_class = SO100LeaderConfig - name = "so100_leader" - - def __init__(self, config: SO100LeaderConfig): - super().__init__(config) - self.config = config - self.bus = FeetechMotorsBus( - port=self.config.port, - motors={ - "shoulder_pan": Motor(1, "sts3215", MotorNormMode.RANGE_M100_100), - "shoulder_lift": Motor(2, "sts3215", MotorNormMode.RANGE_M100_100), - "elbow_flex": Motor(3, "sts3215", MotorNormMode.RANGE_M100_100), - "wrist_flex": Motor(4, "sts3215", MotorNormMode.RANGE_M100_100), - "wrist_roll": Motor(5, "sts3215", MotorNormMode.RANGE_M100_100), - "gripper": Motor(6, "sts3215", MotorNormMode.RANGE_0_100), - }, - calibration=self.calibration, - ) - - @property - def action_features(self) -> dict[str, type]: - return {f"{motor}.pos": float for motor in self.bus.motors} - - @property - def feedback_features(self) -> dict[str, type]: - return {} - - @property - def is_connected(self) -> bool: - return self.bus.is_connected - - def connect(self, calibrate: bool = True) -> None: - if self.is_connected: - raise DeviceAlreadyConnectedError(f"{self} already connected") - - self.bus.connect() - if not self.is_calibrated and calibrate: - logger.info( - "Mismatch between calibration values in the motor and the calibration file or no calibration file found" - ) - self.calibrate() - - self.configure() - logger.info(f"{self} connected.") - - @property - def is_calibrated(self) -> bool: - return self.bus.is_calibrated - - def calibrate(self) -> None: - if self.calibration: - # Calibration file exists, ask user whether to use it or run new calibration - user_input = input( - f"Press ENTER to use provided calibration file associated with the id {self.id}, or type 'c' and press ENTER to run calibration: " - ) - if user_input.strip().lower() != "c": - logger.info(f"Writing calibration file associated with the id {self.id} to the motors") - self.bus.write_calibration(self.calibration) - return - - logger.info(f"\nRunning calibration of {self}") - self.bus.disable_torque() - for motor in self.bus.motors: - self.bus.write("Operating_Mode", motor, OperatingMode.POSITION.value) - - input(f"Move {self} to the middle of its range of motion and press ENTER....") - homing_offsets = self.bus.set_half_turn_homings() - - full_turn_motor = "wrist_roll" - unknown_range_motors = [motor for motor in self.bus.motors if motor != full_turn_motor] - print( - f"Move all joints except '{full_turn_motor}' sequentially through their " - "entire ranges of motion.\nRecording positions. Press ENTER to stop..." - ) - range_mins, range_maxes = self.bus.record_ranges_of_motion(unknown_range_motors) - range_mins[full_turn_motor] = 0 - range_maxes[full_turn_motor] = 4095 - - self.calibration = {} - for motor, m in self.bus.motors.items(): - self.calibration[motor] = MotorCalibration( - id=m.id, - drive_mode=0, - homing_offset=homing_offsets[motor], - range_min=range_mins[motor], - range_max=range_maxes[motor], - ) - - self.bus.write_calibration(self.calibration) - self._save_calibration() - print(f"Calibration saved to {self.calibration_fpath}") - - def configure(self) -> None: - self.bus.disable_torque() - self.bus.configure_motors() - for motor in self.bus.motors: - self.bus.write("Operating_Mode", motor, OperatingMode.POSITION.value) - - def setup_motors(self) -> None: - for motor in reversed(self.bus.motors): - input(f"Connect the controller board to the '{motor}' motor only and press enter.") - self.bus.setup_motor(motor) - print(f"'{motor}' motor id set to {self.bus.motors[motor].id}") - - def get_action(self) -> dict[str, float]: - start = time.perf_counter() - action = self.bus.sync_read("Present_Position") - action = {f"{motor}.pos": val for motor, val in action.items()} - dt_ms = (time.perf_counter() - start) * 1e3 - logger.debug(f"{self} read action: {dt_ms:.1f}ms") - return action - - def send_feedback(self, feedback: dict[str, float]) -> None: - # TODO(rcadene, aliberts): Implement force feedback - raise NotImplementedError - - def disconnect(self) -> None: - if not self.is_connected: - DeviceNotConnectedError(f"{self} is not connected.") - - self.bus.disconnect() - logger.info(f"{self} disconnected.") diff --git a/src/lerobot/robots/so101_follower/__init__.py b/src/lerobot/teleoperators/so_leader/__init__.py similarity index 69% rename from src/lerobot/robots/so101_follower/__init__.py rename to src/lerobot/teleoperators/so_leader/__init__.py index 9ff2baf45..e5aaa31b6 100644 --- a/src/lerobot/robots/so101_follower/__init__.py +++ b/src/lerobot/teleoperators/so_leader/__init__.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# Copyright 2026 The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,5 +14,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .config_so101_follower import SO101FollowerConfig -from .so101_follower import SO101Follower +from .config_so_leader import ( + SO100LeaderConfig, + SO101LeaderConfig, + SOLeaderConfig, + SOLeaderTeleopConfig, +) +from .so_leader import SO100Leader, SO101Leader, SOLeader diff --git a/src/lerobot/teleoperators/so100_leader/config_so100_leader.py b/src/lerobot/teleoperators/so_leader/config_so_leader.py similarity index 60% rename from src/lerobot/teleoperators/so100_leader/config_so100_leader.py rename to src/lerobot/teleoperators/so_leader/config_so_leader.py index a97949b7e..dd55196d7 100644 --- a/src/lerobot/teleoperators/so100_leader/config_so100_leader.py +++ b/src/lerobot/teleoperators/so_leader/config_so_leader.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,12 +15,28 @@ # limitations under the License. from dataclasses import dataclass +from typing import TypeAlias from ..config import TeleoperatorConfig -@TeleoperatorConfig.register_subclass("so100_leader") @dataclass -class SO100LeaderConfig(TeleoperatorConfig): +class SOLeaderConfig: + """Base configuration class for SO Leader teleoperators.""" + # Port to connect to the arm port: str + + # Whether to use degrees for angles + use_degrees: bool = False + + +@TeleoperatorConfig.register_subclass("so101_leader") +@TeleoperatorConfig.register_subclass("so100_leader") +@dataclass +class SOLeaderTeleopConfig(TeleoperatorConfig, SOLeaderConfig): + pass + + +SO100LeaderConfig: TypeAlias = SOLeaderTeleopConfig +SO101LeaderConfig: TypeAlias = SOLeaderTeleopConfig diff --git a/src/lerobot/teleoperators/so_leader/so100.md b/src/lerobot/teleoperators/so_leader/so100.md new file mode 120000 index 000000000..ad1154e75 --- /dev/null +++ b/src/lerobot/teleoperators/so_leader/so100.md @@ -0,0 +1 @@ +../../../../docs/source/so100.mdx \ No newline at end of file diff --git a/src/lerobot/teleoperators/so_leader/so101.md b/src/lerobot/teleoperators/so_leader/so101.md new file mode 120000 index 000000000..27b892660 --- /dev/null +++ b/src/lerobot/teleoperators/so_leader/so101.md @@ -0,0 +1 @@ +../../../../docs/source/so101.mdx \ No newline at end of file diff --git a/src/lerobot/teleoperators/so101_leader/so101_leader.py b/src/lerobot/teleoperators/so_leader/so_leader.py similarity index 82% rename from src/lerobot/teleoperators/so101_leader/so101_leader.py rename to src/lerobot/teleoperators/so_leader/so_leader.py index be804bf70..a10e3a61f 100644 --- a/src/lerobot/teleoperators/so101_leader/so101_leader.py +++ b/src/lerobot/teleoperators/so_leader/so_leader.py @@ -1,6 +1,6 @@ -#!/usr/bin/env python +# !/usr/bin/env python -# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# Copyright 2026 The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,29 +16,28 @@ import logging import time +from typing import TypeAlias from lerobot.motors import Motor, MotorCalibration, MotorNormMode from lerobot.motors.feetech import ( FeetechMotorsBus, OperatingMode, ) -from lerobot.utils.errors import DeviceAlreadyConnectedError, DeviceNotConnectedError +from lerobot.utils.decorators import check_if_already_connected, check_if_not_connected from ..teleoperator import Teleoperator -from .config_so101_leader import SO101LeaderConfig +from .config_so_leader import SOLeaderTeleopConfig logger = logging.getLogger(__name__) -class SO101Leader(Teleoperator): - """ - SO-101 Leader Arm designed by TheRobotStudio and Hugging Face. - """ +class SOLeader(Teleoperator): + """Generic SO leader base for SO-100/101/10X teleoperators.""" - config_class = SO101LeaderConfig - name = "so101_leader" + config_class = SOLeaderTeleopConfig + name = "so_leader" - def __init__(self, config: SO101LeaderConfig): + def __init__(self, config: SOLeaderTeleopConfig): super().__init__(config) self.config = config norm_mode_body = MotorNormMode.DEGREES if config.use_degrees else MotorNormMode.RANGE_M100_100 @@ -67,10 +66,8 @@ class SO101Leader(Teleoperator): def is_connected(self) -> bool: return self.bus.is_connected + @check_if_already_connected def connect(self, calibrate: bool = True) -> None: - if self.is_connected: - raise DeviceAlreadyConnectedError(f"{self} already connected") - self.bus.connect() if not self.is_calibrated and calibrate: logger.info( @@ -104,11 +101,15 @@ class SO101Leader(Teleoperator): input(f"Move {self} to the middle of its range of motion and press ENTER....") homing_offsets = self.bus.set_half_turn_homings() + full_turn_motor = "wrist_roll" + unknown_range_motors = [motor for motor in self.bus.motors if motor != full_turn_motor] print( - "Move all joints sequentially through their entire ranges " - "of motion.\nRecording positions. Press ENTER to stop..." + f"Move all joints except '{full_turn_motor}' sequentially through their " + "entire ranges of motion.\nRecording positions. Press ENTER to stop..." ) - range_mins, range_maxes = self.bus.record_ranges_of_motion() + range_mins, range_maxes = self.bus.record_ranges_of_motion(unknown_range_motors) + range_mins[full_turn_motor] = 0 + range_maxes[full_turn_motor] = 4095 self.calibration = {} for motor, m in self.bus.motors.items(): @@ -136,6 +137,7 @@ class SO101Leader(Teleoperator): self.bus.setup_motor(motor) print(f"'{motor}' motor id set to {self.bus.motors[motor].id}") + @check_if_not_connected def get_action(self) -> dict[str, float]: start = time.perf_counter() action = self.bus.sync_read("Present_Position") @@ -145,12 +147,14 @@ class SO101Leader(Teleoperator): return action def send_feedback(self, feedback: dict[str, float]) -> None: - # TODO(rcadene, aliberts): Implement force feedback + # TODO: Implement force feedback raise NotImplementedError + @check_if_not_connected def disconnect(self) -> None: - if not self.is_connected: - DeviceNotConnectedError(f"{self} is not connected.") - self.bus.disconnect() logger.info(f"{self} disconnected.") + + +SO100Leader: TypeAlias = SOLeader +SO101Leader: TypeAlias = SOLeader diff --git a/src/lerobot/teleoperators/teleoperator.py b/src/lerobot/teleoperators/teleoperator.py index 95020a962..847b88b7f 100644 --- a/src/lerobot/teleoperators/teleoperator.py +++ b/src/lerobot/teleoperators/teleoperator.py @@ -20,6 +20,7 @@ from typing import Any import draccus from lerobot.motors.motors_bus import MotorCalibration +from lerobot.processor import RobotAction from lerobot.utils.constants import HF_LEROBOT_CALIBRATION, TELEOPERATORS from .config import TeleoperatorConfig @@ -57,6 +58,32 @@ class Teleoperator(abc.ABC): def __str__(self) -> str: return f"{self.id} {self.__class__.__name__}" + def __enter__(self): + """ + Context manager entry. + Automatically connects to the camera. + """ + self.connect() + return self + + def __exit__(self, exc_type, exc_value, traceback) -> None: + """ + Context manager exit. + Automatically disconnects, ensuring resources are released even on error. + """ + self.disconnect() + + def __del__(self) -> None: + """ + Destructor safety net. + Attempts to disconnect if the object is garbage collected without cleanup. + """ + try: + if self.is_connected: + self.disconnect() + except Exception: # nosec B110 + pass + @property @abc.abstractmethod def action_features(self) -> dict: @@ -150,12 +177,12 @@ class Teleoperator(abc.ABC): pass @abc.abstractmethod - def get_action(self) -> dict[str, Any]: + def get_action(self) -> RobotAction: """ Retrieve the current action from the teleoperator. Returns: - dict[str, Any]: A flat dictionary representing the teleoperator's current actions. Its + RobotAction: A flat dictionary representing the teleoperator's current actions. Its structure should match :pymeth:`observation_features`. """ pass diff --git a/src/lerobot/teleoperators/utils.py b/src/lerobot/teleoperators/utils.py index fb29ed3a9..3ccacdaf4 100644 --- a/src/lerobot/teleoperators/utils.py +++ b/src/lerobot/teleoperators/utils.py @@ -46,11 +46,11 @@ def make_teleoperator_from_config(config: TeleoperatorConfig) -> Teleoperator: return OmxLeader(config) elif config.type == "so100_leader": - from .so100_leader import SO100Leader + from .so_leader import SO100Leader return SO100Leader(config) elif config.type == "so101_leader": - from .so101_leader import SO101Leader + from .so_leader import SO101Leader return SO101Leader(config) elif config.type == "mock_teleop": @@ -73,10 +73,10 @@ def make_teleoperator_from_config(config: TeleoperatorConfig) -> Teleoperator: from .homunculus import HomunculusArm return HomunculusArm(config) - elif config.type == "bi_so100_leader": - from .bi_so100_leader import BiSO100Leader + elif config.type == "bi_so_leader": + from .bi_so_leader import BiSOLeader - return BiSO100Leader(config) + return BiSOLeader(config) elif config.type == "reachy2_teleoperator": from .reachy2_teleoperator import Reachy2Teleoperator diff --git a/src/lerobot/utils/constants.py b/src/lerobot/utils/constants.py index dfa10b2e5..43a61b4f7 100644 --- a/src/lerobot/utils/constants.py +++ b/src/lerobot/utils/constants.py @@ -28,9 +28,13 @@ OBS_LANGUAGE_TOKENS = OBS_LANGUAGE + ".tokens" OBS_LANGUAGE_ATTENTION_MASK = OBS_LANGUAGE + ".attention_mask" ACTION = "action" +ACTION_PREFIX = ACTION + "." +ACTION_TOKENS = ACTION + ".tokens" +ACTION_TOKEN_MASK = ACTION + ".token_mask" REWARD = "next.reward" TRUNCATED = "next.truncated" DONE = "next.done" +INFO = "info" ROBOTS = "robots" TELEOPERATORS = "teleoperators" diff --git a/src/lerobot/utils/decorators.py b/src/lerobot/utils/decorators.py new file mode 100644 index 000000000..8fc2f9a07 --- /dev/null +++ b/src/lerobot/utils/decorators.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python + +# Copyright 2026 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import wraps + +from lerobot.utils.errors import DeviceAlreadyConnectedError, DeviceNotConnectedError + + +def check_if_not_connected(func): + @wraps(func) + def wrapper(self, *args, **kwargs): + if not self.is_connected: + raise DeviceNotConnectedError( + f"{self.__class__.__name__} is not connected. Run `.connect()` first." + ) + return func(self, *args, **kwargs) + + return wrapper + + +def check_if_already_connected(func): + @wraps(func) + def wrapper(self, *args, **kwargs): + if self.is_connected: + raise DeviceAlreadyConnectedError(f"{self.__class__.__name__} is already connected.") + return func(self, *args, **kwargs) + + return wrapper diff --git a/src/lerobot/utils/import_utils.py b/src/lerobot/utils/import_utils.py index 3a01aee88..c33a73589 100644 --- a/src/lerobot/utils/import_utils.py +++ b/src/lerobot/utils/import_utils.py @@ -21,12 +21,23 @@ from typing import Any from draccus.choice_types import ChoiceRegistry -def is_package_available(pkg_name: str, return_version: bool = False) -> tuple[bool, str] | bool: - """Copied from https://github.com/huggingface/transformers/blob/main/src/transformers/utils/import_utils.py - Check if the package spec exists and grab its version to avoid importing a local directory. - **Note:** this doesn't work for all packages. +def is_package_available( + pkg_name: str, import_name: str | None = None, return_version: bool = False +) -> tuple[bool, str] | bool: """ - package_exists = importlib.util.find_spec(pkg_name) is not None + Check if the package spec exists and grab its version to avoid importing a local directory. + + Args: + pkg_name: The name of the package as installed via pip (e.g. "python-can"). + import_name: The actual name used to import the package (e.g. "can"). + Defaults to pkg_name if not provided. + return_version: Whether to return the version string. + """ + if import_name is None: + import_name = pkg_name + + # Check if the module spec exists using the import name + package_exists = importlib.util.find_spec(import_name) is not None package_version = "N/A" if package_exists: try: @@ -37,7 +48,7 @@ def is_package_available(pkg_name: str, return_version: bool = False) -> tuple[b # Fallback method: Only for "torch" and versions containing "dev" if pkg_name == "torch": try: - package = importlib.import_module(pkg_name) + package = importlib.import_module(import_name) temp_version = getattr(package, "__version__", "N/A") # Check if the version contains "dev" if "dev" in temp_version: @@ -48,9 +59,6 @@ def is_package_available(pkg_name: str, return_version: bool = False) -> tuple[b except ImportError: # If the package can't be imported, it's not available package_exists = False - elif pkg_name == "grpc": - package = importlib.import_module(pkg_name) - package_version = getattr(package, "__version__", "N/A") else: # For packages other than "torch", don't attempt the fallback and set as not available package_exists = False @@ -63,6 +71,9 @@ def is_package_available(pkg_name: str, return_version: bool = False) -> tuple[b _transformers_available = is_package_available("transformers") _peft_available = is_package_available("peft") +_scipy_available = is_package_available("scipy") +_reachy2_sdk_available = is_package_available("reachy2_sdk") +_can_available = is_package_available("python-can", "can") def make_device_from_device_class(config: ChoiceRegistry) -> Any: diff --git a/src/lerobot/utils/train_utils.py b/src/lerobot/utils/train_utils.py index 3ebe31971..d8481f4b9 100644 --- a/src/lerobot/utils/train_utils.py +++ b/src/lerobot/utils/train_utils.py @@ -99,6 +99,10 @@ def save_checkpoint( pretrained_dir = checkpoint_dir / PRETRAINED_MODEL_DIR policy.save_pretrained(pretrained_dir) cfg.save_pretrained(pretrained_dir) + if cfg.peft is not None: + # When using PEFT, policy.save_pretrained will only write the adapter weights + config, not the + # policy config which we need for loading the model. In this case we'll write it ourselves. + policy.config.save_pretrained(pretrained_dir) if preprocessor is not None: preprocessor.save_pretrained(pretrained_dir) if postprocessor is not None: diff --git a/src/lerobot/utils/visualization_utils.py b/src/lerobot/utils/visualization_utils.py index 991b10247..31ca8d247 100644 --- a/src/lerobot/utils/visualization_utils.py +++ b/src/lerobot/utils/visualization_utils.py @@ -14,21 +14,34 @@ import numbers import os -from typing import Any import numpy as np import rerun as rr -from .constants import OBS_PREFIX, OBS_STR +from lerobot.processor import RobotAction, RobotObservation + +from .constants import ACTION, ACTION_PREFIX, OBS_PREFIX, OBS_STR -def init_rerun(session_name: str = "lerobot_control_loop") -> None: - """Initializes the Rerun SDK for visualizing the control loop.""" +def init_rerun( + session_name: str = "lerobot_control_loop", ip: str | None = None, port: int | None = None +) -> None: + """ + Initializes the Rerun SDK for visualizing the control loop. + + Args: + session_name: Name of the Rerun session. + ip: Optional IP for connecting to a Rerun server. + port: Optional port for connecting to a Rerun server. + """ batch_size = os.getenv("RERUN_FLUSH_NUM_BYTES", "8000") os.environ["RERUN_FLUSH_NUM_BYTES"] = batch_size rr.init(session_name) memory_limit = os.getenv("LEROBOT_RERUN_MEMORY_LIMIT", "10%") - rr.spawn(memory_limit=memory_limit) + if ip and port: + rr.connect_grpc(url=f"rerun+http://{ip}:{port}/proxy") + else: + rr.spawn(memory_limit=memory_limit) def _is_scalar(x): @@ -38,8 +51,9 @@ def _is_scalar(x): def log_rerun_data( - observation: dict[str, Any] | None = None, - action: dict[str, Any] | None = None, + observation: RobotObservation | None = None, + action: RobotAction | None = None, + compress_images: bool = False, ) -> None: """ Logs observation and action data to Rerun for real-time visualization. @@ -48,7 +62,7 @@ def log_rerun_data( to the Rerun viewer. It handles different data types appropriately: - Scalars values (floats, ints) are logged as `rr.Scalars`. - 3D NumPy arrays that resemble images (e.g., with 1, 3, or 4 channels first) are transposed - from CHW to HWC format and logged as `rr.Image`. + from CHW to HWC format, (optionally) compressed to JPEG and logged as `rr.Image` or `rr.EncodedImage`. - 1D NumPy arrays are logged as a series of individual scalars, with each element indexed. - Other multi-dimensional arrays are flattened and logged as individual scalars. @@ -57,6 +71,7 @@ def log_rerun_data( Args: observation: An optional dictionary containing observation data to log. action: An optional dictionary containing action data to log. + compress_images: Whether to compress images before logging to save bandwidth & memory in exchange for cpu and quality. """ if observation: for k, v in observation.items(): @@ -75,13 +90,14 @@ def log_rerun_data( for i, vi in enumerate(arr): rr.log(f"{key}_{i}", rr.Scalars(float(vi))) else: - rr.log(key, rr.Image(arr), static=True) + img_entity = rr.Image(arr).compress() if compress_images else rr.Image(arr) + rr.log(key, entity=img_entity, static=True) if action: for k, v in action.items(): if v is None: continue - key = k if str(k).startswith("action.") else f"action.{k}" + key = k if str(k).startswith(ACTION_PREFIX) else f"{ACTION}.{k}" if _is_scalar(v): rr.log(key, rr.Scalars(float(v))) diff --git a/tests/async_inference/test_e2e.py b/tests/async_inference/test_e2e.py index 11941ce32..54ca29b48 100644 --- a/tests/async_inference/test_e2e.py +++ b/tests/async_inference/test_e2e.py @@ -144,12 +144,18 @@ def test_async_inference_e2e(monkeypatch): client = RobotClient(client_config) assert client.start(), "Client failed initial handshake with the server" - # Track action chunks received without modifying RobotClient - action_chunks_received = {"count": 0} + # Track action chunks received and verify device type + action_chunks_received = {"count": 0, "actions_on_cpu": True} original_aggregate = client._aggregate_action_queues def counting_aggregate(*args, **kwargs): action_chunks_received["count"] += 1 + # Check that all received actions are on CPU + if args: + for timed_action in args[0]: # args[0] is the list of TimedAction + action_tensor = timed_action.get_action() + if action_tensor.device.type != "cpu": + action_chunks_received["actions_on_cpu"] = False return original_aggregate(*args, **kwargs) monkeypatch.setattr(client, "_aggregate_action_queues", counting_aggregate) diff --git a/tests/async_inference/test_policy_server.py b/tests/async_inference/test_policy_server.py index 29583d4fa..c3ee37c8f 100644 --- a/tests/async_inference/test_policy_server.py +++ b/tests/async_inference/test_policy_server.py @@ -62,7 +62,7 @@ class MockPolicy: @pytest.fixture -@require_package("grpc") +@require_package("grpcio", "grpc") def policy_server(): """Fresh `PolicyServer` instance with a stubbed-out policy model.""" # Import only when the test actually runs (after decorator check) diff --git a/tests/cameras/test_reachy2_camera.py b/tests/cameras/test_reachy2_camera.py index 0b38e8b0b..14774bf38 100644 --- a/tests/cameras/test_reachy2_camera.py +++ b/tests/cameras/test_reachy2_camera.py @@ -20,6 +20,8 @@ from unittest.mock import MagicMock, patch import numpy as np import pytest +pytest.importorskip("reachy2_sdk") + from lerobot.cameras.reachy2_camera import Reachy2Camera, Reachy2CameraConfig from lerobot.utils.errors import DeviceNotConnectedError @@ -127,24 +129,12 @@ def test_async_read(camera): try: img = camera.async_read() - assert camera.thread is not None - assert camera.thread.is_alive() assert isinstance(img, np.ndarray) finally: if camera.is_connected: camera.disconnect() -def test_async_read_timeout(camera): - camera.connect() - try: - with pytest.raises(TimeoutError): - camera.async_read(timeout_ms=0) - finally: - if camera.is_connected: - camera.disconnect() - - def test_read_before_connect(camera): with pytest.raises(DeviceNotConnectedError): _ = camera.read() diff --git a/tests/conftest.py b/tests/conftest.py index 6f4cfdff0..0cc577600 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -28,7 +28,6 @@ pytest_plugins = [ "tests.fixtures.files", "tests.fixtures.hub", "tests.fixtures.optimizers", - "tests.plugins.reachy2_sdk", ] diff --git a/tests/datasets/test_aggregate.py b/tests/datasets/test_aggregate.py index b710a3a4b..031c29d60 100644 --- a/tests/datasets/test_aggregate.py +++ b/tests/datasets/test_aggregate.py @@ -16,6 +16,7 @@ from unittest.mock import patch +import datasets import torch from lerobot.datasets.aggregate import aggregate_datasets @@ -380,3 +381,147 @@ def test_video_timestamps_regression(tmp_path, lerobot_dataset_factory): for key in aggr_ds.meta.video_keys: assert key in item, f"Video key {key} missing from item {i}" assert item[key].shape[0] == 3, f"Expected 3 channels for video key {key}" + + +def assert_image_schema_preserved(aggr_ds): + """Test that HuggingFace Image feature schema is preserved in aggregated parquet files. + + This verifies the fix for a bug where image columns were written with a generic + struct schema {'bytes': Value('binary'), 'path': Value('string')} instead of + the proper Image() feature type, causing HuggingFace Hub viewer to display + raw dict objects instead of image thumbnails. + """ + image_keys = aggr_ds.meta.image_keys + if not image_keys: + return + + # Check that parquet files have proper Image schema + data_dir = aggr_ds.root / "data" + parquet_files = list(data_dir.rglob("*.parquet")) + assert len(parquet_files) > 0, "No parquet files found in aggregated dataset" + + for parquet_file in parquet_files: + # Load with HuggingFace datasets to check schema + ds = datasets.Dataset.from_parquet(str(parquet_file)) + + for image_key in image_keys: + feature = ds.features.get(image_key) + assert feature is not None, f"Image key '{image_key}' not found in parquet schema" + assert isinstance(feature, datasets.Image), ( + f"Image key '{image_key}' should have Image() feature type, " + f"but got {type(feature).__name__}: {feature}. " + "This indicates image schema was not preserved during aggregation." + ) + + +def assert_image_frames_integrity(aggr_ds, ds_0, ds_1): + """Test that image frames are correctly preserved after aggregation.""" + image_keys = aggr_ds.meta.image_keys + if not image_keys: + return + + def images_equal(img1, img2): + return torch.allclose(img1, img2) + + # Test the section corresponding to the first dataset (ds_0) + for i in range(len(ds_0)): + assert aggr_ds[i]["index"] == i, ( + f"Frame index at position {i} should be {i}, but got {aggr_ds[i]['index']}" + ) + for key in image_keys: + assert images_equal(aggr_ds[i][key], ds_0[i][key]), ( + f"Image frames at position {i} should be equal between aggregated and ds_0" + ) + + # Test the section corresponding to the second dataset (ds_1) + for i in range(len(ds_0), len(ds_0) + len(ds_1)): + assert aggr_ds[i]["index"] == i, ( + f"Frame index at position {i} should be {i}, but got {aggr_ds[i]['index']}" + ) + for key in image_keys: + assert images_equal(aggr_ds[i][key], ds_1[i - len(ds_0)][key]), ( + f"Image frames at position {i} should be equal between aggregated and ds_1" + ) + + +def test_aggregate_image_datasets(tmp_path, lerobot_dataset_factory): + """Test aggregation of image-based datasets preserves HuggingFace Image schema. + + This test specifically verifies that: + 1. Image-based datasets can be aggregated correctly + 2. The HuggingFace Image() feature type is preserved in parquet files + 3. Image data integrity is maintained across aggregation + 4. Images can be properly decoded after aggregation + + This catches the bug where to_parquet_with_hf_images() was not passing + the features schema, causing image columns to be written as generic + struct types instead of Image() types. + """ + ds_0_num_frames = 50 + ds_1_num_frames = 75 + ds_0_num_episodes = 2 + ds_1_num_episodes = 3 + + # Create two image-based datasets (use_videos=False) + ds_0 = lerobot_dataset_factory( + root=tmp_path / "image_0", + repo_id=f"{DUMMY_REPO_ID}_image_0", + total_episodes=ds_0_num_episodes, + total_frames=ds_0_num_frames, + use_videos=False, # Image-based dataset + ) + ds_1 = lerobot_dataset_factory( + root=tmp_path / "image_1", + repo_id=f"{DUMMY_REPO_ID}_image_1", + total_episodes=ds_1_num_episodes, + total_frames=ds_1_num_frames, + use_videos=False, # Image-based dataset + ) + + # Verify source datasets have image keys + assert len(ds_0.meta.image_keys) > 0, "ds_0 should have image keys" + assert len(ds_1.meta.image_keys) > 0, "ds_1 should have image keys" + + # Aggregate the datasets + aggregate_datasets( + repo_ids=[ds_0.repo_id, ds_1.repo_id], + roots=[ds_0.root, ds_1.root], + aggr_repo_id=f"{DUMMY_REPO_ID}_image_aggr", + aggr_root=tmp_path / "image_aggr", + ) + + # Load the aggregated dataset + with ( + patch("lerobot.datasets.lerobot_dataset.get_safe_version") as mock_get_safe_version, + patch("lerobot.datasets.lerobot_dataset.snapshot_download") as mock_snapshot_download, + ): + mock_get_safe_version.return_value = "v3.0" + mock_snapshot_download.return_value = str(tmp_path / "image_aggr") + aggr_ds = LeRobotDataset(f"{DUMMY_REPO_ID}_image_aggr", root=tmp_path / "image_aggr") + + # Verify aggregated dataset has image keys + assert len(aggr_ds.meta.image_keys) > 0, "Aggregated dataset should have image keys" + assert aggr_ds.meta.image_keys == ds_0.meta.image_keys, "Image keys should match source datasets" + + # Run standard aggregation assertions + expected_total_episodes = ds_0_num_episodes + ds_1_num_episodes + expected_total_frames = ds_0_num_frames + ds_1_num_frames + + assert_episode_and_frame_counts(aggr_ds, expected_total_episodes, expected_total_frames) + assert_dataset_content_integrity(aggr_ds, ds_0, ds_1) + assert_metadata_consistency(aggr_ds, ds_0, ds_1) + assert_episode_indices_updated_correctly(aggr_ds, ds_0, ds_1) + + # Image-specific assertions + assert_image_schema_preserved(aggr_ds) + assert_image_frames_integrity(aggr_ds, ds_0, ds_1) + + # Verify images can be accessed and have correct shape + sample_item = aggr_ds[0] + for image_key in aggr_ds.meta.image_keys: + img = sample_item[image_key] + assert isinstance(img, torch.Tensor), f"Image {image_key} should be a tensor" + assert img.dim() == 3, f"Image {image_key} should have 3 dimensions (C, H, W)" + assert img.shape[0] == 3, f"Image {image_key} should have 3 channels" + + assert_dataset_iteration_works(aggr_ds) diff --git a/tests/datasets/test_dataset_tools.py b/tests/datasets/test_dataset_tools.py index 3a4516fc8..35a369de9 100644 --- a/tests/datasets/test_dataset_tools.py +++ b/tests/datasets/test_dataset_tools.py @@ -29,7 +29,7 @@ from lerobot.datasets.dataset_tools import ( remove_feature, split_dataset, ) -from lerobot.scripts.lerobot_edit_dataset import convert_dataset_to_videos +from lerobot.scripts.lerobot_edit_dataset import convert_image_to_video_dataset @pytest.fixture @@ -1050,7 +1050,7 @@ def test_modify_features_preserves_file_structure(sample_dataset, tmp_path): assert "reward" in modified_dataset.meta.features -def test_convert_dataset_to_videos(tmp_path): +def test_convert_image_to_video_dataset(tmp_path): """Test converting lerobot/pusht_image dataset to video format.""" from lerobot.datasets.lerobot_dataset import LeRobotDataset @@ -1071,7 +1071,7 @@ def test_convert_dataset_to_videos(tmp_path): assert "observation.image" in source_dataset.meta.features # Convert to video dataset (only first 2 episodes for speed) - video_dataset = convert_dataset_to_videos( + video_dataset = convert_image_to_video_dataset( dataset=source_dataset, output_dir=output_dir, repo_id="lerobot/pusht_video", @@ -1113,7 +1113,7 @@ def test_convert_dataset_to_videos(tmp_path): shutil.rmtree(output_dir) -def test_convert_dataset_to_videos_subset_episodes(tmp_path): +def test_convert_image_to_video_dataset_subset_episodes(tmp_path): """Test converting only specific episodes from lerobot/pusht_image to video format.""" from lerobot.datasets.lerobot_dataset import LeRobotDataset @@ -1132,7 +1132,7 @@ def test_convert_dataset_to_videos_subset_episodes(tmp_path): # Convert only episode 0 to video (subset of loaded episodes) episode_indices = [0] - video_dataset = convert_dataset_to_videos( + video_dataset = convert_image_to_video_dataset( dataset=source_dataset, output_dir=output_dir, repo_id="lerobot/pusht_video_subset", diff --git a/tests/datasets/test_datasets.py b/tests/datasets/test_datasets.py index 38fdc358d..27c51b3c4 100644 --- a/tests/datasets/test_datasets.py +++ b/tests/datasets/test_datasets.py @@ -31,8 +31,10 @@ from lerobot.configs.train import TrainPipelineConfig from lerobot.datasets.factory import make_dataset from lerobot.datasets.image_writer import image_array_to_pil_image from lerobot.datasets.lerobot_dataset import ( + VALID_VIDEO_CODECS, LeRobotDataset, MultiLeRobotDataset, + _encode_video_worker, ) from lerobot.datasets.utils import ( DEFAULT_CHUNK_SIZE, @@ -350,6 +352,65 @@ def test_image_array_to_pil_image_wrong_range_float_0_255(): image_array_to_pil_image(image) +def test_tmp_image_deletion(tmp_path, empty_lerobot_dataset_factory): + """Verify temporary image directories are removed for image features after saving episode.""" + # Image feature: images should be deleted after saving episode + image_key = "image" + features_image = { + image_key: {"dtype": "image", "shape": DUMMY_CHW, "names": ["channels", "height", "width"]} + } + ds_img = empty_lerobot_dataset_factory(root=tmp_path / "img", features=features_image) + ds_img.add_frame({"image": np.random.rand(*DUMMY_CHW), "task": "Dummy task"}) + ds_img.save_episode() + img_dir = ds_img._get_image_file_dir(0, image_key) + assert not img_dir.exists(), "Temporary image directory should be removed for image features" + + +def test_tmp_video_deletion(tmp_path, empty_lerobot_dataset_factory): + """Verify temporary image directories are removed for video encoding when `batch_encoding_size == 1`.""" + # Video feature: when batch_encoding_size == 1 temporary images should be deleted + vid_key = "video" + features_video = { + vid_key: {"dtype": "video", "shape": DUMMY_CHW, "names": ["channels", "height", "width"]} + } + + ds_vid = empty_lerobot_dataset_factory(root=tmp_path / "vid", features=features_video) + ds_vid.batch_encoding_size = 1 + ds_vid.add_frame({vid_key: np.random.rand(*DUMMY_CHW), "task": "Dummy task"}) + ds_vid.save_episode() + vid_img_dir = ds_vid._get_image_file_dir(0, vid_key) + assert not vid_img_dir.exists(), ( + "Temporary image directory should be removed when batch_encoding_size == 1" + ) + + +def test_tmp_mixed_deletion(tmp_path, empty_lerobot_dataset_factory): + """Verify temporary image directories are removed appropriately when both image and video features are present.""" + image_key = "image" + vid_key = "video" + features_mixed = { + image_key: {"dtype": "image", "shape": DUMMY_CHW, "names": ["channels", "height", "width"]}, + vid_key: {"dtype": "video", "shape": DUMMY_HWC, "names": ["height", "width", "channels"]}, + } + ds_mixed = empty_lerobot_dataset_factory( + root=tmp_path / "mixed", features=features_mixed, batch_encoding_size=2 + ) + ds_mixed.add_frame( + { + "image": np.random.rand(*DUMMY_CHW), + "video": np.random.rand(*DUMMY_HWC), + "task": "Dummy task", + } + ) + ds_mixed.save_episode() + img_dir = ds_mixed._get_image_file_dir(0, image_key) + vid_img_dir = ds_mixed._get_image_file_dir(0, vid_key) + assert not img_dir.exists(), "Temporary image directory should be removed for image features" + assert vid_img_dir.exists(), ( + "Temporary image directory should not be removed for video features when batch_encoding_size == 2" + ) + + # TODO(aliberts): # - [ ] test various attributes & state from init and create # - [ ] test init with episodes and check num_frames @@ -1292,3 +1353,300 @@ def test_frames_in_current_file_calculation(tmp_path, empty_lerobot_dataset_fact frame = loaded_dataset[idx] expected_ep = idx // frames_per_episode assert frame["episode_index"].item() == expected_ep + + +def test_encode_video_worker_forwards_vcodec(tmp_path): + """Test that _encode_video_worker correctly forwards the vcodec parameter to encode_video_frames.""" + from unittest.mock import patch + + from lerobot.datasets.utils import DEFAULT_IMAGE_PATH + + # Create the expected directory structure + video_key = "observation.images.laptop" + episode_index = 0 + frame_index = 0 + + fpath = DEFAULT_IMAGE_PATH.format( + image_key=video_key, episode_index=episode_index, frame_index=frame_index + ) + img_dir = tmp_path / Path(fpath).parent + img_dir.mkdir(parents=True, exist_ok=True) + + # Create a dummy image file + dummy_img = Image.new("RGB", (64, 64), color="red") + dummy_img.save(img_dir / "frame-000000.png") + + # Track what vcodec was passed to encode_video_frames + captured_kwargs = {} + + def mock_encode_video_frames(imgs_dir, video_path, fps, **kwargs): + captured_kwargs.update(kwargs) + # Create a dummy output file so the worker doesn't fail + Path(video_path).parent.mkdir(parents=True, exist_ok=True) + Path(video_path).touch() + + with patch("lerobot.datasets.lerobot_dataset.encode_video_frames", side_effect=mock_encode_video_frames): + # Test with h264 codec + _encode_video_worker(video_key, episode_index, tmp_path, fps=30, vcodec="h264") + + assert "vcodec" in captured_kwargs + assert captured_kwargs["vcodec"] == "h264" + + +def test_encode_video_worker_default_vcodec(tmp_path): + """Test that _encode_video_worker uses libsvtav1 as the default codec.""" + from unittest.mock import patch + + from lerobot.datasets.utils import DEFAULT_IMAGE_PATH + + # Create the expected directory structure + video_key = "observation.images.laptop" + episode_index = 0 + frame_index = 0 + + fpath = DEFAULT_IMAGE_PATH.format( + image_key=video_key, episode_index=episode_index, frame_index=frame_index + ) + img_dir = tmp_path / Path(fpath).parent + img_dir.mkdir(parents=True, exist_ok=True) + + # Create a dummy image file + dummy_img = Image.new("RGB", (64, 64), color="red") + dummy_img.save(img_dir / "frame-000000.png") + + # Track what vcodec was passed to encode_video_frames + captured_kwargs = {} + + def mock_encode_video_frames(imgs_dir, video_path, fps, **kwargs): + captured_kwargs.update(kwargs) + # Create a dummy output file so the worker doesn't fail + Path(video_path).parent.mkdir(parents=True, exist_ok=True) + Path(video_path).touch() + + with patch("lerobot.datasets.lerobot_dataset.encode_video_frames", side_effect=mock_encode_video_frames): + # Test with default codec (no vcodec specified) + _encode_video_worker(video_key, episode_index, tmp_path, fps=30) + + assert "vcodec" in captured_kwargs + assert captured_kwargs["vcodec"] == "libsvtav1" + + +def test_lerobot_dataset_vcodec_validation(): + """Test that LeRobotDataset validates the vcodec parameter.""" + # Test that invalid vcodec raises ValueError + with pytest.raises(ValueError, match="Invalid vcodec"): + LeRobotDataset.__new__(LeRobotDataset) # bypass __init__ to test validation directly + # Actually test via create since it's easier + LeRobotDataset.create( + repo_id="test/invalid_codec", + fps=30, + features={"observation.state": {"dtype": "float32", "shape": (2,), "names": ["x", "y"]}}, + vcodec="invalid_codec", + ) + + +def test_valid_video_codecs_constant(): + """Test that VALID_VIDEO_CODECS contains the expected codecs.""" + assert "h264" in VALID_VIDEO_CODECS + assert "hevc" in VALID_VIDEO_CODECS + assert "libsvtav1" in VALID_VIDEO_CODECS + assert len(VALID_VIDEO_CODECS) == 3 + + +def test_delta_timestamps_with_episodes_filter(tmp_path, empty_lerobot_dataset_factory): + """Regression test for bug where delta_timestamps incorrectly marked all frames as padded when using episodes filter. + + The bug occurred because _get_query_indices was using the relative index (idx) in the filtered dataset + instead of the absolute index when comparing against episode boundaries (ep_start, ep_end). + """ + features = { + "observation.state": {"dtype": "float32", "shape": (2,), "names": ["x", "y"]}, + "action": {"dtype": "float32", "shape": (2,), "names": ["vx", "vy"]}, + } + + dataset = empty_lerobot_dataset_factory(root=tmp_path / "test", features=features, use_videos=False) + + # Create 3 episodes with 10 frames each + frames_per_episode = 10 + for ep_idx in range(3): + for frame_idx in range(frames_per_episode): + dataset.add_frame( + { + "observation.state": torch.tensor([ep_idx, frame_idx], dtype=torch.float32), + "action": torch.randn(2), + "task": f"task_{ep_idx}", + } + ) + dataset.save_episode() + dataset.finalize() + + # Load only episode 1 (middle episode) with delta_timestamps + delta_ts = {"observation.state": [0.0]} # Just the current frame + filtered_dataset = LeRobotDataset( + dataset.repo_id, + root=dataset.root, + episodes=[1], + delta_timestamps=delta_ts, + ) + + # Verify the filtered dataset has the correct length + assert len(filtered_dataset) == frames_per_episode + + # Check that no frames are marked as padded (since delta=0 should always be valid) + for idx in range(len(filtered_dataset)): + frame = filtered_dataset[idx] + assert frame["observation.state_is_pad"].item() is False, f"Frame {idx} incorrectly marked as padded" + # Verify we're getting data from episode 1 + assert frame["episode_index"].item() == 1 + + +def test_delta_timestamps_padding_at_episode_boundaries(tmp_path, empty_lerobot_dataset_factory): + """Test that delta_timestamps correctly marks padding at episode boundaries when using episodes filter.""" + features = { + "observation.state": {"dtype": "float32", "shape": (2,), "names": ["x", "y"]}, + "action": {"dtype": "float32", "shape": (2,), "names": ["vx", "vy"]}, + } + + dataset = empty_lerobot_dataset_factory( + root=tmp_path / "test", features=features, use_videos=False, fps=10 + ) + + # Create 3 episodes with 5 frames each + frames_per_episode = 5 + for ep_idx in range(3): + for frame_idx in range(frames_per_episode): + dataset.add_frame( + { + "observation.state": torch.tensor([ep_idx, frame_idx], dtype=torch.float32), + "action": torch.randn(2), + "task": f"task_{ep_idx}", + } + ) + dataset.save_episode() + dataset.finalize() + + # Load only episode 1 with delta_timestamps that go beyond episode boundaries + # fps=10, so 0.1s = 1 frame offset + delta_ts = {"observation.state": [-0.2, -0.1, 0.0, 0.1, 0.2]} # -2, -1, 0, +1, +2 frames + filtered_dataset = LeRobotDataset( + dataset.repo_id, + root=dataset.root, + episodes=[1], + delta_timestamps=delta_ts, + tolerance_s=0.04, # Slightly less than half a frame at 10fps + ) + + assert len(filtered_dataset) == frames_per_episode + + # Check padding at the start of the episode (first frame) + first_frame = filtered_dataset[0] + is_pad = first_frame["observation.state_is_pad"].tolist() + # At frame 0 of episode 1: delta -2 and -1 should be padded, 0, +1, +2 should not + assert is_pad == [True, True, False, False, False], f"First frame padding incorrect: {is_pad}" + + # Check middle frame (no padding expected) + mid_frame = filtered_dataset[2] + is_pad = mid_frame["observation.state_is_pad"].tolist() + assert is_pad == [False, False, False, False, False], f"Middle frame padding incorrect: {is_pad}" + + # Check padding at the end of the episode (last frame) + last_frame = filtered_dataset[4] + is_pad = last_frame["observation.state_is_pad"].tolist() + # At frame 4 of episode 1: delta -2, -1, 0 should not be padded, +1, +2 should be + assert is_pad == [False, False, False, True, True], f"Last frame padding incorrect: {is_pad}" + + +def test_delta_timestamps_multiple_episodes_filter(tmp_path, empty_lerobot_dataset_factory): + """Test delta_timestamps with multiple non-consecutive episodes selected.""" + features = { + "observation.state": {"dtype": "float32", "shape": (2,), "names": ["x", "y"]}, + } + + dataset = empty_lerobot_dataset_factory( + root=tmp_path / "test", features=features, use_videos=False, fps=10 + ) + + # Create 5 episodes with 5 frames each + frames_per_episode = 5 + for ep_idx in range(5): + for frame_idx in range(frames_per_episode): + dataset.add_frame( + { + "observation.state": torch.tensor([ep_idx, frame_idx], dtype=torch.float32), + "task": f"task_{ep_idx}", + } + ) + dataset.save_episode() + dataset.finalize() + + # Load episodes 1 and 3 (non-consecutive) + delta_ts = {"observation.state": [0.0]} + filtered_dataset = LeRobotDataset( + dataset.repo_id, + root=dataset.root, + episodes=[1, 3], + delta_timestamps=delta_ts, + ) + + assert len(filtered_dataset) == 2 * frames_per_episode + + # All frames should have valid (non-padded) data for delta=0 + for idx in range(len(filtered_dataset)): + frame = filtered_dataset[idx] + assert frame["observation.state_is_pad"].item() is False + + # Verify we're getting the correct episodes + episode_indices = [filtered_dataset[i]["episode_index"].item() for i in range(len(filtered_dataset))] + expected_episodes = [1] * frames_per_episode + [3] * frames_per_episode + assert episode_indices == expected_episodes + + +def test_delta_timestamps_query_returns_correct_values(tmp_path, empty_lerobot_dataset_factory): + """Test that delta_timestamps returns the correct observation values, not just correct padding.""" + features = { + "observation.state": {"dtype": "float32", "shape": (1,), "names": ["x"]}, + } + + dataset = empty_lerobot_dataset_factory( + root=tmp_path / "test", features=features, use_videos=False, fps=10 + ) + + # Create 2 episodes with known values + # Episode 0: frames with values 0, 1, 2, 3, 4 + # Episode 1: frames with values 10, 11, 12, 13, 14 + frames_per_episode = 5 + for ep_idx in range(2): + for frame_idx in range(frames_per_episode): + value = ep_idx * 10 + frame_idx + dataset.add_frame( + { + "observation.state": torch.tensor([value], dtype=torch.float32), + "task": f"task_{ep_idx}", + } + ) + dataset.save_episode() + dataset.finalize() + + # Load episode 1 with delta that looks at previous frame + delta_ts = {"observation.state": [-0.1, 0.0]} # Previous frame and current frame + filtered_dataset = LeRobotDataset( + dataset.repo_id, + root=dataset.root, + episodes=[1], + delta_timestamps=delta_ts, + tolerance_s=0.04, + ) + + # Check frame 2 of episode 1 (which has absolute index 7, value 12) + frame = filtered_dataset[2] + state_values = frame["observation.state"].tolist() + # Should get [11, 12] - the previous and current values within episode 1 + assert state_values == [11.0, 12.0], f"Expected [11.0, 12.0], got {state_values}" + + # Check first frame - previous frame should be clamped to episode start (padded) + first_frame = filtered_dataset[0] + state_values = first_frame["observation.state"].tolist() + is_pad = first_frame["observation.state_is_pad"].tolist() + # Previous frame is outside episode, so it's clamped to first frame and marked as padded + assert state_values == [10.0, 10.0], f"Expected [10.0, 10.0], got {state_values}" + assert is_pad == [True, False], f"Expected [True, False], got {is_pad}" diff --git a/tests/mocks/mock_robot.py b/tests/mocks/mock_robot.py index b0513fd38..f69a2c02a 100644 --- a/tests/mocks/mock_robot.py +++ b/tests/mocks/mock_robot.py @@ -17,12 +17,12 @@ import random from dataclasses import dataclass, field from functools import cached_property -from typing import Any from lerobot.cameras import CameraConfig, make_cameras_from_configs from lerobot.motors.motors_bus import Motor, MotorNormMode +from lerobot.processor import RobotAction, RobotObservation from lerobot.robots import Robot, RobotConfig -from lerobot.utils.errors import DeviceAlreadyConnectedError, DeviceNotConnectedError +from lerobot.utils.decorators import check_if_already_connected, check_if_not_connected from tests.mocks.mock_motors_bus import MockMotorsBus @@ -98,10 +98,8 @@ class MockRobot(Robot): def is_connected(self) -> bool: return self._is_connected + @check_if_already_connected def connect(self, calibrate: bool = True) -> None: - if self.is_connected: - raise DeviceAlreadyConnectedError(f"{self} already connected") - self._is_connected = True if calibrate: self.calibrate() @@ -110,19 +108,15 @@ class MockRobot(Robot): def is_calibrated(self) -> bool: return self._is_calibrated + @check_if_not_connected def calibrate(self) -> None: - if not self.is_connected: - raise DeviceNotConnectedError(f"{self} is not connected.") - self._is_calibrated = True def configure(self) -> None: pass - def get_observation(self) -> dict[str, Any]: - if not self.is_connected: - raise DeviceNotConnectedError(f"{self} is not connected.") - + @check_if_not_connected + def get_observation(self) -> RobotObservation: if self.config.random_values: return {f"{motor}.pos": random.uniform(-100, 100) for motor in self.motors} else: @@ -130,14 +124,10 @@ class MockRobot(Robot): f"{motor}.pos": val for motor, val in zip(self.motors, self.config.static_values, strict=True) } - def send_action(self, action: dict[str, Any]) -> dict[str, Any]: - if not self.is_connected: - raise DeviceNotConnectedError(f"{self} is not connected.") - + @check_if_not_connected + def send_action(self, action: RobotAction) -> RobotAction: return action + @check_if_not_connected def disconnect(self) -> None: - if not self.is_connected: - raise DeviceNotConnectedError(f"{self} is not connected.") - self._is_connected = False diff --git a/tests/mocks/mock_teleop.py b/tests/mocks/mock_teleop.py index 71b49947c..89174dadf 100644 --- a/tests/mocks/mock_teleop.py +++ b/tests/mocks/mock_teleop.py @@ -19,8 +19,9 @@ from dataclasses import dataclass from functools import cached_property from typing import Any +from lerobot.processor import RobotAction from lerobot.teleoperators import Teleoperator, TeleoperatorConfig -from lerobot.utils.errors import DeviceAlreadyConnectedError, DeviceNotConnectedError +from lerobot.utils.decorators import check_if_already_connected, check_if_not_connected @TeleoperatorConfig.register_subclass("mock_teleop") @@ -67,10 +68,8 @@ class MockTeleop(Teleoperator): def is_connected(self) -> bool: return self._is_connected + @check_if_already_connected def connect(self, calibrate: bool = True) -> None: - if self.is_connected: - raise DeviceAlreadyConnectedError(f"{self} already connected") - self._is_connected = True if calibrate: self.calibrate() @@ -79,19 +78,15 @@ class MockTeleop(Teleoperator): def is_calibrated(self) -> bool: return self._is_calibrated + @check_if_not_connected def calibrate(self) -> None: - if not self.is_connected: - raise DeviceNotConnectedError(f"{self} is not connected.") - self._is_calibrated = True def configure(self) -> None: pass - def get_action(self) -> dict[str, Any]: - if not self.is_connected: - raise DeviceNotConnectedError(f"{self} is not connected.") - + @check_if_not_connected + def get_action(self) -> RobotAction: if self.config.random_values: return {f"{motor}.pos": random.uniform(-100, 100) for motor in self.motors} else: @@ -99,12 +94,9 @@ class MockTeleop(Teleoperator): f"{motor}.pos": val for motor, val in zip(self.motors, self.config.static_values, strict=True) } - def send_feedback(self, feedback: dict[str, Any]) -> None: - if not self.is_connected: - raise DeviceNotConnectedError(f"{self} is not connected.") + @check_if_not_connected + def send_feedback(self, feedback: dict[str, Any]) -> None: ... + @check_if_not_connected def disconnect(self) -> None: - if not self.is_connected: - raise DeviceNotConnectedError(f"{self} is not connected.") - self._is_connected = False diff --git a/tests/plugins/reachy2_sdk.py b/tests/plugins/reachy2_sdk.py deleted file mode 100644 index 457fcf0f9..000000000 --- a/tests/plugins/reachy2_sdk.py +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/env python - -# Copyright 2025 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import types -from unittest.mock import MagicMock - - -def _install_reachy2_sdk_stub(): - sdk = types.ModuleType("reachy2_sdk") - sdk.__path__ = [] - sdk.ReachySDK = MagicMock(name="ReachySDK") - - media = types.ModuleType("reachy2_sdk.media") - media.__path__ = [] - camera = types.ModuleType("reachy2_sdk.media.camera") - camera.CameraView = MagicMock(name="CameraView") - camera_manager = types.ModuleType("reachy2_sdk.media.camera_manager") - camera_manager.CameraManager = MagicMock(name="CameraManager") - - sdk.media = media - media.camera = camera - media.camera_manager = camera_manager - - # Register in sys.modules - sys.modules.setdefault("reachy2_sdk", sdk) - sys.modules.setdefault("reachy2_sdk.media", media) - sys.modules.setdefault("reachy2_sdk.media.camera", camera) - sys.modules.setdefault("reachy2_sdk.media.camera_manager", camera_manager) - - -def pytest_sessionstart(session): - _install_reachy2_sdk_stub() diff --git a/tests/policies/pi0_fast/test_pi0_fast_original_vs_lerobot.py b/tests/policies/pi0_fast/test_pi0_fast_original_vs_lerobot.py new file mode 100644 index 000000000..9ebc4ba89 --- /dev/null +++ b/tests/policies/pi0_fast/test_pi0_fast_original_vs_lerobot.py @@ -0,0 +1,504 @@ +#!/usr/bin/env python + +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Test script to verify PI0Fast policy integration with LeRobot vs the original implementation""" +# ruff: noqa: E402 + +import os +import random +from copy import deepcopy +from typing import Any + +import numpy as np +import pytest +import torch + +pytest.importorskip("transformers") +pytest.importorskip("scipy") +pytestmark = pytest.mark.skipif( + os.environ.get("CI") == "true" or os.environ.get("GITHUB_ACTIONS") == "true", + reason="This test requires accepting the model license", +) + +from lerobot.policies.pi0_fast.configuration_pi0_fast import PI0FastConfig +from lerobot.policies.pi0_fast.modeling_pi0_fast import PI0FastPolicy +from lerobot.policies.pi0_fast.processor_pi0_fast import make_pi0_fast_pre_post_processors +from lerobot.processor import PolicyAction, PolicyProcessorPipeline # noqa: E402 +from lerobot.utils.constants import ( + ACTION_TOKEN_MASK, + ACTION_TOKENS, + OBS_IMAGES, + OBS_LANGUAGE_ATTENTION_MASK, + OBS_LANGUAGE_TOKENS, + OBS_STATE, +) # noqa: E402 +from tests.utils import require_cuda # noqa: E402 + +# Constants +DUMMY_ACTION_DIM = 7 +DUMMY_STATE_DIM = 20 +IMAGE_HEIGHT = 224 +IMAGE_WIDTH = 224 +NUM_VIEWS = 2 # Number of camera views +DEVICE = "cuda" if torch.cuda.is_available() else "cpu" +MODEL_PATH_LEROBOT = "lerobot/pi0fast-base" + +# Expected action token shape: (batch_size, max_decoding_steps) +EXPECTED_ACTION_TOKENS_SHAPE = (1, 2) + +# Expected first 5 action tokens (for reproducibility check) +EXPECTED_ACTION_TOKENS_FIRST_5 = torch.tensor([255657, 255362]) + +# Expected actions after detokenization +EXPECTED_ACTIONS_SHAPE = (1, 2, 32) # (batch_size, n_action_steps, action_dim) +EXPECTED_ACTIONS_MEAN = 0.04419417306780815 +EXPECTED_ACTIONS_STD = 0.26231569051742554 +EXPECTED_ACTIONS_FIRST_5 = torch.tensor([0.0000, 1.4849, 0.0000, 0.0000, 0.0000]) + + +def set_seed_all(seed: int): + """Set random seed for all RNG sources to ensure reproducibility.""" + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + # Set deterministic behavior + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + torch.use_deterministic_algorithms(True, warn_only=True) + + +def instantiate_lerobot_pi0_fast( + from_pretrained: bool = False, + model_path: str = MODEL_PATH_LEROBOT, +) -> tuple[ + Any, # Policy + PolicyProcessorPipeline[dict[str, Any], dict[str, Any]], + PolicyProcessorPipeline[PolicyAction, PolicyAction], +]: + """Instantiate LeRobot PI0Fast policy with preprocessor and postprocessor.""" + if from_pretrained: + policy = PI0FastPolicy.from_pretrained( + pretrained_name_or_path=model_path, + strict=True, + ) + policy.config.validate_action_token_prefix = False + policy.config.max_action_tokens = 2 + policy.config.max_decoding_steps = 2 + policy.config.chunk_size = 2 + policy.config.n_action_steps = 2 + else: + config = PI0FastConfig( + n_action_steps=2, + max_action_dim=DUMMY_ACTION_DIM, + max_state_dim=DUMMY_STATE_DIM, + device=DEVICE, + validate_action_token_prefix=False, + max_action_tokens=2, + max_decoding_steps=2, + chunk_size=2, + ) + policy = PI0FastPolicy(config) + + policy.to(DEVICE) + policy.config.device = DEVICE + preprocessor, postprocessor = make_pi0_fast_pre_post_processors( + config=policy.config, + dataset_stats=None, # Pass None for dataset_stats to disable normalization + ) + + return policy, preprocessor, postprocessor + + +def create_dummy_data(device=DEVICE): + """Create dummy data for testing both implementations.""" + batch_size = 1 + prompt = "Pick up the red block and place it in the bin" + + # Create random RGB images in [0, 255] uint8 range (as PIL images would be) + # Then convert to [0, 1] float32 range for LeRobot + def fake_rgb(h, w): + arr = np.random.randint(0, 255, (h, w, 3), dtype=np.uint8) + t = torch.from_numpy(arr).permute(2, 0, 1) # CHW + return t + + batch = { + f"{OBS_IMAGES}.base_0_rgb": torch.stack( + [fake_rgb(IMAGE_HEIGHT, IMAGE_WIDTH) for _ in range(batch_size)] + ).to(device), + f"{OBS_IMAGES}.left_wrist_0_rgb": torch.stack( + [fake_rgb(IMAGE_HEIGHT, IMAGE_WIDTH) for _ in range(batch_size)] + ).to(device), + f"{OBS_IMAGES}.right_wrist_0_rgb": torch.stack( + [fake_rgb(IMAGE_HEIGHT, IMAGE_WIDTH) for _ in range(batch_size)] + ).to(device), + OBS_STATE: torch.randn(batch_size, DUMMY_STATE_DIM, dtype=torch.float32, device=device), + "task": [prompt for _ in range(batch_size)], + } + + return batch + + +# Pytest fixtures +@pytest.fixture(scope="module") +def pi0_fast_components(): + """Fixture to instantiate and provide all PI0Fast components for tests.""" + print(f"\nTesting with DEVICE='{DEVICE}'") + print("\n[Setup] Instantiating LeRobot PI0Fast policy...") + policy_obj, preprocessor_obj, postprocessor_obj = instantiate_lerobot_pi0_fast(from_pretrained=True) + print("Model loaded successfully") + yield policy_obj, preprocessor_obj, postprocessor_obj + + +@pytest.fixture(scope="module") +def policy(pi0_fast_components): + """Fixture to provide the PI0Fast policy for tests.""" + return pi0_fast_components[0] + + +@pytest.fixture(scope="module") +def preprocessor(pi0_fast_components): + """Fixture to provide the PI0Fast preprocessor for tests.""" + return pi0_fast_components[1] + + +@require_cuda +def test_pi0_fast_preprocessor_alignment(policy, preprocessor): + """Test that LeRobot PI0Fast preprocessor produces expected outputs.""" + print("\n" + "=" * 80) + print("Test: PI0Fast Preprocessor Outputs") + print("=" * 80) + + set_seed_all(42) + + print("\nCreating dummy data...") + batch = create_dummy_data() + + print("\n[LeRobot] Preprocessing...") + lerobot_observation = preprocessor(deepcopy(batch)) + + print("\nVerifying preprocessor outputs:") + print("-" * 80) + + # Expected keys from PI0Fast preprocessing + expected_keys = [ + "observation.images.base_0_rgb", + "observation.images.left_wrist_0_rgb", + "observation.images.right_wrist_0_rgb", + "observation.state", + "observation.language_tokens", + "observation.language_attention_mask", + ] + + for key in expected_keys: + if key in lerobot_observation: + shape = tuple(lerobot_observation[key].shape) + print(f"\nKey: {key}") + print(f"Shape: {shape}") + print(f"Dtype: {lerobot_observation[key].dtype}") + else: + print(f"\nKey '{key}' not found in inputs!") + + # Check language tokens shape + if "observation.language_tokens" in lerobot_observation: + lang_tokens = lerobot_observation["observation.language_tokens"] + print(f"\nLanguage tokens shape: {lang_tokens.shape}") + # Should have batch dimension and max_length from tokenizer + assert lang_tokens.dim() == 2, f"Expected 2D tensor, got {lang_tokens.dim()}D" + + print("\nPreprocessor outputs verified!") + + +@require_cuda +def test_pi0_fast_action_generation(policy, preprocessor): + """Test PI0Fast LeRobot implementation generates expected actions.""" + print("\n" + "=" * 80) + print("Test: PI0Fast Action Generation Against Expected Values") + print("=" * 80) + + set_seed_all(42) + + print("\nCreating dummy data...") + batch = create_dummy_data() + + print("\n[LeRobot] Running inference...") + lerobot_observation = preprocessor(deepcopy(batch)) + + # Reset seed for inference + torch.manual_seed(42) + with torch.no_grad(): + lerobot_actions = policy.predict_action_chunk(lerobot_observation) + lerobot_actions = lerobot_actions.float().cpu() + + print(f"LeRobot actions shape: {lerobot_actions.shape}") + print(f"LeRobot actions mean: {lerobot_actions.mean().item():.6f}") + print(f"LeRobot actions std: {lerobot_actions.std().item():.6f}") + print(f"LeRobot actions first 5: {lerobot_actions[0, 0, :5]}") + + print("\nExpected values (from original PI0Fast):") + print(f"Expected actions shape: {EXPECTED_ACTIONS_SHAPE}") + print(f"Expected actions mean: {EXPECTED_ACTIONS_MEAN:.6f}") + print(f"Expected actions std: {EXPECTED_ACTIONS_STD:.6f}") + print(f"Expected actions first 5: {EXPECTED_ACTIONS_FIRST_5}") + + print("\nAction Comparison:") + print("-" * 80) + + # Compare shapes + actual_shape = tuple(lerobot_actions.shape) + print(f"Actual shape: {actual_shape}") + + assert actual_shape == EXPECTED_ACTIONS_SHAPE, ( + f"Shape mismatch: {actual_shape} vs {EXPECTED_ACTIONS_SHAPE}" + ) + print(f"Shape matches: {actual_shape}") + + # Compare statistics + actual_mean = lerobot_actions.mean().item() + actual_std = lerobot_actions.std().item() + + print(f"\nMean: {actual_mean:.6f} (expected: {EXPECTED_ACTIONS_MEAN:.6f})") + print(f"Std: {actual_std:.6f} (expected: {EXPECTED_ACTIONS_STD:.6f})") + + # Compare first 5 actions + actual_first_5 = lerobot_actions[0, 0, :5] + print("\nFirst 5 actions comparison:") + print(f" Actual: {actual_first_5}") + print(f" Expected: {EXPECTED_ACTIONS_FIRST_5}") + + first_5_diff = torch.abs(actual_first_5 - EXPECTED_ACTIONS_FIRST_5) + print(f" Max diff: {first_5_diff.max().item():.6e}") + print(f" Mean diff: {first_5_diff.mean().item():.6e}") + + # Check with different tolerances + tolerances = [1e-5, 1e-4, 1e-3, 1e-2] + for tol in tolerances: + is_close = torch.allclose(actual_first_5, EXPECTED_ACTIONS_FIRST_5, atol=tol) + status = "Success" if is_close else "Failure" + print(f"{status}: First 5 actions close (atol={tol}): {is_close}") + + # Assert with reasonable tolerance + tolerance = 1e-3 + assert torch.allclose(actual_first_5, EXPECTED_ACTIONS_FIRST_5, atol=tolerance), ( + f"First 5 actions differ by more than tolerance ({tolerance})" + ) + print(f"\nSuccess: Actions match expected values within tolerance ({tolerance})!") + + print("\nAction generation test completed (values printed for reference)!") + + +@require_cuda +def test_pi0_fast_inference_reproducibility(policy, preprocessor): + """Test that PI0Fast inference is reproducible with the same seed.""" + print("\n" + "=" * 80) + print("Test: PI0Fast Inference Reproducibility") + print("=" * 80) + + print("\nCreating dummy data...") + batch = create_dummy_data() + + # First inference + print("\n[Run 1] Running inference...") + set_seed_all(42) + lerobot_observation = preprocessor(deepcopy(batch)) + with torch.no_grad(): + actions_1 = policy.predict_action_chunk(lerobot_observation) + actions_1 = actions_1.float().cpu() + + # Second inference with same seed + print("\n[Run 2] Running inference with same seed...") + set_seed_all(42) + lerobot_observation = preprocessor(deepcopy(batch)) + with torch.no_grad(): + actions_2 = policy.predict_action_chunk(lerobot_observation) + actions_2 = actions_2.float().cpu() + + print("\nComparing two runs:") + print("-" * 80) + if torch.allclose(actions_1, actions_2, atol=1e-8): + print("Inference is perfectly reproducible!") + else: + diff = torch.abs(actions_1 - actions_2) + print("Small differences detected:") + print(f" Max diff: {diff.max().item():.6e}") + print(f" Mean diff: {diff.mean().item():.6e}") + + assert torch.allclose(actions_1, actions_2, atol=1e-6), "Inference should be reproducible!" + + print("\nInference is reproducible!") + + +@require_cuda +def test_pi0_fast_forward_pass_logits(policy, preprocessor): + """Test PI0Fast forward pass and compare logits against expected values.""" + print("\n" + "=" * 80) + print("Test: PI0Fast Forward Pass Logits") + print("=" * 80) + + set_seed_all(42) + + print("\nCreating dummy data with action tokens...") + batch = create_dummy_data() + + # Preprocess the batch + lerobot_observation = preprocessor(deepcopy(batch)) + + # For forward pass, we need action tokens + # Create dummy action tokens for testing + batch_size = 1 + max_action_tokens = policy.config.max_action_tokens + + # Create dummy action tokens (in practice, these come from the FAST tokenizer) + dummy_action_tokens = torch.randint( + 0, 1000, (batch_size, max_action_tokens), dtype=torch.long, device=DEVICE + ) + dummy_action_masks = torch.ones(batch_size, max_action_tokens, dtype=torch.bool, device=DEVICE) + + # Add action tokens to the observation + lerobot_observation[ACTION_TOKENS] = dummy_action_tokens + lerobot_observation[ACTION_TOKEN_MASK] = dummy_action_masks + + print("\n[LeRobot] Running forward pass...") + policy.train() + with torch.no_grad(): + loss, loss_dict = policy.forward(lerobot_observation) + + print(f"Loss: {loss.item():.6f}") + print(f"FAST Loss: {loss_dict['ce_loss']:.6f}") + + print("\nForward pass completed successfully!") + print(f"Loss value: {loss.item():.6f}") + + # The loss should be a positive value + assert loss.item() > 0, "Loss should be positive" + assert not torch.isnan(loss), "Loss should not be NaN" + assert not torch.isinf(loss), "Loss should not be infinite" + + print("\nForward pass test passed!") + + +@require_cuda +def test_pi0_fast_action_token_sampling(policy, preprocessor): + """Test PI0Fast action token sampling (autoregressive decoding).""" + print("\n" + "=" * 80) + print("Test: PI0Fast Action Token Sampling") + print("=" * 80) + + set_seed_all(42) + + print("\nCreating dummy data...") + batch = create_dummy_data() + + print("\n[LeRobot] Preprocessing...") + lerobot_observation = preprocessor(deepcopy(batch)) + + # Prepare inputs for model + images, img_masks = policy._preprocess_images(lerobot_observation) + tokens = lerobot_observation[OBS_LANGUAGE_TOKENS] + masks = lerobot_observation[OBS_LANGUAGE_ATTENTION_MASK] + + print("\n[LeRobot] Sampling action tokens...") + torch.manual_seed(42) + with torch.no_grad(): + action_tokens = policy.model.sample_actions_fast( + images, + img_masks, + tokens, + masks, + max_decoding_steps=2, + temperature=0.0, # Greedy decoding for reproducibility + ) + + print(f"Action tokens shape: {action_tokens.shape}") + print(f"Action tokens first 10: {action_tokens[0, :10].tolist()}") + + print("\nExpected values (from original PI0Fast):") + print(f"Expected shape: {EXPECTED_ACTION_TOKENS_SHAPE}") + print(f"Expected first 5: {EXPECTED_ACTION_TOKENS_FIRST_5.tolist()}") + + # Verify shape + actual_shape = tuple(action_tokens.shape) + print(f"\nActual shape: {actual_shape}") + + assert actual_shape == EXPECTED_ACTION_TOKENS_SHAPE, ( + f"Shape mismatch: {actual_shape} vs {EXPECTED_ACTION_TOKENS_SHAPE}" + ) + + # Compare first 5 tokens + actual_first_5 = action_tokens[0, :5].cpu() + assert torch.equal(actual_first_5, EXPECTED_ACTION_TOKENS_FIRST_5), ( + f"First 5 tokens mismatch: {actual_first_5} vs {EXPECTED_ACTION_TOKENS_FIRST_5}" + ) + + print("\nAction token sampling test completed!") + + +@require_cuda +def test_pi0_fast_detokenization(policy, preprocessor): + """Test PI0Fast action detokenization (FAST decoding).""" + print("\n" + "=" * 80) + print("Test: PI0Fast Action Detokenization") + print("=" * 80) + + set_seed_all(42) + + print("\nCreating dummy data...") + batch = create_dummy_data() + + print("\n[LeRobot] Preprocessing...") + lerobot_observation = preprocessor(deepcopy(batch)) + + # Prepare inputs for model + images, img_masks = policy._preprocess_images(lerobot_observation) + tokens = lerobot_observation[OBS_LANGUAGE_TOKENS] + masks = lerobot_observation[OBS_LANGUAGE_ATTENTION_MASK] + + print("\n[LeRobot] Sampling action tokens...") + torch.manual_seed(42) + with torch.no_grad(): + action_tokens = policy.model.sample_actions_fast( + images, + img_masks, + tokens, + masks, + max_decoding_steps=2, + temperature=0.0, + ) + + print(f"Action tokens shape: {action_tokens.shape}") + + # Detokenize + print("\n[LeRobot] Detokenizing action tokens...") + action_horizon = policy.config.n_action_steps + action_dim = policy.config.output_features["action"].shape[0] + + try: + continuous_actions = policy.detokenize_actions( + action_tokens, action_horizon=action_horizon, action_dim=action_dim + ) + print(f"Continuous actions shape: {continuous_actions.shape}") + print(f"Continuous actions mean: {continuous_actions.mean().item():.6f}") + print(f"Continuous actions std: {continuous_actions.std().item():.6f}") + print(f"Continuous actions first 5: {continuous_actions[0, 0, :5]}") + print("\nDetokenization successful!") + except Exception as e: + print(f"\nDetokenization failed with error: {e}") + print("This may be expected if the action tokens are not valid FAST tokens.") + print("The test will pass as long as the sampling works correctly.") diff --git a/tests/policies/rtc/test_training_time_rtc.py b/tests/policies/rtc/test_training_time_rtc.py new file mode 100644 index 000000000..876b82a61 --- /dev/null +++ b/tests/policies/rtc/test_training_time_rtc.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python + +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for training-time RTC helpers.""" + +import torch + +from lerobot.configs.types import RTCTrainingDelayDistribution +from lerobot.policies.rtc.configuration_rtc import RTCTrainingConfig +from lerobot.policies.rtc.training_time import apply_rtc_training_time, sample_rtc_delay + + +def test_rtc_training_config_defaults(): + config = RTCTrainingConfig() + assert config.enabled is False + assert config.min_delay == 0 + assert config.max_delay == 0 + assert config.delay_distribution == RTCTrainingDelayDistribution.UNIFORM + assert config.exp_decay == 1.0 + + +def test_sample_rtc_delay_uniform_range(): + cfg = RTCTrainingConfig(enabled=True, min_delay=1, max_delay=4) + delays = sample_rtc_delay(cfg, batch_size=100, device=torch.device("cpu")) + assert delays.min().item() >= 1 + assert delays.max().item() <= 4 + + +def test_apply_rtc_training_time_prefix_mask(): + time = torch.tensor([0.5]) + delays = torch.tensor([2]) + time_tokens, postfix_mask = apply_rtc_training_time(time, delays, seq_len=4) + assert time_tokens.shape == (1, 4) + assert postfix_mask.shape == (1, 4) + # Delay=2 means the first two steps are prefix (time forced to 0.0) and only the last two are postfix. + assert torch.allclose(time_tokens[0], torch.tensor([0.0, 0.0, 0.5, 0.5])) + assert torch.equal(postfix_mask[0], torch.tensor([False, False, True, True])) diff --git a/tests/processor/test_pipeline.py b/tests/processor/test_pipeline.py index 134228c05..58a83fe69 100644 --- a/tests/processor/test_pipeline.py +++ b/tests/processor/test_pipeline.py @@ -17,7 +17,7 @@ import json import tempfile from collections.abc import Callable -from dataclasses import dataclass +from dataclasses import dataclass, field from pathlib import Path from typing import Any @@ -1884,7 +1884,7 @@ class FeatureContractAddStep(ProcessorStep): """Adds a PolicyFeature""" key: str = "a" - value: PolicyFeature = PolicyFeature(type=FeatureType.STATE, shape=(1,)) + value: PolicyFeature = field(default_factory=lambda: PolicyFeature(type=FeatureType.STATE, shape=(1,))) def __call__(self, transition: EnvTransition) -> EnvTransition: return transition diff --git a/tests/rl/test_actor.py b/tests/rl/test_actor.py index ec67f1889..54e4d2870 100644 --- a/tests/rl/test_actor.py +++ b/tests/rl/test_actor.py @@ -64,7 +64,7 @@ def close_service_stub(channel, server): server.stop(None) -@require_package("grpc") +@require_package("grpcio", "grpc") def test_establish_learner_connection_success(): from lerobot.rl.actor import establish_learner_connection @@ -81,7 +81,7 @@ def test_establish_learner_connection_success(): close_service_stub(channel, server) -@require_package("grpc") +@require_package("grpcio", "grpc") def test_establish_learner_connection_failure(): from lerobot.rl.actor import establish_learner_connection @@ -100,7 +100,7 @@ def test_establish_learner_connection_failure(): close_service_stub(channel, server) -@require_package("grpc") +@require_package("grpcio", "grpc") def test_push_transitions_to_transport_queue(): from lerobot.rl.actor import push_transitions_to_transport_queue from lerobot.transport.utils import bytes_to_transitions @@ -135,7 +135,7 @@ def test_push_transitions_to_transport_queue(): assert_transitions_equal(deserialized_transition, transitions[i]) -@require_package("grpc") +@require_package("grpcio", "grpc") @pytest.mark.timeout(3) # force cross-platform watchdog def test_transitions_stream(): from lerobot.rl.actor import transitions_stream @@ -167,7 +167,7 @@ def test_transitions_stream(): assert streamed_data[2].data == b"transition_data_3" -@require_package("grpc") +@require_package("grpcio", "grpc") @pytest.mark.timeout(3) # force cross-platform watchdog def test_interactions_stream(): from lerobot.rl.actor import interactions_stream diff --git a/tests/rl/test_actor_learner.py b/tests/rl/test_actor_learner.py index 5d95dee04..e13862d82 100644 --- a/tests/rl/test_actor_learner.py +++ b/tests/rl/test_actor_learner.py @@ -88,7 +88,7 @@ def cfg(): return cfg -@require_package("grpc") +@require_package("grpcio", "grpc") @pytest.mark.timeout(10) # force cross-platform watchdog def test_end_to_end_transitions_flow(cfg): from lerobot.rl.actor import ( @@ -150,7 +150,7 @@ def test_end_to_end_transitions_flow(cfg): assert_transitions_equal(transition, input_transitions[i]) -@require_package("grpc") +@require_package("grpcio", "grpc") @pytest.mark.timeout(10) def test_end_to_end_interactions_flow(cfg): from lerobot.rl.actor import ( @@ -223,7 +223,7 @@ def test_end_to_end_interactions_flow(cfg): assert received == expected -@require_package("grpc") +@require_package("grpcio", "grpc") @pytest.mark.parametrize("data_size", ["small", "large"]) @pytest.mark.timeout(10) def test_end_to_end_parameters_flow(cfg, data_size): diff --git a/tests/rl/test_learner_service.py b/tests/rl/test_learner_service.py index e0f0292be..d967388f0 100644 --- a/tests/rl/test_learner_service.py +++ b/tests/rl/test_learner_service.py @@ -39,7 +39,7 @@ def learner_service_stub(): close_learner_service_stub(channel, server) -@require_package("grpc") +@require_package("grpcio", "grpc") def create_learner_service_stub( shutdown_event: Event, parameters_queue: Queue, @@ -75,7 +75,7 @@ def create_learner_service_stub( return services_pb2_grpc.LearnerServiceStub(channel), channel, server -@require_package("grpc") +@require_package("grpcio", "grpc") def close_learner_service_stub(channel, server): channel.close() server.stop(None) @@ -91,7 +91,7 @@ def test_ready_method(learner_service_stub): assert response == services_pb2.Empty() -@require_package("grpc") +@require_package("grpcio", "grpc") @pytest.mark.timeout(3) # force cross-platform watchdog def test_send_interactions(): from lerobot.transport import services_pb2 @@ -135,7 +135,7 @@ def test_send_interactions(): assert interactions == [b"123", b"4", b"5", b"678"] -@require_package("grpc") +@require_package("grpcio", "grpc") @pytest.mark.timeout(3) # force cross-platform watchdog def test_send_transitions(): from lerobot.transport import services_pb2 @@ -181,7 +181,7 @@ def test_send_transitions(): assert transitions == [b"transition_1transition_2transition_3", b"batch_1batch_2"] -@require_package("grpc") +@require_package("grpcio", "grpc") @pytest.mark.timeout(3) # force cross-platform watchdog def test_send_transitions_empty_stream(): from lerobot.transport import services_pb2 @@ -209,7 +209,7 @@ def test_send_transitions_empty_stream(): assert transitions_queue.empty() -@require_package("grpc") +@require_package("grpcio", "grpc") @pytest.mark.timeout(10) # force cross-platform watchdog def test_stream_parameters(): import time @@ -267,7 +267,7 @@ def test_stream_parameters(): assert time_diff == pytest.approx(seconds_between_pushes, abs=0.1) -@require_package("grpc") +@require_package("grpcio", "grpc") @pytest.mark.timeout(3) # force cross-platform watchdog def test_stream_parameters_with_shutdown(): from lerobot.transport import services_pb2 @@ -319,7 +319,7 @@ def test_stream_parameters_with_shutdown(): assert received_params == [b"param_batch_1", b"stop"] -@require_package("grpc") +@require_package("grpcio", "grpc") @pytest.mark.timeout(3) # force cross-platform watchdog def test_stream_parameters_waits_and_retries_on_empty_queue(): import threading diff --git a/tests/robots/test_reachy2.py b/tests/robots/test_reachy2.py index 94152ea38..d3c44bf5a 100644 --- a/tests/robots/test_reachy2.py +++ b/tests/robots/test_reachy2.py @@ -19,6 +19,8 @@ from unittest.mock import MagicMock, patch import numpy as np import pytest +pytest.importorskip("reachy2_sdk") + from lerobot.robots.reachy2 import ( REACHY2_ANTENNAS_JOINTS, REACHY2_L_ARM_JOINTS, diff --git a/tests/robots/test_so100_follower.py b/tests/robots/test_so100_follower.py index d76b9591a..b61d0ca01 100644 --- a/tests/robots/test_so100_follower.py +++ b/tests/robots/test_so100_follower.py @@ -19,7 +19,7 @@ from unittest.mock import MagicMock, patch import pytest -from lerobot.robots.so100_follower import ( +from lerobot.robots.so_follower import ( SO100Follower, SO100FollowerConfig, ) @@ -66,7 +66,7 @@ def follower(): with ( patch( - "lerobot.robots.so100_follower.so100_follower.FeetechMotorsBus", + "lerobot.robots.so_follower.so_follower.FeetechMotorsBus", side_effect=_bus_side_effect, ), patch.object(SO100Follower, "configure", lambda self: None), diff --git a/tests/test_cli_peft.py b/tests/test_cli_peft.py new file mode 100644 index 000000000..42fef4741 --- /dev/null +++ b/tests/test_cli_peft.py @@ -0,0 +1,235 @@ +import importlib +import os +from unittest.mock import MagicMock, patch + +import pytest +from safetensors.torch import load_file + +from .utils import require_package + +# Skip this entire module in CI +pytestmark = pytest.mark.skipif( + os.environ.get("CI") == "true" or os.environ.get("GITHUB_ACTIONS") == "true", + reason="This test requires peft and is very slow, not meant for CI", +) + + +def run_command(cmd, module, args): + module = importlib.import_module(f"lerobot.scripts.{module}") + with patch("sys.argv", [cmd] + args): + module.main() + + +def lerobot_train(args): + return run_command(cmd="lerobot-train", module="lerobot_train", args=args) + + +def lerobot_record(args): + return run_command(cmd="lerobot-record", module="lerobot_record", args=args) + + +def resolve_model_id_for_peft_training(policy_type): + """PEFT training needs pretrained models, this finds the pretrained model of a policy type for PEFT training.""" + if policy_type == "smolvla": + return "lerobot/smolvla_base" + + raise ValueError(f"No pretrained model known for {policy_type}. PEFT training will not work.") + + +@pytest.mark.parametrize("policy_type", ["smolvla"]) +@require_package("peft") +def test_peft_training_push_to_hub_works(policy_type, tmp_path): + """Ensure that push to hub stores PEFT only the adapter, not the full model weights.""" + output_dir = tmp_path / f"output_{policy_type}" + upload_folder_contents = set() + + model_id = resolve_model_id_for_peft_training(policy_type) + + def mock_upload_folder(*args, **kwargs): + folder_path = kwargs["folder_path"] + # we include more than is actually uploaded since we ignore {allow,ignore}_patterns of upload_folders() + upload_folder_contents.update(os.listdir(folder_path)) + return MagicMock() + + with ( + patch("huggingface_hub.HfApi.create_repo"), + patch("huggingface_hub.HfApi.upload_folder", mock_upload_folder), + ): + lerobot_train( + [ + f"--policy.path={model_id}", + "--policy.push_to_hub=true", + "--policy.repo_id=foo/bar", + "--policy.input_features=null", + "--policy.output_features=null", + "--peft.method=LORA", + "--dataset.repo_id=lerobot/pusht", + "--dataset.episodes=[0, 1]", + "--steps=1", + f"--output_dir={output_dir}", + ] + ) + + assert "adapter_model.safetensors" in upload_folder_contents + assert "config.json" in upload_folder_contents + assert "adapter_config.json" in upload_folder_contents + + +@pytest.mark.parametrize("policy_type", ["smolvla"]) +@require_package("peft") +def test_peft_training_works(policy_type, tmp_path): + """Check whether the standard case of fine-tuning a (partially) pre-trained policy with PEFT works.""" + output_dir = tmp_path / f"output_{policy_type}" + model_id = resolve_model_id_for_peft_training(policy_type) + + lerobot_train( + [ + f"--policy.path={model_id}", + "--policy.push_to_hub=false", + "--policy.input_features=null", + "--policy.output_features=null", + "--peft.method=LORA", + "--dataset.repo_id=lerobot/pusht", + "--dataset.episodes=[0, 1]", + "--steps=1", + f"--output_dir={output_dir}", + ] + ) + + policy_dir = output_dir / "checkpoints" / "last" / "pretrained_model" + + for file in ["adapter_config.json", "adapter_model.safetensors", "config.json"]: + assert (policy_dir / file).exists() + + # This is the default case where we train a pre-trained policy from scratch with new data. + # We assume that we target policy-specific modules but fully fine-tune action and state projections + # so these must be part of the trained state dict. + state_dict = load_file(policy_dir / "adapter_model.safetensors") + + adapted_keys = [ + "state_proj", + "action_in_proj", + "action_out_proj", + "action_time_mlp_in", + "action_time_mlp_out", + ] + + found_keys = [ + module_key + for module_key in adapted_keys + for state_dict_key in state_dict + if f".{module_key}." in state_dict_key + ] + + assert set(found_keys) == set(adapted_keys) + + +@pytest.mark.parametrize("policy_type", ["smolvla"]) +@require_package("peft") +def test_peft_training_params_are_fewer(policy_type, tmp_path): + """Check whether the standard case of fine-tuning a (partially) pre-trained policy with PEFT works.""" + output_dir = tmp_path / f"output_{policy_type}" + model_id = resolve_model_id_for_peft_training(policy_type) + + def dummy_update_policy( + train_metrics, policy, batch, optimizer, grad_clip_norm: float, accelerator, **kwargs + ): + params_total = sum(p.numel() for p in policy.parameters()) + params_trainable = sum(p.numel() for p in policy.parameters() if p.requires_grad) + + assert params_total > params_trainable + + return train_metrics, {} + + with patch("lerobot.scripts.lerobot_train.update_policy", dummy_update_policy): + lerobot_train( + [ + f"--policy.path={model_id}", + "--policy.push_to_hub=false", + "--policy.input_features=null", + "--policy.output_features=null", + "--peft.method=LORA", + "--dataset.repo_id=lerobot/pusht", + "--dataset.episodes=[0, 1]", + "--steps=1", + f"--output_dir={output_dir}", + ] + ) + + +class DummyRobot: + name = "dummy" + cameras = [] + action_features = {"foo": 1.0, "bar": 2.0} + observation_features = {"obs1": 1.0, "obs2": 2.0} + is_connected = True + + def connect(self, *args): + pass + + def disconnect(self): + pass + + +def dummy_make_robot_from_config(*args, **kwargs): + return DummyRobot() + + +@pytest.mark.parametrize("policy_type", ["smolvla"]) +@require_package("peft") +def test_peft_record_loads_policy(policy_type, tmp_path): + """Train a policy with PEFT and attempt to load it with `lerobot-record`.""" + from peft import PeftModel + + output_dir = tmp_path / f"output_{policy_type}" + model_id = resolve_model_id_for_peft_training(policy_type) + + lerobot_train( + [ + f"--policy.path={model_id}", + "--policy.push_to_hub=false", + "--policy.input_features=null", + "--policy.output_features=null", + "--peft.method=LORA", + "--dataset.repo_id=lerobot/pusht", + "--dataset.episodes=[0, 1]", + "--steps=1", + f"--output_dir={output_dir}", + ] + ) + + policy_dir = output_dir / "checkpoints" / "last" / "pretrained_model" + dataset_dir = tmp_path / "eval_pusht" + single_task = "move the table" + loaded_policy = None + + def dummy_record_loop(*args, **kwargs): + nonlocal loaded_policy + + if "dataset" not in kwargs: + return + + dataset = kwargs["dataset"] + dataset.add_frame({"task": single_task}) + loaded_policy = kwargs["policy"] + + with ( + patch("lerobot.scripts.lerobot_record.make_robot_from_config", dummy_make_robot_from_config), + # disable record loop since we're only interested in successful loading of the policy. + patch("lerobot.scripts.lerobot_record.record_loop", dummy_record_loop), + # disable speech output + patch("lerobot.utils.utils.say"), + ): + lerobot_record( + [ + f"--policy.path={policy_dir}", + "--robot.type=so101_follower", + "--robot.port=/dev/null", + "--dataset.repo_id=lerobot/eval_pusht", + f'--dataset.single_task="{single_task}"', + f"--dataset.root={dataset_dir}", + "--dataset.push_to_hub=false", + ] + ) + + assert isinstance(loaded_policy, PeftModel) diff --git a/tests/transport/test_transport_utils.py b/tests/transport/test_transport_utils.py index 52825a24e..63632a8f4 100644 --- a/tests/transport/test_transport_utils.py +++ b/tests/transport/test_transport_utils.py @@ -26,7 +26,7 @@ from lerobot.utils.transition import Transition from tests.utils import require_cuda, require_package -@require_package("grpc") +@require_package("grpcio", "grpc") def test_bytes_buffer_size_empty_buffer(): from lerobot.transport.utils import bytes_buffer_size @@ -37,7 +37,7 @@ def test_bytes_buffer_size_empty_buffer(): assert buffer.tell() == 0 -@require_package("grpc") +@require_package("grpcio", "grpc") def test_bytes_buffer_size_small_buffer(): from lerobot.transport.utils import bytes_buffer_size @@ -47,7 +47,7 @@ def test_bytes_buffer_size_small_buffer(): assert buffer.tell() == 0 -@require_package("grpc") +@require_package("grpcio", "grpc") def test_bytes_buffer_size_large_buffer(): from lerobot.transport.utils import CHUNK_SIZE, bytes_buffer_size @@ -58,7 +58,7 @@ def test_bytes_buffer_size_large_buffer(): assert buffer.tell() == 0 -@require_package("grpc") +@require_package("grpcio", "grpc") def test_send_bytes_in_chunks_empty_data(): from lerobot.transport.utils import send_bytes_in_chunks, services_pb2 @@ -68,7 +68,7 @@ def test_send_bytes_in_chunks_empty_data(): assert len(chunks) == 0 -@require_package("grpc") +@require_package("grpcio", "grpc") def test_single_chunk_small_data(): from lerobot.transport.utils import send_bytes_in_chunks, services_pb2 @@ -82,7 +82,7 @@ def test_single_chunk_small_data(): assert chunks[0].transfer_state == services_pb2.TransferState.TRANSFER_END -@require_package("grpc") +@require_package("grpcio", "grpc") def test_not_silent_mode(): from lerobot.transport.utils import send_bytes_in_chunks, services_pb2 @@ -94,7 +94,7 @@ def test_not_silent_mode(): assert chunks[0].data == b"Some data" -@require_package("grpc") +@require_package("grpcio", "grpc") def test_send_bytes_in_chunks_large_data(): from lerobot.transport.utils import CHUNK_SIZE, send_bytes_in_chunks, services_pb2 @@ -111,7 +111,7 @@ def test_send_bytes_in_chunks_large_data(): assert chunks[2].transfer_state == services_pb2.TransferState.TRANSFER_END -@require_package("grpc") +@require_package("grpcio", "grpc") def test_send_bytes_in_chunks_large_data_with_exact_chunk_size(): from lerobot.transport.utils import CHUNK_SIZE, send_bytes_in_chunks, services_pb2 @@ -124,7 +124,7 @@ def test_send_bytes_in_chunks_large_data_with_exact_chunk_size(): assert chunks[0].transfer_state == services_pb2.TransferState.TRANSFER_END -@require_package("grpc") +@require_package("grpcio", "grpc") def test_receive_bytes_in_chunks_empty_data(): from lerobot.transport.utils import receive_bytes_in_chunks @@ -138,7 +138,7 @@ def test_receive_bytes_in_chunks_empty_data(): assert queue.empty() -@require_package("grpc") +@require_package("grpcio", "grpc") def test_receive_bytes_in_chunks_single_chunk(): from lerobot.transport.utils import receive_bytes_in_chunks, services_pb2 @@ -157,7 +157,7 @@ def test_receive_bytes_in_chunks_single_chunk(): assert queue.empty() -@require_package("grpc") +@require_package("grpcio", "grpc") def test_receive_bytes_in_chunks_single_not_end_chunk(): from lerobot.transport.utils import receive_bytes_in_chunks, services_pb2 @@ -175,7 +175,7 @@ def test_receive_bytes_in_chunks_single_not_end_chunk(): assert queue.empty() -@require_package("grpc") +@require_package("grpcio", "grpc") def test_receive_bytes_in_chunks_multiple_chunks(): from lerobot.transport.utils import receive_bytes_in_chunks, services_pb2 @@ -199,7 +199,7 @@ def test_receive_bytes_in_chunks_multiple_chunks(): assert queue.empty() -@require_package("grpc") +@require_package("grpcio", "grpc") def test_receive_bytes_in_chunks_multiple_messages(): from lerobot.transport.utils import receive_bytes_in_chunks, services_pb2 @@ -235,7 +235,7 @@ def test_receive_bytes_in_chunks_multiple_messages(): assert queue.empty() -@require_package("grpc") +@require_package("grpcio", "grpc") def test_receive_bytes_in_chunks_shutdown_during_receive(): from lerobot.transport.utils import receive_bytes_in_chunks, services_pb2 @@ -259,7 +259,7 @@ def test_receive_bytes_in_chunks_shutdown_during_receive(): assert queue.empty() -@require_package("grpc") +@require_package("grpcio", "grpc") def test_receive_bytes_in_chunks_only_begin_chunk(): from lerobot.transport.utils import receive_bytes_in_chunks, services_pb2 @@ -279,7 +279,7 @@ def test_receive_bytes_in_chunks_only_begin_chunk(): assert queue.empty() -@require_package("grpc") +@require_package("grpcio", "grpc") def test_receive_bytes_in_chunks_missing_begin(): from lerobot.transport.utils import receive_bytes_in_chunks, services_pb2 @@ -303,7 +303,7 @@ def test_receive_bytes_in_chunks_missing_begin(): # Tests for state_to_bytes and bytes_to_state_dict -@require_package("grpc") +@require_package("grpcio", "grpc") def test_state_to_bytes_empty_dict(): from lerobot.transport.utils import bytes_to_state_dict, state_to_bytes @@ -314,7 +314,7 @@ def test_state_to_bytes_empty_dict(): assert reconstructed == state_dict -@require_package("grpc") +@require_package("grpcio", "grpc") def test_bytes_to_state_dict_empty_data(): from lerobot.transport.utils import bytes_to_state_dict @@ -323,7 +323,7 @@ def test_bytes_to_state_dict_empty_data(): bytes_to_state_dict(b"") -@require_package("grpc") +@require_package("grpcio", "grpc") def test_state_to_bytes_simple_dict(): from lerobot.transport.utils import bytes_to_state_dict, state_to_bytes @@ -347,7 +347,7 @@ def test_state_to_bytes_simple_dict(): assert torch.allclose(state_dict[key], reconstructed[key]) -@require_package("grpc") +@require_package("grpcio", "grpc") def test_state_to_bytes_various_dtypes(): from lerobot.transport.utils import bytes_to_state_dict, state_to_bytes @@ -372,7 +372,7 @@ def test_state_to_bytes_various_dtypes(): assert torch.allclose(state_dict[key], reconstructed[key]) -@require_package("grpc") +@require_package("grpcio", "grpc") def test_bytes_to_state_dict_invalid_data(): from lerobot.transport.utils import bytes_to_state_dict @@ -382,7 +382,7 @@ def test_bytes_to_state_dict_invalid_data(): @require_cuda -@require_package("grpc") +@require_package("grpcio", "grpc") def test_state_to_bytes_various_dtypes_cuda(): from lerobot.transport.utils import bytes_to_state_dict, state_to_bytes @@ -407,7 +407,7 @@ def test_state_to_bytes_various_dtypes_cuda(): assert torch.allclose(state_dict[key], reconstructed[key]) -@require_package("grpc") +@require_package("grpcio", "grpc") def test_python_object_to_bytes_none(): from lerobot.transport.utils import bytes_to_python_object, python_object_to_bytes @@ -439,7 +439,7 @@ def test_python_object_to_bytes_none(): (1, 2, 3), ], ) -@require_package("grpc") +@require_package("grpcio", "grpc") def test_python_object_to_bytes_simple_types(obj): from lerobot.transport.utils import bytes_to_python_object, python_object_to_bytes @@ -450,7 +450,7 @@ def test_python_object_to_bytes_simple_types(obj): assert type(reconstructed) is type(obj) -@require_package("grpc") +@require_package("grpcio", "grpc") def test_python_object_to_bytes_with_tensors(): from lerobot.transport.utils import bytes_to_python_object, python_object_to_bytes @@ -475,7 +475,7 @@ def test_python_object_to_bytes_with_tensors(): assert torch.equal(obj["nested"]["tensor2"], reconstructed["nested"]["tensor2"]) -@require_package("grpc") +@require_package("grpcio", "grpc") def test_transitions_to_bytes_empty_list(): from lerobot.transport.utils import bytes_to_transitions, transitions_to_bytes @@ -487,7 +487,7 @@ def test_transitions_to_bytes_empty_list(): assert isinstance(reconstructed, list) -@require_package("grpc") +@require_package("grpcio", "grpc") def test_transitions_to_bytes_single_transition(): from lerobot.transport.utils import bytes_to_transitions, transitions_to_bytes @@ -509,7 +509,7 @@ def test_transitions_to_bytes_single_transition(): assert_transitions_equal(transitions[0], reconstructed[0]) -@require_package("grpc") +@require_package("grpcio", "grpc") def assert_transitions_equal(t1: Transition, t2: Transition): """Helper to assert two transitions are equal.""" assert_observation_equal(t1["state"], t2["state"]) @@ -519,7 +519,7 @@ def assert_transitions_equal(t1: Transition, t2: Transition): assert_observation_equal(t1["next_state"], t2["next_state"]) -@require_package("grpc") +@require_package("grpcio", "grpc") def assert_observation_equal(o1: dict, o2: dict): """Helper to assert two observations are equal.""" assert set(o1.keys()) == set(o2.keys()) @@ -527,7 +527,7 @@ def assert_observation_equal(o1: dict, o2: dict): assert torch.allclose(o1[key], o2[key]) -@require_package("grpc") +@require_package("grpcio", "grpc") def test_transitions_to_bytes_multiple_transitions(): from lerobot.transport.utils import bytes_to_transitions, transitions_to_bytes @@ -551,7 +551,7 @@ def test_transitions_to_bytes_multiple_transitions(): assert_transitions_equal(original, reconstructed_item) -@require_package("grpc") +@require_package("grpcio", "grpc") def test_receive_bytes_in_chunks_unknown_state(): from lerobot.transport.utils import receive_bytes_in_chunks diff --git a/tests/utils.py b/tests/utils.py index 800b7d4b3..38841db02 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -167,7 +167,7 @@ def require_package_arg(func): return wrapper -def require_package(package_name): +def require_package(package_name, import_name=None): """ Decorator that skips the test if the specified package is not installed. """ @@ -175,7 +175,7 @@ def require_package(package_name): def decorator(func): @wraps(func) def wrapper(*args, **kwargs): - if not is_package_available(package_name): + if not is_package_available(pkg_name=package_name, import_name=import_name): pytest.skip(f"{package_name} not installed") return func(*args, **kwargs) diff --git a/tests/utils/test_train_utils.py b/tests/utils/test_train_utils.py index 892503e97..4791caf58 100644 --- a/tests/utils/test_train_utils.py +++ b/tests/utils/test_train_utils.py @@ -82,6 +82,20 @@ def test_save_checkpoint(mock_save_training_state, tmp_path, optimizer): mock_save_training_state.assert_called_once() +@patch("lerobot.utils.train_utils.save_training_state") +def test_save_checkpoint_peft(mock_save_training_state, tmp_path, optimizer): + policy = Mock() + policy.config = Mock() + policy.config.save_pretrained = Mock() + cfg = Mock() + cfg.use_peft = True + save_checkpoint(tmp_path, 10, cfg, policy, optimizer) + policy.save_pretrained.assert_called_once() + cfg.save_pretrained.assert_called_once() + policy.config.save_pretrained.assert_called_once() + mock_save_training_state.assert_called_once() + + def test_save_training_state(tmp_path, optimizer, scheduler): save_training_state(tmp_path, 10, optimizer, scheduler) assert (tmp_path / TRAINING_STATE_DIR).is_dir() diff --git a/tests/utils/test_visualization_utils.py b/tests/utils/test_visualization_utils.py index f573de166..408f636cb 100644 --- a/tests/utils/test_visualization_utils.py +++ b/tests/utils/test_visualization_utils.py @@ -41,7 +41,10 @@ def mock_rerun(monkeypatch): def __init__(self, arr): self.arr = arr - def dummy_log(key, obj, **kwargs): + def dummy_log(key, obj=None, **kwargs): + # Accept either positional `obj` or keyword `entity` and record remaining kwargs. + if obj is None and "entity" in kwargs: + obj = kwargs.pop("entity") calls.append((key, obj, kwargs)) dummy_rr = SimpleNamespace(