mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-12 15:19:43 +00:00
Compare commits
129 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 3dbf463f3b | |||
| 6139b133ca | |||
| bc7740a15d | |||
| 85de893fa7 | |||
| 03e009db78 | |||
| a4c66e530b | |||
| 0794b4ba8f | |||
| a225127527 | |||
| e489ba24fc | |||
| eb85d2d541 | |||
| d324ffe810 | |||
| d7537c85c5 | |||
| 1a24f770d3 | |||
| b2a32b8076 | |||
| cb296ee58f | |||
| 428ea89ff2 | |||
| b9cb947bd2 | |||
| 8440c561ae | |||
| 11cefed08a | |||
| 7bfedd1388 | |||
| 8c95a71c94 | |||
| 1d048c7e2b | |||
| 419305a4c2 | |||
| 23d04ca6fd | |||
| 753b996cda | |||
| 099f3ba4d7 | |||
| 3f3d08e5a8 | |||
| 9e1a67c862 | |||
| 54c38627bd | |||
| f0ef3717ca | |||
| bd8e1ccf70 | |||
| 6243b70239 | |||
| cc4377c346 | |||
| 50e13da845 | |||
| 7cca09d3da | |||
| ddfd573853 | |||
| 6d87b52abd | |||
| e10e352ca0 | |||
| d755df4461 | |||
| 0840dd6b94 | |||
| 9c4c988573 | |||
| f51e9fc1e4 | |||
| 802d73edec | |||
| 3dd51c335e | |||
| b8e36bb2f3 | |||
| 2f3b1b3db8 | |||
| 8f33a1392e | |||
| f3dc152e94 | |||
| 673e18ab46 | |||
| f4683127b2 | |||
| bdac9d7df8 | |||
| c71b30a06c | |||
| 238ea68382 | |||
| 2abb20262e | |||
| ef3b299145 | |||
| c95d151913 | |||
| 699191f81f | |||
| 6d9271940b | |||
| c24cbaacf9 | |||
| 0e7bfa5624 | |||
| fdcbd1a936 | |||
| dd39503802 | |||
| db17c08f6e | |||
| b1ab9b9c46 | |||
| 5c6714bc1b | |||
| 7581adf4cd | |||
| e9384ec834 | |||
| df52c559d3 | |||
| a011accb7f | |||
| b98ccdde3a | |||
| 473528cb14 | |||
| 713efb6427 | |||
| e268ec1ec5 | |||
| d75f3f8915 | |||
| 2cf13d9d63 | |||
| 8755bd0637 | |||
| 634e3924b8 | |||
| f5f9833540 | |||
| e2b47a142a | |||
| 2a3444a8dd | |||
| 3e5f31e0be | |||
| d653f96420 | |||
| 2b90763597 | |||
| 632c778a2b | |||
| 77dbc951cd | |||
| 5b9f98169c | |||
| 2128dece82 | |||
| b575632f4f | |||
| 8a2f5aa6cb | |||
| 25ecd16b67 | |||
| 1e049fbef7 | |||
| afe2c4d3aa | |||
| 534e143b0c | |||
| 23382c0ac5 | |||
| 4eda54c7fe | |||
| a632dd3af4 | |||
| e4a1b27fd3 | |||
| 71f359ca6e | |||
| 51dfee43f4 | |||
| 1f74982469 | |||
| 8e3a1e8945 | |||
| 43c335d0d7 | |||
| dd4ef1383f | |||
| f3823e8bcd | |||
| c398a146b3 | |||
| 9b47c5fac9 | |||
| 56dbeed89f | |||
| 67b1a9eeb1 | |||
| 86e0ee787d | |||
| ba968e84f1 | |||
| d49d3390f6 | |||
| f1ac454800 | |||
| 10cfc17705 | |||
| 5524a0d7a7 | |||
| 3a16a002f8 | |||
| 3b2a4f548c | |||
| b92dc82ddd | |||
| 103230c64c | |||
| cdacc090cd | |||
| 6f856016c5 | |||
| adabb37af6 | |||
| 55e19ff9a7 | |||
| 22714af08d | |||
| 46ebcc2f7d | |||
| a0d5a088e3 | |||
| 34499cbc1b | |||
| 8b9fada80f | |||
| ab97d5c019 | |||
| 14a7a4d7d4 |
@@ -44,7 +44,7 @@ permissions:
|
||||
# Sets up the environment variables
|
||||
env:
|
||||
UV_VERSION: "0.8.0"
|
||||
PYTHON_VERSION: "3.10"
|
||||
PYTHON_VERSION: "3.12"
|
||||
|
||||
# Ensures that only the latest commit for a PR or branch is built, canceling older runs.
|
||||
concurrency:
|
||||
@@ -91,6 +91,7 @@ jobs:
|
||||
run: uv sync --extra "test"
|
||||
|
||||
- name: Login to Hugging Face
|
||||
if: env.HF_USER_TOKEN != ''
|
||||
run: |
|
||||
uv run hf auth login --token "$HF_USER_TOKEN" --add-to-git-credential
|
||||
uv run hf auth whoami
|
||||
|
||||
@@ -37,7 +37,7 @@ permissions:
|
||||
# Sets up the environment variables
|
||||
env:
|
||||
UV_VERSION: "0.8.0"
|
||||
PYTHON_VERSION: "3.10"
|
||||
PYTHON_VERSION: "3.12"
|
||||
DOCKER_IMAGE_NAME: huggingface/lerobot-gpu
|
||||
|
||||
# Ensures that only the latest action is built, canceling older runs.
|
||||
@@ -89,6 +89,7 @@ jobs:
|
||||
run: uv sync --extra all # TODO(Steven): Make flash-attn optional
|
||||
|
||||
- name: Login to Hugging Face
|
||||
if: env.HF_USER_TOKEN != ''
|
||||
run: |
|
||||
uv run hf auth login --token "$HF_USER_TOKEN" --add-to-git-credential
|
||||
uv run hf auth whoami
|
||||
@@ -181,11 +182,12 @@ jobs:
|
||||
working-directory: /lerobot
|
||||
steps:
|
||||
- name: Login to Hugging Face
|
||||
if: env.HF_USER_TOKEN != ''
|
||||
run: |
|
||||
hf auth login --token "$HF_USER_TOKEN" --add-to-git-credential
|
||||
hf auth whoami
|
||||
- name: Fix ptxas permissions
|
||||
run: chmod +x /lerobot/.venv/lib/python3.10/site-packages/triton/backends/nvidia/bin/ptxas
|
||||
run: chmod +x /lerobot/.venv/lib/python3.12/site-packages/triton/backends/nvidia/bin/ptxas
|
||||
- name: Run pytest on GPU
|
||||
run: pytest tests -vv --maxfail=10
|
||||
- name: Run end-to-end tests
|
||||
|
||||
@@ -28,7 +28,7 @@ on:
|
||||
# Sets up the environment variables
|
||||
env:
|
||||
UV_VERSION: "0.8.0"
|
||||
PYTHON_VERSION: "3.10"
|
||||
PYTHON_VERSION: "3.12"
|
||||
DOCKER_IMAGE_NAME_CPU: huggingface/lerobot-cpu:latest
|
||||
DOCKER_IMAGE_NAME_GPU: huggingface/lerobot-gpu:latest
|
||||
|
||||
@@ -132,6 +132,7 @@ jobs:
|
||||
working-directory: /lerobot
|
||||
steps:
|
||||
- name: Login to Hugging Face
|
||||
if: env.HF_USER_TOKEN != ''
|
||||
run: |
|
||||
hf auth login --token "$HF_USER_TOKEN" --add-to-git-credential
|
||||
hf auth whoami
|
||||
@@ -164,6 +165,7 @@ jobs:
|
||||
working-directory: /lerobot
|
||||
steps:
|
||||
- name: Login to Hugging Face
|
||||
if: env.HF_USER_TOKEN != ''
|
||||
run: |
|
||||
hf auth login --token "$HF_USER_TOKEN" --add-to-git-credential
|
||||
hf auth whoami
|
||||
@@ -197,6 +199,7 @@ jobs:
|
||||
working-directory: /lerobot
|
||||
steps:
|
||||
- name: Login to Hugging Face
|
||||
if: env.HF_USER_TOKEN != ''
|
||||
run: |
|
||||
hf auth login --token "$HF_USER_TOKEN" --add-to-git-credential
|
||||
hf auth whoami
|
||||
@@ -206,5 +209,4 @@ jobs:
|
||||
python -c "import torch; print(f'PyTorch CUDA available: {torch.cuda.is_available()}'); print(f'Number of GPUs: {torch.cuda.device_count()}')"
|
||||
|
||||
- name: Run multi-GPU training tests
|
||||
# TODO(Steven): Investigate why motors tests are failing in multi-GPU setup
|
||||
run: pytest tests -vv --maxfail=10 --ignore=tests/motors/
|
||||
run: pytest -vv tests/training/
|
||||
|
||||
@@ -50,7 +50,7 @@ jobs:
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
python-version: '3.10'
|
||||
python-version: '3.12'
|
||||
|
||||
- name: Run pre-commit hooks
|
||||
uses: pre-commit/action@v3.0.1 # zizmor: ignore[unpinned-uses]
|
||||
|
||||
@@ -22,7 +22,7 @@ on:
|
||||
# Sets up the environment variables
|
||||
env:
|
||||
UV_VERSION: "0.8.0"
|
||||
PYTHON_VERSION: "3.10"
|
||||
PYTHON_VERSION: "3.12"
|
||||
|
||||
jobs:
|
||||
# This job builds the Python package and publishes it to PyPI
|
||||
@@ -45,7 +45,7 @@ jobs:
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
python-version: '3.10'
|
||||
python-version: '3.12'
|
||||
|
||||
- name: Extract Version
|
||||
id: extract_info
|
||||
@@ -83,14 +83,6 @@ jobs:
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Remove Tags with Git dependencies
|
||||
# TODO(Steven): Temporary patch to remove pi from PyPi 0.4.0 release due to its reliance on git dependencies.
|
||||
run: |
|
||||
echo "::info:: Checking for Git dependencies to remove from pyproject.toml..."
|
||||
grep -E '@ git\+https|lerobot\[pi\]' pyproject.toml | sed 's/^/::warning:: Removing line: /' || true
|
||||
sed -E -i '/@ git\+https|lerobot\[pi\]/d' pyproject.toml
|
||||
echo "::info:: Git dependencies removed. Proceeding with build."
|
||||
|
||||
- name: Install build dependencies
|
||||
run: python -m pip install build
|
||||
|
||||
|
||||
@@ -29,7 +29,7 @@ permissions:
|
||||
# Sets up the environment variables
|
||||
env:
|
||||
UV_VERSION: "0.8.0"
|
||||
PYTHON_VERSION: "3.10"
|
||||
PYTHON_VERSION: "3.12"
|
||||
DOCKER_IMAGE_NAME: huggingface/lerobot-gpu:unbound
|
||||
|
||||
# Ensures that only the latest action is built, canceling older runs.
|
||||
@@ -81,6 +81,7 @@ jobs:
|
||||
- name: Install lerobot with all extras
|
||||
run: uv sync --extra all # TODO(Steven): Make flash-attn optional
|
||||
- name: Login to Hugging Face
|
||||
if: env.HF_USER_TOKEN != ''
|
||||
run: |
|
||||
uv run hf auth login --token "$HF_USER_TOKEN" --add-to-git-credential
|
||||
uv run hf auth whoami
|
||||
@@ -154,6 +155,7 @@ jobs:
|
||||
working-directory: /lerobot
|
||||
steps:
|
||||
- name: Login to Hugging Face
|
||||
if: env.HF_USER_TOKEN != ''
|
||||
run: |
|
||||
hf auth login --token "$HF_USER_TOKEN" --add-to-git-credential
|
||||
hf auth whoami
|
||||
|
||||
@@ -13,7 +13,7 @@
|
||||
# limitations under the License.
|
||||
|
||||
default_language_version:
|
||||
python: python3.10
|
||||
python: python3.12
|
||||
|
||||
exclude: "tests/artifacts/.*\\.safetensors$"
|
||||
|
||||
@@ -55,7 +55,7 @@ repos:
|
||||
rev: v3.21.0
|
||||
hooks:
|
||||
- id: pyupgrade
|
||||
args: [--py310-plus]
|
||||
args: [--py312-plus]
|
||||
|
||||
##### Markdown Quality #####
|
||||
- repo: https://github.com/rbubley/mirrors-prettier
|
||||
|
||||
@@ -100,11 +100,11 @@ lerobot-train \
|
||||
--dataset.repo_id=lerobot/aloha_mobile_cabinet
|
||||
```
|
||||
|
||||
| Category | Models |
|
||||
| -------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| **Imitation Learning** | [ACT](./docs/source/policy_act_README.md), [Diffusion](./docs/source/policy_diffusion_README.md), [VQ-BeT](./docs/source/policy_vqbet_README.md) |
|
||||
| **Reinforcement Learning** | [HIL-SERL](./docs/source/hilserl.mdx), [TDMPC](./docs/source/policy_tdmpc_README.md) & QC-FQL (coming soon) |
|
||||
| **VLAs Models** | [Pi0Fast](./docs/source/pi0fast.mdx), [Pi0.5](./docs/source/pi05.mdx), [GR00T N1.5](./docs/source/policy_groot_README.md), [SmolVLA](./docs/source/policy_smolvla_README.md), [XVLA](./docs/source/xvla.mdx) |
|
||||
| Category | Models |
|
||||
| -------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| **Imitation Learning** | [ACT](./docs/source/policy_act_README.md), [Diffusion](./docs/source/policy_diffusion_README.md), [VQ-BeT](./docs/source/policy_vqbet_README.md), [Multitask DiT Policy](./docs/source/policy_multi_task_dit_README.md) |
|
||||
| **Reinforcement Learning** | [HIL-SERL](./docs/source/hilserl.mdx), [TDMPC](./docs/source/policy_tdmpc_README.md) & QC-FQL (coming soon) |
|
||||
| **VLAs Models** | [Pi0Fast](./docs/source/pi0fast.mdx), [Pi0.5](./docs/source/pi05.mdx), [GR00T N1.5](./docs/source/policy_groot_README.md), [SmolVLA](./docs/source/policy_smolvla_README.md), [XVLA](./docs/source/xvla.mdx) |
|
||||
|
||||
Similarly to the hardware, you can easily implement your own policy & leverage LeRobot's data collection, training, and visualization tools, and share your model to the HF Hub
|
||||
|
||||
|
||||
@@ -24,7 +24,7 @@ ARG OS_VERSION=22.04
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu${OS_VERSION}
|
||||
|
||||
# Define Python version argument
|
||||
ARG PYTHON_VERSION=3.10
|
||||
ARG PYTHON_VERSION=3.12
|
||||
|
||||
# Configure environment variables
|
||||
ENV DEBIAN_FRONTEND=noninteractive \
|
||||
|
||||
@@ -19,7 +19,7 @@
|
||||
# docker run -it --rm lerobot-user
|
||||
|
||||
# Configure the base image
|
||||
ARG PYTHON_VERSION=3.10
|
||||
ARG PYTHON_VERSION=3.12
|
||||
FROM python:${PYTHON_VERSION}-slim
|
||||
|
||||
# Configure environment variables
|
||||
|
||||
@@ -47,6 +47,8 @@
|
||||
title: NVIDIA GR00T N1.5
|
||||
- local: xvla
|
||||
title: X-VLA
|
||||
- local: multi_task_dit
|
||||
title: Multitask DiT Policy
|
||||
- local: walloss
|
||||
title: WALL-OSS
|
||||
title: "Policies"
|
||||
|
||||
@@ -32,7 +32,7 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
# your policy-specific dependencies
|
||||
]
|
||||
requires-python = ">= 3.11"
|
||||
requires-python = ">= 3.12"
|
||||
|
||||
[build-system]
|
||||
build-backend = # your-build-backend
|
||||
@@ -82,7 +82,7 @@ Create your policy implementation by inheriting from LeRobot's base `PreTrainedP
|
||||
# modeling_my_custom_policy.py
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from typing import Dict, Any
|
||||
from typing import Any
|
||||
|
||||
from lerobot.policies.pretrained import PreTrainedPolicy
|
||||
from .configuration_my_custom_policy import MyCustomPolicyConfig
|
||||
@@ -91,7 +91,7 @@ class MyCustomPolicy(PreTrainedPolicy):
|
||||
config_class = MyCustomPolicyConfig
|
||||
name = "my_custom_policy"
|
||||
|
||||
def __init__(self, config: MyCustomPolicyConfig, dataset_stats: Dict[str, Any] = None):
|
||||
def __init__(self, config: MyCustomPolicyConfig, dataset_stats: dict[str, Any] = None):
|
||||
super().__init__(config, dataset_stats)
|
||||
...
|
||||
```
|
||||
@@ -102,7 +102,7 @@ Create processor functions:
|
||||
|
||||
```python
|
||||
# processor_my_custom_policy.py
|
||||
from typing import Dict, Any
|
||||
from typing import Any
|
||||
import torch
|
||||
|
||||
|
||||
|
||||
@@ -13,7 +13,7 @@ The EarthRover Mini Plus is a fully open source mobile robot that connects throu
|
||||
### Hardware
|
||||
|
||||
- EarthRover Mini robot
|
||||
- Computer with Python 3.10 or newer
|
||||
- Computer with Python 3.12 or newer
|
||||
- Internet connection
|
||||
|
||||
### Setting Up the Frodobots SDK
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# Installation
|
||||
|
||||
This guide uses conda (via miniforge) to manage environments. If you prefer another environment manager (e.g. `uv`, `venv`), ensure you have Python >=3.10 and ffmpeg installed with the `libsvtav1` encoder, then skip ahead to [Install LeRobot](#step-3-install-lerobot-).
|
||||
This guide uses conda (via miniforge) to manage environments. If you prefer another environment manager (e.g. `uv`, `venv`), ensure you have Python >=3.12 and ffmpeg installed with the `libsvtav1` encoder, then skip ahead to [Install LeRobot](#step-3-install-lerobot-).
|
||||
|
||||
## Step 1: Install [`miniforge`](https://conda-forge.org/download/)
|
||||
|
||||
@@ -11,10 +11,10 @@ bash Miniforge3-$(uname)-$(uname -m).sh
|
||||
|
||||
## Step 2: Environment Setup
|
||||
|
||||
Create a virtual environment with Python 3.10, using conda:
|
||||
Create a virtual environment with Python 3.12, using conda:
|
||||
|
||||
```bash
|
||||
conda create -y -n lerobot python=3.10
|
||||
conda create -y -n lerobot python=3.12
|
||||
```
|
||||
|
||||
Then activate your conda environment, you have to do this each time you open a shell to use lerobot:
|
||||
@@ -90,9 +90,6 @@ _Replace `[...]` with your desired features._
|
||||
For a full list of optional dependencies, see:
|
||||
https://pypi.org/project/lerobot/
|
||||
|
||||
> [!NOTE]
|
||||
> For lerobot 0.4.0, if you want to install pi, you will have to do: `pip install "lerobot[pi]@git+https://github.com/huggingface/lerobot.git"`
|
||||
|
||||
### Troubleshooting
|
||||
|
||||
If you encounter build errors, you may need to install additional dependencies: `cmake`, `build-essential`, and `ffmpeg libs`.
|
||||
|
||||
@@ -0,0 +1,340 @@
|
||||
# Multitask DiT Policy
|
||||
|
||||
Multitask Diffusion Transformer (DiT) Policy is an evolution of the original Diffusion Policy architecture, which leverages a large DiT with text and vision conditioning for multitask robot learning. This implementation supports both diffusion and flow matching objectives for action generation, enabling robots to perform diverse manipulation tasks conditioned on language instructions.
|
||||
|
||||
## Model Overview
|
||||
|
||||
The model uses:
|
||||
|
||||
- **CLIP Vision Encoder**: Processes RGB images from multiple camera views
|
||||
- **CLIP Text Encoder**: Encodes language task instructions (frozen weights with learnable projection)
|
||||
- **Diffusion Transformer**: Predicts action sequences conditioned on observations and language
|
||||
- **Two Objectives**: Supports both diffusion (DDPM/DDIM) and flow matching for action generation
|
||||
|
||||
This model is exciting because you can achieve extremely high dexterity, competitive with multi-billion parameter
|
||||
VLAs, with only ~450M parameters and significantly less training.
|
||||
|
||||
## Installation Requirements
|
||||
|
||||
Multitask DiT Policy has additional dependencies. Install it with:
|
||||
|
||||
```bash
|
||||
pip install lerobot[multi_task_dit]
|
||||
```
|
||||
|
||||
This will install all necessary dependencies including the HuggingFace Transformers library for CLIP models.
|
||||
|
||||
## Usage
|
||||
|
||||
To use Multitask DiT in your LeRobot configuration, specify the policy type as:
|
||||
|
||||
```python
|
||||
policy.type=multi_task_dit
|
||||
```
|
||||
|
||||
## Training
|
||||
|
||||
### Basic Training Command
|
||||
|
||||
Here's a complete training command for training Multitask DiT on your dataset:
|
||||
|
||||
```bash
|
||||
lerobot-train \
|
||||
--dataset.repo_id=YOUR_DATASET \
|
||||
--output_dir=./outputs/multitask_dit_training \
|
||||
--batch_size=32 \
|
||||
--steps=5000 \
|
||||
--save_freq=500 \
|
||||
--log_freq=100 \
|
||||
--policy.type=multi_task_dit \
|
||||
--policy.device=cuda \
|
||||
--policy.repo_id="HF_USER/multitask-dit-your-robot" \
|
||||
--wandb.enable=true
|
||||
```
|
||||
|
||||
### Recommended Hyperparameters and Dataset Details (30Hz Control Frequency)
|
||||
|
||||
For reliable performance, start with these suggested default hyperparameters:
|
||||
|
||||
```bash
|
||||
lerobot-train \
|
||||
--dataset.repo_id=YOUR_DATASET \
|
||||
--output_dir=./outputs/mutitask_dit_training \
|
||||
--batch_size=320 \
|
||||
--steps=30000 \
|
||||
--policy.type=multi_task_dit \
|
||||
--policy.device=cuda \
|
||||
--policy.horizon=32 \
|
||||
--policy.n_action_steps=24 \
|
||||
--policy.objective=diffusion \
|
||||
--policy.noise_scheduler_type=DDPM \
|
||||
--policy.num_train_timesteps=100 \
|
||||
--policy.repo_id="HF_USER/multitask-dit-your-robot" \
|
||||
--wandb.enable=true
|
||||
```
|
||||
|
||||
**Key Parameters:**
|
||||
|
||||
- **Batch Size**: 192-320 - If you have access to a GPU that can support this, you will get the best training dynamics
|
||||
- **Horizon**: 32 - number of action steps to predict, ~1.0 sec at 30Hz
|
||||
- **n_action_steps**: 24 - ~0.8 seconds at 30Hz
|
||||
- **Objective**: `diffusion` - start with diffusion and experiment with flow matching if generation quality is poor
|
||||
- **Training Steps**: >30k steps recommended for a single task
|
||||
|
||||
### Training Configuration Parameters
|
||||
|
||||
#### Objective Selection
|
||||
|
||||
Choose between diffusion and flow matching:
|
||||
|
||||
```bash
|
||||
# Diffusion objective (default)
|
||||
--policy.objective=diffusion \
|
||||
--policy.noise_scheduler_type=DDPM \ # or "DDIM"
|
||||
--policy.num_train_timesteps=100 \
|
||||
--policy.num_inference_steps=10 \ # For faster inference
|
||||
--policy.beta_schedule=squaredcos_cap_v2 \ # Noise schedule type
|
||||
--policy.prediction_type=epsilon \ # "epsilon" (predict noise) or "sample" (predict clean)
|
||||
--policy.clip_sample=true \ # Clip samples during denoising
|
||||
--policy.clip_sample_range=1.0 # Clipping range [-x, x]
|
||||
|
||||
# Flow matching objective
|
||||
--policy.objective=flow_matching \
|
||||
--policy.timestep_sampling_strategy=beta \ # or "uniform" | the beta sampling strategy performance appears much better in practice
|
||||
--policy.num_integration_steps=100 \
|
||||
--policy.integration_method=euler \ # or "rk4"
|
||||
--policy.sigma_min=0.0 # Minimum noise in flow interpolation path
|
||||
```
|
||||
|
||||
#### Transformer Architecture
|
||||
|
||||
Adjust model capacity based on dataset size:
|
||||
|
||||
```bash
|
||||
# Small datasets (< 100 examples)
|
||||
--policy.num_layers=4 \
|
||||
--policy.hidden_dim=512 \
|
||||
--policy.num_heads=8 # should ideally be hidden_dim // 64
|
||||
|
||||
# Medium datasets (100-5k examples) - default
|
||||
--policy.num_layers=6 \
|
||||
--policy.hidden_dim=512 \
|
||||
--policy.num_heads=8 # should ideally be hidden_dim // 64
|
||||
|
||||
# Large datasets (> 5k examples)
|
||||
--policy.num_layers=8 \
|
||||
--policy.hidden_dim=512 \
|
||||
--policy.num_heads=8 # should ideally be hidden_dim // 64
|
||||
```
|
||||
|
||||
**Positional Encoding Options:**
|
||||
|
||||
The model supports two positional encoding methods for action sequences:
|
||||
|
||||
```bash
|
||||
# Rotary Position Embedding (RoPE) - default, recommended
|
||||
--policy.use_rope=true \
|
||||
--policy.rope_base=10000.0 # Base frequency for RoPE
|
||||
|
||||
# Absolute positional encoding
|
||||
--policy.use_positional_encoding=true # Disables RoPE when true
|
||||
```
|
||||
|
||||
**Other Transformer Parameters:**
|
||||
|
||||
```bash
|
||||
--policy.dropout=0.1 # Dropout rate for DiT blocks (0.0-1.0)
|
||||
--policy.timestep_embed_dim=256 # Timestep embedding dimension
|
||||
```
|
||||
|
||||
#### Vision Encoder Configuration
|
||||
|
||||
```bash
|
||||
# Use different CLIP model for more expressivity at the cost of inference time
|
||||
# experiment with larger or smaller models depending on the complexity of your tasks and size of dataset
|
||||
--policy.vision_encoder_name=openai/clip-vit-large-patch14
|
||||
|
||||
# Use separate vision encoder per camera
|
||||
# This may be useful when cameras have significantly different characteristics, but
|
||||
# be wary of increased VRAM footprint.
|
||||
--policy.use_separate_rgb_encoder_per_camera=true
|
||||
|
||||
# Image preprocessing
|
||||
--policy.image_resize_shape=[XXX,YYY] \ # you may need to resize your images for inference speed ups
|
||||
--policy.image_crop_shape=[224,224] \
|
||||
--policy.image_crop_is_random=true # Random during training, center at inference
|
||||
```
|
||||
|
||||
#### Text Encoder Configuration
|
||||
|
||||
```bash
|
||||
# Use different CLIP text encoder model
|
||||
# same as vision: experiment with larger or smaller models depending on the
|
||||
# complexity of your tasks and size of dataset
|
||||
--policy.text_encoder_name=openai/clip-vit-large-patch14
|
||||
```
|
||||
|
||||
#### Learning Rate Configuration
|
||||
|
||||
The vision encoder uses a separate learning rate multiplier, where 1/10th is suggested to be the ideal staritng point:
|
||||
|
||||
```bash
|
||||
--policy.optimizer_lr=2e-5 \
|
||||
--policy.vision_encoder_lr_multiplier=0.1 # Vision encoder LR = 0.1 * optimizer_lr
|
||||
```
|
||||
|
||||
### Training Tuning Guidelines
|
||||
|
||||
#### 1. Flow Matching with Beta Sampling
|
||||
|
||||
The original diffusion implementation here is based on the work described in [TRI's LBM paper](https://arxiv.org/abs/2507.05331)
|
||||
|
||||
Additionally, we have implemented a flow-matching objective, which is described at a high-level in [Boston Dynamics blog post](https://bostondynamics.com/blog/large-behavior-models-atlas-find-new-footing/).
|
||||
|
||||
Consider testing the flow-matching objective and evaluating performance differences for your task:
|
||||
|
||||
```bash
|
||||
--policy.objective=flow_matching \
|
||||
--policy.timestep_sampling_strategy=beta \
|
||||
--policy.timestep_sampling_alpha=1.5 \
|
||||
--policy.timestep_sampling_beta=1.0 \
|
||||
--policy.timestep_sampling_s=0.999
|
||||
```
|
||||
|
||||
This hasn't been shown to be a silver bullet across every user case, but it occasionally results in smoother and more consistent actions.
|
||||
|
||||
#### 2. Number of Transformer Layers
|
||||
|
||||
Match model capacity to your dataset size:
|
||||
|
||||
- **Small datasets** (< 100 examples): Reduce to 4 layers
|
||||
- **Large datasets** (> 5k examples): Increase to 8 layers
|
||||
|
||||
#### 3. `horizon` Tuning
|
||||
|
||||
The model can be sensitive to the horizon you choose. Start with around a 1 second horizon based on your control frequency:
|
||||
|
||||
- **30 Hz frequency**: `horizon=30`
|
||||
- **10 Hz frequency**: `horizon=10`
|
||||
|
||||
Then experiment with increasing from there. The horizon determines how far into the future the model predicts actions.
|
||||
|
||||
#### 4. `n_action_steps` Sensitivity
|
||||
|
||||
The model can also be very sensitive to `n_action_steps`. Start with it being around 0.8 seconds based on your control frequency and tune from there:
|
||||
|
||||
- **Lower values**: More reactive but potentially less stable for long-horizon tasks
|
||||
- **Higher values**: Better for long-horizon execution but open-loop failures are limited in their recovery
|
||||
|
||||
### Inference Tuning
|
||||
|
||||
For faster inference, use DDIM with fewer sampling steps:
|
||||
|
||||
```bash
|
||||
--policy.noise_scheduler_type=DDIM \
|
||||
--policy.num_inference_steps=10
|
||||
```
|
||||
|
||||
### Resuming Training
|
||||
|
||||
To resume training from a checkpoint:
|
||||
|
||||
```bash
|
||||
lerobot-train \
|
||||
--config_path=./outputs/mutitask_dit_training/checkpoints/last/pretrained_model/train_config.json \
|
||||
--resume=true
|
||||
```
|
||||
|
||||
The checkpoint directory should contain `model.safetensors` and `config.json` files (saved automatically during training). When resuming, the configuration is loaded from the checkpoint, so you don't need to specify other parameters.
|
||||
|
||||
## Common Failure Modes and Debugging
|
||||
|
||||
Training these models can be finicky. Here are common failure modes and debugging approaches:
|
||||
|
||||
### Idling / No Motion
|
||||
|
||||
The model may "collapse" during inference, resulting in static or no motion. This can occur when:
|
||||
|
||||
1. **Insufficient training data**: If you only have 20-50 examples, try to roughly double your dataset size. Once you have above 300 examples, if you're still seeing this, the task may be too complex.
|
||||
|
||||
2. **Multiple similar tasks**: When your dataset contains multiple similar tasks (e.g., picking up 2 different objects), the model may rely too heavily on language conditioning which might not be rich enough.
|
||||
|
||||
**Debugging tips:**
|
||||
|
||||
- Increase dataset size (double until you get to over 300 examples)
|
||||
- Train for longer, up to 100k steps, even when the loss flatlines
|
||||
- Check if the model is receiving proper language instructions or increase diversity of instruction
|
||||
|
||||
### Executing the Wrong Task
|
||||
|
||||
Sometimes the robot will completely ignore your instruction and perform some other task. This generally only happens if you have trained on multiple tasks.
|
||||
|
||||
**Potential causes:**
|
||||
|
||||
- Language instruction ambiguity
|
||||
- Insufficient task-specific training data
|
||||
- Model confusion between similar tasks in the multitask dataset
|
||||
|
||||
**Debugging tips:**
|
||||
|
||||
- Verify language instruction specificity, especially if descriptions are similar between multiple tasks
|
||||
- Check task distribution in your training dataset and add weighting to the failing/ignored task
|
||||
- Consider task-specific fine-tuning
|
||||
|
||||
### Training Instability
|
||||
|
||||
If training loss is unstable or diverging:
|
||||
|
||||
- Try adjusting learning rate between `1e-5` and `3e-4`
|
||||
- Increase batch size if possible
|
||||
- Check that your dataset normalization is correct
|
||||
- Verify image preprocessing is working correctly
|
||||
|
||||
## Performance Considerations
|
||||
|
||||
### GPU Requirements
|
||||
|
||||
- **Inference**: At least an RTX 5070 Ti (or equivalent GPU) is recommended for reasonable speed performance
|
||||
- **Training**: A GPU with enough VRAM to load batch sizes of >64 is ideal, which will vary depending on the number of image observations, etc
|
||||
|
||||
### Batch Size Recommendations
|
||||
|
||||
- **Minimum**: 64 (less than this may result in unstable training)
|
||||
- **Recommended**: 256-320 (best performance, requires larger GPU)
|
||||
|
||||
## Example: Training on Custom Dataset
|
||||
|
||||
Here's a complete example training on a custom dataset:
|
||||
|
||||
```bash
|
||||
lerobot-train \
|
||||
--dataset.repo_id=YOUR_DATASET \
|
||||
--output_dir=./outputs/mutitask_dit_training \
|
||||
--batch_size=320 \
|
||||
--steps=30000 \
|
||||
--save_freq=1000 \
|
||||
--log_freq=100 \
|
||||
--eval_freq=1000 \
|
||||
--policy.type=multi_task_dit \
|
||||
--policy.device=cuda \
|
||||
--policy.horizon=32 \
|
||||
--policy.n_action_steps=24 \
|
||||
--policy.objective=diffusion \
|
||||
--policy.noise_scheduler_type=DDPM \
|
||||
--policy.num_layers=6 \
|
||||
--policy.hidden_dim=512 \
|
||||
--policy.vision_encoder_name=openai/clip-vit-base-patch16 \
|
||||
--policy.image_resize_shape=[320,240] \
|
||||
--policy.image_crop_shape=[224,224] \
|
||||
--policy.repo_id="HF_USER/multitask-dit-your-robot" \
|
||||
--wandb.enable=true \
|
||||
--wandb.project=multitask_dit
|
||||
```
|
||||
|
||||
## References
|
||||
|
||||
For more details on the technical implementation and architecture, see:
|
||||
|
||||
- [A Careful Examination of Large Behavior Models for Multitask Dexterous Manipulation](https://arxiv.org/abs/2507.05331)
|
||||
- [Large Behavior Models and Atlas Find New Footing](https://bostondynamics.com/blog/large-behavior-models-atlas-find-new-footing/)
|
||||
- [Dissecting and Open-Sourcing Multitask Diffusion Transformer Policy](https://brysonkjones.substack.com/p/dissecting-and-open-sourcing-multitask-diffusion-transformer-policy)
|
||||
@@ -34,11 +34,6 @@ As described by Physical Intelligence, while AI has achieved remarkable success
|
||||
pip install -e ".[pi]"
|
||||
```
|
||||
|
||||
> [!NOTE]
|
||||
> For lerobot 0.4.0, if you want to install pi tag, you will have to do: `pip install "lerobot[pi]@git+https://github.com/huggingface/lerobot.git"`.
|
||||
>
|
||||
> This will be solved in the next patch release
|
||||
|
||||
## Training Data and Capabilities
|
||||
|
||||
π₀ is trained on the largest robot interaction dataset to date, combining three key data sources:
|
||||
|
||||
@@ -36,11 +36,6 @@ This diverse training mixture creates a "curriculum" that enables generalization
|
||||
pip install -e ".[pi]"
|
||||
```
|
||||
|
||||
> [!NOTE]
|
||||
> For lerobot 0.4.0, if you want to install pi tag, you will have to do: `pip install "lerobot[pi]@git+https://github.com/huggingface/lerobot.git"`.
|
||||
>
|
||||
> This will be solved in the next patch release
|
||||
|
||||
## Usage
|
||||
|
||||
To use π₀.₅ in your LeRobot configuration, specify the policy type as:
|
||||
|
||||
@@ -43,11 +43,6 @@ This approach can transform **any existing VLM** into a VLA by training it to pr
|
||||
pip install -e ".[pi]"
|
||||
```
|
||||
|
||||
> [!NOTE]
|
||||
> For lerobot 0.4.0, if you want to install the pi tag, you will have to do: `pip install "lerobot[pi]@git+https://github.com/huggingface/lerobot.git"`.
|
||||
>
|
||||
> This will be solved in the next patch release
|
||||
|
||||
## Training a Custom FAST Tokenizer
|
||||
|
||||
You have two options for the FAST tokenizer:
|
||||
|
||||
@@ -0,0 +1,37 @@
|
||||
# Multitask DiT Policy
|
||||
|
||||
## Citation
|
||||
|
||||
If you use this work, please cite the following works:
|
||||
|
||||
```bibtex
|
||||
@misc{jones2025multitaskditpolicy,
|
||||
author = {Bryson Jones},
|
||||
title = {Dissecting and Open-Sourcing Multitask Diffusion Transformer Policy},
|
||||
year = {2025},
|
||||
url = {https://brysonkjones.substack.com/p/dissecting-and-open-sourcing-multitask-diffusion-transformer-policy},
|
||||
note = {Blog post}
|
||||
}
|
||||
```
|
||||
|
||||
```bibtex
|
||||
@misc{trilbmteam2025carefulexaminationlargebehaviormodels,
|
||||
author = {TRI LBM Team},
|
||||
title = {A Careful Examination of Large Behavior Models for Multitask Dexterous Manipulation},
|
||||
year = {2025},
|
||||
eprint = {arXiv:2507.05331},
|
||||
archivePrefix = {arXiv},
|
||||
primaryClass = {cs.RO},
|
||||
url = {https://arxiv.org/abs/2507.05331}
|
||||
}
|
||||
```
|
||||
|
||||
```bibtex
|
||||
@misc{bostondynamics2025largebehaviormodelsatlas,
|
||||
author = {Boston Dynamics and TRI Research Team},
|
||||
title = {Large Behavior Models and Atlas Find New Footing},
|
||||
year = {2025},
|
||||
url = {https://bostondynamics.com/blog/large-behavior-models-atlas-find-new-footing/},
|
||||
note = {Blog post}
|
||||
}
|
||||
```
|
||||
@@ -123,7 +123,7 @@ SSH into the robot and install LeRobot:
|
||||
```bash
|
||||
ssh unitree@<YOUR_ROBOT_IP>
|
||||
|
||||
conda create -y -n lerobot python=3.10
|
||||
conda create -y -n lerobot python=3.12
|
||||
conda activate lerobot
|
||||
git clone https://github.com/huggingface/lerobot.git
|
||||
cd lerobot
|
||||
@@ -153,7 +153,7 @@ With the robot server running, you can now control the robot remotely. Let's lau
|
||||
### Step 1: Install LeRobot on your machine
|
||||
|
||||
```bash
|
||||
conda create -y -n lerobot python=3.10
|
||||
conda create -y -n lerobot python=3.12
|
||||
conda activate lerobot
|
||||
git clone https://github.com/huggingface/lerobot.git
|
||||
cd lerobot
|
||||
|
||||
@@ -0,0 +1,490 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
SLURM-distributed SARM RA-BC annotation pipeline.
|
||||
|
||||
Computes SARM progress values for all frames in a dataset, distributed across
|
||||
SLURM workers, then merges the shards into a single sarm_progress.parquet.
|
||||
|
||||
Two subcommands, each a separate SLURM submission:
|
||||
|
||||
compute – N workers, each computes progress for a subset of episodes
|
||||
aggregate – 1 worker, merges N shards into sarm_progress.parquet, pushes to hub
|
||||
|
||||
Usage:
|
||||
python slurm_compute_rabc.py compute \\
|
||||
--repo-id user/dataset --reward-model-path user/sarm_model \\
|
||||
--stride 10 --device cpu --workers 50 --partition cpu
|
||||
|
||||
python slurm_compute_rabc.py aggregate \\
|
||||
--repo-id user/dataset --reward-model-path user/sarm_model \\
|
||||
--partition cpu --push-to-hub
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
from datatrove.executor import LocalPipelineExecutor
|
||||
from datatrove.executor.slurm import SlurmPipelineExecutor
|
||||
from datatrove.pipeline.base import PipelineStep
|
||||
|
||||
|
||||
class ComputeProgressShards(PipelineStep):
|
||||
"""Each worker computes SARM progress for its assigned episodes."""
|
||||
|
||||
def __init__(
|
||||
self, repo_id, reward_model_path, stride=1, head_mode="sparse", device="cpu", shard_dir="rabc_shards"
|
||||
):
|
||||
super().__init__()
|
||||
if stride < 1:
|
||||
raise ValueError(f"stride must be >= 1, got {stride}")
|
||||
self.repo_id = repo_id
|
||||
self.reward_model_path = reward_model_path
|
||||
self.stride = stride
|
||||
self.head_mode = head_mode
|
||||
self.device = device
|
||||
self.shard_dir = shard_dir
|
||||
|
||||
def run(self, data=None, rank: int = 0, world_size: int = 1):
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import pyarrow as pa
|
||||
import pyarrow.parquet as pq
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
|
||||
from lerobot.policies.sarm.compute_rabc_weights import (
|
||||
generate_all_frame_indices,
|
||||
interpolate_progress,
|
||||
load_sarm_resources,
|
||||
)
|
||||
from lerobot.utils.utils import init_logging
|
||||
|
||||
init_logging()
|
||||
|
||||
dataset, reward_model, preprocess = load_sarm_resources(
|
||||
self.repo_id,
|
||||
self.reward_model_path,
|
||||
self.device,
|
||||
)
|
||||
|
||||
if hasattr(preprocess, "eval"):
|
||||
preprocess.eval()
|
||||
for step in preprocess.steps:
|
||||
if hasattr(step, "eval"):
|
||||
step.eval()
|
||||
|
||||
image_key = reward_model.config.image_key
|
||||
state_key = reward_model.config.state_key
|
||||
frame_gap = reward_model.config.frame_gap
|
||||
center_idx = reward_model.config.n_obs_steps // 2
|
||||
|
||||
dual_mode = reward_model.config.uses_dual_heads
|
||||
compute_sparse = self.head_mode in ("sparse", "both") or not dual_mode
|
||||
compute_dense = self.head_mode in ("dense", "both") and dual_mode
|
||||
|
||||
my_episodes = list(range(dataset.num_episodes))[rank::world_size]
|
||||
if not my_episodes:
|
||||
logging.info(f"Rank {rank}: no episodes assigned")
|
||||
return
|
||||
logging.info(f"Rank {rank}: {len(my_episodes)} / {dataset.num_episodes} episodes")
|
||||
|
||||
all_rows = []
|
||||
|
||||
for ep_idx in tqdm(my_episodes, desc=f"Rank {rank}"):
|
||||
ep = dataset.meta.episodes[ep_idx]
|
||||
ep_start, ep_end = ep["dataset_from_index"], ep["dataset_to_index"]
|
||||
task = dataset[ep_start].get("task", "perform the task")
|
||||
|
||||
all_ep_indices = generate_all_frame_indices(ep_start, ep_end, frame_gap)
|
||||
if self.stride > 1:
|
||||
compute_indices = [i for i in all_ep_indices if (i - ep_start) % self.stride == 0]
|
||||
if (ep_end - 1) not in compute_indices:
|
||||
compute_indices.append(ep_end - 1)
|
||||
compute_indices = sorted(set(compute_indices))
|
||||
else:
|
||||
compute_indices = all_ep_indices
|
||||
|
||||
frame_results = {}
|
||||
for qi in tqdm(compute_indices, desc=f" Ep {ep_idx}", leave=False):
|
||||
try:
|
||||
sample = dataset[qi]
|
||||
batch = {
|
||||
image_key: sample[image_key],
|
||||
"task": task,
|
||||
"index": qi,
|
||||
"episode_index": ep_idx,
|
||||
}
|
||||
if state_key in sample:
|
||||
batch[state_key] = sample[state_key]
|
||||
|
||||
with torch.no_grad():
|
||||
processed = preprocess(batch)
|
||||
vf = processed["video_features"].to(self.device)
|
||||
tf = processed["text_features"].to(self.device)
|
||||
sf = processed.get("state_features")
|
||||
if sf is not None:
|
||||
sf = sf.to(self.device)
|
||||
lengths = processed.get("lengths")
|
||||
|
||||
sparse_val = dense_val = np.nan
|
||||
if compute_sparse:
|
||||
r = reward_model.calculate_rewards(
|
||||
text_embeddings=tf,
|
||||
video_embeddings=vf,
|
||||
state_features=sf,
|
||||
lengths=lengths,
|
||||
return_all_frames=True,
|
||||
head_mode="sparse",
|
||||
)
|
||||
sparse_val = float(r[0, center_idx] if r.ndim == 2 else r[center_idx])
|
||||
if compute_dense:
|
||||
r = reward_model.calculate_rewards(
|
||||
text_embeddings=tf,
|
||||
video_embeddings=vf,
|
||||
state_features=sf,
|
||||
lengths=lengths,
|
||||
return_all_frames=True,
|
||||
head_mode="dense",
|
||||
)
|
||||
dense_val = float(r[0, center_idx] if r.ndim == 2 else r[center_idx])
|
||||
|
||||
frame_results[qi] = (sparse_val, dense_val)
|
||||
except Exception as e:
|
||||
logging.warning(f"Failed frame {qi}: {e}")
|
||||
|
||||
if not frame_results:
|
||||
logging.warning(f"Episode {ep_idx}: all frames failed, skipping")
|
||||
continue
|
||||
|
||||
# Interpolate to all frames in this episode
|
||||
computed_idx = np.array(sorted(frame_results.keys()))
|
||||
all_frame_arr = np.arange(ep_start, ep_end)
|
||||
|
||||
sparse_vals = np.array([frame_results[i][0] for i in computed_idx]) if compute_sparse else None
|
||||
dense_vals = np.array([frame_results[i][1] for i in computed_idx]) if compute_dense else None
|
||||
|
||||
if self.stride > 1 and len(computed_idx) > 1:
|
||||
if compute_sparse:
|
||||
sparse_vals = interpolate_progress(computed_idx, sparse_vals, all_frame_arr)
|
||||
if compute_dense:
|
||||
dense_vals = interpolate_progress(computed_idx, dense_vals, all_frame_arr)
|
||||
output_frames = all_frame_arr
|
||||
else:
|
||||
# Use only successfully computed frames to avoid indexing mismatch on failures
|
||||
output_frames = computed_idx
|
||||
|
||||
for i, fi in enumerate(output_frames):
|
||||
row = {"index": int(fi), "episode_index": ep_idx, "frame_index": int(fi - ep_start)}
|
||||
if compute_sparse:
|
||||
row["progress_sparse"] = float(sparse_vals[i])
|
||||
if compute_dense:
|
||||
row["progress_dense"] = float(dense_vals[i])
|
||||
all_rows.append(row)
|
||||
|
||||
if all_rows:
|
||||
import pandas as pd
|
||||
|
||||
df = pd.DataFrame(all_rows).sort_values("index").reset_index(drop=True)
|
||||
table = pa.Table.from_pandas(df, preserve_index=False)
|
||||
table = table.replace_schema_metadata({b"reward_model_path": self.reward_model_path.encode()})
|
||||
shard_dir = Path(self.shard_dir)
|
||||
shard_dir.mkdir(parents=True, exist_ok=True)
|
||||
out = shard_dir / f"shard_{rank:05d}.parquet"
|
||||
pq.write_table(table, out)
|
||||
logging.info(f"Rank {rank}: saved {len(df)} rows to {out}")
|
||||
|
||||
|
||||
class AggregateProgress(PipelineStep):
|
||||
"""Merge all shard parquets into final sarm_progress.parquet."""
|
||||
|
||||
def __init__(self, repo_id, reward_model_path, shard_dir="rabc_shards", push_to_hub=False):
|
||||
super().__init__()
|
||||
self.repo_id = repo_id
|
||||
self.reward_model_path = reward_model_path
|
||||
self.shard_dir = shard_dir
|
||||
self.push_to_hub = push_to_hub
|
||||
|
||||
def run(self, data=None, rank: int = 0, world_size: int = 1):
|
||||
import datetime
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import pyarrow as pa
|
||||
import pyarrow.parquet as pq
|
||||
|
||||
from lerobot.datasets.lerobot_dataset import LeRobotDataset
|
||||
from lerobot.utils.utils import init_logging
|
||||
|
||||
init_logging()
|
||||
if rank != 0:
|
||||
return
|
||||
|
||||
shard_dir = Path(self.shard_dir)
|
||||
shards = sorted(shard_dir.glob("shard_*.parquet"))
|
||||
if not shards:
|
||||
raise FileNotFoundError(f"No shards found in {shard_dir}")
|
||||
|
||||
# Log shard modification time range to help detect stale files
|
||||
mtimes = [os.path.getmtime(s) for s in shards]
|
||||
oldest = datetime.datetime.fromtimestamp(min(mtimes)).isoformat(timespec="seconds")
|
||||
newest = datetime.datetime.fromtimestamp(max(mtimes)).isoformat(timespec="seconds")
|
||||
logging.info(f"Aggregating {len(shards)} shards (oldest: {oldest}, newest: {newest})")
|
||||
|
||||
df = pd.concat([pd.read_parquet(s) for s in shards], ignore_index=True)
|
||||
df = df.sort_values("index").reset_index(drop=True)
|
||||
|
||||
table = pa.Table.from_pandas(df, preserve_index=False)
|
||||
table = table.replace_schema_metadata({b"reward_model_path": self.reward_model_path.encode()})
|
||||
|
||||
temp_ds = LeRobotDataset(self.repo_id, download_videos=False)
|
||||
out_path = Path(temp_ds.root) / "sarm_progress.parquet"
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
pq.write_table(table, out_path)
|
||||
logging.info(f"Saved {len(df)} rows to {out_path}")
|
||||
|
||||
for col in ["progress_sparse", "progress_dense"]:
|
||||
if col in df.columns:
|
||||
v = df[col].dropna()
|
||||
logging.info(
|
||||
f"{col}: mean={v.mean():.4f} std={v.std():.4f} min={v.min():.4f} max={v.max():.4f}"
|
||||
)
|
||||
|
||||
if self.push_to_hub:
|
||||
from huggingface_hub import HfApi
|
||||
|
||||
api = HfApi()
|
||||
hub_path = "sarm_progress.parquet"
|
||||
logging.info(f"Uploading to {self.repo_id}/{hub_path}")
|
||||
api.upload_file(
|
||||
path_or_fileobj=str(out_path),
|
||||
path_in_repo=hub_path,
|
||||
repo_id=self.repo_id,
|
||||
repo_type="dataset",
|
||||
)
|
||||
logging.info(f"Uploaded: https://huggingface.co/datasets/{self.repo_id}/blob/main/{hub_path}")
|
||||
|
||||
|
||||
def make_compute_executor(
|
||||
repo_id,
|
||||
reward_model_path,
|
||||
stride,
|
||||
head_mode,
|
||||
device,
|
||||
shard_dir,
|
||||
logs_dir,
|
||||
job_name,
|
||||
slurm,
|
||||
workers,
|
||||
partition,
|
||||
cpus_per_task,
|
||||
mem_per_cpu,
|
||||
):
|
||||
kwargs = {
|
||||
"pipeline": [
|
||||
ComputeProgressShards(repo_id, reward_model_path, stride, head_mode, device, str(shard_dir)),
|
||||
],
|
||||
"logging_dir": str(logs_dir / job_name),
|
||||
}
|
||||
|
||||
if slurm:
|
||||
kwargs.update(
|
||||
{
|
||||
"job_name": job_name,
|
||||
"tasks": workers,
|
||||
"workers": workers,
|
||||
"time": "24:00:00",
|
||||
"partition": partition,
|
||||
"cpus_per_task": cpus_per_task,
|
||||
"sbatch_args": {"mem-per-cpu": mem_per_cpu},
|
||||
}
|
||||
)
|
||||
return SlurmPipelineExecutor(**kwargs)
|
||||
|
||||
kwargs.update({"tasks": workers, "workers": 1})
|
||||
return LocalPipelineExecutor(**kwargs)
|
||||
|
||||
|
||||
def make_aggregate_executor(
|
||||
repo_id,
|
||||
reward_model_path,
|
||||
shard_dir,
|
||||
logs_dir,
|
||||
job_name,
|
||||
slurm,
|
||||
partition,
|
||||
cpus_per_task,
|
||||
mem_per_cpu,
|
||||
push_to_hub,
|
||||
):
|
||||
kwargs = {
|
||||
"pipeline": [
|
||||
AggregateProgress(repo_id, reward_model_path, str(shard_dir), push_to_hub),
|
||||
],
|
||||
"logging_dir": str(logs_dir / job_name),
|
||||
}
|
||||
|
||||
if slurm:
|
||||
kwargs.update(
|
||||
{
|
||||
"job_name": job_name,
|
||||
"tasks": 1,
|
||||
"workers": 1,
|
||||
"time": "02:00:00",
|
||||
"partition": partition,
|
||||
"cpus_per_task": cpus_per_task,
|
||||
"sbatch_args": {"mem-per-cpu": mem_per_cpu},
|
||||
}
|
||||
)
|
||||
return SlurmPipelineExecutor(**kwargs)
|
||||
|
||||
kwargs.update({"tasks": 1, "workers": 1})
|
||||
return LocalPipelineExecutor(**kwargs)
|
||||
|
||||
|
||||
def _add_shared_args(p):
|
||||
p.add_argument(
|
||||
"--repo-id",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Hugging Face repository identifier, e.g. 'user/dataset'.",
|
||||
)
|
||||
p.add_argument(
|
||||
"--shard-dir",
|
||||
type=Path,
|
||||
default=Path("rabc_shards"),
|
||||
help="Directory to read/write per-rank parquet shards.",
|
||||
)
|
||||
p.add_argument(
|
||||
"--logs-dir",
|
||||
type=Path,
|
||||
default=Path("logs"),
|
||||
help="Directory for datatrove logs.",
|
||||
)
|
||||
p.add_argument(
|
||||
"--job-name",
|
||||
type=str,
|
||||
default=None,
|
||||
help="SLURM job name (defaults to rabc_<subcommand>).",
|
||||
)
|
||||
p.add_argument(
|
||||
"--slurm",
|
||||
type=int,
|
||||
default=1,
|
||||
help="1 = submit via SLURM; 0 = run locally (useful for debugging).",
|
||||
)
|
||||
p.add_argument(
|
||||
"--partition",
|
||||
type=str,
|
||||
default=None,
|
||||
help="SLURM partition to submit to.",
|
||||
)
|
||||
p.add_argument(
|
||||
"--cpus-per-task",
|
||||
type=int,
|
||||
default=4,
|
||||
help="Number of CPUs per SLURM task.",
|
||||
)
|
||||
p.add_argument(
|
||||
"--mem-per-cpu",
|
||||
type=str,
|
||||
default="4G",
|
||||
help="Memory per CPU, e.g. '4G' or '1950M'.",
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="SLURM-distributed SARM RA-BC annotation pipeline",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
sub = parser.add_subparsers(dest="command", required=True)
|
||||
|
||||
# compute subcommand
|
||||
cp = sub.add_parser(
|
||||
"compute",
|
||||
help="Distribute progress computation across SLURM workers.",
|
||||
)
|
||||
_add_shared_args(cp)
|
||||
cp.add_argument(
|
||||
"--reward-model-path",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path or HF repo id of the SARM reward model.",
|
||||
)
|
||||
cp.add_argument(
|
||||
"--stride",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Compute every Nth frame; intermediate frames are interpolated (must be >= 1).",
|
||||
)
|
||||
cp.add_argument(
|
||||
"--head-mode",
|
||||
type=str,
|
||||
default="sparse",
|
||||
choices=["sparse", "dense", "both"],
|
||||
help="Which reward head(s) to compute.",
|
||||
)
|
||||
cp.add_argument(
|
||||
"--device",
|
||||
type=str,
|
||||
default="cpu",
|
||||
help="Device for reward model inference, e.g. 'cpu' or 'cuda'.",
|
||||
)
|
||||
cp.add_argument(
|
||||
"--workers",
|
||||
type=int,
|
||||
default=50,
|
||||
help="Number of parallel SLURM tasks (one shard per worker).",
|
||||
)
|
||||
|
||||
# aggregate subcommand
|
||||
ap = sub.add_parser(
|
||||
"aggregate",
|
||||
help="Merge per-rank shards into a single sarm_progress.parquet.",
|
||||
)
|
||||
_add_shared_args(ap)
|
||||
ap.add_argument(
|
||||
"--reward-model-path",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path or HF repo id of the SARM reward model (stored in parquet metadata).",
|
||||
)
|
||||
ap.add_argument(
|
||||
"--push-to-hub",
|
||||
action="store_true",
|
||||
help="Upload sarm_progress.parquet to the Hugging Face Hub after aggregation.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
job_name = args.job_name or f"rabc_{args.command}"
|
||||
kwargs = vars(args)
|
||||
kwargs["slurm"] = kwargs.pop("slurm") == 1
|
||||
kwargs["job_name"] = job_name
|
||||
command = kwargs.pop("command")
|
||||
|
||||
executor = make_compute_executor(**kwargs) if command == "compute" else make_aggregate_executor(**kwargs)
|
||||
|
||||
executor.run()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
+34
-23
@@ -29,7 +29,7 @@ version = "0.4.5"
|
||||
description = "🤗 LeRobot: State-of-the-art Machine Learning for Real-World Robotics in Pytorch"
|
||||
dynamic = ["readme"]
|
||||
license = { text = "Apache-2.0" }
|
||||
requires-python = ">=3.10"
|
||||
requires-python = ">=3.12"
|
||||
authors = [
|
||||
{ name = "Rémi Cadène", email = "re.cadene@gmail.com" },
|
||||
{ name = "Simon Alibert", email = "alibert.sim@gmail.com" },
|
||||
@@ -50,7 +50,8 @@ classifiers = [
|
||||
"Intended Audience :: Education",
|
||||
"Intended Audience :: Science/Research",
|
||||
"License :: OSI Approved :: Apache Software License",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Programming Language :: Python :: 3.12",
|
||||
"Programming Language :: Python :: 3.13",
|
||||
"Topic :: Software Development :: Build Tools",
|
||||
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
||||
]
|
||||
@@ -61,26 +62,28 @@ dependencies = [
|
||||
# Hugging Face dependencies
|
||||
"datasets>=4.0.0,<5.0.0",
|
||||
"diffusers>=0.27.2,<0.36.0",
|
||||
"huggingface-hub[cli]>=1.0.0,<2.0.0",
|
||||
"huggingface-hub>=1.0.0,<2.0.0",
|
||||
"accelerate>=1.10.0,<2.0.0",
|
||||
|
||||
# Core dependencies
|
||||
"numpy>=2.0.0,<2.3.0", # NOTE: Explicitly listing numpy helps the resolver converge faster. Upper bound imposed by opencv-python-headless.
|
||||
"setuptools>=71.0.0,<81.0.0",
|
||||
"cmake>=3.29.0.1,<4.2.0",
|
||||
"packaging>=24.2,<26.0",
|
||||
|
||||
"torch>=2.2.1,<2.11.0",
|
||||
"torchcodec>=0.2.1,<0.11.0; sys_platform != 'win32' and (sys_platform != 'linux' or (platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 'armv7l')) and (sys_platform != 'darwin' or platform_machine != 'x86_64')",
|
||||
"torchvision>=0.21.0,<0.26.0",
|
||||
|
||||
"einops>=0.8.0,<0.9.0",
|
||||
"opencv-python-headless>=4.9.0,<4.13.0",
|
||||
"av>=15.0.0,<16.0.0",
|
||||
"jsonlines>=4.0.0,<5.0.0",
|
||||
"packaging>=24.2,<26.0",
|
||||
"pynput>=1.7.7,<1.9.0",
|
||||
"pynput>=1.7.8,<1.9.0",
|
||||
"pyserial>=3.5,<4.0",
|
||||
|
||||
"wandb>=0.24.0,<0.25.0",
|
||||
|
||||
"torch>=2.2.1,<2.11.0", # TODO: Bump dependency
|
||||
"torchcodec>=0.2.1,<0.11.0; sys_platform != 'win32' and (sys_platform != 'linux' or (platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 'armv7l')) and (sys_platform != 'darwin' or platform_machine != 'x86_64')", # TODO: Bump dependency
|
||||
"torchvision>=0.21.0,<0.26.0", # TODO: Bump dependency
|
||||
|
||||
"draccus==0.10.0", # TODO: Remove ==
|
||||
"draccus==0.10.0", # TODO: Relax version constraint
|
||||
"gymnasium>=1.1.1,<2.0.0",
|
||||
"rerun-sdk>=0.24.0,<0.27.0",
|
||||
|
||||
@@ -95,13 +98,14 @@ dependencies = [
|
||||
|
||||
# Common
|
||||
pygame-dep = ["pygame>=2.5.1,<2.7.0"]
|
||||
placo-dep = ["placo>=0.9.6,<0.10.0"]
|
||||
placo-dep = ["placo>=0.9.6,<0.9.17"]
|
||||
transformers-dep = ["transformers>=5.3.0,<6.0.0"]
|
||||
grpcio-dep = ["grpcio==1.73.1", "protobuf>=6.31.1,<6.32.0"]
|
||||
can-dep = ["python-can>=4.2.0,<5.0.0"]
|
||||
peft-dep = ["peft>=0.18.0,<1.0.0"]
|
||||
scipy-dep = ["scipy>=1.14.0,<2.0.0"]
|
||||
qwen-vl-utils-dep = ["qwen-vl-utils>=0.0.11,<0.1.0"]
|
||||
matplotlib-dep = ["matplotlib>=3.10.3,<4.0.0", "contourpy>=1.3.0,<2.0.0"] # NOTE: Explicitly listing contourpy helps the resolver converge faster.
|
||||
|
||||
# Motors
|
||||
feetech = ["feetech-servo-sdk>=1.0.0,<2.0.0"]
|
||||
@@ -119,16 +123,16 @@ unitree_g1 = [
|
||||
"onnxruntime>=1.16.0,<2.0.0",
|
||||
"pin>=3.0.0,<4.0.0",
|
||||
"meshcat>=0.3.0,<0.4.0",
|
||||
"matplotlib>=3.9.0,<4.0.0",
|
||||
"lerobot[matplotlib-dep]",
|
||||
"casadi>=3.6.0,<4.0.0",
|
||||
]
|
||||
reachy2 = ["reachy2_sdk>=1.0.15,<1.1.0"]
|
||||
kinematics = ["lerobot[placo-dep]"]
|
||||
intelrealsense = [
|
||||
"pyrealsense2>=2.55.1.6486,<2.57.0 ; sys_platform != 'darwin'",
|
||||
"pyrealsense2-macosx>=2.54,<2.55.0 ; sys_platform == 'darwin'",
|
||||
"pyrealsense2-macosx>=2.54,<2.57.0 ; sys_platform == 'darwin'",
|
||||
]
|
||||
phone = ["hebi-py>=2.8.0,<2.12.0", "teleop>=0.1.0,<0.2.0", "fastapi<1.0"]
|
||||
phone = ["hebi-py>=2.8.0,<2.12.0", "teleop>=0.1.0,<0.2.0", "fastapi<1.0", "lerobot[scipy-dep]"]
|
||||
|
||||
# Policies
|
||||
wallx = [
|
||||
@@ -140,6 +144,7 @@ wallx = [
|
||||
]
|
||||
pi = ["lerobot[transformers-dep]", "lerobot[scipy-dep]"]
|
||||
smolvla = ["lerobot[transformers-dep]", "num2words>=0.5.14,<0.6.0", "accelerate>=1.7.0,<2.0.0", "safetensors>=0.4.3,<1.0.0"]
|
||||
multi_task_dit = ["lerobot[transformers-dep]"]
|
||||
groot = [
|
||||
"lerobot[transformers-dep]",
|
||||
"lerobot[peft]",
|
||||
@@ -151,12 +156,12 @@ groot = [
|
||||
"ninja>=1.11.1,<2.0.0",
|
||||
"flash-attn>=2.5.9,<3.0.0 ; sys_platform != 'darwin'"
|
||||
]
|
||||
sarm = ["lerobot[transformers-dep]", "faker>=33.0.0,<35.0.0", "matplotlib>=3.10.3,<4.0.0", "lerobot[qwen-vl-utils-dep]"]
|
||||
sarm = ["lerobot[transformers-dep]", "faker>=33.0.0,<35.0.0", "lerobot[matplotlib-dep]", "lerobot[qwen-vl-utils-dep]"]
|
||||
xvla = ["lerobot[transformers-dep]"]
|
||||
hilserl = ["lerobot[transformers-dep]", "gym-hil>=0.1.13,<0.2.0", "lerobot[grpcio-dep]", "lerobot[placo-dep]"]
|
||||
|
||||
# Features
|
||||
async = ["lerobot[grpcio-dep]", "matplotlib>=3.10.3,<4.0.0"]
|
||||
async = ["lerobot[grpcio-dep]", "lerobot[matplotlib-dep]"]
|
||||
peft = ["lerobot[transformers-dep]", "lerobot[peft-dep]"]
|
||||
|
||||
# Development
|
||||
@@ -165,13 +170,19 @@ test = ["pytest>=8.1.0,<9.0.0", "pytest-timeout>=2.4.0,<3.0.0", "pytest-cov>=5.0
|
||||
video_benchmark = ["scikit-image>=0.23.2,<0.26.0", "pandas>=2.2.2,<2.4.0"]
|
||||
|
||||
# Simulation
|
||||
aloha = ["gym-aloha>=0.1.2,<0.2.0"]
|
||||
# NOTE: Explicitly listing scipy helps flatten the dependecy tree.
|
||||
aloha = ["gym-aloha>=0.1.2,<0.2.0", "lerobot[scipy-dep]"]
|
||||
pusht = ["gym-pusht>=0.1.5,<0.2.0", "pymunk>=6.6.0,<7.0.0"] # TODO: Fix pymunk version in gym-pusht instead
|
||||
libero = ["lerobot[transformers-dep]", "hf-libero>=0.1.3,<0.2.0"]
|
||||
metaworld = ["metaworld==3.0.0"]
|
||||
libero = ["lerobot[transformers-dep]", "hf-libero>=0.1.3,<0.2.0; sys_platform == 'linux'", "lerobot[scipy-dep]"]
|
||||
metaworld = ["metaworld==3.0.0", "lerobot[scipy-dep]"]
|
||||
|
||||
# All
|
||||
all = [
|
||||
# NOTE(resolver hint): scipy is pulled in transitively via lerobot[scipy-dep] through
|
||||
# multiple extras (aloha, metaworld, pi, wallx, phone). Listing it explicitly
|
||||
# helps pip's resolver converge by constraining scipy early, before it encounters
|
||||
# the loose scipy requirements from transitive deps like dm-control and metaworld.
|
||||
"scipy>=1.14.0,<2.0.0",
|
||||
"lerobot[dynamixel]",
|
||||
"lerobot[gamepad]",
|
||||
"lerobot[hopejr]",
|
||||
@@ -192,7 +203,7 @@ all = [
|
||||
"lerobot[aloha]",
|
||||
"lerobot[pusht]",
|
||||
"lerobot[phone]",
|
||||
"lerobot[libero]",
|
||||
"lerobot[libero]; sys_platform == 'linux'",
|
||||
"lerobot[metaworld]",
|
||||
"lerobot[sarm]",
|
||||
"lerobot[peft]",
|
||||
@@ -224,7 +235,7 @@ lerobot = ["envs/*.json"]
|
||||
where = ["src"]
|
||||
|
||||
[tool.ruff]
|
||||
target-version = "py310"
|
||||
target-version = "py312"
|
||||
line-length = 110
|
||||
exclude = ["tests/artifacts/**/*.safetensors", "*_pb2.py", "*_pb2_grpc.py"]
|
||||
|
||||
@@ -316,7 +327,7 @@ default.extend-ignore-identifiers-re = [
|
||||
# Uncomment [tool.mypy] first, then uncomment individual module overrides as they get proper type annotations
|
||||
|
||||
[tool.mypy]
|
||||
python_version = "3.10"
|
||||
python_version = "3.12"
|
||||
ignore_missing_imports = true
|
||||
follow_imports = "skip"
|
||||
# warn_return_any = true
|
||||
|
||||
@@ -49,9 +49,14 @@ import torch
|
||||
|
||||
from lerobot.cameras.opencv.configuration_opencv import OpenCVCameraConfig # noqa: F401
|
||||
from lerobot.cameras.realsense.configuration_realsense import RealSenseCameraConfig # noqa: F401
|
||||
from lerobot.robots import (
|
||||
RobotConfig, # noqa: F401
|
||||
from lerobot.robots import ( # noqa: F401
|
||||
Robot,
|
||||
RobotConfig,
|
||||
bi_so_follower,
|
||||
koch_follower,
|
||||
make_robot_from_config,
|
||||
omx_follower,
|
||||
so_follower,
|
||||
)
|
||||
from lerobot.transport import (
|
||||
services_pb2, # type: ignore
|
||||
|
||||
@@ -21,7 +21,7 @@ from collections import deque
|
||||
from collections.abc import Iterable, Iterator
|
||||
from pathlib import Path
|
||||
from pprint import pformat
|
||||
from typing import Any, Generic, TypeVar
|
||||
from typing import Any
|
||||
|
||||
import datasets
|
||||
import numpy as np
|
||||
@@ -78,8 +78,6 @@ DEFAULT_FEATURES = {
|
||||
"task_index": {"dtype": "int64", "shape": (1,), "names": None},
|
||||
}
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
|
||||
def get_parquet_file_size_in_mb(parquet_path: str | Path) -> float:
|
||||
metadata = pq.read_metadata(parquet_path)
|
||||
@@ -1234,7 +1232,7 @@ class LookAheadError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class Backtrackable(Generic[T]):
|
||||
class Backtrackable[T]:
|
||||
"""
|
||||
Wrap any iterator/iterable so you can step back up to `history` items
|
||||
and look ahead up to `lookahead` items.
|
||||
|
||||
@@ -29,7 +29,7 @@ from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from functools import cached_property
|
||||
from pprint import pformat
|
||||
from typing import Protocol, TypeAlias
|
||||
from typing import Protocol
|
||||
|
||||
import serial
|
||||
from deepdiff import DeepDiff
|
||||
@@ -38,8 +38,8 @@ from tqdm import tqdm
|
||||
from lerobot.utils.decorators import check_if_already_connected, check_if_not_connected
|
||||
from lerobot.utils.utils import enter_pressed, move_cursor_up
|
||||
|
||||
NameOrID: TypeAlias = str | int
|
||||
Value: TypeAlias = int | float
|
||||
type NameOrID = str | int
|
||||
type Value = int | float
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -1277,4 +1277,4 @@ class SerialMotorsBus(MotorsBusBase):
|
||||
|
||||
|
||||
# Backward compatibility alias
|
||||
MotorsBus: TypeAlias = SerialMotorsBus
|
||||
MotorsBus = SerialMotorsBus
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
from .act.configuration_act import ACTConfig as ACTConfig
|
||||
from .diffusion.configuration_diffusion import DiffusionConfig as DiffusionConfig
|
||||
from .groot.configuration_groot import GrootConfig as GrootConfig
|
||||
from .multi_task_dit.configuration_multi_task_dit import MultiTaskDiTConfig as MultiTaskDiTConfig
|
||||
from .pi0.configuration_pi0 import PI0Config as PI0Config
|
||||
from .pi0_fast.configuration_pi0_fast import PI0FastConfig as PI0FastConfig
|
||||
from .pi05.configuration_pi05 import PI05Config as PI05Config
|
||||
@@ -28,6 +29,7 @@ from .xvla.configuration_xvla import XVLAConfig as XVLAConfig
|
||||
__all__ = [
|
||||
"ACTConfig",
|
||||
"DiffusionConfig",
|
||||
"MultiTaskDiTConfig",
|
||||
"PI0Config",
|
||||
"PI05Config",
|
||||
"PI0FastConfig",
|
||||
|
||||
@@ -18,10 +18,9 @@ from __future__ import annotations
|
||||
|
||||
import importlib
|
||||
import logging
|
||||
from typing import Any, TypedDict
|
||||
from typing import Any, TypedDict, Unpack
|
||||
|
||||
import torch
|
||||
from typing_extensions import Unpack
|
||||
|
||||
from lerobot.configs.policies import PreTrainedConfig
|
||||
from lerobot.configs.types import FeatureType
|
||||
@@ -32,6 +31,7 @@ from lerobot.envs.utils import env_to_policy_features
|
||||
from lerobot.policies.act.configuration_act import ACTConfig
|
||||
from lerobot.policies.diffusion.configuration_diffusion import DiffusionConfig
|
||||
from lerobot.policies.groot.configuration_groot import GrootConfig
|
||||
from lerobot.policies.multi_task_dit.configuration_multi_task_dit import MultiTaskDiTConfig
|
||||
from lerobot.policies.pi0.configuration_pi0 import PI0Config
|
||||
from lerobot.policies.pi05.configuration_pi05 import PI05Config
|
||||
from lerobot.policies.pretrained import PreTrainedPolicy
|
||||
@@ -67,8 +67,7 @@ def get_policy_class(name: str) -> type[PreTrainedPolicy]:
|
||||
|
||||
Args:
|
||||
name: The name of the policy. Supported names are "tdmpc", "diffusion", "act",
|
||||
"vqbet", "pi0", "pi05", "sac", "reward_classifier", "smolvla", "wall_x".
|
||||
|
||||
"multi_task_dit", "vqbet", "pi0", "pi05", "sac", "reward_classifier", "smolvla", "wall_x".
|
||||
Returns:
|
||||
The policy class corresponding to the given name.
|
||||
|
||||
@@ -87,6 +86,10 @@ def get_policy_class(name: str) -> type[PreTrainedPolicy]:
|
||||
from lerobot.policies.act.modeling_act import ACTPolicy
|
||||
|
||||
return ACTPolicy
|
||||
elif name == "multi_task_dit":
|
||||
from lerobot.policies.multi_task_dit.modeling_multi_task_dit import MultiTaskDiTPolicy
|
||||
|
||||
return MultiTaskDiTPolicy
|
||||
elif name == "vqbet":
|
||||
from lerobot.policies.vqbet.modeling_vqbet import VQBeTPolicy
|
||||
|
||||
@@ -147,8 +150,8 @@ def make_policy_config(policy_type: str, **kwargs) -> PreTrainedConfig:
|
||||
|
||||
Args:
|
||||
policy_type: The type of the policy. Supported types include "tdmpc",
|
||||
"diffusion", "act", "vqbet", "pi0", "pi05", "sac", "smolvla",
|
||||
"reward_classifier", "wall_x".
|
||||
"multi_task_dit", "diffusion", "act", "vqbet", "pi0", "pi05", "sac",
|
||||
"smolvla", "reward_classifier", "wall_x".
|
||||
**kwargs: Keyword arguments to be passed to the configuration class constructor.
|
||||
|
||||
Returns:
|
||||
@@ -163,6 +166,8 @@ def make_policy_config(policy_type: str, **kwargs) -> PreTrainedConfig:
|
||||
return DiffusionConfig(**kwargs)
|
||||
elif policy_type == "act":
|
||||
return ACTConfig(**kwargs)
|
||||
elif policy_type == "multi_task_dit":
|
||||
return MultiTaskDiTConfig(**kwargs)
|
||||
elif policy_type == "vqbet":
|
||||
return VQBeTConfig(**kwargs)
|
||||
elif policy_type == "pi0":
|
||||
@@ -309,6 +314,16 @@ def make_pre_post_processors(
|
||||
dataset_stats=kwargs.get("dataset_stats"),
|
||||
)
|
||||
|
||||
elif isinstance(policy_cfg, MultiTaskDiTConfig):
|
||||
from lerobot.policies.multi_task_dit.processor_multi_task_dit import (
|
||||
make_multi_task_dit_pre_post_processors,
|
||||
)
|
||||
|
||||
processors = make_multi_task_dit_pre_post_processors(
|
||||
config=policy_cfg,
|
||||
dataset_stats=kwargs.get("dataset_stats"),
|
||||
)
|
||||
|
||||
elif isinstance(policy_cfg, VQBeTConfig):
|
||||
from lerobot.policies.vqbet.processor_vqbet import make_vqbet_pre_post_processors
|
||||
|
||||
|
||||
@@ -4,10 +4,9 @@
|
||||
# Licensed under The MIT License [see LICENSE for details]
|
||||
# --------------------------------------------------------
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
# copy from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py
|
||||
from typing import Optional
|
||||
|
||||
from transformers.image_processing_utils import (
|
||||
BatchFeature,
|
||||
get_patch_output_size,
|
||||
@@ -165,11 +164,11 @@ class Eagle25VLImageProcessorFast(BaseImageProcessorFast):
|
||||
|
||||
def _resize_for_patching(
|
||||
self,
|
||||
image: "torch.Tensor",
|
||||
image: torch.Tensor,
|
||||
target_resolution: tuple,
|
||||
interpolation: "F.InterpolationMode",
|
||||
interpolation: F.InterpolationMode,
|
||||
input_data_format: ChannelDimension,
|
||||
) -> "torch.Tensor":
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Resizes an image to a target resolution while maintaining aspect ratio.
|
||||
|
||||
@@ -219,8 +218,8 @@ class Eagle25VLImageProcessorFast(BaseImageProcessorFast):
|
||||
return best_ratio
|
||||
|
||||
def _pad_for_patching(
|
||||
self, image: "torch.Tensor", target_resolution: tuple, input_data_format: ChannelDimension
|
||||
) -> "torch.Tensor":
|
||||
self, image: torch.Tensor, target_resolution: tuple, input_data_format: ChannelDimension
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Pad an image to a target resolution while maintaining aspect ratio.
|
||||
"""
|
||||
@@ -236,15 +235,15 @@ class Eagle25VLImageProcessorFast(BaseImageProcessorFast):
|
||||
|
||||
def _get_image_patches(
|
||||
self,
|
||||
image: "torch.Tensor",
|
||||
image: torch.Tensor,
|
||||
min_num: int,
|
||||
max_num: int,
|
||||
size: tuple,
|
||||
tile_size: int,
|
||||
use_thumbnail: bool,
|
||||
interpolation: "F.InterpolationMode",
|
||||
interpolation: F.InterpolationMode,
|
||||
pad_during_tiling: bool,
|
||||
) -> list["torch.Tensor"]:
|
||||
) -> list[torch.Tensor]:
|
||||
image_size = get_image_size(image, channel_dim=ChannelDimension.FIRST)
|
||||
orig_height, orig_width = image_size
|
||||
aspect_ratio = orig_width / orig_height
|
||||
@@ -305,8 +304,8 @@ class Eagle25VLImageProcessorFast(BaseImageProcessorFast):
|
||||
|
||||
def _pad_for_batching(
|
||||
self,
|
||||
pixel_values: list["torch.Tensor"],
|
||||
) -> list["torch.Tensor"]:
|
||||
pixel_values: list[torch.Tensor],
|
||||
) -> list[torch.Tensor]:
|
||||
"""
|
||||
Pads images on the `num_of_patches` dimension with zeros to form a batch of same number of patches.
|
||||
|
||||
@@ -327,14 +326,14 @@ class Eagle25VLImageProcessorFast(BaseImageProcessorFast):
|
||||
|
||||
def _preprocess(
|
||||
self,
|
||||
images: list["torch.Tensor"],
|
||||
images: list[torch.Tensor],
|
||||
do_resize: bool,
|
||||
size: SizeDict,
|
||||
max_dynamic_tiles: int,
|
||||
min_dynamic_tiles: int,
|
||||
use_thumbnail: bool,
|
||||
pad_during_tiling: bool,
|
||||
interpolation: Optional["F.InterpolationMode"],
|
||||
interpolation: F.InterpolationMode | None,
|
||||
do_center_crop: bool,
|
||||
crop_size: SizeDict,
|
||||
do_rescale: bool,
|
||||
|
||||
@@ -0,0 +1,37 @@
|
||||
# Multitask DiT Policy
|
||||
|
||||
## Citation
|
||||
|
||||
If you use this work, please cite the following works:
|
||||
|
||||
```bibtex
|
||||
@misc{jones2025multitaskditpolicy,
|
||||
author = {Bryson Jones},
|
||||
title = {Dissecting and Open-Sourcing Multitask Diffusion Transformer Policy},
|
||||
year = {2025},
|
||||
url = {https://brysonkjones.substack.com/p/dissecting-and-open-sourcing-multitask-diffusion-transformer-policy},
|
||||
note = {Blog post}
|
||||
}
|
||||
```
|
||||
|
||||
```bibtex
|
||||
@misc{trilbmteam2025carefulexaminationlargebehaviormodels,
|
||||
author = {TRI LBM Team},
|
||||
title = {A Careful Examination of Large Behavior Models for Multitask Dexterous Manipulation},
|
||||
year = {2025},
|
||||
eprint = {arXiv:2507.05331},
|
||||
archivePrefix = {arXiv},
|
||||
primaryClass = {cs.RO},
|
||||
url = {https://arxiv.org/abs/2507.05331}
|
||||
}
|
||||
```
|
||||
|
||||
```bibtex
|
||||
@misc{bostondynamics2025largebehaviormodelsatlas,
|
||||
author = {Boston Dynamics and TRI Research Team},
|
||||
title = {Large Behavior Models and Atlas Find New Footing},
|
||||
year = {2025},
|
||||
url = {https://bostondynamics.com/blog/large-behavior-models-atlas-find-new-footing/},
|
||||
note = {Blog post}
|
||||
}
|
||||
```
|
||||
@@ -0,0 +1,21 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2025 Bryson Jones and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from .configuration_multi_task_dit import MultiTaskDiTConfig
|
||||
from .modeling_multi_task_dit import MultiTaskDiTPolicy
|
||||
from .processor_multi_task_dit import make_multi_task_dit_pre_post_processors
|
||||
|
||||
__all__ = ["MultiTaskDiTConfig", "MultiTaskDiTPolicy", "make_multi_task_dit_pre_post_processors"]
|
||||
@@ -0,0 +1,256 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2025 Bryson Jones and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from lerobot.configs.policies import PreTrainedConfig
|
||||
from lerobot.configs.types import NormalizationMode
|
||||
from lerobot.optim.optimizers import AdamConfig
|
||||
from lerobot.optim.schedulers import DiffuserSchedulerConfig
|
||||
|
||||
|
||||
@PreTrainedConfig.register_subclass("multi_task_dit")
|
||||
@dataclass
|
||||
class MultiTaskDiTConfig(PreTrainedConfig):
|
||||
"""Configuration for the Multi-Task Diffusion Transformer (DiT) policy.
|
||||
|
||||
A transformer-based policy that supports both diffusion and flow matching objectives
|
||||
for multi-task robot learning with text and vision conditioning.
|
||||
"""
|
||||
|
||||
n_obs_steps: int = 2 # Number of observation steps for temporal context
|
||||
horizon: int = 32 # Number of action steps to predict
|
||||
n_action_steps: int = 24 # Actions executed per policy call (~0.8s at 30Hz)
|
||||
|
||||
# Objective Selection
|
||||
objective: str = "diffusion" # "diffusion" or "flow_matching"
|
||||
|
||||
# --- Diffusion-specific (used when objective="diffusion") ---
|
||||
noise_scheduler_type: str = "DDPM" # "DDPM" or "DDIM"
|
||||
num_train_timesteps: int = 100 # Number of diffusion timesteps
|
||||
beta_schedule: str = "squaredcos_cap_v2" # Noise schedule type
|
||||
beta_start: float = 0.0001 # Starting noise level
|
||||
beta_end: float = 0.02 # Ending noise level
|
||||
prediction_type: str = "epsilon" # "epsilon" (predict noise) or "sample" (predict clean)
|
||||
clip_sample: bool = True # Clip samples during denoising
|
||||
clip_sample_range: float = 1.0 # Clipping range [-x, x]
|
||||
num_inference_steps: int | None = None # Denoising steps at inference (defaults to num_train_timesteps)
|
||||
|
||||
# --- Flow Matching-specific (used when objective="flow_matching") ---
|
||||
sigma_min: float = 0.0 # Minimum noise in flow interpolation path
|
||||
num_integration_steps: int = 100 # ODE integration steps at inference
|
||||
integration_method: str = "euler" # ODE solver: "euler" or "rk4"
|
||||
timestep_sampling_strategy: str = "beta" # "uniform" or "beta"
|
||||
|
||||
timestep_sampling_s: float = 0.999 # (beta only) Max timestep threshold
|
||||
timestep_sampling_alpha: float = 1.5 # (beta only) Beta distribution alpha
|
||||
timestep_sampling_beta: float = 1.0 # (beta only) Beta distribution beta
|
||||
|
||||
# Transformer Architecture
|
||||
hidden_dim: int = 512 # Transformer hidden dimension
|
||||
num_layers: int = 6 # Number of transformer layers
|
||||
num_heads: int = 8 # Number of attention heads
|
||||
dropout: float = 0.1 # Dropout rate
|
||||
use_positional_encoding: bool = False # Use absolute positional encoding
|
||||
timestep_embed_dim: int = 256 # Timestep embedding dimension
|
||||
use_rope: bool = True # Use Rotary Position Embedding
|
||||
rope_base: float = 10000.0 # RoPE base frequency
|
||||
|
||||
# Vision Encoder (CLIP)
|
||||
vision_encoder_name: str = "openai/clip-vit-base-patch16" # HuggingFace CLIP model
|
||||
use_separate_rgb_encoder_per_camera: bool = False # Separate encoder per camera view
|
||||
vision_encoder_lr_multiplier: float = 0.1 # LR multiplier for vision encoder
|
||||
image_resize_shape: tuple[int, int] | None = None # Resize images before crop
|
||||
image_crop_shape: tuple[int, int] | None = (224, 224) # Crop shape (CLIP default)
|
||||
image_crop_is_random: bool = True # Random crop during training, center at inference
|
||||
|
||||
# Text Encoder (CLIP)
|
||||
text_encoder_name: str = "openai/clip-vit-base-patch16" # HuggingFace CLIP model
|
||||
tokenizer_max_length: int = 77 # Max length for tokenized text (CLIP default is 77)
|
||||
tokenizer_padding: str = "max_length" # Padding strategy: "max_length" or "longest"
|
||||
tokenizer_padding_side: str = "right" # Padding side: "left" or "right"
|
||||
tokenizer_truncation: bool = True # Whether to truncate sequences longer than max_length
|
||||
|
||||
# Normalization
|
||||
normalization_mapping: dict[str, NormalizationMode] = field(
|
||||
default_factory=lambda: {
|
||||
"VISUAL": NormalizationMode.MEAN_STD,
|
||||
"STATE": NormalizationMode.MIN_MAX,
|
||||
"ACTION": NormalizationMode.MIN_MAX,
|
||||
}
|
||||
)
|
||||
|
||||
# Training/Optimizer
|
||||
optimizer_lr: float = 2e-5
|
||||
optimizer_betas: tuple = (0.95, 0.999)
|
||||
optimizer_eps: float = 1e-8
|
||||
optimizer_weight_decay: float = 0.0
|
||||
scheduler_name: str = "cosine"
|
||||
scheduler_warmup_steps: int = 0
|
||||
do_mask_loss_for_padding: bool = False
|
||||
|
||||
# Auto-calculated
|
||||
drop_n_last_frames: int | None = None
|
||||
|
||||
def __post_init__(self):
|
||||
super().__post_init__()
|
||||
|
||||
if self.drop_n_last_frames is None:
|
||||
self.drop_n_last_frames = self.horizon - self.n_action_steps - self.n_obs_steps + 1
|
||||
|
||||
self._validate()
|
||||
|
||||
def _validate(self):
|
||||
"""Validate configuration parameters."""
|
||||
# Objective validation
|
||||
if self.objective not in ["diffusion", "flow_matching"]:
|
||||
raise ValueError(f"objective must be 'diffusion' or 'flow_matching', got '{self.objective}'")
|
||||
|
||||
# Transformer validation
|
||||
if self.hidden_dim <= 0:
|
||||
raise ValueError("hidden_dim must be positive")
|
||||
if self.num_layers <= 0:
|
||||
raise ValueError("num_layers must be positive")
|
||||
if self.num_heads <= 0:
|
||||
raise ValueError("num_heads must be positive")
|
||||
if self.hidden_dim % self.num_heads != 0:
|
||||
raise ValueError("hidden_dim must be divisible by num_heads")
|
||||
if not (0.0 <= self.dropout <= 1.0):
|
||||
raise ValueError("dropout must be between 0.0 and 1.0")
|
||||
|
||||
# Vision encoder validation
|
||||
if "clip" not in self.vision_encoder_name.lower():
|
||||
raise ValueError(
|
||||
f"vision_encoder_name must be a CLIP model (contain 'clip'), got '{self.vision_encoder_name}'"
|
||||
)
|
||||
if (
|
||||
self.image_resize_shape
|
||||
and self.image_crop_shape
|
||||
and (
|
||||
self.image_crop_shape[0] > self.image_resize_shape[0]
|
||||
or self.image_crop_shape[1] > self.image_resize_shape[1]
|
||||
)
|
||||
):
|
||||
logging.warning(
|
||||
"image_crop_shape %s must be <= image_resize_shape %s; disabling cropping.",
|
||||
self.image_crop_shape,
|
||||
self.image_resize_shape,
|
||||
)
|
||||
self.image_crop_shape = None
|
||||
|
||||
# Text encoder validation
|
||||
if "clip" not in self.text_encoder_name.lower():
|
||||
raise ValueError(
|
||||
f"text_encoder_name must be a CLIP model (contain 'clip'), got '{self.text_encoder_name}'"
|
||||
)
|
||||
|
||||
# Objective-specific validation
|
||||
if self.objective == "diffusion":
|
||||
if self.noise_scheduler_type not in ["DDPM", "DDIM"]:
|
||||
raise ValueError(
|
||||
f"noise_scheduler_type must be 'DDPM' or 'DDIM', got {self.noise_scheduler_type}"
|
||||
)
|
||||
if self.prediction_type not in ["epsilon", "sample"]:
|
||||
raise ValueError(f"prediction_type must be 'epsilon' or 'sample', got {self.prediction_type}")
|
||||
if self.num_train_timesteps <= 0:
|
||||
raise ValueError(f"num_train_timesteps must be positive, got {self.num_train_timesteps}")
|
||||
if not (0.0 <= self.beta_start <= self.beta_end <= 1.0):
|
||||
raise ValueError(f"Invalid beta values: {self.beta_start}, {self.beta_end}")
|
||||
|
||||
elif self.objective == "flow_matching":
|
||||
if not (0.0 <= self.sigma_min <= 1.0):
|
||||
raise ValueError(f"sigma_min must be in [0, 1], got {self.sigma_min}")
|
||||
if self.num_integration_steps <= 0:
|
||||
raise ValueError(f"num_integration_steps must be positive, got {self.num_integration_steps}")
|
||||
if self.integration_method not in ["euler", "rk4"]:
|
||||
raise ValueError(
|
||||
f"integration_method must be 'euler' or 'rk4', got {self.integration_method}"
|
||||
)
|
||||
if self.timestep_sampling_strategy not in ["uniform", "beta"]:
|
||||
raise ValueError("timestep_sampling_strategy must be 'uniform' or 'beta'")
|
||||
if self.timestep_sampling_strategy == "beta":
|
||||
if not (0.0 < self.timestep_sampling_s <= 1.0):
|
||||
raise ValueError(f"timestep_sampling_s must be in (0, 1], got {self.timestep_sampling_s}")
|
||||
if self.timestep_sampling_alpha <= 0:
|
||||
raise ValueError("timestep_sampling_alpha must be positive")
|
||||
if self.timestep_sampling_beta <= 0:
|
||||
raise ValueError("timestep_sampling_beta must be positive")
|
||||
|
||||
def get_optimizer_preset(self) -> AdamConfig:
|
||||
return AdamConfig(
|
||||
lr=self.optimizer_lr,
|
||||
betas=self.optimizer_betas,
|
||||
eps=self.optimizer_eps,
|
||||
weight_decay=self.optimizer_weight_decay,
|
||||
)
|
||||
|
||||
def get_scheduler_preset(self) -> DiffuserSchedulerConfig:
|
||||
return DiffuserSchedulerConfig(
|
||||
name=self.scheduler_name,
|
||||
num_warmup_steps=self.scheduler_warmup_steps,
|
||||
)
|
||||
|
||||
def validate_features(self) -> None:
|
||||
"""Validate that required input features are present and properly configured."""
|
||||
# If the configured crop doesn't fit, disable cropping instead of erroring.
|
||||
# Note: if image_resize_shape is set, cropping is applied *after* resizing.
|
||||
if self.image_crop_shape is not None:
|
||||
for key, image_ft in self.image_features.items():
|
||||
# image_ft.shape is (C, H, W)
|
||||
effective_h, effective_w = (
|
||||
self.image_resize_shape
|
||||
if self.image_resize_shape is not None
|
||||
else (image_ft.shape[1], image_ft.shape[2])
|
||||
)
|
||||
if self.image_crop_shape[0] > effective_h or self.image_crop_shape[1] > effective_w:
|
||||
logging.warning(
|
||||
"image_crop_shape %s doesn't fit within effective image shape (%s, %s) for '%s'; disabling cropping.",
|
||||
self.image_crop_shape,
|
||||
effective_h,
|
||||
effective_w,
|
||||
key,
|
||||
)
|
||||
self.image_crop_shape = None
|
||||
break
|
||||
|
||||
if len(self.image_features) > 0:
|
||||
first_key, first_ft = next(iter(self.image_features.items()))
|
||||
for key, image_ft in self.image_features.items():
|
||||
if image_ft.shape != first_ft.shape:
|
||||
raise ValueError(
|
||||
f"Image '{key}' shape {image_ft.shape} != '{first_key}' shape {first_ft.shape}"
|
||||
)
|
||||
|
||||
@property
|
||||
def is_diffusion(self) -> bool:
|
||||
return self.objective == "diffusion"
|
||||
|
||||
@property
|
||||
def is_flow_matching(self) -> bool:
|
||||
return self.objective == "flow_matching"
|
||||
|
||||
@property
|
||||
def observation_delta_indices(self) -> list:
|
||||
return list(range(1 - self.n_obs_steps, 1))
|
||||
|
||||
@property
|
||||
def action_delta_indices(self) -> list:
|
||||
return list(range(1 - self.n_obs_steps, 1 - self.n_obs_steps + self.horizon))
|
||||
|
||||
@property
|
||||
def reward_delta_indices(self) -> None:
|
||||
return None
|
||||
@@ -0,0 +1,803 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2025 Bryson Jones and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Multi-Task Diffusion Transformer (DiT) Policy
|
||||
|
||||
Transformer-based diffusion policy for multi-task robot learning with text and vision conditioning.
|
||||
Supports both diffusion and flow matching objectives for action generation.
|
||||
|
||||
References:
|
||||
- https://arxiv.org/abs/2507.05331
|
||||
- https://bostondynamics.com/blog/large-behavior-models-atlas-find-new-footing/
|
||||
- https://brysonkjones.substack.com/p/dissecting-and-open-sourcing-multitask-diffusion-transformer-policy
|
||||
"""
|
||||
|
||||
import math
|
||||
from collections import deque
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import einops
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F # noqa: N812
|
||||
import torchvision
|
||||
from diffusers.schedulers.scheduling_ddim import DDIMScheduler
|
||||
from diffusers.schedulers.scheduling_ddpm import DDPMScheduler
|
||||
from torch import Tensor
|
||||
|
||||
from lerobot.policies.multi_task_dit.configuration_multi_task_dit import MultiTaskDiTConfig
|
||||
from lerobot.utils.import_utils import _transformers_available
|
||||
|
||||
# Conditional import for type checking and lazy loading
|
||||
if TYPE_CHECKING or _transformers_available:
|
||||
from transformers import CLIPTextModel, CLIPVisionModel
|
||||
else:
|
||||
CLIPTextModel = None
|
||||
CLIPVisionModel = None
|
||||
from lerobot.policies.pretrained import PreTrainedPolicy
|
||||
from lerobot.policies.utils import populate_queues
|
||||
from lerobot.utils.constants import (
|
||||
ACTION,
|
||||
OBS_IMAGES,
|
||||
OBS_LANGUAGE_ATTENTION_MASK,
|
||||
OBS_LANGUAGE_TOKENS,
|
||||
OBS_STATE,
|
||||
)
|
||||
|
||||
# -- Policy --
|
||||
|
||||
|
||||
class MultiTaskDiTPolicy(PreTrainedPolicy):
|
||||
config_class = MultiTaskDiTConfig
|
||||
name = "multi_task_dit"
|
||||
|
||||
def __init__(self, config: MultiTaskDiTConfig, **kwargs):
|
||||
super().__init__(config)
|
||||
config.validate_features()
|
||||
self.config = config
|
||||
|
||||
self._queues = None
|
||||
|
||||
self.observation_encoder = ObservationEncoder(config)
|
||||
conditioning_dim = self.observation_encoder.conditioning_dim
|
||||
self.noise_predictor = DiffusionTransformer(config, conditioning_dim=conditioning_dim)
|
||||
|
||||
action_dim = config.action_feature.shape[0]
|
||||
horizon = config.horizon
|
||||
|
||||
if config.is_diffusion:
|
||||
self.objective = DiffusionObjective(
|
||||
config,
|
||||
action_dim=action_dim,
|
||||
horizon=horizon,
|
||||
do_mask_loss_for_padding=config.do_mask_loss_for_padding,
|
||||
)
|
||||
elif config.is_flow_matching:
|
||||
self.objective = FlowMatchingObjective(
|
||||
config,
|
||||
action_dim=action_dim,
|
||||
horizon=horizon,
|
||||
do_mask_loss_for_padding=config.do_mask_loss_for_padding,
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unsupported objective: {config.objective}")
|
||||
|
||||
self.reset()
|
||||
|
||||
def get_optim_params(self) -> list:
|
||||
"""Returns parameter groups with different learning rates for vision vs non-vision parameters"""
|
||||
non_vision_params = []
|
||||
vision_encoder_params = []
|
||||
|
||||
for name, param in self.named_parameters():
|
||||
if not param.requires_grad:
|
||||
continue
|
||||
|
||||
if "observation_encoder.vision_encoder" in name:
|
||||
vision_encoder_params.append(param)
|
||||
else:
|
||||
non_vision_params.append(param)
|
||||
|
||||
return [
|
||||
{"params": non_vision_params},
|
||||
{
|
||||
"params": vision_encoder_params,
|
||||
"lr": self.config.optimizer_lr * self.config.vision_encoder_lr_multiplier,
|
||||
},
|
||||
]
|
||||
|
||||
def _generate_actions(self, batch: dict[str, Tensor]) -> Tensor:
|
||||
batch_size, n_obs_steps = batch[OBS_STATE].shape[:2]
|
||||
assert n_obs_steps == self.config.n_obs_steps
|
||||
|
||||
conditioning_vec = self.observation_encoder.encode(batch)
|
||||
actions = self.objective.conditional_sample(self.noise_predictor, batch_size, conditioning_vec)
|
||||
|
||||
start = n_obs_steps - 1
|
||||
end = start + self.config.n_action_steps
|
||||
actions = actions[:, start:end]
|
||||
return actions
|
||||
|
||||
def reset(self):
|
||||
"""Clear observation and action queues. Should be called on `env.reset()`"""
|
||||
self._queues = {
|
||||
OBS_STATE: deque(maxlen=self.config.n_obs_steps),
|
||||
ACTION: deque(maxlen=self.config.n_action_steps),
|
||||
}
|
||||
|
||||
if self.config.image_features:
|
||||
self._queues[OBS_IMAGES] = deque(maxlen=self.config.n_obs_steps)
|
||||
|
||||
@torch.no_grad()
|
||||
def predict_action_chunk(self, batch: dict[str, Tensor]) -> Tensor:
|
||||
"""Predict a chunk of actions given environment observations"""
|
||||
self.eval()
|
||||
|
||||
for k in batch:
|
||||
if k in self._queues:
|
||||
batch[k] = torch.stack(list(self._queues[k]), dim=1)
|
||||
|
||||
actions = self._generate_actions(batch)
|
||||
return actions
|
||||
|
||||
def _prepare_batch(self, batch: dict[str, Tensor]) -> dict[str, Tensor]:
|
||||
"""Prepare batch by stacking image features if needed."""
|
||||
if self.config.image_features:
|
||||
batch = dict(batch) # shallow copy to avoid modifying original
|
||||
batch[OBS_IMAGES] = torch.stack([batch[key] for key in self.config.image_features], dim=-4)
|
||||
|
||||
return batch
|
||||
|
||||
@torch.no_grad()
|
||||
def select_action(self, batch: dict[str, Tensor]) -> Tensor:
|
||||
"""Select a single action given environment observations"""
|
||||
if ACTION in batch:
|
||||
batch = dict(batch) # shallow copy to avoid modifying original
|
||||
batch.pop(ACTION)
|
||||
|
||||
batch = self._prepare_batch(batch)
|
||||
|
||||
self._queues = populate_queues(self._queues, batch)
|
||||
|
||||
if len(self._queues[ACTION]) == 0:
|
||||
actions = self.predict_action_chunk(batch)
|
||||
self._queues[ACTION].extend(actions.transpose(0, 1))
|
||||
|
||||
action = self._queues[ACTION].popleft()
|
||||
return action
|
||||
|
||||
def forward(self, batch: dict[str, Tensor]) -> tuple[Tensor, dict | None]:
|
||||
"""Run the batch through the model and compute the loss for training"""
|
||||
batch = self._prepare_batch(batch)
|
||||
|
||||
conditioning_vec = self.observation_encoder.encode(batch)
|
||||
loss = self.objective.compute_loss(self.noise_predictor, batch, conditioning_vec)
|
||||
|
||||
return loss, None
|
||||
|
||||
|
||||
# -- Observation Encoders --
|
||||
|
||||
|
||||
class CLIPVisionEncoder(nn.Module):
|
||||
"""CLIP vision encoder using the CLS token for global image representation."""
|
||||
|
||||
def __init__(self, model_name: str):
|
||||
super().__init__()
|
||||
self.model_name = model_name
|
||||
self.model = CLIPVisionModel.from_pretrained(self.model_name)
|
||||
self.num_non_spatial_tokens = 1
|
||||
self.embed_dim = self.model.config.hidden_size
|
||||
|
||||
def forward(self, x: Tensor) -> Tensor:
|
||||
"""Encode RGB image to CLS token."""
|
||||
outputs = self.model(pixel_values=x, output_hidden_states=False)
|
||||
cls_token = outputs.last_hidden_state[:, 0]
|
||||
b, embed_dim = cls_token.shape
|
||||
return cls_token.reshape(b, embed_dim, 1, 1)
|
||||
|
||||
def get_output_shape(self) -> tuple:
|
||||
return (self.embed_dim, 1, 1)
|
||||
|
||||
|
||||
class CLIPTextEncoder(nn.Module):
|
||||
"""CLIP text encoder with frozen weights and a learnable projection layer.
|
||||
|
||||
Accepts pre-tokenized inputs (input_ids and attention_mask) from the processor pipeline. See the processor
|
||||
pipeline to see how the tokenization is handled.
|
||||
"""
|
||||
|
||||
def __init__(self, model_name: str = "openai/clip-vit-base-patch16", projection_dim: int = 512):
|
||||
super().__init__()
|
||||
self.model_name = model_name
|
||||
self.projection_dim = projection_dim
|
||||
self.text_encoder = CLIPTextModel.from_pretrained(model_name)
|
||||
|
||||
for param in self.text_encoder.parameters():
|
||||
param.requires_grad = False
|
||||
|
||||
self.text_embed_dim = self.text_encoder.config.hidden_size
|
||||
self.projection = nn.Linear(self.text_embed_dim, projection_dim)
|
||||
|
||||
def forward(self, input_ids: Tensor, attention_mask: Tensor) -> Tensor:
|
||||
"""Encode pre-tokenized text to feature vectors."""
|
||||
# Ensure inputs are on the same device as the model
|
||||
device = next(self.parameters()).device
|
||||
input_ids = input_ids.to(device)
|
||||
attention_mask = attention_mask.to(device)
|
||||
|
||||
with torch.no_grad():
|
||||
outputs = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)
|
||||
clip_features = outputs.pooler_output
|
||||
|
||||
return self.projection(clip_features)
|
||||
|
||||
|
||||
class ObservationEncoder(nn.Module):
|
||||
"""Handles all observation processing for the conditioning vector."""
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
self._setup_preprocessing(config)
|
||||
|
||||
if config.image_features:
|
||||
self.num_cameras = len(config.image_features)
|
||||
self.camera_names = list(config.image_features.keys())
|
||||
|
||||
if config.use_separate_rgb_encoder_per_camera:
|
||||
self.vision_encoders = nn.ModuleList(
|
||||
[CLIPVisionEncoder(model_name=config.vision_encoder_name) for _ in self.camera_names]
|
||||
)
|
||||
self.vision_encoder = None
|
||||
else:
|
||||
self.vision_encoder = CLIPVisionEncoder(model_name=config.vision_encoder_name)
|
||||
self.vision_encoders = None
|
||||
else:
|
||||
self.vision_encoder = None
|
||||
self.vision_encoders = None
|
||||
self.camera_names = []
|
||||
self.num_cameras = 0
|
||||
|
||||
if hasattr(config, "robot_state_feature") and config.robot_state_feature:
|
||||
self.robot_state_dim = config.robot_state_feature.shape[0]
|
||||
else:
|
||||
self.robot_state_dim = 0
|
||||
|
||||
self.text_dim = config.hidden_dim
|
||||
self.text_encoder = CLIPTextEncoder(model_name=config.text_encoder_name, projection_dim=self.text_dim)
|
||||
|
||||
self._setup_vector_output()
|
||||
|
||||
def _apply_preprocessing(self, images: Tensor) -> Tensor:
|
||||
if self.do_resize:
|
||||
images = self.resize(images)
|
||||
if self.do_crop:
|
||||
images = self.maybe_random_crop(images) if self.training else self.center_crop(images)
|
||||
return images
|
||||
|
||||
def _setup_preprocessing(self, config):
|
||||
if config.image_resize_shape is not None:
|
||||
self.do_resize = True
|
||||
self.resize = torchvision.transforms.Resize(
|
||||
size=config.image_resize_shape,
|
||||
interpolation=torchvision.transforms.InterpolationMode.BILINEAR,
|
||||
antialias=True,
|
||||
)
|
||||
else:
|
||||
self.do_resize = False
|
||||
|
||||
if config.image_crop_shape is not None:
|
||||
self.do_crop = True
|
||||
self.center_crop = torchvision.transforms.CenterCrop(config.image_crop_shape)
|
||||
if config.image_crop_is_random:
|
||||
self.maybe_random_crop = torchvision.transforms.RandomCrop(config.image_crop_shape)
|
||||
else:
|
||||
self.maybe_random_crop = self.center_crop
|
||||
else:
|
||||
self.do_crop = False
|
||||
|
||||
def _setup_vector_output(self):
|
||||
total_dim = 0
|
||||
|
||||
if self.vision_encoder is not None or self.vision_encoders is not None:
|
||||
encoder_to_check = self.vision_encoder or next(iter(self.vision_encoders))
|
||||
feature_map_shape = encoder_to_check.get_output_shape()
|
||||
c, h, w = feature_map_shape
|
||||
spatial_feature_dim = c * h * w
|
||||
total_dim += spatial_feature_dim * self.num_cameras
|
||||
|
||||
total_dim += self.robot_state_dim
|
||||
total_dim += self.text_dim
|
||||
|
||||
self.conditioning_dim = total_dim * self.config.n_obs_steps
|
||||
|
||||
def encode(self, batch: dict) -> Tensor:
|
||||
"""Encode observations to vector format."""
|
||||
batch_size, n_obs_steps = batch[OBS_STATE].shape[:2]
|
||||
conditioning_feats = []
|
||||
|
||||
conditioning_feats.append(batch[OBS_STATE])
|
||||
|
||||
if self.vision_encoder is not None or self.vision_encoders is not None:
|
||||
images = batch[OBS_IMAGES]
|
||||
|
||||
if len(images.shape) == 5:
|
||||
images = images.unsqueeze(1)
|
||||
|
||||
if self.config.use_separate_rgb_encoder_per_camera:
|
||||
camera_features = []
|
||||
for cam_idx in range(self.num_cameras):
|
||||
cam_images = images[:, :, cam_idx]
|
||||
cam_images_flat = einops.rearrange(cam_images, "b s c h w -> (b s) c h w")
|
||||
cam_images_flat = self._apply_preprocessing(cam_images_flat)
|
||||
cam_features = self.vision_encoders[cam_idx](cam_images_flat)
|
||||
cam_visual_features = cam_features.flatten(start_dim=1)
|
||||
cam_features_reshaped = einops.rearrange(
|
||||
cam_visual_features, "(b s) f -> b s f", b=batch_size, s=n_obs_steps
|
||||
)
|
||||
camera_features.append(cam_features_reshaped)
|
||||
img_features = torch.cat(camera_features, dim=-1)
|
||||
conditioning_feats.append(img_features)
|
||||
else:
|
||||
images_flat = einops.rearrange(images, "b s n c h w -> (b s n) c h w")
|
||||
images_flat = self._apply_preprocessing(images_flat)
|
||||
visual_features = self.vision_encoder(images_flat).flatten(start_dim=1)
|
||||
img_features = einops.rearrange(
|
||||
visual_features, "(b s n) f -> b s (n f)", b=batch_size, s=n_obs_steps, n=self.num_cameras
|
||||
)
|
||||
conditioning_feats.append(img_features)
|
||||
|
||||
if self.text_encoder is not None and OBS_LANGUAGE_TOKENS in batch:
|
||||
input_ids = batch[OBS_LANGUAGE_TOKENS] # [batch_size, seq_length]
|
||||
attention_mask = batch[OBS_LANGUAGE_ATTENTION_MASK] # [batch_size, seq_length]
|
||||
|
||||
text_features = self.text_encoder(input_ids, attention_mask)
|
||||
|
||||
text_features = text_features.unsqueeze(1).expand(-1, n_obs_steps, -1)
|
||||
conditioning_feats.append(text_features)
|
||||
|
||||
combined_features = torch.cat(conditioning_feats, dim=-1)
|
||||
return combined_features.flatten(start_dim=1)
|
||||
|
||||
|
||||
# -- Transformer Components --
|
||||
|
||||
|
||||
def modulate(x: Tensor, shift: Tensor, scale: Tensor) -> Tensor:
|
||||
"""Modulate input with shift and scale for AdaLN-Zero."""
|
||||
return x * (1 + scale) + shift
|
||||
|
||||
|
||||
class SinusoidalPosEmb(nn.Module):
|
||||
"""Sinusoidal positional embeddings for timesteps."""
|
||||
|
||||
def __init__(self, dim: int):
|
||||
super().__init__()
|
||||
self.dim = dim
|
||||
|
||||
def forward(self, x: Tensor) -> Tensor:
|
||||
device = x.device
|
||||
half_dim = self.dim // 2
|
||||
emb = math.log(10000) / (half_dim - 1)
|
||||
emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
|
||||
emb = x[:, None] * emb[None, :]
|
||||
emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
|
||||
return emb
|
||||
|
||||
|
||||
class RotaryPositionalEmbedding(nn.Module):
|
||||
"""Rotary Position Embedding (RoPE) for transformers."""
|
||||
|
||||
def __init__(self, head_dim: int, max_seq_len: int = 512, base: float = 10000.0):
|
||||
super().__init__()
|
||||
assert head_dim % 2 == 0, "head_dim must be even for RoPE"
|
||||
|
||||
self.head_dim = head_dim
|
||||
self.max_seq_len = max_seq_len
|
||||
self.base = base
|
||||
|
||||
inv_freq = 1.0 / (base ** (torch.arange(0, head_dim, 2).float() / head_dim))
|
||||
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
||||
self._precompute_cache(max_seq_len)
|
||||
|
||||
def _precompute_cache(self, seq_len: int):
|
||||
t = torch.arange(seq_len, dtype=self.inv_freq.dtype)
|
||||
freqs = torch.outer(t, self.inv_freq)
|
||||
emb = torch.cat((freqs, freqs), dim=-1)
|
||||
self.register_buffer("_cos_cached", emb.cos()[None, None, :, :], persistent=False)
|
||||
self.register_buffer("_sin_cached", emb.sin()[None, None, :, :], persistent=False)
|
||||
|
||||
def _rotate_half(self, x: Tensor) -> Tensor:
|
||||
x1 = x[..., : x.shape[-1] // 2]
|
||||
x2 = x[..., x.shape[-1] // 2 :]
|
||||
return torch.cat((-x2, x1), dim=-1)
|
||||
|
||||
def forward(self, q: Tensor, k: Tensor) -> tuple[Tensor, Tensor]:
|
||||
seq_len = q.shape[2]
|
||||
if seq_len > self.max_seq_len:
|
||||
raise ValueError(f"Sequence length {seq_len} exceeds max_seq_len {self.max_seq_len}.")
|
||||
|
||||
cos = self._cos_cached[:, :, :seq_len, :].to(q.dtype)
|
||||
sin = self._sin_cached[:, :, :seq_len, :].to(q.dtype)
|
||||
|
||||
q_rotated = (q * cos) + (self._rotate_half(q) * sin)
|
||||
k_rotated = (k * cos) + (self._rotate_half(k) * sin)
|
||||
return q_rotated, k_rotated
|
||||
|
||||
|
||||
class RoPEAttention(nn.Module):
|
||||
"""Multi-head self-attention with Rotary Position Embedding (RoPE)."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
hidden_size: int,
|
||||
num_heads: int,
|
||||
dropout: float = 0.0,
|
||||
max_seq_len: int = 512,
|
||||
rope_base: float = 10000.0,
|
||||
):
|
||||
super().__init__()
|
||||
assert hidden_size % num_heads == 0, "hidden_size must be divisible by num_heads"
|
||||
|
||||
self.hidden_size = hidden_size
|
||||
self.num_heads = num_heads
|
||||
self.head_dim = hidden_size // num_heads
|
||||
self.scale = self.head_dim**-0.5
|
||||
|
||||
self.qkv_proj = nn.Linear(hidden_size, 3 * hidden_size, bias=True)
|
||||
self.out_proj = nn.Linear(hidden_size, hidden_size, bias=True)
|
||||
self.dropout = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
|
||||
self.rope = RotaryPositionalEmbedding(head_dim=self.head_dim, max_seq_len=max_seq_len, base=rope_base)
|
||||
|
||||
def forward(self, x: Tensor) -> Tensor:
|
||||
B, T, _ = x.shape # noqa: N806
|
||||
|
||||
qkv = self.qkv_proj(x)
|
||||
qkv = qkv.reshape(B, T, 3, self.num_heads, self.head_dim)
|
||||
qkv = qkv.permute(2, 0, 3, 1, 4)
|
||||
q, k, v = qkv[0], qkv[1], qkv[2]
|
||||
|
||||
q, k = self.rope(q, k)
|
||||
|
||||
attn_out = torch.nn.functional.scaled_dot_product_attention(
|
||||
q,
|
||||
k,
|
||||
v,
|
||||
dropout_p=self.dropout.p if isinstance(self.dropout, nn.Dropout) and self.training else 0.0,
|
||||
)
|
||||
|
||||
attn_out = attn_out.transpose(1, 2).reshape(B, T, self.hidden_size)
|
||||
return self.out_proj(attn_out)
|
||||
|
||||
|
||||
class TransformerBlock(nn.Module):
|
||||
"""DiT-style transformer block with AdaLN-Zero."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
hidden_size: int = 128,
|
||||
num_heads: int = 4,
|
||||
num_features: int = 128,
|
||||
dropout: float = 0.0,
|
||||
use_rope: bool = False,
|
||||
max_seq_len: int = 512,
|
||||
rope_base: float = 10000.0,
|
||||
):
|
||||
super().__init__()
|
||||
self.use_rope = use_rope
|
||||
|
||||
if use_rope:
|
||||
self.attn = RoPEAttention(
|
||||
hidden_size=hidden_size,
|
||||
num_heads=num_heads,
|
||||
dropout=dropout,
|
||||
max_seq_len=max_seq_len,
|
||||
rope_base=rope_base,
|
||||
)
|
||||
else:
|
||||
self.multihead_attn = nn.MultiheadAttention(
|
||||
hidden_size, num_heads=num_heads, batch_first=True, dropout=dropout
|
||||
)
|
||||
|
||||
self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
|
||||
self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
|
||||
|
||||
self.mlp = nn.Sequential(
|
||||
nn.Linear(hidden_size, hidden_size * 4),
|
||||
nn.GELU(approximate="tanh"),
|
||||
nn.Linear(hidden_size * 4, hidden_size),
|
||||
)
|
||||
|
||||
self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(num_features, 6 * hidden_size, bias=True))
|
||||
|
||||
def forward(self, x: Tensor, features: Tensor) -> Tensor:
|
||||
shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(
|
||||
features
|
||||
).chunk(6, dim=1)
|
||||
|
||||
attn_input = modulate(self.norm1(x), shift_msa.unsqueeze(1), scale_msa.unsqueeze(1))
|
||||
|
||||
if self.use_rope:
|
||||
attn_out = self.attn(attn_input)
|
||||
else:
|
||||
attn_out, _ = self.multihead_attn(attn_input, attn_input, attn_input)
|
||||
|
||||
x = x + gate_msa.unsqueeze(1) * attn_out
|
||||
|
||||
mlp_input = modulate(self.norm2(x), shift_mlp.unsqueeze(1), scale_mlp.unsqueeze(1))
|
||||
mlp_out = self.mlp(mlp_input)
|
||||
x = x + gate_mlp.unsqueeze(1) * mlp_out
|
||||
|
||||
return x
|
||||
|
||||
|
||||
class DiffusionTransformer(nn.Module):
|
||||
"""Transformer-based diffusion noise prediction model."""
|
||||
|
||||
def __init__(self, config, conditioning_dim: int):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
self.conditioning_dim = conditioning_dim
|
||||
|
||||
self.action_dim = config.action_feature.shape[0]
|
||||
self.horizon = config.horizon
|
||||
self.hidden_size = config.hidden_dim
|
||||
self.num_layers = config.num_layers
|
||||
self.num_heads = config.num_heads
|
||||
self.dropout = config.dropout
|
||||
self.use_rope = config.use_rope
|
||||
|
||||
self.timestep_embed_dim = config.timestep_embed_dim
|
||||
self.time_mlp = nn.Sequential(
|
||||
SinusoidalPosEmb(self.timestep_embed_dim),
|
||||
nn.Linear(self.timestep_embed_dim, 2 * self.timestep_embed_dim),
|
||||
nn.GELU(),
|
||||
nn.Linear(2 * self.timestep_embed_dim, self.timestep_embed_dim),
|
||||
nn.GELU(),
|
||||
)
|
||||
|
||||
self.cond_dim = self.timestep_embed_dim + conditioning_dim
|
||||
self.input_proj = nn.Linear(self.action_dim, self.hidden_size)
|
||||
|
||||
if config.use_positional_encoding:
|
||||
self.pos_embedding = nn.Parameter(
|
||||
torch.empty(1, self.horizon, self.hidden_size).normal_(std=0.02)
|
||||
)
|
||||
else:
|
||||
self.pos_embedding = None
|
||||
|
||||
self.transformer_blocks = nn.ModuleList(
|
||||
[
|
||||
TransformerBlock(
|
||||
hidden_size=self.hidden_size,
|
||||
num_heads=self.num_heads,
|
||||
num_features=self.cond_dim,
|
||||
dropout=self.dropout,
|
||||
use_rope=self.use_rope,
|
||||
max_seq_len=self.horizon,
|
||||
rope_base=config.rope_base,
|
||||
)
|
||||
for _ in range(self.num_layers)
|
||||
]
|
||||
)
|
||||
|
||||
self.output_proj = nn.Linear(self.hidden_size, self.action_dim)
|
||||
self._initialize_weights()
|
||||
|
||||
def _initialize_weights(self):
|
||||
for block in self.transformer_blocks:
|
||||
nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
|
||||
nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
|
||||
|
||||
def forward(self, x: Tensor, timestep: Tensor, conditioning_vec: Tensor) -> Tensor:
|
||||
_, seq_len, _ = x.shape
|
||||
|
||||
timestep_features = self.time_mlp(timestep)
|
||||
cond_features = torch.cat([timestep_features, conditioning_vec], dim=-1)
|
||||
|
||||
hidden_seq = self.input_proj(x)
|
||||
|
||||
if self.pos_embedding is not None:
|
||||
hidden_seq = hidden_seq + self.pos_embedding[:, :seq_len, :]
|
||||
|
||||
for block in self.transformer_blocks:
|
||||
hidden_seq = block(hidden_seq, cond_features)
|
||||
|
||||
return self.output_proj(hidden_seq)
|
||||
|
||||
|
||||
# -- Objectives --
|
||||
|
||||
|
||||
class DiffusionObjective(nn.Module):
|
||||
"""Standard diffusion (DDPM/DDIM) objective implementation."""
|
||||
|
||||
def __init__(self, config, action_dim: int, horizon: int, do_mask_loss_for_padding: bool = False):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
self.action_dim = action_dim
|
||||
self.horizon = horizon
|
||||
self.do_mask_loss_for_padding = do_mask_loss_for_padding
|
||||
|
||||
scheduler_kwargs = {
|
||||
"num_train_timesteps": config.num_train_timesteps,
|
||||
"beta_start": config.beta_start,
|
||||
"beta_end": config.beta_end,
|
||||
"beta_schedule": config.beta_schedule,
|
||||
"clip_sample": config.clip_sample,
|
||||
"clip_sample_range": config.clip_sample_range,
|
||||
"prediction_type": config.prediction_type,
|
||||
}
|
||||
|
||||
if config.noise_scheduler_type == "DDPM":
|
||||
self.noise_scheduler: DDPMScheduler | DDIMScheduler = DDPMScheduler(**scheduler_kwargs)
|
||||
elif config.noise_scheduler_type == "DDIM":
|
||||
self.noise_scheduler = DDIMScheduler(**scheduler_kwargs)
|
||||
else:
|
||||
raise ValueError(f"Unsupported noise scheduler type {config.noise_scheduler_type}")
|
||||
|
||||
self.num_inference_steps = (
|
||||
config.num_inference_steps
|
||||
if config.num_inference_steps is not None
|
||||
else self.noise_scheduler.config.num_train_timesteps
|
||||
)
|
||||
|
||||
def compute_loss(self, model: nn.Module, batch: dict[str, Tensor], conditioning_vec: Tensor) -> Tensor:
|
||||
clean_actions = batch[ACTION]
|
||||
noise = torch.randn_like(clean_actions)
|
||||
timesteps = torch.randint(
|
||||
low=0,
|
||||
high=self.noise_scheduler.config.num_train_timesteps,
|
||||
size=(clean_actions.shape[0],),
|
||||
device=clean_actions.device,
|
||||
).long()
|
||||
noisy_actions = self.noise_scheduler.add_noise(clean_actions, noise, timesteps)
|
||||
|
||||
prediction_type = self.noise_scheduler.config.prediction_type
|
||||
if prediction_type == "epsilon":
|
||||
target = noise
|
||||
elif prediction_type == "sample":
|
||||
target = clean_actions
|
||||
else:
|
||||
raise ValueError(f"Unsupported prediction type: {prediction_type}")
|
||||
|
||||
predicted = model(noisy_actions, timesteps, conditioning_vec=conditioning_vec)
|
||||
loss = F.mse_loss(predicted, target, reduction="none")
|
||||
|
||||
if self.do_mask_loss_for_padding and "action_is_pad" in batch:
|
||||
valid_actions = ~batch["action_is_pad"]
|
||||
loss = loss * valid_actions.unsqueeze(-1)
|
||||
|
||||
return loss.mean()
|
||||
|
||||
def conditional_sample(self, model: nn.Module, batch_size: int, conditioning_vec: Tensor) -> Tensor:
|
||||
device = next(model.parameters()).device
|
||||
dtype = next(model.parameters()).dtype
|
||||
|
||||
sample = torch.randn(
|
||||
size=(batch_size, self.horizon, self.action_dim),
|
||||
dtype=dtype,
|
||||
device=device,
|
||||
)
|
||||
|
||||
self.noise_scheduler.set_timesteps(self.num_inference_steps)
|
||||
for t in self.noise_scheduler.timesteps:
|
||||
model_output = model(
|
||||
sample,
|
||||
torch.full(sample.shape[:1], t, dtype=torch.long, device=sample.device),
|
||||
conditioning_vec=conditioning_vec,
|
||||
)
|
||||
sample = self.noise_scheduler.step(model_output, t, sample).prev_sample
|
||||
|
||||
return sample
|
||||
|
||||
|
||||
class FlowMatchingObjective(nn.Module):
|
||||
"""Flow matching objective: trains a model to predict velocity fields."""
|
||||
|
||||
def __init__(self, config, action_dim: int, horizon: int, do_mask_loss_for_padding: bool = False):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
self.action_dim = action_dim
|
||||
self.horizon = horizon
|
||||
self.do_mask_loss_for_padding = do_mask_loss_for_padding
|
||||
|
||||
def _sample_timesteps(self, batch_size: int, device: torch.device) -> Tensor:
|
||||
if self.config.timestep_sampling_strategy == "uniform":
|
||||
return torch.rand(batch_size, device=device)
|
||||
elif self.config.timestep_sampling_strategy == "beta":
|
||||
beta_dist = torch.distributions.Beta(
|
||||
self.config.timestep_sampling_alpha, self.config.timestep_sampling_beta
|
||||
)
|
||||
u = beta_dist.sample((batch_size,)).to(device)
|
||||
return self.config.timestep_sampling_s * (1.0 - u)
|
||||
else:
|
||||
raise ValueError(f"Unknown timestep strategy: {self.config.timestep_sampling_strategy}")
|
||||
|
||||
def compute_loss(self, model: nn.Module, batch: dict[str, Tensor], conditioning_vec: Tensor) -> Tensor:
|
||||
data = batch[ACTION]
|
||||
batch_size = data.shape[0]
|
||||
device = data.device
|
||||
|
||||
noise = torch.randn_like(data)
|
||||
t = self._sample_timesteps(batch_size, device)
|
||||
t_expanded = t.view(-1, 1, 1)
|
||||
x_t = t_expanded * data + (1 - (1 - self.config.sigma_min) * t_expanded) * noise
|
||||
|
||||
target_velocity = data - (1 - self.config.sigma_min) * noise
|
||||
predicted_velocity = model(x_t, t, conditioning_vec=conditioning_vec)
|
||||
loss = F.mse_loss(predicted_velocity, target_velocity, reduction="none")
|
||||
|
||||
if self.do_mask_loss_for_padding and "action_is_pad" in batch:
|
||||
valid_mask = ~batch["action_is_pad"]
|
||||
loss = loss * valid_mask.unsqueeze(-1)
|
||||
|
||||
return loss.mean()
|
||||
|
||||
def conditional_sample(self, model: nn.Module, batch_size: int, conditioning_vec: Tensor) -> Tensor:
|
||||
device = next(model.parameters()).device
|
||||
dtype = next(model.parameters()).dtype
|
||||
|
||||
x = torch.randn((batch_size, self.horizon, self.action_dim), dtype=dtype, device=device)
|
||||
|
||||
num_steps = self.config.num_integration_steps
|
||||
time_grid = torch.linspace(0, 1, num_steps + 1, device=device)
|
||||
|
||||
if self.config.integration_method == "euler":
|
||||
x = self._euler_integrate(model, x, time_grid, conditioning_vec)
|
||||
elif self.config.integration_method == "rk4":
|
||||
x = self._rk4_integrate(model, x, time_grid, conditioning_vec)
|
||||
else:
|
||||
raise ValueError(f"Unknown integration method: {self.config.integration_method}")
|
||||
|
||||
return x
|
||||
|
||||
def _euler_integrate(
|
||||
self, model: nn.Module, x_init: Tensor, time_grid: Tensor, conditioning_vec: Tensor
|
||||
) -> Tensor:
|
||||
x = x_init
|
||||
for i in range(len(time_grid) - 1):
|
||||
t_scalar = time_grid[i].item()
|
||||
dt = (time_grid[i + 1] - time_grid[i]).item()
|
||||
t_batch = torch.full((x.shape[0],), t_scalar, dtype=x.dtype, device=x.device)
|
||||
with torch.no_grad():
|
||||
velocity = model(x, t_batch, conditioning_vec=conditioning_vec)
|
||||
x = x + dt * velocity
|
||||
return x
|
||||
|
||||
def _rk4_integrate(
|
||||
self, model: nn.Module, x_init: Tensor, time_grid: Tensor, conditioning_vec: Tensor
|
||||
) -> Tensor:
|
||||
x = x_init
|
||||
|
||||
def dynamics(x_val: Tensor, t_scalar: float) -> Tensor:
|
||||
t_batch = torch.full((x_val.shape[0],), t_scalar, dtype=x_val.dtype, device=x_val.device)
|
||||
with torch.no_grad():
|
||||
return model(x_val, t_batch, conditioning_vec=conditioning_vec)
|
||||
|
||||
for i in range(len(time_grid) - 1):
|
||||
t = time_grid[i].item()
|
||||
dt = (time_grid[i + 1] - time_grid[i]).item()
|
||||
|
||||
k1 = dynamics(x, t)
|
||||
k2 = dynamics(x + dt * k1 / 2, t + dt / 2)
|
||||
k3 = dynamics(x + dt * k2 / 2, t + dt / 2)
|
||||
k4 = dynamics(x + dt * k3, t + dt)
|
||||
|
||||
x = x + dt / 6 * (k1 + 2 * k2 + 2 * k3 + k4)
|
||||
|
||||
return x
|
||||
@@ -0,0 +1,105 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2025 Bryson Jones and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
|
||||
from lerobot.policies.multi_task_dit.configuration_multi_task_dit import MultiTaskDiTConfig
|
||||
from lerobot.processor import (
|
||||
AddBatchDimensionProcessorStep,
|
||||
DeviceProcessorStep,
|
||||
NormalizerProcessorStep,
|
||||
PolicyAction,
|
||||
PolicyProcessorPipeline,
|
||||
RenameObservationsProcessorStep,
|
||||
TokenizerProcessorStep,
|
||||
UnnormalizerProcessorStep,
|
||||
)
|
||||
from lerobot.processor.converters import policy_action_to_transition, transition_to_policy_action
|
||||
from lerobot.utils.constants import POLICY_POSTPROCESSOR_DEFAULT_NAME, POLICY_PREPROCESSOR_DEFAULT_NAME
|
||||
|
||||
|
||||
def make_multi_task_dit_pre_post_processors(
|
||||
config: MultiTaskDiTConfig,
|
||||
dataset_stats: dict[str, dict[str, torch.Tensor]] | None = None,
|
||||
) -> tuple[
|
||||
PolicyProcessorPipeline[dict[str, Any], dict[str, Any]],
|
||||
PolicyProcessorPipeline[PolicyAction, PolicyAction],
|
||||
]:
|
||||
"""
|
||||
Constructs pre-processor and post-processor pipelines for a Multi-Task DiT policy.
|
||||
|
||||
The pre-processing pipeline prepares the input data for the model by:
|
||||
1. Renaming features.
|
||||
2. Adding a batch dimension.
|
||||
3. Tokenizing the language task description (if present).
|
||||
4. Moving the data to the specified device.
|
||||
5. Normalizing the input and output features based on dataset statistics.
|
||||
|
||||
The post-processing pipeline handles the model's output by:
|
||||
1. Unnormalizing the output features to their original scale.
|
||||
2. Moving the data to the CPU.
|
||||
|
||||
Args:
|
||||
config: The configuration object for the Multi-Task DiT policy,
|
||||
containing feature definitions, normalization mappings, and device information.
|
||||
dataset_stats: A dictionary of statistics used for normalization.
|
||||
Defaults to None.
|
||||
|
||||
Returns:
|
||||
A tuple containing the configured pre-processor and post-processor pipelines.
|
||||
"""
|
||||
|
||||
input_steps = [
|
||||
RenameObservationsProcessorStep(rename_map={}),
|
||||
AddBatchDimensionProcessorStep(),
|
||||
TokenizerProcessorStep(
|
||||
tokenizer_name=config.text_encoder_name,
|
||||
padding=config.tokenizer_padding,
|
||||
padding_side=config.tokenizer_padding_side,
|
||||
max_length=config.tokenizer_max_length,
|
||||
truncation=config.tokenizer_truncation,
|
||||
),
|
||||
DeviceProcessorStep(device=config.device),
|
||||
NormalizerProcessorStep(
|
||||
features={**config.input_features, **config.output_features},
|
||||
norm_map=config.normalization_mapping,
|
||||
stats=dataset_stats,
|
||||
device=config.device,
|
||||
),
|
||||
]
|
||||
output_steps = [
|
||||
UnnormalizerProcessorStep(
|
||||
features=config.output_features,
|
||||
norm_map=config.normalization_mapping,
|
||||
stats=dataset_stats,
|
||||
),
|
||||
DeviceProcessorStep(device="cpu"),
|
||||
]
|
||||
|
||||
return (
|
||||
PolicyProcessorPipeline[dict[str, Any], dict[str, Any]](
|
||||
steps=input_steps,
|
||||
name=POLICY_PREPROCESSOR_DEFAULT_NAME,
|
||||
),
|
||||
PolicyProcessorPipeline[PolicyAction, PolicyAction](
|
||||
steps=output_steps,
|
||||
name=POLICY_POSTPROCESSOR_DEFAULT_NAME,
|
||||
to_transition=policy_action_to_transition,
|
||||
to_output=transition_to_policy_action,
|
||||
),
|
||||
)
|
||||
@@ -20,12 +20,11 @@ import logging
|
||||
import math
|
||||
from collections import deque
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Literal, TypedDict
|
||||
from typing import TYPE_CHECKING, Literal, TypedDict, Unpack
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F # noqa: N812
|
||||
from torch import Tensor, nn
|
||||
from typing_extensions import Unpack
|
||||
|
||||
from lerobot.utils.import_utils import _transformers_available
|
||||
|
||||
|
||||
@@ -20,12 +20,11 @@ import logging
|
||||
import math
|
||||
from collections import deque
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Literal, TypedDict
|
||||
from typing import TYPE_CHECKING, Literal, TypedDict, Unpack
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F # noqa: N812
|
||||
from torch import Tensor, nn
|
||||
from typing_extensions import Unpack
|
||||
|
||||
from lerobot.utils.import_utils import _transformers_available
|
||||
|
||||
|
||||
@@ -19,13 +19,12 @@ import logging
|
||||
import math
|
||||
from collections import deque
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Literal, TypedDict
|
||||
from typing import TYPE_CHECKING, Literal, TypedDict, Unpack
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn.functional as F # noqa: N812
|
||||
from torch import Tensor, nn
|
||||
from typing_extensions import Unpack
|
||||
|
||||
from lerobot.utils.import_utils import _scipy_available, _transformers_available
|
||||
|
||||
|
||||
@@ -19,7 +19,7 @@ import os
|
||||
from importlib.resources import files
|
||||
from pathlib import Path
|
||||
from tempfile import TemporaryDirectory
|
||||
from typing import TypedDict, TypeVar
|
||||
from typing import TypedDict, TypeVar, Unpack
|
||||
|
||||
import packaging
|
||||
import safetensors
|
||||
@@ -28,7 +28,6 @@ from huggingface_hub.constants import SAFETENSORS_SINGLE_FILE
|
||||
from huggingface_hub.errors import HfHubHTTPError
|
||||
from safetensors.torch import load_model as load_model_as_safetensor, save_model as save_model_as_safetensor
|
||||
from torch import Tensor, nn
|
||||
from typing_extensions import Unpack
|
||||
|
||||
from lerobot.configs.policies import PreTrainedConfig
|
||||
from lerobot.configs.train import TrainPipelineConfig
|
||||
|
||||
@@ -54,12 +54,11 @@ policy = SmolVLAPolicy.from_pretrained("lerobot/smolvla_base")
|
||||
|
||||
import math
|
||||
from collections import deque
|
||||
from typing import TypedDict
|
||||
from typing import TypedDict, Unpack
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F # noqa: N812
|
||||
from torch import Tensor, nn
|
||||
from typing_extensions import Unpack
|
||||
|
||||
from lerobot.policies.pretrained import PreTrainedPolicy
|
||||
from lerobot.policies.rtc.modeling_rtc import RTCProcessor
|
||||
|
||||
@@ -17,7 +17,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from enum import Enum
|
||||
from typing import Any, TypeAlias, TypedDict
|
||||
from typing import Any, TypedDict
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -36,10 +36,10 @@ class TransitionKey(str, Enum):
|
||||
COMPLEMENTARY_DATA = "complementary_data"
|
||||
|
||||
|
||||
PolicyAction: TypeAlias = torch.Tensor
|
||||
RobotAction: TypeAlias = dict[str, Any]
|
||||
EnvAction: TypeAlias = np.ndarray
|
||||
RobotObservation: TypeAlias = dict[str, Any]
|
||||
PolicyAction = torch.Tensor
|
||||
RobotAction = dict[str, Any]
|
||||
EnvAction = np.ndarray
|
||||
RobotObservation = dict[str, Any]
|
||||
|
||||
|
||||
EnvTransition = TypedDict(
|
||||
|
||||
@@ -39,7 +39,7 @@ from collections.abc import Callable, Iterable, Sequence
|
||||
from copy import deepcopy
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any, Generic, TypeAlias, TypedDict, TypeVar, cast
|
||||
from typing import Any, TypedDict, TypeVar, cast
|
||||
|
||||
import torch
|
||||
from huggingface_hub import hf_hub_download
|
||||
@@ -251,7 +251,7 @@ class ProcessorMigrationError(Exception):
|
||||
|
||||
|
||||
@dataclass
|
||||
class DataProcessorPipeline(HubMixin, Generic[TInput, TOutput]):
|
||||
class DataProcessorPipeline[TInput, TOutput](HubMixin):
|
||||
"""A sequential pipeline for processing data, integrated with the Hugging Face Hub.
|
||||
|
||||
This class chains together multiple `ProcessorStep` instances to form a complete
|
||||
@@ -1432,8 +1432,8 @@ class DataProcessorPipeline(HubMixin, Generic[TInput, TOutput]):
|
||||
|
||||
|
||||
# Type aliases for semantic clarity.
|
||||
RobotProcessorPipeline: TypeAlias = DataProcessorPipeline[TInput, TOutput]
|
||||
PolicyProcessorPipeline: TypeAlias = DataProcessorPipeline[TInput, TOutput]
|
||||
RobotProcessorPipeline = DataProcessorPipeline[TInput, TOutput]
|
||||
PolicyProcessorPipeline = DataProcessorPipeline[TInput, TOutput]
|
||||
|
||||
|
||||
class ObservationProcessorStep(ProcessorStep, ABC):
|
||||
|
||||
@@ -15,7 +15,6 @@
|
||||
# limitations under the License.
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import TypeAlias
|
||||
|
||||
from lerobot.cameras import CameraConfig
|
||||
|
||||
@@ -50,5 +49,5 @@ class SOFollowerRobotConfig(RobotConfig, SOFollowerConfig):
|
||||
pass
|
||||
|
||||
|
||||
SO100FollowerConfig: TypeAlias = SOFollowerRobotConfig
|
||||
SO101FollowerConfig: TypeAlias = SOFollowerRobotConfig
|
||||
SO100FollowerConfig = SOFollowerRobotConfig
|
||||
SO101FollowerConfig = SOFollowerRobotConfig
|
||||
|
||||
@@ -17,7 +17,6 @@
|
||||
import logging
|
||||
import time
|
||||
from functools import cached_property
|
||||
from typing import TypeAlias
|
||||
|
||||
from lerobot.cameras.utils import make_cameras_from_configs
|
||||
from lerobot.motors import Motor, MotorCalibration, MotorNormMode
|
||||
@@ -230,5 +229,5 @@ class SOFollower(Robot):
|
||||
logger.info(f"{self} disconnected.")
|
||||
|
||||
|
||||
SO100Follower: TypeAlias = SOFollower
|
||||
SO101Follower: TypeAlias = SOFollower
|
||||
SO100Follower = SOFollower
|
||||
SO101Follower = SOFollower
|
||||
|
||||
@@ -15,7 +15,6 @@
|
||||
# limitations under the License.
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import TypeAlias
|
||||
|
||||
from ..config import TeleoperatorConfig
|
||||
|
||||
@@ -38,5 +37,5 @@ class SOLeaderTeleopConfig(TeleoperatorConfig, SOLeaderConfig):
|
||||
pass
|
||||
|
||||
|
||||
SO100LeaderConfig: TypeAlias = SOLeaderTeleopConfig
|
||||
SO101LeaderConfig: TypeAlias = SOLeaderTeleopConfig
|
||||
SO100LeaderConfig = SOLeaderTeleopConfig
|
||||
SO101LeaderConfig = SOLeaderTeleopConfig
|
||||
|
||||
@@ -16,7 +16,6 @@
|
||||
|
||||
import logging
|
||||
import time
|
||||
from typing import TypeAlias
|
||||
|
||||
from lerobot.motors import Motor, MotorCalibration, MotorNormMode
|
||||
from lerobot.motors.feetech import (
|
||||
@@ -156,5 +155,5 @@ class SOLeader(Teleoperator):
|
||||
logger.info(f"{self} disconnected.")
|
||||
|
||||
|
||||
SO100Leader: TypeAlias = SOLeader
|
||||
SO101Leader: TypeAlias = SOLeader
|
||||
SO100Leader = SOLeader
|
||||
SO101Leader = SOLeader
|
||||
|
||||
@@ -16,12 +16,10 @@
|
||||
import json
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
from typing import TypeVar
|
||||
|
||||
import imageio
|
||||
|
||||
JsonLike = str | int | float | bool | None | list["JsonLike"] | dict[str, "JsonLike"] | tuple["JsonLike", ...]
|
||||
T = TypeVar("T", bound=JsonLike)
|
||||
|
||||
|
||||
def write_video(video_path, stacked_frames, fps):
|
||||
@@ -33,7 +31,7 @@ def write_video(video_path, stacked_frames, fps):
|
||||
imageio.mimsave(video_path, stacked_frames, fps=fps)
|
||||
|
||||
|
||||
def deserialize_json_into_object(fpath: Path, obj: T) -> T:
|
||||
def deserialize_json_into_object[T: JsonLike](fpath: Path, obj: T) -> T:
|
||||
"""
|
||||
Loads the JSON data from `fpath` and recursively fills `obj` with the
|
||||
corresponding values (strictly matching structure and types).
|
||||
|
||||
@@ -231,3 +231,39 @@ def test_ready_to_send_observation_with_varying_threshold(robot_client, g_thresh
|
||||
robot_client.action_queue.put(act)
|
||||
|
||||
assert robot_client._ready_to_send_observation() is expected
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Regression test: robot type registry populated by robot_client imports
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_robot_client_registers_builtin_robot_types():
|
||||
"""Importing robot_client must populate RobotConfig's ChoiceRegistry.
|
||||
|
||||
This is a regression test for a bug introduced in #2425, where removing
|
||||
robot module imports from robot_client.py caused RobotConfig's registry to
|
||||
be empty, breaking CLI argument parsing with:
|
||||
error: argument --robot.type: invalid choice: 'so101_follower' (choose from )
|
||||
|
||||
Robot types are registered via @RobotConfig.register_subclass() decorators
|
||||
at import time, so all supported modules must be explicitly imported.
|
||||
"""
|
||||
import lerobot.async_inference.robot_client # noqa: F401
|
||||
from lerobot.robots.config import RobotConfig
|
||||
|
||||
known_choices = RobotConfig.get_known_choices()
|
||||
|
||||
expected_robot_types = [
|
||||
"so100_follower",
|
||||
"so101_follower",
|
||||
"koch_follower",
|
||||
"omx_follower",
|
||||
"bi_so_follower",
|
||||
]
|
||||
for robot_type in expected_robot_types:
|
||||
assert robot_type in known_choices, (
|
||||
f"Robot type '{robot_type}' is not registered in RobotConfig's ChoiceRegistry. "
|
||||
f"Ensure the corresponding module is imported in robot_client.py. "
|
||||
f"Known choices: {sorted(known_choices)}"
|
||||
)
|
||||
|
||||
@@ -0,0 +1,624 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2025 Bryson Jones and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# ruff: noqa: E402
|
||||
|
||||
"""Test script for Multi-Task DiT policy.
|
||||
|
||||
To run tests locally:
|
||||
python -m pytest tests/policies/multi_task_dit/test_multi_task_dit.py -v
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from torch import Tensor
|
||||
|
||||
pytest.importorskip("transformers")
|
||||
|
||||
pytestmark = pytest.mark.skipif(
|
||||
os.environ.get("CI") == "true" or os.environ.get("GITHUB_ACTIONS") == "true",
|
||||
reason="This test requires local transformers installation and is not meant for CI",
|
||||
)
|
||||
|
||||
from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature
|
||||
from lerobot.policies.multi_task_dit.configuration_multi_task_dit import MultiTaskDiTConfig
|
||||
from lerobot.policies.multi_task_dit.modeling_multi_task_dit import MultiTaskDiTPolicy
|
||||
from lerobot.policies.multi_task_dit.processor_multi_task_dit import (
|
||||
make_multi_task_dit_pre_post_processors,
|
||||
)
|
||||
from lerobot.utils.constants import (
|
||||
ACTION,
|
||||
OBS_IMAGES,
|
||||
OBS_LANGUAGE_ATTENTION_MASK,
|
||||
OBS_LANGUAGE_TOKENS,
|
||||
OBS_STATE,
|
||||
)
|
||||
from lerobot.utils.random_utils import seeded_context, set_seed
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def set_random_seed():
|
||||
seed = 17
|
||||
set_seed(seed)
|
||||
|
||||
|
||||
def create_train_batch(
|
||||
batch_size: int = 2,
|
||||
n_obs_steps: int = 2,
|
||||
horizon: int = 16,
|
||||
state_dim: int = 10,
|
||||
action_dim: int = 10,
|
||||
height: int = 224,
|
||||
width: int = 224,
|
||||
) -> dict[str, Tensor]:
|
||||
"""Create a training batch with visual input and text."""
|
||||
return {
|
||||
"observation.state": torch.randn(batch_size, n_obs_steps, state_dim),
|
||||
f"{OBS_IMAGES}.laptop": torch.rand(batch_size, n_obs_steps, 3, height, width),
|
||||
ACTION: torch.randn(batch_size, horizon, action_dim),
|
||||
"task": ["pick up the cube"] * batch_size,
|
||||
}
|
||||
|
||||
|
||||
def create_observation_batch(
|
||||
batch_size: int = 2, state_dim: int = 10, height: int = 224, width: int = 224
|
||||
) -> dict:
|
||||
"""Create observation batch for inference for a single timestep."""
|
||||
return {
|
||||
"observation.state": torch.randn(batch_size, state_dim),
|
||||
f"{OBS_IMAGES}.laptop": torch.rand(batch_size, 3, height, width),
|
||||
"task": ["pick up the red cube"] * batch_size,
|
||||
}
|
||||
|
||||
|
||||
def create_config(
|
||||
state_dim: int = 10,
|
||||
action_dim: int = 10,
|
||||
n_obs_steps: int = 2,
|
||||
horizon: int = 16,
|
||||
n_action_steps: int = 8,
|
||||
with_visual: bool = True,
|
||||
height: int = 224,
|
||||
width: int = 224,
|
||||
) -> MultiTaskDiTConfig:
|
||||
"""Create a MultiTaskDiT config for testing.
|
||||
|
||||
Args:
|
||||
state_dim: Dimension of state observations
|
||||
action_dim: Dimension of actions
|
||||
n_obs_steps: Number of observation steps
|
||||
horizon: Action prediction horizon
|
||||
n_action_steps: Number of action steps to execute
|
||||
with_visual: Whether to include visual input (default: True)
|
||||
height: Image height (only used if with_visual=True)
|
||||
width: Image width (only used if with_visual=True)
|
||||
"""
|
||||
input_features = {OBS_STATE: PolicyFeature(type=FeatureType.STATE, shape=(state_dim,))}
|
||||
|
||||
if with_visual:
|
||||
input_features[f"{OBS_IMAGES}.laptop"] = PolicyFeature(
|
||||
type=FeatureType.VISUAL, shape=(3, height, width)
|
||||
)
|
||||
|
||||
config = MultiTaskDiTConfig(
|
||||
input_features=input_features,
|
||||
output_features={ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(action_dim,))},
|
||||
n_obs_steps=n_obs_steps,
|
||||
horizon=horizon,
|
||||
n_action_steps=n_action_steps,
|
||||
# Use smaller model for faster tests
|
||||
hidden_dim=128,
|
||||
num_layers=2,
|
||||
num_heads=4,
|
||||
)
|
||||
|
||||
config.validate_features()
|
||||
return config
|
||||
|
||||
|
||||
@pytest.mark.parametrize("batch_size,state_dim,action_dim", [(2, 10, 10), (1, 6, 6)])
|
||||
def test_multi_task_dit_policy_forward(batch_size: int, state_dim: int, action_dim: int):
|
||||
"""Test forward pass (training mode)."""
|
||||
n_obs_steps = 2
|
||||
horizon = 16
|
||||
n_action_steps = 8
|
||||
|
||||
config = create_config(
|
||||
state_dim=state_dim,
|
||||
action_dim=action_dim,
|
||||
n_obs_steps=n_obs_steps,
|
||||
horizon=horizon,
|
||||
n_action_steps=n_action_steps,
|
||||
)
|
||||
|
||||
policy = MultiTaskDiTPolicy(config=config)
|
||||
policy.train()
|
||||
|
||||
# Use preprocessor to handle tokenization
|
||||
config.normalization_mapping = {
|
||||
"VISUAL": NormalizationMode.IDENTITY,
|
||||
"STATE": NormalizationMode.IDENTITY,
|
||||
"ACTION": NormalizationMode.IDENTITY,
|
||||
}
|
||||
preprocessor, _ = make_multi_task_dit_pre_post_processors(config=config, dataset_stats=None)
|
||||
|
||||
batch = create_train_batch(
|
||||
batch_size=batch_size,
|
||||
n_obs_steps=n_obs_steps,
|
||||
horizon=horizon,
|
||||
state_dim=state_dim,
|
||||
action_dim=action_dim,
|
||||
)
|
||||
|
||||
# Process batch through preprocessor to tokenize task text
|
||||
processed_batch = preprocessor(batch)
|
||||
|
||||
# Test forward pass
|
||||
loss, _ = policy.forward(processed_batch)
|
||||
assert loss is not None
|
||||
assert loss.item() is not None
|
||||
assert loss.shape == ()
|
||||
|
||||
# Test backward pass
|
||||
loss.backward()
|
||||
|
||||
|
||||
def test_multi_task_dit_pre_post_processors():
|
||||
"""Test pre and post processors for Multi-Task DiT policy."""
|
||||
state_dim = 10
|
||||
action_dim = 8
|
||||
n_obs_steps = 2
|
||||
horizon = 16
|
||||
|
||||
config = create_config(
|
||||
state_dim=state_dim,
|
||||
action_dim=action_dim,
|
||||
n_obs_steps=n_obs_steps,
|
||||
horizon=horizon,
|
||||
n_action_steps=8,
|
||||
)
|
||||
config.device = "cpu"
|
||||
|
||||
# Set normalization mode to match the stats we're providing
|
||||
config.normalization_mapping = {
|
||||
"VISUAL": NormalizationMode.IDENTITY,
|
||||
"STATE": NormalizationMode.MEAN_STD, # Use MEAN_STD since we provide mean/std stats
|
||||
"ACTION": NormalizationMode.MIN_MAX,
|
||||
}
|
||||
|
||||
# Create dataset stats for normalization
|
||||
dataset_stats = {
|
||||
"observation.state": {
|
||||
"mean": torch.zeros(state_dim),
|
||||
"std": torch.ones(state_dim),
|
||||
},
|
||||
"action": {
|
||||
"min": torch.full((action_dim,), -1.0),
|
||||
"max": torch.ones(action_dim),
|
||||
},
|
||||
}
|
||||
|
||||
# Create processors
|
||||
preprocessor, postprocessor = make_multi_task_dit_pre_post_processors(
|
||||
config=config, dataset_stats=dataset_stats
|
||||
)
|
||||
|
||||
# Test preprocessor with sample data
|
||||
batch = {
|
||||
"observation.state": torch.randn(state_dim),
|
||||
f"{OBS_IMAGES}.laptop": torch.rand(3, 224, 224),
|
||||
ACTION: torch.randn(action_dim),
|
||||
"task": "pick up the cube",
|
||||
}
|
||||
|
||||
processed_batch = preprocessor(batch)
|
||||
|
||||
# Check that data is batched
|
||||
assert processed_batch["observation.state"].shape == (1, state_dim)
|
||||
assert processed_batch[f"{OBS_IMAGES}.laptop"].shape == (1, 3, 224, 224)
|
||||
assert processed_batch[ACTION].shape == (1, action_dim)
|
||||
# Check that task text was tokenized
|
||||
assert OBS_LANGUAGE_TOKENS in processed_batch
|
||||
assert OBS_LANGUAGE_ATTENTION_MASK in processed_batch
|
||||
assert processed_batch[OBS_LANGUAGE_TOKENS].shape[0] == 1 # batch dimension
|
||||
assert processed_batch[OBS_LANGUAGE_ATTENTION_MASK].shape[0] == 1 # batch dimension
|
||||
|
||||
# Check that data is on correct device
|
||||
assert processed_batch["observation.state"].device.type == "cpu"
|
||||
assert processed_batch[f"{OBS_IMAGES}.laptop"].device.type == "cpu"
|
||||
assert processed_batch[ACTION].device.type == "cpu"
|
||||
|
||||
# Test postprocessor with sample action (PolicyAction is just a torch.Tensor)
|
||||
action = torch.randn(1, action_dim)
|
||||
processed_action = postprocessor(action)
|
||||
|
||||
# Check that action is unnormalized and on CPU
|
||||
assert processed_action.shape == (1, action_dim)
|
||||
assert processed_action.device.type == "cpu"
|
||||
|
||||
|
||||
def test_multi_task_dit_pre_post_processors_normalization():
|
||||
"""Test that normalization and unnormalization work correctly with simple sanity check numbers."""
|
||||
state_dim = 3
|
||||
action_dim = 2
|
||||
|
||||
config = create_config(
|
||||
state_dim=state_dim,
|
||||
action_dim=action_dim,
|
||||
n_obs_steps=2,
|
||||
horizon=16,
|
||||
n_action_steps=8,
|
||||
)
|
||||
config.device = "cpu"
|
||||
|
||||
# Set normalization mode to match the stats we're providing
|
||||
config.normalization_mapping = {
|
||||
"VISUAL": NormalizationMode.IDENTITY,
|
||||
"STATE": NormalizationMode.MEAN_STD, # Use MEAN_STD since we provide mean/std stats
|
||||
"ACTION": NormalizationMode.MIN_MAX,
|
||||
}
|
||||
|
||||
# Use simple stats that will actually transform the values
|
||||
dataset_stats = {
|
||||
"observation.state": {
|
||||
"mean": torch.full((state_dim,), 5.0),
|
||||
"std": torch.full((state_dim,), 2.0),
|
||||
},
|
||||
"action": {
|
||||
"min": torch.zeros(action_dim),
|
||||
"max": torch.full((action_dim,), 2.0),
|
||||
},
|
||||
}
|
||||
|
||||
# Create processors
|
||||
preprocessor, postprocessor = make_multi_task_dit_pre_post_processors(
|
||||
config=config, dataset_stats=dataset_stats
|
||||
)
|
||||
|
||||
# Use simple input values
|
||||
input_state = torch.tensor([7.0, 5.0, 3.0]) # Will normalize to [1.0, 0.0, -1.0]
|
||||
input_action = torch.tensor([1.0, 2.0]) # Will normalize to [0.0, 1.0]
|
||||
|
||||
batch = {
|
||||
"observation.state": input_state,
|
||||
f"{OBS_IMAGES}.laptop": torch.rand(3, 224, 224),
|
||||
ACTION: input_action,
|
||||
"task": "test task",
|
||||
}
|
||||
|
||||
# Process through preprocessor
|
||||
processed_batch = preprocessor(batch)
|
||||
|
||||
# State normalization: (x - mean) / std
|
||||
expected_normalized_state = torch.tensor([1.0, 0.0, -1.0])
|
||||
assert torch.allclose(processed_batch["observation.state"][0], expected_normalized_state, atol=1e-5)
|
||||
|
||||
# Action normalization: (x - min) / (max - min) * 2 - 1
|
||||
expected_normalized_action = torch.tensor([0.0, 1.0])
|
||||
assert torch.allclose(processed_batch[ACTION][0], expected_normalized_action, atol=1e-5)
|
||||
|
||||
# Test unnormalization: should recover original values
|
||||
normalized_action_tensor = processed_batch[ACTION][0:1] # Keep batch dimension
|
||||
unnormalized_action = postprocessor(normalized_action_tensor)
|
||||
|
||||
# Should recover original action values
|
||||
assert torch.allclose(unnormalized_action[0], input_action, atol=1e-4)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("batch_size,state_dim,action_dim", [(2, 10, 10), (1, 6, 6)])
|
||||
def test_multi_task_dit_policy_select_action(batch_size: int, state_dim: int, action_dim: int):
|
||||
"""Test select_action (inference mode)."""
|
||||
n_obs_steps = 2
|
||||
horizon = 16
|
||||
n_action_steps = 8
|
||||
|
||||
config = create_config(
|
||||
state_dim=state_dim,
|
||||
action_dim=action_dim,
|
||||
n_obs_steps=n_obs_steps,
|
||||
horizon=horizon,
|
||||
n_action_steps=n_action_steps,
|
||||
)
|
||||
|
||||
policy = MultiTaskDiTPolicy(config=config)
|
||||
policy.eval()
|
||||
policy.reset() # Reset queues before inference
|
||||
|
||||
# Create processors - use IDENTITY normalization when no stats provided
|
||||
config.normalization_mapping = {
|
||||
"VISUAL": NormalizationMode.IDENTITY,
|
||||
"STATE": NormalizationMode.IDENTITY,
|
||||
"ACTION": NormalizationMode.IDENTITY,
|
||||
}
|
||||
preprocessor, postprocessor = make_multi_task_dit_pre_post_processors(config=config, dataset_stats=None)
|
||||
|
||||
with torch.no_grad():
|
||||
observation_batch = create_observation_batch(batch_size=batch_size, state_dim=state_dim)
|
||||
# Process observation through preprocessor
|
||||
processed_obs = preprocessor(observation_batch)
|
||||
selected_action = policy.select_action(processed_obs)
|
||||
# Process action through postprocessor (PolicyAction is just a torch.Tensor)
|
||||
processed_action = postprocessor(selected_action)
|
||||
assert processed_action.shape == (batch_size, action_dim)
|
||||
|
||||
|
||||
def test_multi_task_dit_policy_diffusion_objective():
|
||||
"""Test policy with diffusion objective."""
|
||||
batch_size = 2
|
||||
state_dim = 10
|
||||
action_dim = 10
|
||||
n_obs_steps = 2
|
||||
horizon = 16
|
||||
n_action_steps = 8
|
||||
|
||||
input_features = {
|
||||
OBS_STATE: PolicyFeature(type=FeatureType.STATE, shape=(state_dim,)),
|
||||
f"{OBS_IMAGES}.laptop": PolicyFeature(type=FeatureType.VISUAL, shape=(3, 224, 224)),
|
||||
}
|
||||
|
||||
config = MultiTaskDiTConfig(
|
||||
input_features=input_features,
|
||||
output_features={ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(action_dim,))},
|
||||
n_obs_steps=n_obs_steps,
|
||||
horizon=horizon,
|
||||
n_action_steps=n_action_steps,
|
||||
# Use diffusion objective
|
||||
objective="diffusion",
|
||||
noise_scheduler_type="DDPM",
|
||||
num_train_timesteps=100,
|
||||
num_inference_steps=10,
|
||||
# Smaller model for tests
|
||||
hidden_dim=128,
|
||||
num_layers=2,
|
||||
num_heads=4,
|
||||
)
|
||||
config.validate_features()
|
||||
|
||||
policy = MultiTaskDiTPolicy(config=config)
|
||||
policy.train()
|
||||
|
||||
# Use preprocessor to handle tokenization
|
||||
config.normalization_mapping = {
|
||||
"VISUAL": NormalizationMode.IDENTITY,
|
||||
"STATE": NormalizationMode.IDENTITY,
|
||||
"ACTION": NormalizationMode.IDENTITY,
|
||||
}
|
||||
preprocessor, _ = make_multi_task_dit_pre_post_processors(config=config, dataset_stats=None)
|
||||
|
||||
batch = create_train_batch(
|
||||
batch_size=batch_size,
|
||||
n_obs_steps=n_obs_steps,
|
||||
horizon=horizon,
|
||||
state_dim=state_dim,
|
||||
action_dim=action_dim,
|
||||
)
|
||||
|
||||
# Process batch through preprocessor to tokenize task text
|
||||
processed_batch = preprocessor(batch)
|
||||
|
||||
# Test forward pass
|
||||
loss, _ = policy.forward(processed_batch)
|
||||
assert loss is not None
|
||||
assert loss.item() is not None
|
||||
|
||||
# Test inference
|
||||
policy.eval()
|
||||
# Use IDENTITY normalization when no stats provided
|
||||
config.normalization_mapping = {
|
||||
"VISUAL": NormalizationMode.IDENTITY,
|
||||
"STATE": NormalizationMode.IDENTITY,
|
||||
"ACTION": NormalizationMode.IDENTITY,
|
||||
}
|
||||
preprocessor, postprocessor = make_multi_task_dit_pre_post_processors(config=config, dataset_stats=None)
|
||||
with torch.no_grad():
|
||||
observation_batch = create_observation_batch(batch_size=batch_size, state_dim=state_dim)
|
||||
# Process observation through preprocessor
|
||||
processed_obs = preprocessor(observation_batch)
|
||||
selected_action = policy.select_action(processed_obs)
|
||||
# Process action through postprocessor (PolicyAction is just a torch.Tensor)
|
||||
processed_action = postprocessor(selected_action)
|
||||
assert processed_action.shape == (batch_size, action_dim)
|
||||
|
||||
|
||||
def test_multi_task_dit_policy_flow_matching_objective():
|
||||
"""Test policy with flow matching objective."""
|
||||
batch_size = 2
|
||||
state_dim = 10
|
||||
action_dim = 10
|
||||
n_obs_steps = 2
|
||||
horizon = 16
|
||||
n_action_steps = 8
|
||||
|
||||
input_features = {
|
||||
OBS_STATE: PolicyFeature(type=FeatureType.STATE, shape=(state_dim,)),
|
||||
f"{OBS_IMAGES}.laptop": PolicyFeature(type=FeatureType.VISUAL, shape=(3, 224, 224)),
|
||||
}
|
||||
|
||||
config = MultiTaskDiTConfig(
|
||||
input_features=input_features,
|
||||
output_features={ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(action_dim,))},
|
||||
n_obs_steps=n_obs_steps,
|
||||
horizon=horizon,
|
||||
n_action_steps=n_action_steps,
|
||||
# Use flow matching objective
|
||||
objective="flow_matching",
|
||||
sigma_min=0.0,
|
||||
num_integration_steps=10, # Fewer steps for faster tests
|
||||
integration_method="euler",
|
||||
# Smaller model for tests
|
||||
hidden_dim=128,
|
||||
num_layers=2,
|
||||
num_heads=4,
|
||||
)
|
||||
config.validate_features()
|
||||
|
||||
policy = MultiTaskDiTPolicy(config=config)
|
||||
policy.train()
|
||||
|
||||
# Use preprocessor to handle tokenization
|
||||
config.normalization_mapping = {
|
||||
"VISUAL": NormalizationMode.IDENTITY,
|
||||
"STATE": NormalizationMode.IDENTITY,
|
||||
"ACTION": NormalizationMode.IDENTITY,
|
||||
}
|
||||
preprocessor, _ = make_multi_task_dit_pre_post_processors(config=config, dataset_stats=None)
|
||||
|
||||
batch = create_train_batch(
|
||||
batch_size=batch_size,
|
||||
n_obs_steps=n_obs_steps,
|
||||
horizon=horizon,
|
||||
state_dim=state_dim,
|
||||
action_dim=action_dim,
|
||||
)
|
||||
|
||||
# Process batch through preprocessor to tokenize task text
|
||||
processed_batch = preprocessor(batch)
|
||||
|
||||
# Test forward pass
|
||||
loss, _ = policy.forward(processed_batch)
|
||||
assert loss is not None
|
||||
assert loss.item() is not None
|
||||
|
||||
# Test inference
|
||||
policy.eval()
|
||||
# Use IDENTITY normalization when no stats provided
|
||||
config.normalization_mapping = {
|
||||
"VISUAL": NormalizationMode.IDENTITY,
|
||||
"STATE": NormalizationMode.IDENTITY,
|
||||
"ACTION": NormalizationMode.IDENTITY,
|
||||
}
|
||||
preprocessor, postprocessor = make_multi_task_dit_pre_post_processors(config=config, dataset_stats=None)
|
||||
with torch.no_grad():
|
||||
observation_batch = create_observation_batch(batch_size=batch_size, state_dim=state_dim)
|
||||
# Process observation through preprocessor
|
||||
processed_obs = preprocessor(observation_batch)
|
||||
selected_action = policy.select_action(processed_obs)
|
||||
# Process action through postprocessor (PolicyAction is just a torch.Tensor)
|
||||
processed_action = postprocessor(selected_action)
|
||||
assert processed_action.shape == (batch_size, action_dim)
|
||||
|
||||
|
||||
def test_multi_task_dit_policy_save_and_load(tmp_path):
|
||||
"""Test that the policy can be saved and loaded correctly."""
|
||||
root = tmp_path / "test_multi_task_dit_save_and_load"
|
||||
|
||||
state_dim = 10
|
||||
action_dim = 10
|
||||
batch_size = 2
|
||||
n_obs_steps = 2
|
||||
horizon = 16
|
||||
n_action_steps = 8
|
||||
|
||||
config = create_config(
|
||||
state_dim=state_dim,
|
||||
action_dim=action_dim,
|
||||
n_obs_steps=n_obs_steps,
|
||||
horizon=horizon,
|
||||
n_action_steps=n_action_steps,
|
||||
)
|
||||
|
||||
policy = MultiTaskDiTPolicy(config=config)
|
||||
policy.eval()
|
||||
|
||||
# Get device before saving
|
||||
device = next(policy.parameters()).device
|
||||
|
||||
policy.save_pretrained(root)
|
||||
loaded_policy = MultiTaskDiTPolicy.from_pretrained(root, config=config)
|
||||
|
||||
# Explicitly move loaded_policy to the same device
|
||||
loaded_policy.to(device)
|
||||
loaded_policy.eval()
|
||||
|
||||
batch = create_train_batch(
|
||||
batch_size=batch_size,
|
||||
n_obs_steps=n_obs_steps,
|
||||
horizon=horizon,
|
||||
state_dim=state_dim,
|
||||
action_dim=action_dim,
|
||||
)
|
||||
|
||||
# Use preprocessor to handle tokenization
|
||||
config.normalization_mapping = {
|
||||
"VISUAL": NormalizationMode.IDENTITY,
|
||||
"STATE": NormalizationMode.IDENTITY,
|
||||
"ACTION": NormalizationMode.IDENTITY,
|
||||
}
|
||||
preprocessor, postprocessor = make_multi_task_dit_pre_post_processors(config=config, dataset_stats=None)
|
||||
|
||||
with torch.no_grad():
|
||||
with seeded_context(12):
|
||||
# Process batch through preprocessor
|
||||
processed_batch = preprocessor(batch)
|
||||
# Move batch to the same device as the policy
|
||||
for key in processed_batch:
|
||||
if isinstance(processed_batch[key], torch.Tensor):
|
||||
processed_batch[key] = processed_batch[key].to(device)
|
||||
# Collect policy values before saving
|
||||
loss, _ = policy.forward(processed_batch)
|
||||
|
||||
observation_batch = create_observation_batch(batch_size=batch_size, state_dim=state_dim)
|
||||
# Process observation through preprocessor
|
||||
processed_obs = preprocessor(observation_batch)
|
||||
actions = policy.select_action(processed_obs)
|
||||
|
||||
with seeded_context(12):
|
||||
# Process batch through preprocessor
|
||||
processed_batch = preprocessor(batch)
|
||||
# Collect policy values after loading
|
||||
loaded_loss, _ = loaded_policy.forward(processed_batch)
|
||||
|
||||
loaded_observation_batch = create_observation_batch(batch_size=batch_size, state_dim=state_dim)
|
||||
processed_obs = preprocessor(loaded_observation_batch)
|
||||
loaded_actions = loaded_policy.select_action(processed_obs)
|
||||
|
||||
# Compare state dicts
|
||||
assert policy.state_dict().keys() == loaded_policy.state_dict().keys()
|
||||
for k in policy.state_dict():
|
||||
assert torch.allclose(policy.state_dict()[k], loaded_policy.state_dict()[k], atol=1e-6)
|
||||
|
||||
# Compare values before and after saving and loading
|
||||
assert torch.allclose(loss, loaded_loss)
|
||||
assert torch.allclose(actions, loaded_actions)
|
||||
|
||||
|
||||
def test_multi_task_dit_policy_get_optim_params():
|
||||
"""Test that the policy returns correct optimizer parameter groups."""
|
||||
config = create_config(
|
||||
state_dim=10,
|
||||
action_dim=10,
|
||||
n_obs_steps=2,
|
||||
horizon=16,
|
||||
n_action_steps=8,
|
||||
)
|
||||
|
||||
policy = MultiTaskDiTPolicy(config=config)
|
||||
param_groups = policy.get_optim_params()
|
||||
|
||||
# Should have 2 parameter groups: non-vision and vision encoder
|
||||
assert len(param_groups) == 2
|
||||
|
||||
# First group is non-vision params (no lr specified, will use default)
|
||||
assert "params" in param_groups[0]
|
||||
assert len(param_groups[0]["params"]) > 0
|
||||
|
||||
# Second group is vision encoder params with different lr
|
||||
assert "params" in param_groups[1]
|
||||
assert "lr" in param_groups[1]
|
||||
expected_lr = config.optimizer_lr * config.vision_encoder_lr_multiplier
|
||||
assert param_groups[1]["lr"] == expected_lr
|
||||
@@ -40,7 +40,7 @@ from lerobot.utils.constants import (
|
||||
OBS_LANGUAGE_TOKENS,
|
||||
OBS_STATE,
|
||||
) # noqa: E402
|
||||
from tests.utils import require_cuda # noqa: E402
|
||||
from tests.utils import require_cuda, require_hf_token # noqa: E402
|
||||
|
||||
# Constants
|
||||
DUMMY_ACTION_DIM = 7
|
||||
@@ -65,6 +65,7 @@ EXPECTED_ACTIONS_FIRST_5 = torch.tensor([0.0000, 0.3536, 0.0707, 0.0000, 0.0000]
|
||||
|
||||
|
||||
@require_cuda
|
||||
@require_hf_token
|
||||
def set_seed_all(seed: int):
|
||||
"""Set random seed for all RNG sources to ensure reproducibility."""
|
||||
random.seed(seed)
|
||||
@@ -82,6 +83,7 @@ def set_seed_all(seed: int):
|
||||
|
||||
|
||||
@require_cuda
|
||||
@require_hf_token
|
||||
def instantiate_lerobot_pi0_fast(
|
||||
from_pretrained: bool = False,
|
||||
model_path: str = MODEL_PATH_LEROBOT,
|
||||
@@ -125,6 +127,7 @@ def instantiate_lerobot_pi0_fast(
|
||||
|
||||
|
||||
@require_cuda
|
||||
@require_hf_token
|
||||
def create_dummy_data(device=DEVICE):
|
||||
"""Create dummy data for testing both implementations."""
|
||||
batch_size = 1
|
||||
@@ -157,6 +160,7 @@ def create_dummy_data(device=DEVICE):
|
||||
# Pytest fixtures
|
||||
@pytest.fixture(scope="module")
|
||||
@require_cuda
|
||||
@require_hf_token
|
||||
def pi0_fast_components():
|
||||
"""Fixture to instantiate and provide all PI0Fast components for tests."""
|
||||
print(f"\nTesting with DEVICE='{DEVICE}'")
|
||||
@@ -168,6 +172,7 @@ def pi0_fast_components():
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
@require_cuda
|
||||
@require_hf_token
|
||||
def policy(pi0_fast_components):
|
||||
"""Fixture to provide the PI0Fast policy for tests."""
|
||||
return pi0_fast_components[0]
|
||||
@@ -175,12 +180,14 @@ def policy(pi0_fast_components):
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
@require_cuda
|
||||
@require_hf_token
|
||||
def preprocessor(pi0_fast_components):
|
||||
"""Fixture to provide the PI0Fast preprocessor for tests."""
|
||||
return pi0_fast_components[1]
|
||||
|
||||
|
||||
@require_cuda
|
||||
@require_hf_token
|
||||
def test_pi0_fast_preprocessor_alignment(policy, preprocessor):
|
||||
"""Test that LeRobot PI0Fast preprocessor produces expected outputs."""
|
||||
print("\n" + "=" * 80)
|
||||
@@ -228,6 +235,7 @@ def test_pi0_fast_preprocessor_alignment(policy, preprocessor):
|
||||
|
||||
|
||||
@require_cuda
|
||||
@require_hf_token
|
||||
def test_pi0_fast_action_generation(policy, preprocessor):
|
||||
"""Test PI0Fast LeRobot implementation generates expected actions."""
|
||||
print("\n" + "=" * 80)
|
||||
@@ -306,6 +314,7 @@ def test_pi0_fast_action_generation(policy, preprocessor):
|
||||
|
||||
|
||||
@require_cuda
|
||||
@require_hf_token
|
||||
def test_pi0_fast_inference_reproducibility(policy, preprocessor):
|
||||
"""Test that PI0Fast inference is reproducible with the same seed."""
|
||||
print("\n" + "=" * 80)
|
||||
@@ -347,6 +356,7 @@ def test_pi0_fast_inference_reproducibility(policy, preprocessor):
|
||||
|
||||
|
||||
@require_cuda
|
||||
@require_hf_token
|
||||
def test_pi0_fast_forward_pass_logits(policy, preprocessor):
|
||||
"""Test PI0Fast forward pass and compare logits against expected values."""
|
||||
print("\n" + "=" * 80)
|
||||
@@ -396,6 +406,7 @@ def test_pi0_fast_forward_pass_logits(policy, preprocessor):
|
||||
|
||||
|
||||
@require_cuda
|
||||
@require_hf_token
|
||||
def test_pi0_fast_action_token_sampling(policy, preprocessor):
|
||||
"""Test PI0Fast action token sampling (autoregressive decoding)."""
|
||||
print("\n" + "=" * 80)
|
||||
@@ -452,6 +463,7 @@ def test_pi0_fast_action_token_sampling(policy, preprocessor):
|
||||
|
||||
|
||||
@require_cuda
|
||||
@require_hf_token
|
||||
def test_pi0_fast_detokenization(policy, preprocessor):
|
||||
"""Test PI0Fast action detokenization (FAST decoding)."""
|
||||
print("\n" + "=" * 80)
|
||||
|
||||
@@ -14,10 +14,13 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Test script to verify PI0 policy integration with LeRobot, only meant to be run locally!"""
|
||||
"""Test script to verify PI0 policy integration with LeRobot"""
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
pytest.importorskip("transformers")
|
||||
|
||||
from lerobot.policies.factory import make_policy_config # noqa: E402
|
||||
from lerobot.policies.pi0 import ( # noqa: E402
|
||||
PI0Config,
|
||||
@@ -25,10 +28,11 @@ from lerobot.policies.pi0 import ( # noqa: E402
|
||||
make_pi0_pre_post_processors, # noqa: E402
|
||||
)
|
||||
from lerobot.utils.random_utils import set_seed # noqa: E402
|
||||
from tests.utils import require_cuda # noqa: E402
|
||||
from tests.utils import require_cuda, require_hf_token # noqa: E402
|
||||
|
||||
|
||||
@require_cuda
|
||||
@require_hf_token
|
||||
def test_policy_instantiation():
|
||||
# Create config
|
||||
set_seed(42)
|
||||
@@ -105,6 +109,7 @@ def test_policy_instantiation():
|
||||
|
||||
|
||||
@require_cuda
|
||||
@require_hf_token
|
||||
def test_config_creation():
|
||||
"""Test policy config creation through factory."""
|
||||
try:
|
||||
|
||||
@@ -14,10 +14,13 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Test script to verify PI0.5 (pi05) support in PI0 policy, only meant to be run locally!"""
|
||||
"""Test script to verify PI0.5 (pi05) support in PI0 policy"""
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
pytest.importorskip("transformers")
|
||||
|
||||
from lerobot.policies.factory import make_policy_config # noqa: E402
|
||||
from lerobot.policies.pi05 import ( # noqa: E402
|
||||
PI05Config,
|
||||
@@ -25,10 +28,11 @@ from lerobot.policies.pi05 import ( # noqa: E402
|
||||
make_pi05_pre_post_processors, # noqa: E402
|
||||
)
|
||||
from lerobot.utils.random_utils import set_seed
|
||||
from tests.utils import require_cuda # noqa: E402
|
||||
from tests.utils import require_cuda, require_hf_token # noqa: E402
|
||||
|
||||
|
||||
@require_cuda
|
||||
@require_hf_token
|
||||
def test_policy_instantiation():
|
||||
# Create config
|
||||
set_seed(42)
|
||||
@@ -141,6 +145,7 @@ def test_policy_instantiation():
|
||||
|
||||
|
||||
@require_cuda
|
||||
@require_hf_token
|
||||
def test_config_creation():
|
||||
"""Test policy config creation through factory."""
|
||||
try:
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Test script to verify PI0OpenPI policy integration with LeRobot vs the original implementation, only meant to be run locally!"""
|
||||
"""Test script to verify PI0OpenPI policy integration with LeRobot vs the original implementation"""
|
||||
|
||||
import os
|
||||
from copy import deepcopy
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Test script to verify PI0 policy integration with LeRobot vs the original implementation, only meant to be run locally!"""
|
||||
"""Test script to verify PI0 policy integration with LeRobot vs the original implementation"""
|
||||
|
||||
import os
|
||||
from copy import deepcopy
|
||||
|
||||
@@ -143,12 +143,18 @@ def test_policy(ds_repo_id, env_name, env_kwargs, policy_name, policy_kwargs):
|
||||
Note: We test various combinations of policy and dataset. The combinations are by no means exhaustive,
|
||||
and for now we add tests as we see fit.
|
||||
"""
|
||||
if policy_name == "vqbet" and DEVICE == "mps":
|
||||
pytest.skip("VQBet does not support MPS backend")
|
||||
if policy_name == "act" and "aloha" in ds_repo_id and DEVICE == "mps":
|
||||
pytest.skip("ACT with aloha has batch mutation issues on MPS")
|
||||
|
||||
train_cfg = TrainPipelineConfig(
|
||||
# TODO(rcadene, aliberts): remove dataset download
|
||||
dataset=DatasetConfig(repo_id=ds_repo_id, episodes=[0]),
|
||||
policy=make_policy_config(policy_name, push_to_hub=False, **policy_kwargs),
|
||||
env=make_env_config(env_name, **env_kwargs),
|
||||
)
|
||||
train_cfg.policy.device = DEVICE
|
||||
train_cfg.validate()
|
||||
|
||||
# Check that we can make the policy object.
|
||||
@@ -227,6 +233,7 @@ def test_act_backbone_lr():
|
||||
dataset=DatasetConfig(repo_id="lerobot/aloha_sim_insertion_scripted", episodes=[0]),
|
||||
policy=make_policy_config("act", optimizer_lr=0.01, optimizer_lr_backbone=0.001, push_to_hub=False),
|
||||
)
|
||||
cfg.policy.device = DEVICE
|
||||
cfg.validate() # Needed for auto-setting some parameters
|
||||
|
||||
assert cfg.policy.optimizer_lr == 0.01
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Test script to verify Wall-X policy integration with LeRobot, only meant to be run locally!"""
|
||||
"""Test script to verify Wall-X policy integration with LeRobot"""
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
@@ -29,10 +29,11 @@ from lerobot.policies.wall_x import WallXConfig # noqa: E402
|
||||
from lerobot.policies.wall_x.modeling_wall_x import WallXPolicy # noqa: E402
|
||||
from lerobot.policies.wall_x.processor_wall_x import make_wall_x_pre_post_processors # noqa: E402
|
||||
from lerobot.utils.random_utils import set_seed # noqa: E402
|
||||
from tests.utils import require_cuda # noqa: E402
|
||||
from tests.utils import require_cuda, require_hf_token # noqa: E402
|
||||
|
||||
|
||||
@require_cuda
|
||||
@require_hf_token
|
||||
def test_policy_instantiation():
|
||||
# Create config
|
||||
set_seed(42)
|
||||
@@ -118,6 +119,7 @@ def test_policy_instantiation():
|
||||
|
||||
|
||||
@require_cuda
|
||||
@require_hf_token
|
||||
def test_config_creation():
|
||||
"""Test policy config creation through factory."""
|
||||
try:
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Test script to verify XVLA policy integration with LeRobot vs the original implementation, only meant to be run locally!"""
|
||||
"""Test script to verify XVLA policy integration with LeRobot vs the original implementation"""
|
||||
# ruff: noqa: E402
|
||||
|
||||
import random
|
||||
|
||||
@@ -1870,9 +1870,7 @@ class NonCallableStep(ProcessorStep):
|
||||
|
||||
def test_construction_rejects_step_without_call():
|
||||
"""Test that DataProcessorPipeline rejects steps that don't inherit from ProcessorStep."""
|
||||
with pytest.raises(
|
||||
TypeError, match=r"Can't instantiate abstract class NonCallableStep with abstract method __call_"
|
||||
):
|
||||
with pytest.raises(TypeError, match=r"Can't instantiate abstract class NonCallableStep"):
|
||||
DataProcessorPipeline([NonCallableStep()])
|
||||
|
||||
with pytest.raises(TypeError, match=r"must inherit from ProcessorStep"):
|
||||
|
||||
+18
-1
@@ -22,8 +22,9 @@ import torch
|
||||
|
||||
from lerobot import available_cameras, available_motors, available_robots
|
||||
from lerobot.utils.import_utils import is_package_available
|
||||
from lerobot.utils.utils import auto_select_torch_device
|
||||
|
||||
DEVICE = os.environ.get("LEROBOT_TEST_DEVICE", "cuda") if torch.cuda.is_available() else "cpu"
|
||||
DEVICE = os.environ.get("LEROBOT_TEST_DEVICE", str(auto_select_torch_device()))
|
||||
|
||||
TEST_ROBOT_TYPES = []
|
||||
for robot_type in available_robots:
|
||||
@@ -107,6 +108,22 @@ def require_cuda(func):
|
||||
return wrapper
|
||||
|
||||
|
||||
def require_hf_token(func):
|
||||
"""
|
||||
Decorator that skips the test if no Hugging Face Hub token is available.
|
||||
"""
|
||||
|
||||
@wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
from huggingface_hub import get_token
|
||||
|
||||
if get_token() is None:
|
||||
pytest.skip("requires HF token for gated model access")
|
||||
return func(*args, **kwargs)
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
def require_env(func):
|
||||
"""
|
||||
Decorator that skips the test if the required environment package is not installed.
|
||||
|
||||
Reference in New Issue
Block a user