From 5ac3e568f1885e09b176330b958bf9fb91afb0b2 Mon Sep 17 00:00:00 2001 From: Pepijn Date: Wed, 1 Apr 2026 13:48:06 +0200 Subject: [PATCH] add umi example --- docs/source/_toctree.yml | 2 + docs/source/umi_pi0_relative_ee.mdx | 138 +++++++++++ .../convert_umi_dataset.py | 161 +++++++++++++ examples/umi_pi0_relative_ee/evaluate.py | 227 ++++++++++++++++++ 4 files changed, 528 insertions(+) create mode 100644 docs/source/umi_pi0_relative_ee.mdx create mode 100644 examples/umi_pi0_relative_ee/convert_umi_dataset.py create mode 100644 examples/umi_pi0_relative_ee/evaluate.py diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index cc41ddb51..02797ce03 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -21,6 +21,8 @@ title: Training with PEFT (e.g., LoRA) - local: rename_map title: Using Rename Map and Empty Cameras + - local: umi_pi0_relative_ee + title: UMI Data with pi0 Relative EE Actions title: "Tutorials" - sections: - local: lerobot-dataset-v3 diff --git a/docs/source/umi_pi0_relative_ee.mdx b/docs/source/umi_pi0_relative_ee.mdx new file mode 100644 index 000000000..26a24d11c --- /dev/null +++ b/docs/source/umi_pi0_relative_ee.mdx @@ -0,0 +1,138 @@ +# UMI Data with pi0 Relative EE Actions + +This guide explains how to prepare a UMI-collected dataset for training a pi0 policy with relative end-effector (EE) actions, and how to run inference with the trained model. + +**What we will do:** + +1. How to add `observation.state` to an existing UMI LeRobot dataset. +2. How to train pi0 with `use_relative_actions=True`. +3. How to evaluate the trained policy on a real robot. + +## Background + +[UMI (Universal Manipulation Interface)](https://umi-gripper.github.io) collects manipulation data with hand-held grippers, recovering 6-DoF EE poses via SLAM. UMI datasets stored in LeRobot format already contain `action` (absolute EE pose) and wrist-camera images. To train pi0 with relative actions, we need two additions: + +1. **`observation.state`** — the current EE pose the policy conditions on. +2. **Relative action statistics** — so the normalizer sees `(action − state)` distributions. + +### Why relative actions? + +With relative actions, each action in a chunk is an **offset from the current state** rather than an absolute target: + +``` +relative_action[i] = absolute_action[t + i] − state[t] +``` + +This is the representation advocated by UMI (Chi et al., 2024). Compared to absolute actions it removes the need for a consistent global coordinate frame, and compared to delta actions (each step relative to the previous) it avoids error accumulation across the chunk. See the [Action Representations](action_representations) guide for a full comparison. + +## State-Action Offset + +UMI SLAM produces a single trajectory of EE poses stored as `action`. We derive `observation.state` from the same trajectory with a configurable offset: + +``` +state[t] = action[t - offset] +``` + +| Offset | `state[t]` | Meaning | +| ------ | ------------- | ---------------------------------------------------------------- | +| 0 | `action[t]` | State and action are the same pose at time t | +| 1 | `action[t-1]` | State is the previous action — where the gripper already arrived | + +An offset of 1 is the typical UMI convention: at decision time the "current state" is where the gripper _already is_ (the result of the previous command), and the action is where it should go next. At episode boundaries where `t < offset`, we clamp to `action[0]`. + +## Step 1: Add State and Recompute Stats + +The conversion script in `examples/umi_pi0_relative_ee/convert_umi_dataset.py` handles both steps. Edit the constants at the top of the file: + +```python +HF_DATASET_ID = "/" +STATE_ACTION_OFFSET = 1 +RELATIVE_EXCLUDE_JOINTS = ["gripper"] +CHUNK_SIZE = 50 +``` + +Then run: + +```bash +python examples/umi_pi0_relative_ee/convert_umi_dataset.py +``` + +This: + +- Loads your existing UMI LeRobot dataset. +- Adds `observation.state` derived from the `action` column with the configured offset. +- Calls `recompute_stats(relative_action=True)` to compute chunk-level relative action statistics. + +The `RELATIVE_EXCLUDE_JOINTS` parameter specifies joints that stay absolute. Gripper commands are typically binary or continuous open/close and don't benefit from relative encoding. + + + +If your dataset already has `observation.state`, the script skips the feature-adding step and only recomputes relative action statistics. + + + +## Step 2: Train + +No custom training script is needed — standard `lerobot-train` handles everything: + +```bash +lerobot-train \ + --dataset.repo_id=/ \ + --policy.type=pi0 \ + --policy.pretrained_path=lerobot/pi0_base \ + --policy.use_relative_actions=true \ + --policy.relative_exclude_joints='["gripper"]' +``` + +Under the hood, the training pipeline: + +- Loads relative action stats from the dataset's `meta/stats.json`. +- Configures `RelativeActionsProcessorStep` in the preprocessor (absolute → relative before normalization). +- The model trains on normalized relative action values. + +See the [pi0 documentation](pi0) for all available training options. + +## Step 3: Evaluate + +The evaluation script in `examples/umi_pi0_relative_ee/evaluate.py` runs inference on a real robot (SO-100 with EE space): + +```bash +python examples/umi_pi0_relative_ee/evaluate.py +``` + +Edit `HF_MODEL_ID`, `HF_DATASET_ID`, and robot configuration at the top of the file. + +The inference flow uses pi0's built-in processor pipeline — no custom wrappers needed: + +1. **Robot → FK** — Joint positions are converted to EE pose via `ForwardKinematicsJointsToEE`, producing `observation.state`. +2. **Preprocessor** — `RelativeActionsProcessorStep` caches the raw `observation.state`, then `NormalizerProcessorStep` normalizes everything. +3. **pi0 inference** — The model predicts a normalized relative action chunk. +4. **Postprocessor** — `UnnormalizerProcessorStep` unnormalizes, then `AbsoluteActionsProcessorStep` adds the cached state back to get absolute EE targets. +5. **IK → Robot** — `InverseKinematicsEEToJoints` converts absolute EE targets to joint commands. + +## How the Pieces Fit Together + +``` +Training: + dataset (absolute EE) → RelativeActionsProcessorStep → NormalizerProcessorStep → pi0 model + +Inference: + robot joints → FK → observation.state (absolute EE) + ↓ + RelativeActionsProcessorStep (caches state) + ↓ + NormalizerProcessorStep → pi0 model → relative action chunk + ↓ + UnnormalizerProcessorStep + ↓ + AbsoluteActionsProcessorStep (+ cached state → absolute EE) + ↓ + IK → joint targets → robot +``` + +## References + +- [UMI: Universal Manipulation Interface](https://umi-gripper.github.io) — Chi et al., 2024. Defines relative trajectory actions. +- [Action Representations](action_representations) — LeRobot guide comparing absolute, relative, and delta actions. +- [pi0 documentation](pi0) — Full pi0 configuration including `use_relative_actions`. +- [`examples/so100_to_so100_EE/`](https://github.com/huggingface/lerobot/tree/main/examples/so100_to_so100_EE) — EE-space evaluation example this builds on. diff --git a/examples/umi_pi0_relative_ee/convert_umi_dataset.py b/examples/umi_pi0_relative_ee/convert_umi_dataset.py new file mode 100644 index 000000000..17bb5ffd9 --- /dev/null +++ b/examples/umi_pi0_relative_ee/convert_umi_dataset.py @@ -0,0 +1,161 @@ +#!/usr/bin/env python + +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Add ``observation.state`` to an existing UMI LeRobot dataset and recompute +stats for pi0 training with relative EE actions. + +UMI datasets already contain ``action`` (absolute EE pose from SLAM) and +images. This script derives ``observation.state`` from the action column +and recomputes statistics with relative action stats. + +State-Action Offset: +UMI SLAM produces a single trajectory of EE poses stored as ``action``. +We derive ``observation.state`` from the same trajectory with a +configurable offset: + + state[t] = action[t - STATE_ACTION_OFFSET] + +With offset=0, state equals action at the same timestep. With offset=1, +state is the previous timestep's action — representing where the gripper +*arrived* (the result of the previous command), which is what the robot +knows at decision time. Offset=1 is the typical UMI convention. + +For the first frame(s) of each episode where t < offset, we use the +earliest available action (action[0]) as state. + +After adding state, train with standard lerobot-train: + lerobot-train \\ + --dataset.repo_id= \\ + --policy.type=pi0 \\ + --policy.use_relative_actions=true \\ + --policy.relative_exclude_joints='["gripper"]' \\ + --policy.pretrained_path=lerobot/pi0_base + +Usage: + python convert_umi_dataset.py +""" + +from __future__ import annotations + +import logging + +import numpy as np + +from lerobot.datasets.dataset_tools import add_features, recompute_stats +from lerobot.datasets.lerobot_dataset import LeRobotDataset + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# ── Configuration ───────────────────────────────────────────────────────── + +HF_DATASET_ID = "/" + +# Offset between state and action indices within each episode. +# 0 → state[t] = action[t] (same instant) +# 1 → state[t] = action[t-1] (state lags by 1 step — typical for UMI) +STATE_ACTION_OFFSET = 1 + +# Joint names to keep absolute (not converted to relative). +RELATIVE_EXCLUDE_JOINTS: list[str] = ["gripper"] + +# pi0 chunk size (for relative stats computation). +CHUNK_SIZE = 50 + + +# ── Build state from action with offset ────────────────────────────────── + + +def build_state_array(dataset: LeRobotDataset, offset: int) -> np.ndarray: + """Derive observation.state from the action column with a per-episode offset. + + For each frame t in an episode: + state[t] = action[max(0, t - offset)] (clamped to episode start) + """ + hf = dataset.hf_dataset + actions = np.array(hf["action"], dtype=np.float32) + episode_indices = np.array(hf["episode_index"]) + frame_indices = np.array(hf["frame_index"]) + + states = np.empty_like(actions) + + for ep_idx in np.unique(episode_indices): + ep_mask = episode_indices == ep_idx + ep_global_indices = np.where(ep_mask)[0] + ep_actions = actions[ep_global_indices] + ep_frames = frame_indices[ep_global_indices] + + sort_order = np.argsort(ep_frames) + ep_global_indices = ep_global_indices[sort_order] + ep_actions = ep_actions[sort_order] + + n = len(ep_actions) + for local_t in range(n): + source_t = max(0, local_t - offset) + states[ep_global_indices[local_t]] = ep_actions[source_t] + + return states + + +def main(): + logger.info(f"Loading dataset {HF_DATASET_ID}") + dataset = LeRobotDataset(HF_DATASET_ID) + + if "observation.state" in dataset.features: + logger.warning("observation.state already exists — skipping add_features") + augmented = dataset + else: + logger.info(f"Building observation.state from action with offset={STATE_ACTION_OFFSET}") + state_array = build_state_array(dataset, offset=STATE_ACTION_OFFSET) + + action_meta = dataset.features["action"] + state_feature_info = { + "dtype": "float32", + "shape": list(action_meta["shape"]), + "names": action_meta.get("names"), + } + + augmented = add_features( + dataset, + features={ + "observation.state": (state_array, state_feature_info), + }, + ) + logger.info("observation.state added") + + logger.info("Recomputing stats with relative action statistics...") + recompute_stats( + augmented, + relative_action=True, + relative_exclude_joints=RELATIVE_EXCLUDE_JOINTS, + chunk_size=CHUNK_SIZE, + ) + + logger.info(f"Dataset ready at {augmented.root}") + logger.info( + "Train with:\n" + " lerobot-train \\\n" + f" --dataset.repo_id={augmented.repo_id} \\\n" + " --policy.type=pi0 \\\n" + " --policy.use_relative_actions=true \\\n" + f" --policy.relative_exclude_joints='{RELATIVE_EXCLUDE_JOINTS}' \\\n" + " --policy.pretrained_path=lerobot/pi0_base" + ) + + +if __name__ == "__main__": + main() diff --git a/examples/umi_pi0_relative_ee/evaluate.py b/examples/umi_pi0_relative_ee/evaluate.py new file mode 100644 index 000000000..a2906fd48 --- /dev/null +++ b/examples/umi_pi0_relative_ee/evaluate.py @@ -0,0 +1,227 @@ +#!/usr/bin/env python + +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Inference script for a pi0 model trained with **relative EE actions**. + +This uses the built-in ``RelativeActionsProcessorStep`` and +``AbsoluteActionsProcessorStep`` that are already wired into pi0's +processor pipeline when ``use_relative_actions=True``. + +The inference loop: + 1. Reads joint positions from the robot. + 2. Converts to EE pose via forward kinematics (FK). + This produces ``observation.state`` with the current EE pose. + 3. The pi0 preprocessor: + a) ``RelativeActionsProcessorStep`` caches the raw state. + b) ``NormalizerProcessorStep`` normalizes state and actions. + 4. pi0 predicts relative action chunk. + 5. The pi0 postprocessor: + a) ``UnnormalizerProcessorStep`` unnormalizes. + b) ``AbsoluteActionsProcessorStep`` adds cached state → absolute EE. + 6. IK converts absolute EE → joint targets → robot. + +Based on the so100_to_so100_EE/evaluate.py example. + +Usage: + python evaluate.py +""" + +from __future__ import annotations + +from lerobot.cameras.opencv.configuration_opencv import OpenCVCameraConfig +from lerobot.configs.types import FeatureType, PolicyFeature +from lerobot.datasets.feature_utils import combine_feature_dicts +from lerobot.datasets.lerobot_dataset import LeRobotDataset +from lerobot.datasets.pipeline_features import aggregate_pipeline_dataset_features, create_initial_features +from lerobot.model.kinematics import RobotKinematics +from lerobot.policies.factory import make_pre_post_processors +from lerobot.policies.pi0.modeling_pi0 import PI0Policy +from lerobot.processor import ( + RobotProcessorPipeline, + make_default_teleop_action_processor, +) +from lerobot.processor.converters import ( + observation_to_transition, + robot_action_observation_to_transition, + transition_to_observation, + transition_to_robot_action, +) +from lerobot.robots.so_follower import SO100Follower, SO100FollowerConfig +from lerobot.robots.so_follower.robot_kinematic_processor import ( + ForwardKinematicsJointsToEE, + InverseKinematicsEEToJoints, +) +from lerobot.scripts.lerobot_record import record_loop +from lerobot.types import RobotAction, RobotObservation +from lerobot.utils.control_utils import init_keyboard_listener +from lerobot.utils.utils import log_say +from lerobot.utils.visualization_utils import init_rerun + +NUM_EPISODES = 5 +FPS = 10 +EPISODE_TIME_SEC = 60 +TASK_DESCRIPTION = "manipulation task" + +HF_MODEL_ID = "/" +HF_DATASET_ID = "/" + +# EE feature keys produced by ForwardKinematicsJointsToEE +EE_KEYS = ["x", "y", "z", "wx", "wy", "wz", "gripper_pos"] + + +def main(): + camera_config = {"wrist": OpenCVCameraConfig(index_or_path=0, width=640, height=480, fps=FPS)} + robot_config = SO100FollowerConfig( + port="/dev/tty.usbmodem5A460814411", + id="my_awesome_follower_arm", + cameras=camera_config, + use_degrees=True, + ) + robot = SO100Follower(robot_config) + + policy = PI0Policy.from_pretrained(HF_MODEL_ID) + + kinematics_solver = RobotKinematics( + urdf_path="./SO101/so101_new_calib.urdf", + target_frame_name="gripper_frame_link", + joint_names=list(robot.bus.motors.keys()), + ) + + # FK: joint observation → EE observation (produces observation.state) + robot_joints_to_ee_processor = RobotProcessorPipeline[RobotObservation, RobotObservation]( + steps=[ + ForwardKinematicsJointsToEE( + kinematics=kinematics_solver, + motor_names=list(robot.bus.motors.keys()), + ) + ], + to_transition=observation_to_transition, + to_output=transition_to_observation, + ) + + # IK: EE action → joint targets + robot_ee_to_joints_processor = RobotProcessorPipeline[tuple[RobotAction, RobotObservation], RobotAction]( + steps=[ + InverseKinematicsEEToJoints( + kinematics=kinematics_solver, + motor_names=list(robot.bus.motors.keys()), + initial_guess_current_joints=True, + ), + ], + to_transition=robot_action_observation_to_transition, + to_output=transition_to_robot_action, + ) + + # Dataset handle for stats (used by preprocessor/postprocessor) + dataset = LeRobotDataset.create( + repo_id=HF_DATASET_ID, + fps=FPS, + features=combine_feature_dicts( + aggregate_pipeline_dataset_features( + pipeline=robot_joints_to_ee_processor, + initial_features=create_initial_features(observation=robot.observation_features), + use_videos=True, + ), + aggregate_pipeline_dataset_features( + pipeline=make_default_teleop_action_processor(), + initial_features=create_initial_features( + action={f"ee.{k}": PolicyFeature(type=FeatureType.ACTION, shape=(1,)) for k in EE_KEYS} + ), + use_videos=True, + ), + ), + robot_type=robot.name, + use_videos=True, + image_writer_threads=4, + ) + + # Build pre/post processors from the trained model. + # The pi0 processor pipeline already includes: + # pre: ... → RelativeActionsProcessorStep → NormalizerProcessorStep + # post: UnnormalizerProcessorStep → AbsoluteActionsProcessorStep → ... + # These handle the relative ↔ absolute conversion automatically. + preprocessor, postprocessor = make_pre_post_processors( + policy_cfg=policy, + pretrained_path=HF_MODEL_ID, + dataset_stats=dataset.meta.stats, + preprocessor_overrides={"device_processor": {"device": str(policy.config.device)}}, + ) + + robot.connect() + + listener, events = init_keyboard_listener() + init_rerun(session_name="umi_pi0_relative_ee_evaluate") + + try: + if not robot.is_connected: + raise ValueError("Robot is not connected!") + + print("Starting evaluate loop...") + for episode_idx in range(NUM_EPISODES): + log_say(f"Running inference, recording eval episode {episode_idx + 1} of {NUM_EPISODES}") + + record_loop( + robot=robot, + events=events, + fps=FPS, + policy=policy, + preprocessor=preprocessor, + postprocessor=postprocessor, + dataset=dataset, + control_time_s=EPISODE_TIME_SEC, + single_task=TASK_DESCRIPTION, + display_data=True, + teleop_action_processor=make_default_teleop_action_processor(), + robot_action_processor=robot_ee_to_joints_processor, + robot_observation_processor=robot_joints_to_ee_processor, + ) + + if not events["stop_recording"] and ( + (episode_idx < NUM_EPISODES - 1) or events["rerecord_episode"] + ): + log_say("Reset the environment") + record_loop( + robot=robot, + events=events, + fps=FPS, + control_time_s=EPISODE_TIME_SEC, + single_task=TASK_DESCRIPTION, + display_data=True, + teleop_action_processor=make_default_teleop_action_processor(), + robot_action_processor=robot_ee_to_joints_processor, + robot_observation_processor=robot_joints_to_ee_processor, + ) + + if events["rerecord_episode"]: + log_say("Re-record episode") + events["rerecord_episode"] = False + events["exit_early"] = False + dataset.clear_episode_buffer() + continue + + dataset.save_episode() + finally: + log_say("Stop recording") + robot.disconnect() + listener.stop() + + dataset.finalize() + dataset.push_to_hub() + + +if __name__ == "__main__": + main()