From 5ac3e568f1885e09b176330b958bf9fb91afb0b2 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Wed, 1 Apr 2026 13:48:06 +0200
Subject: [PATCH] add umi example

---
 docs/source/_toctree.yml                      |   2 +
 docs/source/umi_pi0_relative_ee.mdx           | 138 +++++++++++
 .../convert_umi_dataset.py                    | 161 +++++++++++++
 examples/umi_pi0_relative_ee/evaluate.py      | 227 ++++++++++++++++++
 4 files changed, 528 insertions(+)
 create mode 100644 docs/source/umi_pi0_relative_ee.mdx
 create mode 100644 examples/umi_pi0_relative_ee/convert_umi_dataset.py
 create mode 100644 examples/umi_pi0_relative_ee/evaluate.py

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index cc41ddb51..02797ce03 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -21,6 +21,8 @@
     title: Training with PEFT (e.g., LoRA)
   - local: rename_map
     title: Using Rename Map and Empty Cameras
+  - local: umi_pi0_relative_ee
+    title: UMI Data with pi0 Relative EE Actions
   title: "Tutorials"
 - sections:
   - local: lerobot-dataset-v3
diff --git a/docs/source/umi_pi0_relative_ee.mdx b/docs/source/umi_pi0_relative_ee.mdx
new file mode 100644
index 000000000..26a24d11c
--- /dev/null
+++ b/docs/source/umi_pi0_relative_ee.mdx
@@ -0,0 +1,138 @@
+# UMI Data with pi0 Relative EE Actions
+
+This guide explains how to prepare a UMI-collected dataset for training a pi0 policy with relative end-effector (EE) actions, and how to run inference with the trained model.
+
+**What we will do:**
+
+1. How to add `observation.state` to an existing UMI LeRobot dataset.
+2. How to train pi0 with `use_relative_actions=True`.
+3. How to evaluate the trained policy on a real robot.
+
+## Background
+
+[UMI (Universal Manipulation Interface)](https://umi-gripper.github.io) collects manipulation data with hand-held grippers, recovering 6-DoF EE poses via SLAM. UMI datasets stored in LeRobot format already contain `action` (absolute EE pose) and wrist-camera images. To train pi0 with relative actions, we need two additions:
+
+1. **`observation.state`** — the current EE pose the policy conditions on.
+2. **Relative action statistics** — so the normalizer sees `(action − state)` distributions.
+
+### Why relative actions?
+
+With relative actions, each action in a chunk is an **offset from the current state** rather than an absolute target:
+
+```
+relative_action[i] = absolute_action[t + i] − state[t]
+```
+
+This is the representation advocated by UMI (Chi et al., 2024). Compared to absolute actions it removes the need for a consistent global coordinate frame, and compared to delta actions (each step relative to the previous) it avoids error accumulation across the chunk. See the [Action Representations](action_representations) guide for a full comparison.
+
+## State-Action Offset
+
+UMI SLAM produces a single trajectory of EE poses stored as `action`. We derive `observation.state` from the same trajectory with a configurable offset:
+
+```
+state[t] = action[t - offset]
+```
+
+| Offset | `state[t]`    | Meaning                                                          |
+| ------ | ------------- | ---------------------------------------------------------------- |
+| 0      | `action[t]`   | State and action are the same pose at time t                     |
+| 1      | `action[t-1]` | State is the previous action — where the gripper already arrived |
+
+An offset of 1 is the typical UMI convention: at decision time the "current state" is where the gripper _already is_ (the result of the previous command), and the action is where it should go next. At episode boundaries where `t < offset`, we clamp to `action[0]`.
+
+## Step 1: Add State and Recompute Stats
+
+The conversion script in `examples/umi_pi0_relative_ee/convert_umi_dataset.py` handles both steps. Edit the constants at the top of the file:
+
+```python
+HF_DATASET_ID = "<hf_username>/<dataset_repo_id>"
+STATE_ACTION_OFFSET = 1
+RELATIVE_EXCLUDE_JOINTS = ["gripper"]
+CHUNK_SIZE = 50
+```
+
+Then run:
+
+```bash
+python examples/umi_pi0_relative_ee/convert_umi_dataset.py
+```
+
+This:
+
+- Loads your existing UMI LeRobot dataset.
+- Adds `observation.state` derived from the `action` column with the configured offset.
+- Calls `recompute_stats(relative_action=True)` to compute chunk-level relative action statistics.
+
+The `RELATIVE_EXCLUDE_JOINTS` parameter specifies joints that stay absolute. Gripper commands are typically binary or continuous open/close and don't benefit from relative encoding.
+
+<Tip>
+
+If your dataset already has `observation.state`, the script skips the feature-adding step and only recomputes relative action statistics.
+
+</Tip>
+
+## Step 2: Train
+
+No custom training script is needed — standard `lerobot-train` handles everything:
+
+```bash
+lerobot-train \
+    --dataset.repo_id=<hf_username>/<dataset_repo_id> \
+    --policy.type=pi0 \
+    --policy.pretrained_path=lerobot/pi0_base \
+    --policy.use_relative_actions=true \
+    --policy.relative_exclude_joints='["gripper"]'
+```
+
+Under the hood, the training pipeline:
+
+- Loads relative action stats from the dataset's `meta/stats.json`.
+- Configures `RelativeActionsProcessorStep` in the preprocessor (absolute → relative before normalization).
+- The model trains on normalized relative action values.
+
+See the [pi0 documentation](pi0) for all available training options.
+
+## Step 3: Evaluate
+
+The evaluation script in `examples/umi_pi0_relative_ee/evaluate.py` runs inference on a real robot (SO-100 with EE space):
+
+```bash
+python examples/umi_pi0_relative_ee/evaluate.py
+```
+
+Edit `HF_MODEL_ID`, `HF_DATASET_ID`, and robot configuration at the top of the file.
+
+The inference flow uses pi0's built-in processor pipeline — no custom wrappers needed:
+
+1. **Robot → FK** — Joint positions are converted to EE pose via `ForwardKinematicsJointsToEE`, producing `observation.state`.
+2. **Preprocessor** — `RelativeActionsProcessorStep` caches the raw `observation.state`, then `NormalizerProcessorStep` normalizes everything.
+3. **pi0 inference** — The model predicts a normalized relative action chunk.
+4. **Postprocessor** — `UnnormalizerProcessorStep` unnormalizes, then `AbsoluteActionsProcessorStep` adds the cached state back to get absolute EE targets.
+5. **IK → Robot** — `InverseKinematicsEEToJoints` converts absolute EE targets to joint commands.
+
+## How the Pieces Fit Together
+
+```
+Training:
+  dataset (absolute EE) → RelativeActionsProcessorStep → NormalizerProcessorStep → pi0 model
+
+Inference:
+  robot joints → FK → observation.state (absolute EE)
+                           ↓
+                   RelativeActionsProcessorStep (caches state)
+                           ↓
+                   NormalizerProcessorStep → pi0 model → relative action chunk
+                           ↓
+                   UnnormalizerProcessorStep
+                           ↓
+                   AbsoluteActionsProcessorStep (+ cached state → absolute EE)
+                           ↓
+                   IK → joint targets → robot
+```
+
+## References
+
+- [UMI: Universal Manipulation Interface](https://umi-gripper.github.io) — Chi et al., 2024. Defines relative trajectory actions.
+- [Action Representations](action_representations) — LeRobot guide comparing absolute, relative, and delta actions.
+- [pi0 documentation](pi0) — Full pi0 configuration including `use_relative_actions`.
+- [`examples/so100_to_so100_EE/`](https://github.com/huggingface/lerobot/tree/main/examples/so100_to_so100_EE) — EE-space evaluation example this builds on.
diff --git a/examples/umi_pi0_relative_ee/convert_umi_dataset.py b/examples/umi_pi0_relative_ee/convert_umi_dataset.py
new file mode 100644
index 000000000..17bb5ffd9
--- /dev/null
+++ b/examples/umi_pi0_relative_ee/convert_umi_dataset.py
@@ -0,0 +1,161 @@
+#!/usr/bin/env python
+
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Add ``observation.state`` to an existing UMI LeRobot dataset and recompute
+stats for pi0 training with relative EE actions.
+
+UMI datasets already contain ``action`` (absolute EE pose from SLAM) and
+images. This script derives ``observation.state`` from the action column
+and recomputes statistics with relative action stats.
+
+State-Action Offset:
+UMI SLAM produces a single trajectory of EE poses stored as ``action``.
+We derive ``observation.state`` from the same trajectory with a
+configurable offset:
+
+  state[t] = action[t - STATE_ACTION_OFFSET]
+
+With offset=0, state equals action at the same timestep. With offset=1,
+state is the previous timestep's action — representing where the gripper
+*arrived* (the result of the previous command), which is what the robot
+knows at decision time. Offset=1 is the typical UMI convention.
+
+For the first frame(s) of each episode where t < offset, we use the
+earliest available action (action[0]) as state.
+
+After adding state, train with standard lerobot-train:
+    lerobot-train \\
+        --dataset.repo_id=<your_dataset> \\
+        --policy.type=pi0 \\
+        --policy.use_relative_actions=true \\
+        --policy.relative_exclude_joints='["gripper"]' \\
+        --policy.pretrained_path=lerobot/pi0_base
+
+Usage:
+    python convert_umi_dataset.py
+"""
+
+from __future__ import annotations
+
+import logging
+
+import numpy as np
+
+from lerobot.datasets.dataset_tools import add_features, recompute_stats
+from lerobot.datasets.lerobot_dataset import LeRobotDataset
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+# ── Configuration ─────────────────────────────────────────────────────────
+
+HF_DATASET_ID = "<hf_username>/<dataset_repo_id>"
+
+# Offset between state and action indices within each episode.
+#   0 → state[t] = action[t]       (same instant)
+#   1 → state[t] = action[t-1]     (state lags by 1 step — typical for UMI)
+STATE_ACTION_OFFSET = 1
+
+# Joint names to keep absolute (not converted to relative).
+RELATIVE_EXCLUDE_JOINTS: list[str] = ["gripper"]
+
+# pi0 chunk size (for relative stats computation).
+CHUNK_SIZE = 50
+
+
+# ── Build state from action with offset ──────────────────────────────────
+
+
+def build_state_array(dataset: LeRobotDataset, offset: int) -> np.ndarray:
+    """Derive observation.state from the action column with a per-episode offset.
+
+    For each frame t in an episode:
+        state[t] = action[max(0, t - offset)]  (clamped to episode start)
+    """
+    hf = dataset.hf_dataset
+    actions = np.array(hf["action"], dtype=np.float32)
+    episode_indices = np.array(hf["episode_index"])
+    frame_indices = np.array(hf["frame_index"])
+
+    states = np.empty_like(actions)
+
+    for ep_idx in np.unique(episode_indices):
+        ep_mask = episode_indices == ep_idx
+        ep_global_indices = np.where(ep_mask)[0]
+        ep_actions = actions[ep_global_indices]
+        ep_frames = frame_indices[ep_global_indices]
+
+        sort_order = np.argsort(ep_frames)
+        ep_global_indices = ep_global_indices[sort_order]
+        ep_actions = ep_actions[sort_order]
+
+        n = len(ep_actions)
+        for local_t in range(n):
+            source_t = max(0, local_t - offset)
+            states[ep_global_indices[local_t]] = ep_actions[source_t]
+
+    return states
+
+
+def main():
+    logger.info(f"Loading dataset {HF_DATASET_ID}")
+    dataset = LeRobotDataset(HF_DATASET_ID)
+
+    if "observation.state" in dataset.features:
+        logger.warning("observation.state already exists — skipping add_features")
+        augmented = dataset
+    else:
+        logger.info(f"Building observation.state from action with offset={STATE_ACTION_OFFSET}")
+        state_array = build_state_array(dataset, offset=STATE_ACTION_OFFSET)
+
+        action_meta = dataset.features["action"]
+        state_feature_info = {
+            "dtype": "float32",
+            "shape": list(action_meta["shape"]),
+            "names": action_meta.get("names"),
+        }
+
+        augmented = add_features(
+            dataset,
+            features={
+                "observation.state": (state_array, state_feature_info),
+            },
+        )
+        logger.info("observation.state added")
+
+    logger.info("Recomputing stats with relative action statistics...")
+    recompute_stats(
+        augmented,
+        relative_action=True,
+        relative_exclude_joints=RELATIVE_EXCLUDE_JOINTS,
+        chunk_size=CHUNK_SIZE,
+    )
+
+    logger.info(f"Dataset ready at {augmented.root}")
+    logger.info(
+        "Train with:\n"
+        "  lerobot-train \\\n"
+        f"    --dataset.repo_id={augmented.repo_id} \\\n"
+        "    --policy.type=pi0 \\\n"
+        "    --policy.use_relative_actions=true \\\n"
+        f"    --policy.relative_exclude_joints='{RELATIVE_EXCLUDE_JOINTS}' \\\n"
+        "    --policy.pretrained_path=lerobot/pi0_base"
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/umi_pi0_relative_ee/evaluate.py b/examples/umi_pi0_relative_ee/evaluate.py
new file mode 100644
index 000000000..a2906fd48
--- /dev/null
+++ b/examples/umi_pi0_relative_ee/evaluate.py
@@ -0,0 +1,227 @@
+#!/usr/bin/env python
+
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Inference script for a pi0 model trained with **relative EE actions**.
+
+This uses the built-in ``RelativeActionsProcessorStep`` and
+``AbsoluteActionsProcessorStep`` that are already wired into pi0's
+processor pipeline when ``use_relative_actions=True``.
+
+The inference loop:
+  1. Reads joint positions from the robot.
+  2. Converts to EE pose via forward kinematics (FK).
+     This produces ``observation.state`` with the current EE pose.
+  3. The pi0 preprocessor:
+     a) ``RelativeActionsProcessorStep`` caches the raw state.
+     b) ``NormalizerProcessorStep`` normalizes state and actions.
+  4. pi0 predicts relative action chunk.
+  5. The pi0 postprocessor:
+     a) ``UnnormalizerProcessorStep`` unnormalizes.
+     b) ``AbsoluteActionsProcessorStep`` adds cached state → absolute EE.
+  6. IK converts absolute EE → joint targets → robot.
+
+Based on the so100_to_so100_EE/evaluate.py example.
+
+Usage:
+    python evaluate.py
+"""
+
+from __future__ import annotations
+
+from lerobot.cameras.opencv.configuration_opencv import OpenCVCameraConfig
+from lerobot.configs.types import FeatureType, PolicyFeature
+from lerobot.datasets.feature_utils import combine_feature_dicts
+from lerobot.datasets.lerobot_dataset import LeRobotDataset
+from lerobot.datasets.pipeline_features import aggregate_pipeline_dataset_features, create_initial_features
+from lerobot.model.kinematics import RobotKinematics
+from lerobot.policies.factory import make_pre_post_processors
+from lerobot.policies.pi0.modeling_pi0 import PI0Policy
+from lerobot.processor import (
+    RobotProcessorPipeline,
+    make_default_teleop_action_processor,
+)
+from lerobot.processor.converters import (
+    observation_to_transition,
+    robot_action_observation_to_transition,
+    transition_to_observation,
+    transition_to_robot_action,
+)
+from lerobot.robots.so_follower import SO100Follower, SO100FollowerConfig
+from lerobot.robots.so_follower.robot_kinematic_processor import (
+    ForwardKinematicsJointsToEE,
+    InverseKinematicsEEToJoints,
+)
+from lerobot.scripts.lerobot_record import record_loop
+from lerobot.types import RobotAction, RobotObservation
+from lerobot.utils.control_utils import init_keyboard_listener
+from lerobot.utils.utils import log_say
+from lerobot.utils.visualization_utils import init_rerun
+
+NUM_EPISODES = 5
+FPS = 10
+EPISODE_TIME_SEC = 60
+TASK_DESCRIPTION = "manipulation task"
+
+HF_MODEL_ID = "<hf_username>/<model_repo_id>"
+HF_DATASET_ID = "<hf_username>/<dataset_repo_id>"
+
+# EE feature keys produced by ForwardKinematicsJointsToEE
+EE_KEYS = ["x", "y", "z", "wx", "wy", "wz", "gripper_pos"]
+
+
+def main():
+    camera_config = {"wrist": OpenCVCameraConfig(index_or_path=0, width=640, height=480, fps=FPS)}
+    robot_config = SO100FollowerConfig(
+        port="/dev/tty.usbmodem5A460814411",
+        id="my_awesome_follower_arm",
+        cameras=camera_config,
+        use_degrees=True,
+    )
+    robot = SO100Follower(robot_config)
+
+    policy = PI0Policy.from_pretrained(HF_MODEL_ID)
+
+    kinematics_solver = RobotKinematics(
+        urdf_path="./SO101/so101_new_calib.urdf",
+        target_frame_name="gripper_frame_link",
+        joint_names=list(robot.bus.motors.keys()),
+    )
+
+    # FK: joint observation → EE observation (produces observation.state)
+    robot_joints_to_ee_processor = RobotProcessorPipeline[RobotObservation, RobotObservation](
+        steps=[
+            ForwardKinematicsJointsToEE(
+                kinematics=kinematics_solver,
+                motor_names=list(robot.bus.motors.keys()),
+            )
+        ],
+        to_transition=observation_to_transition,
+        to_output=transition_to_observation,
+    )
+
+    # IK: EE action → joint targets
+    robot_ee_to_joints_processor = RobotProcessorPipeline[tuple[RobotAction, RobotObservation], RobotAction](
+        steps=[
+            InverseKinematicsEEToJoints(
+                kinematics=kinematics_solver,
+                motor_names=list(robot.bus.motors.keys()),
+                initial_guess_current_joints=True,
+            ),
+        ],
+        to_transition=robot_action_observation_to_transition,
+        to_output=transition_to_robot_action,
+    )
+
+    # Dataset handle for stats (used by preprocessor/postprocessor)
+    dataset = LeRobotDataset.create(
+        repo_id=HF_DATASET_ID,
+        fps=FPS,
+        features=combine_feature_dicts(
+            aggregate_pipeline_dataset_features(
+                pipeline=robot_joints_to_ee_processor,
+                initial_features=create_initial_features(observation=robot.observation_features),
+                use_videos=True,
+            ),
+            aggregate_pipeline_dataset_features(
+                pipeline=make_default_teleop_action_processor(),
+                initial_features=create_initial_features(
+                    action={f"ee.{k}": PolicyFeature(type=FeatureType.ACTION, shape=(1,)) for k in EE_KEYS}
+                ),
+                use_videos=True,
+            ),
+        ),
+        robot_type=robot.name,
+        use_videos=True,
+        image_writer_threads=4,
+    )
+
+    # Build pre/post processors from the trained model.
+    # The pi0 processor pipeline already includes:
+    #   pre:  ... → RelativeActionsProcessorStep → NormalizerProcessorStep
+    #   post: UnnormalizerProcessorStep → AbsoluteActionsProcessorStep → ...
+    # These handle the relative ↔ absolute conversion automatically.
+    preprocessor, postprocessor = make_pre_post_processors(
+        policy_cfg=policy,
+        pretrained_path=HF_MODEL_ID,
+        dataset_stats=dataset.meta.stats,
+        preprocessor_overrides={"device_processor": {"device": str(policy.config.device)}},
+    )
+
+    robot.connect()
+
+    listener, events = init_keyboard_listener()
+    init_rerun(session_name="umi_pi0_relative_ee_evaluate")
+
+    try:
+        if not robot.is_connected:
+            raise ValueError("Robot is not connected!")
+
+        print("Starting evaluate loop...")
+        for episode_idx in range(NUM_EPISODES):
+            log_say(f"Running inference, recording eval episode {episode_idx + 1} of {NUM_EPISODES}")
+
+            record_loop(
+                robot=robot,
+                events=events,
+                fps=FPS,
+                policy=policy,
+                preprocessor=preprocessor,
+                postprocessor=postprocessor,
+                dataset=dataset,
+                control_time_s=EPISODE_TIME_SEC,
+                single_task=TASK_DESCRIPTION,
+                display_data=True,
+                teleop_action_processor=make_default_teleop_action_processor(),
+                robot_action_processor=robot_ee_to_joints_processor,
+                robot_observation_processor=robot_joints_to_ee_processor,
+            )
+
+            if not events["stop_recording"] and (
+                (episode_idx < NUM_EPISODES - 1) or events["rerecord_episode"]
+            ):
+                log_say("Reset the environment")
+                record_loop(
+                    robot=robot,
+                    events=events,
+                    fps=FPS,
+                    control_time_s=EPISODE_TIME_SEC,
+                    single_task=TASK_DESCRIPTION,
+                    display_data=True,
+                    teleop_action_processor=make_default_teleop_action_processor(),
+                    robot_action_processor=robot_ee_to_joints_processor,
+                    robot_observation_processor=robot_joints_to_ee_processor,
+                )
+
+            if events["rerecord_episode"]:
+                log_say("Re-record episode")
+                events["rerecord_episode"] = False
+                events["exit_early"] = False
+                dataset.clear_episode_buffer()
+                continue
+
+            dataset.save_episode()
+    finally:
+        log_say("Stop recording")
+        robot.disconnect()
+        listener.stop()
+
+        dataset.finalize()
+        dataset.push_to_hub()
+
+
+if __name__ == "__main__":
+    main()