Compare commits

..

16 Commits

Author SHA1 Message Date
Pepijn 727ca1a92c chore(rollout): remove speculative action order fix 2026-04-24 16:24:44 +02:00
Pepijn ee737b72d0 fix(rollout): avoid oversampling observations during interpolation 2026-04-24 15:42:36 +02:00
Pepijn cf2e42f557 fix(rollout): preserve relative action chunks 2026-04-24 15:33:16 +02:00
Steven Palma eb9519eb91 fix(tests): default value RTCConfig 2026-04-22 23:06:01 +02:00
Steven Palma f29ce39f39 fix(rollout): enable default 2026-04-22 18:47:15 +02:00
Steven Palma 682d5f2f95 Merge branch 'main' into feat/lerobot-rollout 2026-04-22 16:39:18 +02:00
Steven Palma 6ee093db9a Merge branch 'main' into feat/lerobot-rollout 2026-04-22 16:38:13 +02:00
Steven Palma 4499519dbf chore(rollout): address coments + minor improvements 2026-04-22 16:37:19 +02:00
Steven Palma d70c3baf7c chore(rollout): multiple minor improvements 2026-04-22 15:18:50 +02:00
Steven Palma bff2d50dc1 docs(rollout): edit rename_map instructions 2026-04-22 13:59:55 +02:00
Steven Palma 8d1401abe3 fix(rollout): device and task propagation + feature pos + warn fps + move rename_map config 2026-04-22 12:55:22 +02:00
Steven Palma 195b777367 fix(rollout): features check 2026-04-20 18:13:37 +02:00
Steven Palma b49e4016f2 test(rollout): fix expectations 2026-04-20 16:17:23 +02:00
Steven Palma 02d8a34829 fix(docs): dagger num_episodes 2026-04-20 15:40:43 +02:00
Steven Palma 14c7a25ce4 fix(rollout) require dataset in dagger + use duration too 2026-04-20 15:34:41 +02:00
Steven Palma bc06cb44ca feat(scripts): lerobot-rollout 2026-04-20 00:45:10 +02:00
390 changed files with 9983 additions and 51203 deletions
-6
View File
@@ -382,7 +382,6 @@ jobs:
--policy.path=\"\$ROBOTWIN_POLICY\" \
--env.type=robotwin \
--env.task=\"\$ROBOTWIN_TASKS\" \
--env.max_parallel_tasks=5 \
--eval.batch_size=1 \
--eval.n_episodes=1 \
--eval.use_async_envs=false \
@@ -483,7 +482,6 @@ jobs:
--policy.path=lerobot/smolvla_robocasa \
--env.type=robocasa \
--env.task=CloseFridge,OpenCabinet,OpenDrawer,TurnOnMicrowave,TurnOffStove,CloseToasterOvenDoor,SlideDishwasherRack,TurnOnSinkFaucet,NavigateKitchen,TurnOnElectricKettle \
--env.max_parallel_tasks=5 \
--eval.batch_size=1 \
--eval.n_episodes=1 \
--eval.use_async_envs=false \
@@ -695,7 +693,6 @@ jobs:
--env.task=\"\$ROBOMME_TASKS\" \
--env.dataset_split=test \
--env.task_ids=[0] \
--env.max_parallel_tasks=5 \
--eval.batch_size=1 \
--eval.n_episodes=1 \
--eval.use_async_envs=false \
@@ -803,7 +800,6 @@ jobs:
--env.type=libero_plus \
--env.task=\"\$LIBERO_PLUS_SUITE\" \
--env.task_ids=\"\$LIBERO_PLUS_TASK_IDS\" \
--env.max_parallel_tasks=5 \
--eval.batch_size=1 \
--eval.n_episodes=1 \
--eval.use_async_envs=false \
@@ -904,8 +900,6 @@ jobs:
--policy.path=lerobot/smolvla_vlabench \
--env.type=vlabench \
--env.task=select_fruit,select_toy,select_book,select_painting,select_drink,select_ingredient,select_billiards,select_poker,add_condiment,insert_flower \
--env.episode_length=50 \
--env.max_parallel_tasks=5 \
--eval.batch_size=1 \
--eval.n_episodes=1 \
--eval.use_async_envs=false \
@@ -33,7 +33,7 @@ jobs:
github.event.workflow_run.event == 'pull_request' &&
github.event.workflow_run.conclusion == 'success' &&
github.repository == 'huggingface/lerobot'
uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@2430c1ec91d04667414e2fa31ecfc36c153ea391 # main
uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@9ad2de8582b56c017cb530c1165116d40433f1c6 # main
with:
package_name: lerobot
secrets:
+2 -2
View File
@@ -55,7 +55,7 @@ jobs:
github.repository == 'huggingface/lerobot'
permissions:
contents: read
uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@2430c1ec91d04667414e2fa31ecfc36c153ea391 # main
uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@90b4ee2c10b81b5c1a6367c4e6fc9e2fb510a7e3 # main
with:
commit_sha: ${{ github.sha }}
package: lerobot
@@ -78,7 +78,7 @@ jobs:
permissions:
contents: read
pull-requests: write
uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@2430c1ec91d04667414e2fa31ecfc36c153ea391 # main
uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@90b4ee2c10b81b5c1a6367c4e6fc9e2fb510a7e3 # main
with:
commit_sha: ${{ github.event.pull_request.head.sha }}
pr_number: ${{ github.event.number }}
+1 -2
View File
@@ -152,14 +152,13 @@ jobs:
BASE_VERSION="${VERSION%%-*}"
echo "Installing pre-release version $BASE_VERSION from TestPyPI..."
uv pip install \
--torch-backend cpu \
--index-url https://test.pypi.org/simple/ \
--extra-index-url https://pypi.org/simple \
--index-strategy unsafe-best-match \
"lerobot[all]==$BASE_VERSION"
else
echo "Installing release version $VERSION from PyPI..."
uv pip install --torch-backend cpu "lerobot[all]==$VERSION"
uv pip install "lerobot[all]==$VERSION"
fi
- name: Check lerobot version
run: uv run python -c "import lerobot; print(lerobot.__version__)"
+8 -8
View File
@@ -19,19 +19,19 @@ on:
workflow_dispatch:
# Runs at 02:00
# schedule:
# - cron: "0 2 * * *"
schedule:
- cron: "0 2 * * *"
env:
CLOSE_ISSUE_MESSAGE: >
This issue was closed because it has been stalled for 30 days with no activity.
This issue was closed because it has been stalled for 14 days with no activity.
Feel free to reopen if is still relevant, or to ping a collaborator if you have any questions.
CLOSE_PR_MESSAGE: >
This PR was closed because it has been stalled for 30 days with no activity.
This PR was closed because it has been stalled for 21 days with no activity.
Feel free to reopen if is still relevant, or to ping a collaborator if you have any questions.
WARN_ISSUE_MESSAGE: >
This issue has been automatically marked as stale because it has not had
recent activity (1 year). It will be closed if no further activity occurs.
recent activity (6 months). It will be closed if no further activity occurs.
Any change, comment or update to this issue will reset this count.
Thank you for your contributions.
WARN_PR_MESSAGE: >
@@ -59,10 +59,10 @@ jobs:
stale-pr-label: stale
exempt-issue-labels: never-stale
exempt-pr-labels: never-stale
days-before-issue-stale: 365
days-before-issue-close: 30
days-before-issue-stale: 180
days-before-issue-close: 14
days-before-pr-stale: 365
days-before-pr-close: 30
days-before-pr-close: 21
delete-branch: true
close-issue-message: ${{ env.CLOSE_ISSUE_MESSAGE }}
close-pr-message: ${{ env.CLOSE_PR_MESSAGE }}
-2
View File
@@ -232,8 +232,6 @@ Match the policy to the user's **GPU memory** and **time budget**. Numbers below
All policies typically train for **510 epochs** (see §7).
> **Human-facing version:** the [Compute Hardware Guide](./docs/source/hardware_guide.mdx) reuses the table below and adds a cloud-GPU tier guide and a Hugging Face Jobs pointer.
| Policy | Batch | Update (ms) | Peak GPU mem (GB) | Best for |
| ----------- | ----: | ----------: | ----------------: | ------------------------------------------------------------------------------------------------ |
| `act` | 4 | **83.9** | **0.94** | First-time users, laptops, single-task. Fast and reliable. |
-1
View File
@@ -1,4 +1,3 @@
include src/lerobot/templates/lerobot_modelcard_template.md
include src/lerobot/templates/lerobot_rewardmodel_modelcard_template.md
include src/lerobot/datasets/card_template.md
include src/lerobot/envs/metaworld_config.json
Binary file not shown.

Before

Width:  |  Height:  |  Size: 445 KiB

+8 -11
View File
@@ -58,7 +58,7 @@ action = model.select_action(obs)
robot.send_action(action)
```
**Supported Hardware:** SO100, LeKiwi, Koch, HopeJR, OMX, EarthRover, Reachy2, Gamepads, Keyboards, Phones, OpenARM, Unitree G1, reBot B601.
**Supported Hardware:** SO100, LeKiwi, Koch, HopeJR, OMX, EarthRover, Reachy2, Gamepads, Keyboards, Phones, OpenARM, Unitree G1.
While these devices are natively integrated into the LeRobot codebase, the library is designed to be extensible. You can easily implement the Robot interface to utilize LeRobot's data collection, training, and visualization tools for your own custom robot.
@@ -101,17 +101,15 @@ lerobot-train \
--dataset.repo_id=lerobot/aloha_mobile_cabinet
```
| Category | Models |
| -------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| **Imitation Learning** | [ACT](./docs/source/policy_act_README.md), [Diffusion](./docs/source/policy_diffusion_README.md), [VQ-BeT](./docs/source/policy_vqbet_README.md), [Multitask DiT Policy](./docs/source/policy_multi_task_dit_README.md) |
| **Reinforcement Learning** | [HIL-SERL](./docs/source/hilserl.mdx), [TDMPC](./docs/source/policy_tdmpc_README.md) & QC-FQL (coming soon) |
| **VLAs Models** | [Pi0](./docs/source/pi0.mdx), [Pi0Fast](./docs/source/pi0fast.mdx), [Pi0.5](./docs/source/pi05.mdx), [GR00T N1.5](./docs/source/policy_groot_README.md), [SmolVLA](./docs/source/policy_smolvla_README.md), [XVLA](./docs/source/xvla.mdx), [EO-1](./docs/source/eo1.mdx), [MolmoAct2](./docs/source/molmoact2.mdx), [WALL-OSS](./docs/source/walloss.mdx) |
| **World Models** | [VLA-JEPA](./docs/source/vla_jepa.mdx) (more coming soon) |
| **Reward Models** | [SARM](./docs/source/sarm.mdx), [TOPReward](./docs/source/topreward.mdx), [Robometer](./docs/source/robometer.mdx) |
| Category | Models |
| -------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| **Imitation Learning** | [ACT](./docs/source/policy_act_README.md), [Diffusion](./docs/source/policy_diffusion_README.md), [VQ-BeT](./docs/source/policy_vqbet_README.md), [Multitask DiT Policy](./docs/source/policy_multi_task_dit_README.md) |
| **Reinforcement Learning** | [HIL-SERL](./docs/source/hilserl.mdx), [TDMPC](./docs/source/policy_tdmpc_README.md) & QC-FQL (coming soon) |
| **VLAs Models** | [Pi0Fast](./docs/source/pi0fast.mdx), [Pi0.5](./docs/source/pi05.mdx), [GR00T N1.5](./docs/source/policy_groot_README.md), [SmolVLA](./docs/source/policy_smolvla_README.md), [XVLA](./docs/source/xvla.mdx) |
Similarly to the hardware, you can easily implement your own policy & leverage LeRobot's data collection, training, and visualization tools, and share your model to the HF Hub
For detailed policy setup guides, see the [Policy Documentation](https://huggingface.co/docs/lerobot/bring_your_own_policies). For GPU/RAM requirements and expected training time per policy, see the [Compute Hardware Guide](https://huggingface.co/docs/lerobot/hardware_guide).
For detailed policy setup guides, see the [Policy Documentation](https://huggingface.co/docs/lerobot/bring_your_own_policies).
## Inference & Evaluation
@@ -135,7 +133,6 @@ Learn how to implement your own simulation environment or benchmark and distribu
- **[Discord](https://discord.gg/q8Dzzpym3f):** Join the `LeRobot` server to discuss with the community.
- **[X](https://x.com/LeRobotHF):** Follow us on X to stay up-to-date with the latest developments.
- **[Robot Learning Tutorial](https://huggingface.co/spaces/lerobot/robot-learning-tutorial):** A free, hands-on course to learn robot learning using LeRobot.
- **[T-Shirt Folding Experiment](https://huggingface.co/spaces/lerobot/robot-folding):** An end-to-end demonstration of folding t-shirts with LeRobot.
## Citation
@@ -143,7 +140,7 @@ If you use LeRobot in your project, please cite the GitHub repository to acknowl
```bibtex
@misc{cadene2024lerobot,
author = {Cadene, Remi and Alibert, Simon and Soare, Alexander and Gallouedec, Quentin and Zouitine, Adil and Palma, Steven and Kooijmans, Pepijn and Aractingi, Michel and Shukor, Mustafa and Aubakirova, Dana and Russi, Martino and Capuano, Francesco and Pascal, Caroline and Choghari, Jade and Meftah, Khalil and Ellerbach, Maxime and Moss, Jess and Wolf, Thomas},
author = {Cadene, Remi and Alibert, Simon and Soare, Alexander and Gallouedec, Quentin and Zouitine, Adil and Palma, Steven and Kooijmans, Pepijn and Aractingi, Michel and Shukor, Mustafa and Aubakirova, Dana and Russi, Martino and Capuano, Francesco and Pascal, Caroline and Choghari, Jade and Moss, Jess and Wolf, Thomas},
title = {LeRobot: State-of-the-art Machine Learning for Real-World Robotics in Pytorch},
howpublished = "\url{https://github.com/huggingface/lerobot}",
year = {2024}
-417
View File
@@ -1,417 +0,0 @@
# Decoupled VLA Inference & Edge Control: System Design Proposal
## 1. Executive Summary
This document proposes a production-grade system for decoupling GPU-bound VLA (Vision-Language-Action) policy inference from high-frequency, CPU-bound robot control in LeRobot. The system adopts a **Model-as-a-Service (MaaS)** paradigm using **Zenoh** as the sole transport protocol, enabling multiple edge devices to be served by centralized GPU servers with minimal latency and high reliability.
An initial prototype exists in `src/lerobot/async_inference/` (gRPC-based, single-client). This proposal defines the target architecture, identifies gaps between the prototype and production requirements, documents known bugs, and establishes the design for the new system.
---
## 2. Motivation
LeRobot's standard control loop runs policy inference and robot I/O in the same process. This works for lightweight policies on local GPUs, but breaks down when:
- **The policy is too large for edge hardware** (e.g., Pi0 at ~3B parameters requires a dedicated GPU).
- **Multiple robots need the same policy** (redundant GPU allocation per robot).
- **Inference latency exceeds the control deadline** (e.g., 200ms inference on a 33ms control loop at 30 FPS).
Decoupling inference from control solves all three: the edge device runs a tight I/O loop on a CPU, while a GPU server handles inference for one or more clients.
---
## 3. Core Architectural Principles
### 3.1 Model-as-a-Service (MaaS)
Servers initialize models **once at startup** from a configuration manifest. Edge devices do **not** trigger dynamic model loading — they route to pre-warmed servers and validate compatibility via a status endpoint.
### 3.2 Multi-Tenant & Stateless Inference
A single GPU server handles multiple edge devices executing the same task. The server is stateless per inference call — `predict_action_chunk()` is a pure function with no side effects on the model. Client isolation is achieved through per-client observation slots and Zenoh key-expression routing.
> **Invariant**: `predict_action_chunk()` must remain a pure function (no mutation of `self`) for all supported policies. This is what enables safe multi-tenant sharing of a single model instance. This invariant must be documented and tested.
### 3.3 Zenoh as primary Transport
The system uses Zenoh's pub/sub model, replacing the current gRPC implementation. Zenoh provides:
- **Hierarchical key expressions** for routing (natural fit for the cluster/experiment/model/task topology).
- **Built-in discovery** (no external service discovery needed).
- **Non-blocking publish** for observations (fire-and-forget with best-effort QoS).
- **Reliable delivery** configurable per-topic (required for action chunks).
- **Shared-memory transport** for same-machine deployments (zero-copy) (if available).
### 3.4 Local Edge CPU
Edge devices rely on standard CPUs for sensor polling, image compression, payload serialization, motor control, and data logging. No edge-GPU dependency.
---
## 4. System Topology
![alt text](MaaS_async_inference_diagram.png)
- **Cluster**: A set of GPU machines. Identified by `cluster_uuid`.
- **Experiment**: A logical grouping of servers and clients. Identified by `experiment_tag`.
- **Server**: One model + one task, pre-warmed. Serves N clients for that model/task combination.
- **Client**: One robot, one task. Publishes observations, subscribes to actions.
The number of clients a single server can handle is a **user decision** based on model inference time and acceptable latency.
---
## 5. Component Specifications
### 5.1 The Edge Device (Client)
**Responsibilities:**
1. **Observation capture**: Read sensors (cameras, motors) at the control loop frequency.
2. **Image compression**: JPEG-encode RGB images before transmission.
3. **Observation publishing**: Non-blocking Zenoh put to the observation topic.
4. **Action subscription**: Zenoh callback receives action chunks, deposits into local buffer.
5. **Action execution**: Pop actions from buffer, send to robot at control frequency.
6. **Action blending**: When a new action chunk overlaps with the current buffer, blend via configurable aggregation function (weighted average, latest-only, etc.).
7. **Latency compensation**: Calculate one-way latency from RTT, discard expired initial steps of incoming action chunks.
8. **Fail-safe**: If action buffer empties, logs a warning.
9. **Data logging**: Record raw observations and executed actions to local `LeRobotDataset` storage for deferred upload.
**Threading model:**
- **Control loop thread** (main): Capture observation → deposit in outbox → pop action from buffer → send to robot → sleep to maintain frequency.
- **Zenoh action callback** (Zenoh-managed): Receives action chunks, processes RTT, trims stale steps, deposits into action buffer.
- **Observation publisher thread**: Drains the outbox, compresses images, serializes, publishes via Zenoh.
> **Design note**: The current prototype blocks on `send_observation` inside the control loop (BUG-1, see Section 9). The new design decouples observation publishing from the control loop entirely, using a separate thread and Zenoh's non-blocking put.
### 5.2 The Inference Server (GPU Pod)
**Responsibilities:**
1. **Model pre-warming**: Load model and processor pipelines at startup from config manifest (including expected clients & policy parameters).
2. **Status publishing**: Expose model capabilities (policy type, expected camera names, resolutions, action dimensions) via Zenoh queryable.
3. **Observation subscription**: Subscribe to observation topics for all clients of this model/task. Maintain per-client observation slots (newest-only semantics).
4. **Inference**: Single inference thread processes observations sequentially (round-robin across clients). Calls `policy.predict_action_chunk()`.
5. **Action publishing**: Publish action chunks to per-client action topics with reliable QoS.
> **Thread safety**: PyTorch's `model.forward()` is not guaranteed thread-safe. Inference will be sequential, latency is mostly about the capabilities of the server to serve multiple requests.
---
## 6. Zenoh Routing & Key Expressions
### 6.1 Key Expression Schema
```
[cluster_uuid] / [experiment_tag] / [model_id] / [model_version] / [application_tag] / [client_uuid] / [topic]
```
**Example key expressions:**
| Key Expression | Direction | Purpose |
| ------------------------------------------------ | ----------------- | ---------------------------------- |
| `jupiter/fabio2/pi0/v1/cookie/robot_a4b9/obs` | Client → Server | Observation payload |
| `jupiter/fabio2/pi0/v1/cookie/robot_a4b9/action` | Server → Client | Action chunk |
| `jupiter/fabio2/pi0/v1/cookie/*/obs` | Server subscribes | All observations for pi0/v1/cookie |
| `jupiter/fabio2/pi0/v1/cookie/status` | Server publishes | Model capabilities (queryable) |
### 6.2 QoS Configuration
| Topic | Reliability | Rationale |
| -------- | ----------- | -------------------------------------------------------------------- |
| `obs` | Best-effort | Dropping stale observations is expected behavior. |
| `action` | Reliable | Every action chunk must be delivered; loss causes action starvation. |
| `status` | Reliable | Client needs accurate capability info before starting. |
### 6.3 Discovery Flow
0. Server goes up with the static configuration.
1. Client constructs its target key prefix: `cluster/experiment/model/version/task/`.
2. Client queries `cluster/experiment/model/version/task/status` (Zenoh queryable).
3. Server responds with its capabilities (expected camera names, image resolutions, action dimensions, model metadata).
4. Client validates its own configuration against server capabilities.
5. On match: client starts publishing observations and subscribing to actions.
6. On mismatch: client logs an error and refuses to start.
No dynamic client discovery for now.
---
## 7. Message Schema
### 7.1 Observation Payload (Client → Server)
| Field | Type | Purpose |
| ------------- | ------------------ | ----------------------------------------------------------- |
| `seq_id` | `uint64` | Incrementing ID for causality tracking and RTT computation. |
| `client_uuid` | `string` | Identifies the sending client. |
| `state` | `bytes` | Proprioceptive state vector (`numpy.tobytes()`). |
| `images` | `dict[str, bytes]` | JPEG-compressed camera images, keyed by camera name. |
| `task` | `string` | Natural-language task instruction (for VLA conditioning). |
### 7.2 Action Payload (Server → Client)
| Field | Type | Purpose |
| -------------------- | --------- | --------------------------------------------------------------- |
| `response_to_seq_id` | `uint64` | Echoes the observation `seq_id` this action corresponds to. |
| `inference_time_ms` | `float32` | Server-side compute duration (for edge RTT math). |
| `actions` | `bytes` | Action chunk as numpy array bytes (`(chunk_size, action_dim)`). |
### 7.3 Status Payload (Server, Queryable)
| Field | Type | Purpose |
| ----------------------- | ------------------- | ------------------------------------------ |
| `model_id` | `string` | Policy identifier (e.g., `pi0`). |
| `model_version` | `string` | Model version or checkpoint path. |
| `expected_cameras` | `dict[str, (H, W)]` | Expected camera names and shapes. |
| `action_dim` | `int` | Dimensionality of the action space. |
| `max_actions_per_chunk` | `int` | Maximum chunk size the model supports. |
| `observation_features` | `dict` | Full feature specification for validation. |
### 7.4 Serialization Format
**MessagePack** for all structured metadata (compact, fast, cross-language). Image payloads are raw JPEG bytes embedded in the MessagePack structure. State vectors use `numpy.tobytes()` with shape/dtype metadata for zero-copy reconstruction.
**No pickle.** The current prototype uses `pickle.dumps`/`pickle.loads` throughout, which allows arbitrary code execution. This is replaced entirely.
---
## 8. Latency Compensation
### 8.1 RTT Calculation
The edge device tracks in-flight observations:
```python
in_flight: dict[int, float] = {} # seq_id -> time.perf_counter() at send
# On send:
in_flight[seq_id] = time.perf_counter()
# On receive action chunk:
rtt = time.perf_counter() - in_flight[response_to_seq_id]
# delete older keys than the one received
```
> **Important**: Delete only the exact `response_to_seq_id` key from `in_flight`, not all keys `<= response_to_seq_id`. With Zenoh's best-effort transport, messages can arrive out of order. Clearing earlier keys would make their RTT unmeasurable.
### 8.2 Stale Action Trimming
When an action chunk arrives, the edge calculates how many initial steps have already expired:
```python
expired_steps = int(rtt / environment_dt)
valid_actions = action_chunk[expired_steps:]
```
The valid actions are then blended into the action buffer using the configured aggregation function.
### 8.3 Edge Cases
| Scenario | Behavior |
| -------------------------------------- | -------------------------------------------------------------------------------------- |
| **First observation** (no RTT history) | Apply all action steps without trimming. |
| **Dropped observations** | Server infers on next received observation. No special handling needed. |
| **Dropped action chunks** | Edge continues executing current buffer. If buffer empties, warn & hold last position. |
| **Server crash** | Edge exhausts buffer, holds position, warns & re-validates via status query. |
> **Assumption**: All currently supported robots are position-controlled (SO100, SO101, OMX). For velocity-controlled robots, the fail-safe must send zero-velocity instead of holding position. This should be configurable per-robot.
---
## 9. Known Bugs in Current Prototype
These issues exist in `src/lerobot/async_inference/` and must be addressed in the new implementation.
### BUG-1: `send_observation` Blocks the Control Loop (Critical)
**Location**: `robot_client.py:207`
`self.stub.SendObservations(observation_iterator)` is a synchronous gRPC call inside the 33ms control loop. For multi-camera observations (several MB after pickle), this consumes 10-20ms on the network, leaving no headroom for sensor capture and motor commands. The robot stutters.
**Resolution in new design**: Observation publishing is moved to a dedicated thread. Zenoh's `session.put()` is non-blocking by default. The control loop only deposits observations into a local outbox.
### BUG-2: Race Condition in Action Queue Aggregation (Correctness)
**Location**: `robot_client.py:236-267`
The lock on `self.action_queue` is acquired to read `internal_queue = self.action_queue.queue` (a reference to the internal deque), then **released** at line 238. The aggregation logic iterates over this reference outside the lock. Meanwhile, the control loop thread can `get_nowait()` from the same queue, mutating the deque during iteration. At line 267, the entire queue is replaced, but actions popped between 238-267 are silently lost.
**Fix**: Either hold the lock for the entire aggregation, or `list(self.action_queue.queue)` to copy contents before releasing.
### BUG-3: No RPC Deadlines (Reliability)
**Location**: `robot_client.py:278`
`GetActions` blocks indefinitely if the server hangs (GPU OOM, deadlock). The retry policy handles `UNAVAILABLE` but not a hung connection.
**Resolution in new design**: The polling `GetActions` pattern is replaced by Zenoh subscription callbacks. The client needs a watchdog timer or check when action queue is empty: if no actions are received for `T` seconds, trigger re-validation via the status service.
### BUG-4: Similarity Check Ignores Images (Correctness for VLAs)
**Location**: `helpers.py:280-297`
`observations_similar()` + `must_go` is a workaround for current architecure limitations to avoid filling up the server queue the first seconds of the task & the robot remaining idle.
**Resolution in new design**: the server always processes the latest observation per client in its inference loop, and doesn't need similarity gating at all. The client can always push.
---
## 10. Gaps Between Prototype and Target Architecture
### 10.1 Critical (Must Address)
| # | Gap | Current State | Target State |
| --- | ------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------- |
| G1 | **Single-client server** | One `observation_queue(maxsize=1)`, one `last_processed_obs`, one `_predicted_timesteps`. `_reset_server()` flushes all state on any new connection. | Per-client state (`ClientState` dataclass) keyed by `client_uuid`. Zenoh key-expression routing provides client isolation. |
| G2 | **Dynamic model loading** | Client sends `RemotePolicyConfig` → server calls `from_pretrained()` on demand. | Server loads models at startup from config manifest. `SendPolicyInstructions` RPC eliminated. Client validates via status query. |
| G3 | **gRPC transport** | Entire `transport/` directory: proto definitions, generated stubs, chunking utils. 4 RPCs: `Ready`, `SendPolicyInstructions`, `SendObservations`, `GetActions`. | Zenoh pub/sub. Client publishes obs, subscribes to actions. Server subscribes to obs, publishes actions. Dispatching via key expressions. |
| G4 | **Pickle serialization** | `pickle.dumps`/`pickle.loads` throughout (arbitrary code execution risk, `# nosec` suppression). | MessagePack for structured metadata + raw JPEG bytes for images + `numpy.tobytes()` for state vectors. |
### 10.2 Important
| # | Gap | Current State | Target State |
| --- | -------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------- |
| G5 | **No RTT/latency compensation** | No `seq_id`, no `response_to_seq_id`, no `inference_time_ms`. Timestamps use `time.time()` (unreliable across machines). | Edge-local `perf_counter` + echoed `seq_id` + server inference duration. Stale action step trimming. |
| G6 | **No hierarchical routing** | Direct gRPC channel to `host:port`. | Zenoh key expressions: `cluster/experiment/model/version/task/client/topic`. |
| G7 | **No data logging** | `control_loop` has access to obs and actions but doesn't persist them. | Edge records via `LeRobotDataset` (`build_dataset_frame` + `dataset.add_frame`). |
| G8 | **No authentication** | `grpc.insecure_channel`. | Zenoh TLS + access control lists on key expressions. |
| G9 | **ProcessorPipeline divergence** | Server reimplements observation prep in `helpers.py` (custom `resize_robot_observation_image` with `F.interpolate` bilinear). Diverges from standard `RobotProcessorPipeline`. | Use the standard `RobotProcessorPipeline` + `build_dataset_frame` to ensure behavioral equivalence between record and async inference. |
### 10.3 Nice-to-Have
| # | Gap | Current State | Target State |
| --- | ------------------------------------- | --------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------- |
| G11 | **No status/discovery service** | Bare `Ready()` ping. | Zenoh queryable at `cluster/exp/model/version/task/status`. |
| G12 | **No monitoring** | `FPSTracker` + `logging.debug`. | Structured metrics via Zenoh telemetry topics. Wildcard subscriptions for centralized monitoring. |
| G13 | **No entry points** | Module-level `__main__`. | `lerobot-policy-server` and `lerobot-robot-client` console scripts in `pyproject.toml`. |
| G14 | **Ratio-based observation threshold** | `chunk_size_threshold` (0-1 ratio of queue fill). Scales oddly with different `actions_per_chunk` values. | Absolute time threshold: `buffer_time_s` calibrated to observed RTT. Send observation when `queue_size * environment_dt < buffer_time_s`. |
---
## 11. Design Decisions & Rationale
### 11.1 Why Zenoh Over gRPC
| Aspect | Zenoh | gRPC |
| ------------------------- | -------------------------------------------------------------------------- | ---------------------------------------------------------------------------------- |
| Communication model | Pub/sub — natural fit for "client publishes obs, server publishes actions" | Request/response — requires polling (`GetActions` loop) or bidirectional streaming |
| Multi-tenant routing | Hierarchical key expressions provide built-in per-client topic isolation | Requires manual per-client channel/stream management |
| Discovery | Built-in discovery | Requires external service (mDNS, Consul, etc.) |
| Observation publishing | Non-blocking put (fire-and-forget) — resolves BUG-1 automatically | Synchronous stream-unary call — blocks the control loop |
| Same-machine optimization | Shared-memory transport (zero-copy) | Loopback TCP |
| Telemetry | Wildcard subscriptions (`+/+/+/+/+/metrics`) | Requires separate monitoring infrastructure |
**Tradeoffs of going Zenoh-only:**
- Smaller community, less tooling for monitoring/tracing vs. gRPC's mature ecosystem.
- No built-in schema enforcement (Zenoh sends raw bytes) — serialization correctness is entirely on us.
- Default QoS is best-effort (like UDP). Must explicitly configure reliable delivery for action chunks.
- `zenoh-python` bindings are less battle-tested than `grpcio`. Needs integration testing under network stress.
### 11.2 Why Single Inference Thread (Not Batching)
True GPU batching across clients requires collecting observations from multiple clients and running a single forward pass. This is difficult because:
- Clients send observations at different times — waiting to batch adds latency.
- Different clients may have slightly different image resolutions.
- Error in one client's observation shouldn't affect others.
**Decision**: Start with sequential processing (single inference thread, round-robin across clients). Profile GPU utilization.
### 11.4 Why MessagePack (Not Protobuf, Not FlatBuffers)
- **Protobuf**: Strong schema enforcement but heavier toolchain (proto compilation, generated code). Since we're dropping gRPC, the protobuf dependency becomes unnecessary overhead.
- **MessagePack**: Fast, compact, schema-less (enforced by application), excellent Python support (`msgpack` package), good for nested dicts with mixed types. Natural fit for observation/action payloads.
Images are embedded as raw JPEG bytes within the MessagePack structure. State vectors use `numpy.tobytes()` with shape/dtype metadata for zero-copy reconstruction.
### 11.5 Action Aggregation Strategy
When a new action chunk overlaps with the existing buffer, the overlapping timesteps must be blended. The current prototype supports configurable aggregation functions:
| Function | Formula | Character |
| ------------------ | ----------------------- | ------------------------------------------ |
| `weighted_average` | `0.3 * old + 0.7 * new` | Smooth transitions, favors new predictions |
| `latest_only` | `new` | Most responsive, can cause discontinuities |
| `average` | `0.5 * old + 0.5 * new` | Equal weight |
| `conservative` | `0.7 * old + 0.3 * new` | Smooth, slow to adapt |
Ultimately, this should be the user's decision. Default to `weighted_average`. The goal of async is not to do temporal ensembling, but to provide a solution when we want to decouple inference and execution.
---
## 12. Configuration
### 12.1 Server Configuration (Manifest)
Servers are configured via a YAML manifest that declares which models to pre-warm & clients to serve:
```yaml
cluster_uuid: jupiter
experiment_tag: fabio2
server:
- model_id: pi0
model_version: v1
pretrained_path: lerobot/pi0-cookie-v1
application_tag: cookie
device: cuda:0
fps: 30
endpoint: tcp/192.168.1.50:7447
clients:
- client_uuid: cookie-worker-4269
```
### 12.2 Client Configuration
Clients are configured via draccus dataclass (CLI-compatible):
```python
@dataclass
class AsyncClientConfig:
# Zenoh routing
cluster_uuid: str
experiment_tag: str
model_id: str
model_version: str
application_tag: str
client_uuid: str
endpoint: str
# Robot
robot: RobotConfig
# Control
fps: int = 30
actions_per_chunk: int = 50
aggregate_fn_name: str = "weighted_average"
jpeg_quality: int = 90
# Fail-safe
max_empty_cycles_before_warning: int = 10
# Datset recording
dataset_repo_id: str | None = None # None = no logging
# Task
task: str = ""
```
---
## 14. Data Logging Integration
The client records observations and executed actions into a local `LeRobotDataset` for deferred upload to the training dataset:
```python
# In control_loop, after executing an action:
if self.dataset is not None:
frame = build_dataset_frame(
self.dataset.features,
processed_observation,
prefix=OBS_STR,
)
frame["action"] = executed_action_tensor
self.dataset.add_frame(frame)
```
-498
View File
@@ -1,498 +0,0 @@
# Decoupled VLA Inference & Edge Control v2: Async Network Inference for `lerobot-rollout`
> **Status**: supersedes the v1 proposal in full. v1 was written against the standalone `src/lerobot/async_inference/` prototype, before `lerobot-rollout` existed. This revision re-grounds the design in the current codebase, keeps v1's decisions that survived contact with it (marked **KEPT** throughout), reverses the ones that didn't, and adds the safety, multi-tenancy, and operations specifications v1 lacked.
## 1. Executive Summary
This document specifies a production-grade system for decoupling GPU-bound policy inference from high-frequency robot control, targeting power users running **hundreds of robots** against centralized GPU clusters. The system keeps v1's **Model-as-a-Service (MaaS)** paradigm and **Zenoh** transport, but changes the integration architecture fundamentally:
- **The client is not a standalone CLI.** It is `--inference.type=remote`, a new `InferenceEngine` backend inside `lerobot-rollout` (`src/lerobot/rollout/inference/`). Every rollout strategy (base, sentry, highlight, dagger, episodic) gets network inference for free — including dataset recording, DAgger pause/resume, Rerun visualization, and safe teardown.
- **The client is weightless.** No policy weights, no policy processors on the edge. `--policy.path` resolves to a config-only `PreTrainedConfig` (no weight download) used for pre-flight validation and action ordering.
- **The server is stateless per request.** All RTC chunk state (leftover prefixes, latency tracking, delay computation) lives client-side in the existing `ActionQueue`/`LatencyTracker` machinery — the client ships prefixes + a delay hint with each observation. A server crash loses zero control state; reconnects and horizontal scaling are trivial.
- **Multi-tenancy is engineered, not assumed.** The real hazards are stateful processor pipelines and episode-scoped policy state — not `predict_action_chunk` purity (which holds for ACT/Pi0/Pi0.5/SmolVLA but _not_ diffusion). The server uses per-session processor instances, a chunk-stateless allowlist, and an exclusive serving mode for policies that need it.
- **The legacy module dies.** `src/lerobot/async_inference/` (~1,900 lines, pickle-over-gRPC, single-client, four confirmed bugs) is deleted in the same PR that lands the new backend. No deprecation cycle: the module is experimental, its CLI undocumented in the main flow, and every config field has a mapped successor (§13.4).
---
## 2. Motivation (unchanged from v1) — **KEPT**
LeRobot's standard control loop runs policy inference and robot I/O in the same process. This breaks down when:
- **The policy is too large for edge hardware** (Pi0-class models need a dedicated GPU).
- **Multiple robots need the same policy** (redundant GPU allocation per robot).
- **Inference latency exceeds the control deadline** (e.g. 150 ms inference on a 33 ms control tick).
Decoupling solves all three: the edge runs a tight CPU loop; a GPU server performs inference for N clients.
What changed since v1: the _local_ version of this decoupling already shipped. `RTCInferenceEngine` (`src/lerobot/rollout/inference/rtc.py`) runs inference in a background thread against a thread-safe `ActionQueue` with latency-aware chunk merging. **The network system is that same architecture with the thread boundary replaced by a network boundary.** This is the design's central simplification: reuse, don't reinvent.
---
## 3. Gap Analysis: v1 Proposal vs. Modern Codebase
| Topic | v1 assumed | Modern reality | Verdict |
| ----------------------------------------- | --------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------- | --------------------------------------- |
| Client architecture | Standalone robot-client CLI (§5.1 of v1) | `InferenceEngine` ABC seam in `lerobot-rollout` (`rollout/inference/base.py`); strategies are backend-agnostic | **Superseded** — backend, not CLI |
| Chunk blending | Configurable aggregation zoo (`weighted_average`, …) | `ActionQueue` replace-with-delay-trim (RTC) / append (non-RTC) (`policies/rtc/action_queue.py:147-217`) | **Superseded** — drop blending entirely |
| Latency compensation | Hand-rolled RTT trim (`expired_steps = int(rtt/dt)`, v1 §8.2) | `ActionQueue.merge(..., real_delay, idx_before)` + `LatencyTracker` already do this, validated | **Superseded** |
| Multi-tenancy invariant | "`predict_action_chunk()` pure ⇒ safe to share" | Processor state + episode-scoped policy state are the real hazards (§7) | **Incomplete** — fixed in §8.3 |
| Data logging | Client-side `build_dataset_frame` + `add_frame` sketch (v1 §14) | Recording strategies (sentry/episodic/dagger) already log obs + executed actions | **Superseded** — free via rollout |
| MaaS pre-warm, no dynamic loading | ✓ | Still right; legacy `SendPolicyInstructions` is a pickle/RCE + capacity-planning disaster | **KEPT** |
| JPEG observation compression | ✓ | Still right (§10.1) | **KEPT** |
| Status/capability validation before start | ✓ (Zenoh queryable) | Still right; extended into a hard sync-safety contract (§8.4) | **KEPT, extended** |
| Time-based send threshold (v1 G14) | ✓ | Adopted as `buffer_time_s` | **KEPT** |
| Zenoh pub/sub data plane | ✓ | Confirmed; QoS corrected (§6.3), control plane moved to queryables, liveliness added | **KEPT, hardened** |
| MessagePack serialization | ✓ | Endorsed (zenoh's `ext` serializer cannot encode numpy); must be version-gated (§10.4) | **KEPT, with schema discipline** |
| QoS table (v1 §6.2) | "obs best-effort, actions reliable" | Conflates transport reliability with congestion control; BLOCK on actions is dangerous | **Revised** (§6.3) |
| Bugs BUG-1…BUG-4, gaps G1…G14 | Listed as work items | Every one resolved _structurally_ by this design (§13.5 mapping) | **Resolved by design** |
---
## 4. Critical Pushbacks on v1
Each pushback: claim → evidence → consequence for this design.
**P1 — A standalone client duplicates `lerobot-rollout`.**
v1 §5.1 assigns the client: observation capture, action execution at frequency, fail-safe, data logging. Every one of those is already owned by rollout strategies and `send_next_action` (`rollout/strategies/core.py:269-304`), which tolerates `None` actions, runs the interpolator, and routes through the canonical robot processors. A standalone client re-implements loop timing, recording, DAgger UX, Rerun, and teardown safety — and then drifts. _Consequence_: the client is `RemoteInferenceEngine`, registered as `--inference.type=remote` next to `sync` and `rtc`.
**P2 — The aggregation-function zoo fabricates actions no policy predicted.**
`0.3*old + 0.7*new` produces hybrid actions that exist in no policy's output distribution; the logged action becomes unexplainable (bad for the reproducibility story) and the implementation hosted a real lock-release race (BUG-2, `async_inference/robot_client.py:236-267`). RTC's prefix-conditioned chunk generation is the principled mechanism for smooth chunk transitions; plain append covers non-RTC chunking. _Consequence_: `ActionQueue` replace/append are the only two merge semantics. The zoo is deleted.
**P3 — "predict_action_chunk pure ⇒ multi-tenant safe" is incomplete.**
Verified in-tree: (a) `RelativeActionsProcessorStep` caches `_last_state` at preprocess (`processor/relative_action_processor.py:131`) and the postprocessor reads it back (`:189`) — a shared pipeline across clients is a race; (b) `DiffusionPolicy.predict_action_chunk` reads `self._queues`, which only `select_action` populates (`policies/diffusion/modeling_diffusion.py:90-108`) — it is **not** chunk-stateless; (c) SAC/SARM have no `predict_action_chunk` at all. _Consequence_: per-session processor instances (mandatory), a chunk-stateless allowlist, `serving_mode: exclusive` for diffusion-family, refusal at startup for SAC/SARM, and `policy.reset()` is **never** called in shared mode (§8.3).
**P4 — v1 re-derives latency compensation that already exists, on top of broken clocks.**
v1 §8 specifies an in-flight RTT dict and manual stale-step trimming. `ActionQueue.merge(original, processed, real_delay, idx_before)` already trims `real_delay` stale steps and cross-validates against actions consumed in flight (`action_queue.py:219-246`). Worse, the legacy code compares wall clocks across machines (`robot_client.py:420` stamps `time.time()` "to compare timestamps across client and server"; `policy_server.py:178` compares it) — NTP skew is the same order as the latencies being measured. _Consequence_: the **monotonic iron rule** (§11): instants never cross machines; client timestamps are opaque echoed tokens; servers report only durations. `delay_steps = ceil((rtt + inference)/dt)` is computed client-side from client-local `perf_counter` samples and shipped per request.
**P5 — One-in-flight per client is a correctness requirement, not a tuning choice.**
At send time the client snapshots `idx_before = queue.get_action_index()` and the leftover prefixes; `merge` validates against them. Two in-flight requests carry conflicting snapshots — the second merge corrupts both RTC replace mode and append mode. The local RTC thread is also strictly one-inference-at-a-time; one-in-flight preserves exact parity. _Consequence_: the worker publishes one observation, waits for its chunk (or timeout), then sends the next. v1 §8.1's out-of-order in-flight dict is dead weight; a late chunk is accepted only if it answers the _latest_ outstanding `seq_id`, otherwise dropped.
**P6 — v1's QoS table conflates transport reliability with congestion behavior.**
"Reliable delivery for actions" sounds right but the dangerous knob is congestion control: a publisher configured `BLOCK` on the action topic can stall the **server's** publish path on one robot's dead uplink (Zenoh blocks up to `wait_before_close`, then may close the transport). A dropped action chunk is _recoverable by design_ — the client's queue keeps the robot moving and the next chunk replaces it. _Consequence_ (§6.3): actions = `reliability=RELIABLE` (hop-level) + `congestion_control=DROP` + `express=True` + `priority=INTERACTIVE_HIGH`; observations = `DROP` + `DATA`. If WAN loss proves material, upgrade the action topic to Zenoh Advanced Pub/Sub (cache + recovery, zenoh ≥ 1.5) rather than BLOCK.
**P7 — Schema-less MessagePack invites silent version drift across a 300-robot fleet.**
msgpack stays (zenoh's `ext` serializer cannot encode numpy/dataclasses, and the team's choice stands), but naked msgpack dicts across heterogeneous fleet versions fail at runtime, on the robot. _Consequence_ (§10.4): a packed little-endian **attachment header** (`schema_version`, `seq_id`, `episode_id`, `client_mono_ns` — the rmw_zenoh pattern) so routing/correlation never deserializes the body; `schema_version` negotiated at the session handshake; additive-only evolution; golden codec tests. Protobuf-over-ZBytes is the documented fallback if drift bites in practice.
**P8 — "Deterministic rollout reproducibility" is unattainable on real robots.**
No seed controls hardware, sensor noise, or network jitter; RTC's latency-driven trimming is inherently timing-dependent. _Consequence_: the contract is **fully logged + replayable** (§12): recording strategies already persist observations and executed actions; the remote engine adds `(session_id, seq_id, episode_id)` provenance so client datasets join server audit logs mechanically.
**P9 — v1 has no safety specification.**
"Log a warning when the buffer empties" is not a fail-safe for a 300-robot fleet. _Consequence_ (§9): a staleness bound (`max_action_age_s` — never execute an action older than X relative to its source observation), an explicit fallback ladder (`hold` / `repeat_last` / `zero` — zero-command required for future velocity-controlled robots), and a DEAD state that triggers the existing strategy shutdown path (return-to-initial-pose, disconnect) via the same `shutdown_event` mechanism RTC uses (`rtc.py:359-360`).
**P10 — Capacity must be formula-driven, not "a user decision".**
v1 §4 says clients-per-server "is a user decision". With `t` = server time per request, `r` = per-client request rate, `H` = RTC execution horizon, `dt` = control period:
`N_max = min( 0.8 / (r·t), (H·dt/2 RTT_net) / t )`
→ ACT @ 20 ms, 1 Hz: ~40 clients/GPU. Pi0 @ 150 ms, 1 Hz: ~5 clients/GPU. 300 robots on Pi0 ≈ 60 GPU pods. _Consequence_: the manifest carries `max_sessions`; the server rejects session opens beyond it (with current load in the reply) so clients retry another replica. Micro-batching is deferred — blocked on a real API issue (`predict_action_chunk` takes a _scalar_ `inference_delay`; batched clients have different delays) — behind a `Scheduler` seam so it can land later without redesign (§8.5).
**P11 — Discovery ≠ multicast.**
Zenoh's multicast scouting does not cross WAN, NAT, or most k8s CNIs. _Consequence_: multicast scouting disabled; clients use static `connect.endpoints` (DNS name of the router) + gossip; presence and liveness come from Zenoh **liveliness tokens** (§6.4), not discovery. "Discovery" for a robot fleet is configuration.
---
## 5. System Topology
![MaaS topology](MaaS_async_inference_diagram.png)
_(Diagram unchanged from v1 — the topology survives; transport/QoS/session details in it are superseded by §6.)_
- **Router tier**: one or more `zenohd` routers (k8s Deployment + Service, TLS on 7447). Robots **dial out** to the router (NAT-friendly: labs only need outbound 7447/443). GPU servers join as peers via cluster DNS.
- **Server**: one process = one `(model_repo, revision, dtype, device)` on one GPU, pre-warmed from a YAML manifest (**KEPT** from v1, amended: `pin_task: bool` — VLA prompts may vary per session unless pinned).
- **Client**: one robot running `lerobot-rollout --inference.type=remote`. Weightless: config-only policy metadata.
- **Identity**: `client_uuid` per robot; `session_id` per connection epoch; both in every log line on both sides.
---
## 6. Zenoh Design
All Zenoh claims below were verified against zenoh / zenoh-python 1.x (eclipse-zenoh 1.9.0). Pin: `eclipse-zenoh>=1.9,<2.0`; keep `zenohd` on the same minor as the Python binding. Wheels cover manylinux x86_64/aarch64/armv7l/armv6l + macOS — Raspberry Pi edge clients are covered.
### 6.1 Key-expression schema
```
@lerobot/<model_id>/<revision>/<task_slug>/<client_uuid>/obs client → server
@lerobot/<model_id>/<revision>/<task_slug>/<client_uuid>/action server → client
@lerobot/<model_id>/<revision>/<task_slug>/status queryable (capabilities)
@lerobot/<model_id>/<revision>/<task_slug>/session queryable (open/validate)
@lerobot/<model_id>/<revision>/<task_slug>/<client_uuid>/reset queryable (episode boundary)
@lerobot/<model_id>/<revision>/<task_slug>/<client_uuid>/alive liveliness token (client)
@lerobot/<model_id>/<revision>/<task_slug>/server/alive liveliness token (server)
```
Rules (hard, enforced by a `sanitize_keyexpr()` helper):
- Root at the **verbatim chunk** `@lerobot` — verbatim chunks are only matched by identical chunks, so third-party `**` subscribers on a shared router can never scrape the tree.
- Sanitize every user-supplied segment (model ids, task strings, uuids): non-empty, no `* $ ? # /`, no leading/trailing/double `/`. A task string containing `/` must be slugified before it becomes a key chunk.
- Server subscribes with a **single-depth** wildcard (`.../*/obs`) — never `**` (it would also match `status`, `alive`, …).
- v1's `cluster/experiment` prefix segments are dropped from the key schema; they return as free-form `tags` metadata in the session handshake (telemetry/labeling, not routing). Routing topology belongs to deployment (which router you dial), not to key depth.
### 6.2 Data plane vs. control plane (the rmw_zenoh split)
- **Data plane = pub/sub** (KEPT from v1): observations up, action chunks down, correlated by `seq_id` in **attachments** (§10.4). Pub/sub rather than query-per-inference because: a timed-out query's late reply is _dropped by the transport_ (wasted inference), whereas a late pub/sub chunk is still mergeable if it answers the latest outstanding seq; and pub/sub leaves room for server-initiated messages (drain notices). The one-in-flight discipline (P5) is enforced in the client worker, not by the transport.
- **Control plane = queryables** (request/reply with explicit timeouts; the pattern rmw*zenoh uses for ROS 2 services): `status` (pre-flight capability fetch, 2 s timeout), `session` (open/validate → ack with capabilities + `session_id`), `reset` (episode boundary — \_acknowledged*, so episodic strategies know the server-side episode state is clean). Always pass an explicit `timeout` to `session.get()` — the config default is 10 s, far too long for our watchdogs.
- **Episode ordering**: under one-in-flight there is no obs/reset race window in the data plane, but as belt-and-braces the first observation of each episode also carries `episode_start=True` + the new `episode_id` in its header.
### 6.3 QoS (revised from v1 §6.2 — see P6)
| Topic | reliability | congestion_control | express | priority | Why |
| ------------------ | ----------- | ---------------------- | -------- | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `obs` | default | **DROP** | false | DATA | Intentional drop already happened at the client's one-slot holder; if the uplink stalls, dropping a frame protects the control loop. |
| `action` | RELIABLE | **DROP** (never BLOCK) | **true** | INTERACTIVE_HIGH | Hop-level reliability over TCP; express skips batching for the small (450 KB) latency-critical payload; DROP so one dead robot uplink can never stall the server's publish path. Chunk loss is recoverable: the client buffer rides through it. |
| control queryables | RELIABLE | default | — | — | Correctness over latency; explicit timeouts bound them. |
Upgrade path if WAN chunk loss proves material: `AdvancedPublisher`/`AdvancedSubscriber` (zenoh ≥ 1.5) with a small cache + heartbeat-based recovery **on the action topic only**. Hop-by-hop RELIABLE is not end-to-end reliability — Zenoh has no broker persistence; a disconnected subscriber's data is gone. The design assumes this (client state machine, §9).
### 6.4 Liveliness (presence + watchdogs)
- Client declares a liveliness token on `.../<client_uuid>/alive`. The server liveliness-subscribes with `history=True`: token appear → ensure session state; token drop → GC the session (mailbox, processor instances) after a grace period.
- Server declares `.../server/alive`. The client liveliness-subscribes: on drop → treat as RECONNECTING (§9), hold/fallback per config, re-run the `status`/`session` handshake when the token reappears.
- Tune the transport lease down from its default so ungraceful-death detection is seconds, not tens of seconds (verify the default in the pinned version; it is config `transport/link/tx/lease`).
- Liveliness cannot detect a _hung-but-connected_ server. The client's per-request timeout (`request_timeout_s`) is the authoritative watchdog — this is the structural fix for legacy BUG-3 (no deadlines on `GetActions`).
### 6.5 Threading constraints (zenoh-python facts that shape both processes)
- **No asyncio API** in zenoh-python — both client and server are thread-based. This matches the existing RTC engine pattern exactly.
- Each callback-based subscriber spawns a dedicated Python thread; **blocking Zenoh calls inside callbacks are disallowed**. Callbacks must be deposit-only (write a slot, set an event, return).
- Channel handlers (`FifoChannel`, `RingChannel`) are Rust-side; `try_recv()` polls without spawning Python threads. `RingChannel(1)` is native latest-only semantics.
- No zero-copy path for our payloads (SHM API is `@_unstable` and same-host-only; `ZBytes` copy behavior undocumented). At ~200 KB × a few Hz per robot, one memcpy is irrelevant.
### 6.6 Router deployment
- `zenohd` official image as a k8s Deployment (1N replicas; routers mesh and reroute around failures) behind a `LoadBalancer`/`NodePort` Service exposing TLS 7447. No official Helm chart exists — roll-your-own manifests.
- `scouting.multicast.enabled: false`; `scouting.gossip.enabled: true`; clients/servers use static `connect.endpoints`.
- **Auth**: mTLS per robot (`transport.link.tls` with `enable_mtls`) + router **ACL** keyed on `cert_common_names`: a robot's cert may only `put` to `@lerobot/**/<its-uuid>/obs` and receive on `.../<its-uuid>/action`. Caveat (flagged): ACL config reloads require a router restart — plan cert/ACL changes as rolling router restarts.
- Security review input: the third-party Zenoh protocol security analysis (Census Labs, 2025) should be read before exposing 7447 publicly.
---
## 7. The Statelessness Boundary (the load-bearing section)
**Where the network cut goes.** The local RTC pipeline is:
```
obs (robot-processed dict)
→ build_dataset_frame(hw_features, obs, "observation") CLIENT (cheap, hardware-coupled)
─────────────────────────── network ───────────────────────────
→ prepare_observation_for_inference(...) SERVER (policy-coupled, heavy)
→ per-session preprocessor(...) SERVER (stateful within the request)
→ policy.predict_action_chunk(obs, inference_delay, prefix) SERVER (pure for allowlisted policies)
→ per-session postprocessor(...) SERVER (reads state cached at preprocess)
─────────────────────────── network ───────────────────────────
→ ActionQueue.merge(original, processed, real_delay, idx_before) CLIENT
```
Three consequences:
1. **The server needs no cross-request state.** `RelativeActionsProcessorStep` writes `_last_state` at preprocess and the postprocessor reads it back _within the same request_. Per-session pipeline instances + one-request-at-a-time-per-session give correctness with zero persistent state.
2. **RTC state stays client-side**, exactly where `RTCInferenceEngine` already keeps it. Each request ships: `inference_delay_steps = ceil(L_max/dt)` (from the client `LatencyTracker`, whose samples are full network-inclusive cycle times — RTT compensation falls out for free), `prefix_model = queue.get_left_over()[:H]`, and `prefix_robot = queue.get_processed_left_over()[:H]` (needed for server-side relative-prefix re-anchoring, mirroring `rtc.py:287-305`). The response returns **both** the model-space and robot-space chunks because `merge` needs both. ≤ `execution_horizon × action_dim` float32 each — a few hundred bytes.
3. **G9 dies structurally.** No bespoke client resize (`F.interpolate` in legacy `helpers.py`), no client-side normalization. Clients ship native camera resolution; the server's canonical processor path does everything — serve-time preprocessing is byte-identical to train-time.
**What the server _does_ hold** (and what it means):
- Per-session processor instances (cheap; normalization stat tensors shared read-only).
- Per-session episode counter + stats. Episode reset = reset the session's pipelines, clear its mailbox. **`policy.reset()` is never called in shared mode** — it is global to the shared policy instance and unnecessary for chunk-pure policies (ACT's ensembler and Pi0/SmolVLA's queues live in `select_action`, not `predict_action_chunk` — verified).
- Policies that are _not_ chunk-pure get `serving_mode: exclusive` (§8.3).
---
## 8. The Inference Server: `lerobot-policy-server`
New package `src/lerobot/policy_server/`; console script `lerobot-policy-server --manifest manifest.yaml`.
### 8.1 Process model — **KEPT** from v1, amended
One process = one model+task on one GPU, loaded and warmed at startup (`warmup_inferences` dummy forwards; covers torch.compile). Multi-GPU nodes run N processes (`CUDA_VISIBLE_DEVICES` pinning). Dynamic model loading (`SendPolicyInstructions`) is **rejected**: pickle/RCE surface, arbitrary-download surface, and it destroys capacity planning. Amendment: `pin_task: false` (default) lets VLA clients set the task per session; `pin_task: true` rejects mismatched tasks at session open.
### 8.2 Concurrency (pure threads — no asyncio in zenoh-python)
```
zenoh subscriber (.../*/obs) inference worker (1 thread, owns GPU)
deposit-only callback: loop:
slots[client_uuid] = sample ──► pick next session with pending obs (RR ring)
(per-client latest-only) decode JPEG → per-session preprocess
predict_action_chunk(delay, prefix)
control queryables (status/session/ per-session postprocess → encode
reset): validate, mutate session publisher.put(.../<uuid>/action)
registry, reply (publishing from the worker thread is fine)
```
- **Per-client latest-only mailbox**: a wildcard subscriber with a deposit-only callback writing per-client slots (scales to dynamic fleets), or — when the manifest enumerates clients — one `RingChannel(1)` subscriber per client polled via `try_recv()`. Either way: newest observation wins; a superseded request is counted (`superseded_seqs` in the next response) so drops are visible. This deletes legacy BUG-4 (`observations_similar` + `must_go`) by construction — the **client** decides when to request; the server never second-guesses observation content.
- **Single inference worker**: torch releases the GIL inside `forward`, callbacks stay responsive. Strict round-robin over sessions with pending observations: each gets exactly one inference per cycle; starvation is structurally impossible. Overload degrades into longer cycle times → larger (but correct) client `delay_steps` → eventually the client staleness bound trips and the robot holds — safe by construction.
### 8.3 Chunk-stateless allowlist and serving modes
At startup the server classifies the loaded policy:
| Class | Policies (verified) | Mode |
| --------------- | ------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
| chunk-stateless | ACT, Pi0, Pi0.5, SmolVLA (and any policy whose `predict_action_chunk` touches no instance state) | `shared`: N sessions, per-session pipelines, `policy.reset()` never called |
| chunk-stateful | Diffusion family (`predict_action_chunk` reads `select_action`-fed `self._queues`) | `exclusive`: `max_sessions=1` enforced; episode reset additionally calls `policy.reset()`; second session open → rejected with a self-explanatory error |
| no chunk API | SAC, SARM | refused at startup |
Implemented as a registry in `policy_server/validation.py`; the cleaner follow-up is a `supports_stateless_chunking` class attribute on `PreTrainedPolicy` (needs a pass over policy families — roadmap §14).
### 8.4 Session open & capability validation (fail fast, fail loud)
`session` queryable payload: `client_uuid`, `policy_type`, `fps`, feature summary (post-rename observation feature names + shapes, ordered action keys), `schema_version`, RTC intent, `tags`. Checks:
| Check | Rule | On mismatch |
| -------------------------- | --------------------------------------------------------------- | ---------------------------------------------------------------------------------- |
| Action names **and order** | must equal server's `action_feature_names` exactly | **hard reject** — this is the sync-safety contract mapping chunk columns to motors |
| Camera names | client set must cover `policy.config.input_features` image keys | hard reject |
| Resolution | any H×W accepted (server resizes canonically) | warn if aspect ratio differs from training |
| State dim | flattened dim must match | hard reject |
| `schema_version` | client within server's supported range | hard reject |
| fps | vs. manifest `trained_fps` | warn (reject only when `strict_fps: true`) |
| Task | when `pin_task: true`, must equal `default_task` | reject |
| RTC | client RTC requires policy RTC kwargs support | downgrade to append mode + warning |
| Capacity | `active_sessions < max_sessions` | reject with current load → client retries another replica |
Reply: `session_id`, model info (repo, revision — consider a checkpoint hash, §15), `action_feature_names`, `chunk_size`, `trained_fps`, `supports_rtc`, `serving_mode`, `warmed_up`, `schema_version`, warnings. **rename_map is applied client-side** so the wire format is canonical policy-feature keys across heterogeneous robots (also a prerequisite for future batching).
### 8.5 Scheduler seam (micro-batching later, not in v1)
The worker calls a `Scheduler.select(ready: list[Session]) -> list[Session]`; v1 ships `RoundRobin` (`return ready[:1]`). Cross-session batching is blocked on the policy API (`inference_delay` is scalar; batched clients have different delays/prefixes) — when that lands, a `MicroBatch` scheduler groups same-shape sessions. The seam costs nothing now and prevents a redesign later.
### 8.6 Manifest
```yaml
model:
{
repo_or_path: lerobot/pi0_towels,
revision: main,
dtype: bfloat16,
device: cuda,
}
default_task: "fold the towel"
pin_task: false
serving_mode: shared # forced to exclusive for chunk-stateful policies
max_sessions: 5 # from the §P10 formula: Pi0 @150ms, 1 Hz refresh
warmup_inferences: 2
strict_fps: false
zenoh:
connect_endpoints: ["tls/router.gpu-cluster.internal:7447"]
tls:
{
connect_certificate: ...,
connect_private_key: ...,
root_ca_certificate: ...,
}
health_port: 9100 # HTTP health + Prometheus metrics
debug: { capture_dir: null, capture_max: 256 }
```
Draccus dataclass in `policy_server/manifest.py`; YAML via `--manifest`, individual overrides via CLI.
---
## 9. The Edge Client: `RemoteInferenceEngine`
New file `src/lerobot/rollout/inference/remote.py`, registered `@InferenceEngineConfig.register_subclass("remote")`.
### 9.1 Threading model
| Thread | Role |
| -------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| Main (strategy loop) | `notify_observation(obs)` → lock-protected latest-only slot (identical to `rtc.py` `_obs_holder`). `get_action()``ActionQueue.get()` + staleness check. **Never any I/O.** Structurally fixes legacy BUG-1 (blocking send inside the 33 ms loop). |
| Network worker (1 daemon thread) | Cycle: wait until `queue_remaining·dt ≤ buffer_time_s` and active → snapshot `idx_before`, prefixes, `delay_steps = ceil(L_max/dt)` → encode (JPEG q=`jpeg_quality`) → `publisher.put(obs, attachment=header)` → await chunk on the action subscriber channel (timeout `request_timeout_s`) → `merge(original, processed, ceil(L/dt), idx_before)``latency_tracker.add(L)`. Owns the state machine, reconnects, and control queries. One-in-flight (P5). |
| Zenoh action subscriber | `FifoChannel(2)` handler drained by the worker (no Python callback thread on the hot path); liveliness subscriber callback is deposit-only (sets an event). |
Reused unchanged: `ActionQueue` (`policies/rtc/action_queue.py`), `LatencyTracker`, `ActionInterpolator` (lives in strategies — `interpolation_multiplier` works with remote for free). Deleted concepts: aggregation zoo, `observations_similar`, `must_go`, `TimedObservation`/`TimedAction` pickles.
### 9.2 Fail-safe state machine
```
ok no chunk for degraded_after_s
CONNECTING ─────► STREAMING ───────────────────────────────► DEGRADED
│ ▲ ▲ │ queue empty OR max_action_age_s hit │
│ │ backoff, │ └───────────────────────────────────► STALLED ◄──┘
│ │ re-handshake │ first successful merge │
│ └─ RECONNECTING ◄── timeout streak / server liveliness drop ◄─┘
│ │ offline > max_offline_s, capability/schema mismatch, auth failure
└──────► DEAD (failed=True → shutdown_event → strategy teardown: return-to-initial-pose)
```
- **DEGRADED**: requests failing but the queue still holds actions — the robot keeps executing; chunks _are_ the fault-tolerance buffer (13 s of coverage makes blips and clean server drains invisible).
- **STALLED**: queue empty or staleness bound hit → apply `fallback`: `hold` (`get_action``None`; `send_next_action` already tolerates it), `repeat_last`, or `zero` (required for velocity-controlled robots, where "send nothing" means "keep last velocity").
- **Staleness bound** (sync safety): every merge records `(chunk_start_index, t_send)`; `get_action` refuses any action whose source observation is older than `max_action_age_s` (default 3.0 s ≈ 90 steps @ 30 fps). Bounds open-loop execution after a network stall.
- **DEAD**: only after `max_offline_s` (default 60 s) or a hard contract violation (capability/schema mismatch on reconnect — e.g. the server restarted with a different model; never execute wrong-model chunks). Uses the exact mechanism RTC uses (`failed=True` + global `shutdown_event`) so existing teardown runs unchanged.
- **Watchdog layering**: per-request timeout (hung server — the BUG-3 fix) → server liveliness token (dead server/router) → staleness bound (the robot-side invariant that holds regardless of why data stopped).
- **Pause/resume (DAgger)**: `pause()` stops the worker publishing (slot keeps refreshing, ignored); queue intact — parity with `RTCInferenceEngine.pause`. DAgger's existing `interpolator.reset(); engine.reset(); engine.resume()` sequence works unchanged.
- **`reset()` (episode boundary)**: clear `ActionQueue` + staleness bookkeeping, bump `episode_id`, fire the acked `reset` query (1 s timeout, failure logged — the server has nothing it _must_ do thanks to per-request statelessness), flag `episode_start` on the next observation. `LatencyTracker` intentionally survives reset (latency is episode-invariant; parity with local RTC).
- **`ready`** = session opened ∧ capabilities validated ∧ server `warmed_up`. First-chunk gating is implicit (`get_action``None` until the first merge).
### 9.3 Weightless client — exact integration changes
- `rollout/context.py`: `PolicyContext.{policy, preprocessor, postprocessor}` become `| None`. For remote configs, skip step 1 (weight load / PEFT / `.to(device)` / torch.compile / `init_rtc_processor`) and step 6 (`make_pre_post_processors`). Verified safe: strategies only consume `ctx.policy.inference`. Keep steps 25 (robot processors, hardware, features, dataset) — they are robot-derived. Keep the visual pre-flight check (`context.py:309-324`): `--policy.path` already loads config-only (`rollout/configs.py:324-328`, no weight download) and failing before dialing the server is free. `use_torch_compile` / explicit `--device` → warn-and-ignore for remote.
- `rollout/inference/factory.py`: signature loosens to `policy: PreTrainedPolicy | None` (+ `policy_config: PreTrainedConfig`); `sync`/`rtc` branches guard `policy is None`; the `remote` branch lazy-imports (`eclipse-zenoh` stays an optional extra).
- The authoritative validation moves to session open (§8.4); the local check becomes a fast-fail convenience.
### 9.4 Config
```python
@InferenceEngineConfig.register_subclass("remote")
@dataclass
class RemoteInferenceConfig(InferenceEngineConfig):
connect_endpoint: str = "tls/localhost:7447" # zenoh router endpoint
tls_cert: str | None = None; tls_key: str | None = None; tls_ca: str | None = None
client_uuid: str = "" # "" → uuid4 at start()
jpeg_quality: int = 90 # 0 = raw (LAN/debug)
buffer_time_s: float = 0.5 # send next obs when queue playback ≤ this (v1 G14) — KEPT
max_action_age_s: float = 3.0 # staleness bound (safety)
degraded_after_s: float = 1.0
request_timeout_s: float = 5.0
reconnect_initial_backoff_s: float = 0.5
reconnect_max_backoff_s: float = 10.0
max_offline_s: float = 60.0
fallback: FallbackBehavior = FallbackBehavior.HOLD # hold | repeat_last | zero
rtc: RTCConfig = field(default_factory=RTCConfig) # enabled → replace mode; horizon caps prefix
tags: dict[str, str] = field(default_factory=dict) # ex-cluster/experiment labels
```
```bash
# Remote RTC + sentry recording (the reproducibility path)
lerobot-rollout \
--strategy.type=sentry \
--policy.path=lerobot/pi0_towels \ # config-only: no weights downloaded
--inference.type=remote \
--inference.connect_endpoint=tls/router.gpu-cluster.internal:7447 \
--inference.rtc.execution_horizon=10 \
--robot.type=so100_follower --robot.port=/dev/ttyACM0 \
--robot.cameras="{front: {type: opencv, index_or_path: 0, width: 640, height: 480, fps: 30}}" \
--dataset.repo_id=user/rollout_fleet_a --dataset.single_task="fold the towel"
```
---
## 10. Wire Schema
### 10.1 Payload anatomy & rates — **KEPT** (JPEG) with numbers
Upstream per request: joints (24128 B) + JPEG frames (480p q90 ≈ 4090 KB each; 720p ≈ 110230 KB) + RTC prefixes (≤ a few KB) → 60450 KB depending on cameras. Downstream: `2 × chunk_size × action_dim × 4 B` + metadata → 350 KB. Effective request rate is self-clocked by `buffer_time_s` to ~14 Hz per robot (not the 30 Hz control rate). 300 robots ≈ 0.310 Mbps each — the wire is never the bottleneck; bandwidth budgeting is about camera count/resolution, and each GPU pod only ever sees its own ≤ `max_sessions` clients. Zenoh fragments >64 KiB payloads transparently; multi-MB messages are fine.
### 10.2 Attachment header (fixed-layout, packed little-endian — parsed without touching the body)
| Field | Type | Notes |
| ---------------- | ---- | -------------------------------------------------------------- |
| `schema_version` | u16 | negotiated at session open |
| `msg_type` | u8 | OBS / CHUNK / EVENT |
| `seq_id` | u64 | per-session monotonic; echoed in the chunk |
| `episode_id` | u32 | bumped by `reset()` |
| `client_mono_ns` | i64 | client `monotonic_ns()`; **opaque to the server, echoed back** |
| `session_epoch` | u32 | bumped per (re)connect; stale-epoch chunks dropped |
### 10.3 msgpack bodies
**ObservationMsg** (client → server): `state: {names_ref, data: f32 LE bytes}`, `images: {name: {codec: jpeg|raw, bytes, (h,w,c) if raw}}`, `task: str`, `inference_delay_steps: int`, `prefix_model: tensor?`, `prefix_robot: tensor?` (tensors = raw LE bytes + dtype + shape), `episode_start: bool`.
**ActionChunkMsg** (server → client): `seq_id_echo`, `client_mono_ns_echo`, `chunk_model: tensor`, `chunk_robot: tensor`, `queue_wait_ms: f32`, `inference_ms: f32`, `superseded_seqs: u32`, `server_load: f32`.
**Status / SessionOpen / SessionAck / ResetMsg**: as specified in §8.4.
### 10.4 Schema discipline (P7)
`schema_version` gates at handshake; evolution is additive-only (new optional msgpack keys; unknown keys ignored); attachment layout changes require a version bump; golden codec round-trip tests (tensor exactness, JPEG RGB-channel-order regression — a silent BGR swap poisons every VLA in the fleet) are part of the test suite. **No pickle anywhere** — KEPT from v1 and now structural: nothing in the schema can carry code.
---
## 11. Latency Budget & the Clock Iron Rule
| Stage | LAN | WAN (50 ms RTT) |
| ------------------------------ | --------------- | --------------- |
| JPEG encode ×3 (edge CPU) | 29 ms | 29 ms |
| Serialize | <1 ms | <1 ms |
| Uplink (tx + ½RTT) | ~2 ms | ~54 ms |
| Server queue wait | 0 → 1×inference | 0 → 1×inference |
| Decode + canonical preprocess | 410 ms | 410 ms |
| **Inference** | **15150 ms** | **15150 ms** |
| Postprocess + downlink + merge | ~2 ms | ~27 ms |
| **Total (Pi0-class)** | **~110175 ms** | **~190250 ms** |
Inference is 6085 % of end-to-end on LAN; the entire transport+serialization stack is <10 ms. WAN adds propagation + uplink bandwidth — identical under any transport. At 30 fps this lands `delay_steps` ≈ 48, comfortably inside RTC execution horizons: WAN degrades smoothness parameters, never correctness. _This table is the standing answer to transport-performance bikeshedding._
**Clock iron rule** (P4): wall-clock instants never cross machines. Client stamps `monotonic_ns`, the server echoes it opaquely; `RTT = now echo`. The server reports only **durations** (`queue_wait_ms`, `inference_ms`) measured on its own monotonic clock; `network_time = RTT queue_wait inference` for diagnostics. The schema has no field in which a foreign wall-clock instant can be compared — the legacy `time.time()` bug is unrepresentable.
---
## 12. Reproducibility & Audit (P8)
The contract is **fully logged + replayable**, not "deterministic":
- **Client = source of truth.** Recording strategies already persist observations + executed actions to `LeRobotDataset`. The remote engine logs, per executed action, the `(session_id, seq_id, episode_id)` of its source chunk plus the echoed `queue_wait_ms`/`inference_ms` (dataset-extras columns are a follow-up; client logs in v1).
- **Server audit line per request** (structured JSON): `{ts, session_id, client_uuid, seq_id, episode_id, queue_wait_ms, inference_ms, chunk_range, superseded_seqs, outcome}`.
- **Optional bounded capture**: `debug.capture_dir` writes a ring of request/response pairs (safetensors) for byte-exact offline replay through the same server pipeline.
- **Runbook — "robot #217 stuttered at 14:03"**: (1) Grafana `session_staleness{client="217"}` — spike ⇒ server side, flat ⇒ client/network. (2) Server side: audit lines — `queue_wait_ms` rising across _all_ sessions ⇒ overloaded replica (check `active_sessions` vs `max_sessions`); `superseded_seqs` streak on 217 only ⇒ that client over-requesting; `outcome=error` ⇒ adjacent stack trace. (3) Client side: state-machine transitions + reconnects in the client log; dataset rows show which seq's chunk was executing and where `None` ticks occurred. Every hop shares `(session_id, seq_id)` — the join is mechanical.
---
## 13. Integration & Migration Plan
### 13.1 New
| Path | Content |
| --------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `src/lerobot/policy_server/{__init__,schema,codec,manifest,session,scheduler,validation,server}.py` | wire schema constants, msgpack/attachment codecs, manifest dataclasses, `Session` + mailbox, `Scheduler` seam, capability rules + chunk-stateless registry, zenoh servicer + inference worker + drain + HTTP health/metrics |
| `src/lerobot/rollout/inference/remote.py` | `RemoteInferenceEngine` (~600 lines; mirrors `rtc.py` structure) |
| `src/lerobot/scripts/lerobot_policy_server.py` + `[project.scripts]` entry | thin `main()` |
| `docker/Dockerfile.policy-server` | CUDA runtime base + uv; manifest via ConfigMap |
| `docs/source/remote_inference.mdx` (+ `_toctree.yml`) | replaces `async.mdx` |
### 13.2 Modified
`rollout/inference/factory.py` (config + Optional-typed signature + lazy import) · `rollout/context.py` (weightless branch) · `rollout/inference/__init__.py` · `scripts/lerobot_rollout.py` docstring · `pyproject.toml`: `[async]` extra becomes `eclipse-zenoh>=1.9,<2.0` + `msgpack` (grpcio/matplotlib leave it; grpcio remains under `[hilserl]`/`dev` for the RL stack).
### 13.3 Removed — same landing PR
`src/lerobot/async_inference/` · `tests/async_inference/` · `docs/source/async.mdx` + its `_toctree.yml` entry · the `AsyncInference` service + `Observation`/`Actions`/`PolicySetup` messages from `src/lerobot/transport/services.proto` (regenerate pb2; **`LearnerService` untouched** — `transport/` is shared with HIL-SERL (`src/lerobot/rl/`); the RL test suite gates this change).
### 13.4 Legacy config → successor mapping
| Legacy (`RobotClientConfig`/`PolicyServerConfig`) | Successor |
| ------------------------------------------------- | ---------------------------------------------------------- |
| `server_address` | `--inference.connect_endpoint` (zenoh router) |
| `policy_type`, `pretrained_name_or_path` | `--policy.path` (config-only) + server manifest |
| `chunk_size_threshold` (01 ratio) | `--inference.buffer_time_s` (seconds) |
| `actions_per_chunk` | server manifest (validated at session open) |
| `aggregate_fn_name` + `AGGREGATE_FUNCTIONS` | **dropped**`ActionQueue` replace/append |
| `policy_device`, `client_device` | **dropped** — server concern / chunks arrive CPU f32 |
| `debug_visualize_queue_size` | **dropped** — Rerun (`--display_data`) + engine stats |
| `PolicyServerConfig.{host,port}` | manifest `zenoh.connect_endpoints` |
| `inference_latency`, `obs_queue_timeout` | **dropped** — latency client-measured; no server obs queue |
| `SendPolicyInstructions` | **dropped** — MaaS manifest + session validation |
| `observations_similar` / `must_go` | **dropped** — latest-only slots + client send gate |
| pickle envelopes | **dropped** — msgpack + attachment headers |
### 13.5 Legacy bugs/gaps → structural resolution
BUG-1 → worker thread owns all I/O. BUG-2 → aggregation deleted; `ActionQueue` is internally locked. BUG-3 → per-request timeout + liveliness. BUG-4 → client-side send gating; server newest-wins. G1 → per-session registry. G2 → manifest. G4 → msgpack+attachments. G5 → monotonic echo + `delay_steps`. G7 → recording strategies. G8 → mTLS + ACL. G9 → server-side canonical processors. G11 → `status` queryable. G12 → Prometheus + audit logs. G13 → `lerobot-policy-server` console script. G14 → `buffer_time_s`.
### 13.6 Tests
- **Unit**: codec round-trips (tensor exact; JPEG RGB-order regression), capability-validation matrix (§8.4 as parametrized cases), scheduler fairness + newest-wins supersession (mock policy with configurable sleep), manifest parsing, key-expr sanitization.
- **Loopback integration** (CPU, fast CI): client+server in one process over zenoh peer-to-peer (or a localhost `zenohd` started by the fixture), tiny-ACT, fake 2-camera robot, N=8 concurrent sessions. The headline regression: two sessions with different joint states must not cross-contaminate `RelativeActionsProcessorStep` postprocessing — the test that proves the multi-tenancy claim.
- **Chaos**: kill the server mid-episode → client returns `None`, never raises into the control loop, `failed` stays False within `max_offline_s`, resumes on restart; `docker kill zenohd` → liveliness flap → safe state → re-handshake (explicitly tests re-declaration behavior, flagged unverified upstream); SIGTERM drain → in-flight chunk completes, clients reconnect invisibly.
- **Golden parity**: remote RTC vs local `RTCInferenceEngine` on identical observation sequences → byte-identical merged queues (the re-anchoring contract test). Gate for any real-robot remote-RTC use.
---
## 14. Roadmap
1. **PR1 — schema & codecs** (no torch deps): `policy_server/{schema,codec,manifest}.py`, key-expr sanitizer, golden codec tests.
2. **PR2 — server core**: session registry, scheduler, validation/allowlist, inference worker with mock policy, loopback harness.
3. **PR3 — client engine**: `RemoteInferenceEngine`, factory/context weightless integration, loopback integration + chaos + golden-parity tests.
4. **PR4 — ops & docs**: Dockerfile, health/metrics, drain, ACL examples, `remote_inference.mdx`, rollout docstring.
5. **Landing PR — legacy deletion**: remove `async_inference/` + tests + docs + proto service (RL suite gates), `[async]` extra swap.
6. **Pre-release field validation**: one real robot on a lossy network (watchdog default tuning); JPEG q90 vs raw A/B on one policy (train/serve shift).
7. **Future**: micro-batching (needs per-sample `inference_delay` across policy families), client-side downscale-to-policy-resolution (config-only shapes make it possible), Advanced Pub/Sub on the action topic, per-robot quotas, dataset provenance columns, `supports_stateless_chunking` attribute upstreamed to policy classes.
---
## 15. Open Risks
| Risk | Mitigation / decision needed |
| ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| Re-anchoring parity (server-side relative-prefix re-anchor vs `rtc.py`) | Golden parity test (§13.6) is a hard gate before robot use; likely failure mode is normalizer dtype/device drift |
| First-chunk over-trim when idle: `merge` trims `ceil(L/dt)` even when nothing was consumed (queue empty at episode start) — wasteful at network latencies (600 ms ⇒ 18 steps) | Proposed clamp `real_delay = min(real_delay, last_index - idx_before)` touches the shared `ActionQueue` used by local RTC — needs sign-off + regression tests |
| JPEG train/serve distribution shift | Unmeasured; A/B before locking q90 default (roadmap §14.6) |
| Watchdog defaults untuned (`request_timeout_s=5`, `degraded_after_s=1`, `max_action_age_s=3`) | Field validation on wired and Wi-Fi; consider named profiles |
| Capability check can pass while semantics differ (different finetune, different normalization stats, identical feature names) | Add checkpoint hash/revision pinning to SessionAck — decide in PR2 |
| zenoh-python long-session maturity: re-declaration after router restart partially verified; SHM unstable; no asyncio | Chaos tests own this; thread-based design avoids the asyncio gap entirely |
| Router ACL reload requires restart | Operational runbook: cert/ACL changes = rolling router restart |
| `fallback=zero` has no consumer until velocity actions land in rollout (only `.pos` features routed today) | Validate the enum against robot capabilities when velocity support lands |
| Per-client mailbox memory under fleet-scale wildcard subscription | One decoded-obs slot per client is small; add an LRU GC tied to liveliness drops |
+288
View File
@@ -0,0 +1,288 @@
# Video benchmark
## Questions
What is the optimal trade-off between:
- maximizing loading time with random access,
- minimizing memory space on disk,
- maximizing success rate of policies,
- compatibility across devices/platforms for decoding videos (e.g. video players, web browsers).
How to encode videos?
- Which video codec (`-vcodec`) to use? h264, h265, AV1?
- What pixel format to use (`-pix_fmt`)? `yuv444p` or `yuv420p`?
- How much compression (`-crf`)? No compression with `0`, intermediate compression with `25` or extreme with `50+`?
- Which frequency to chose for key frames (`-g`)? A key frame every `10` frames?
How to decode videos?
- Which `decoder`? `torchvision`, `torchaudio`, `ffmpegio`, `decord`, or `nvc`?
- What scenarios to use for the requesting timestamps during benchmark? (`timestamps_mode`)
## Variables
**Image content & size**
We don't expect the same optimal settings for a dataset of images from a simulation, or from real-world in an apartment, or in a factory, or outdoor, or with lots of moving objects in the scene, etc. Similarly, loading times might not vary linearly with the image size (resolution).
For these reasons, we run this benchmark on four representative datasets:
- `lerobot/pusht_image`: (96 x 96 pixels) simulation with simple geometric shapes, fixed camera.
- `lerobot/aloha_mobile_shrimp_image`: (480 x 640 pixels) real-world indoor, moving camera.
- `lerobot/paris_street`: (720 x 1280 pixels) real-world outdoor, moving camera.
- `lerobot/kitchen`: (1080 x 1920 pixels) real-world indoor, fixed camera.
Note: The datasets used for this benchmark need to be image datasets, not video datasets.
**Data augmentations**
We might revisit this benchmark and find better settings if we train our policies with various data augmentations to make them more robust (e.g. robust to color changes, compression, etc.).
### Encoding parameters
| parameter | values |
| ----------- | ------------------------------------------------------------ |
| **vcodec** | `libx264`, `libx265`, `libsvtav1` |
| **pix_fmt** | `yuv444p`, `yuv420p` |
| **g** | `1`, `2`, `3`, `4`, `5`, `6`, `10`, `15`, `20`, `40`, `None` |
| **crf** | `0`, `5`, `10`, `15`, `20`, `25`, `30`, `40`, `50`, `None` |
Note that `crf` value might be interpreted differently by various video codecs. In other words, the same value used with one codec doesn't necessarily translate into the same compression level with another codec. In fact, the default value (`None`) isn't the same amongst the different video codecs. Importantly, it is also the case for many other ffmpeg arguments like `g` which specifies the frequency of the key frames.
For a comprehensive list and documentation of these parameters, see the ffmpeg documentation depending on the video codec used:
- h264: https://trac.ffmpeg.org/wiki/Encode/H.264
- h265: https://trac.ffmpeg.org/wiki/Encode/H.265
- AV1: https://trac.ffmpeg.org/wiki/Encode/AV1
### Decoding parameters
**Decoder**
We tested two video decoding backends from torchvision:
- `pyav`
- `video_reader` (requires to build torchvision from source)
**Requested timestamps**
Given the way video decoding works, once a keyframe has been loaded, the decoding of subsequent frames is fast.
This of course is affected by the `-g` parameter during encoding, which specifies the frequency of the keyframes. Given our typical use cases in robotics policies which might request a few timestamps in different random places, we want to replicate these use cases with the following scenarios:
- `1_frame`: 1 frame,
- `2_frames`: 2 consecutive frames (e.g. `[t, t + 1 / fps]`),
- `6_frames`: 6 consecutive frames (e.g. `[t + i / fps for i in range(6)]`)
Note that this differs significantly from a typical use case like watching a movie, in which every frame is loaded sequentially from the beginning to the end and it's acceptable to have big values for `-g`.
Additionally, because some policies might request single timestamps that are a few frames apart, we also have the following scenario:
- `2_frames_4_space`: 2 frames with 4 consecutive frames of spacing in between (e.g `[t, t + 5 / fps]`),
However, due to how video decoding is implemented with `pyav`, we don't have access to an accurate seek so in practice this scenario is essentially the same as `6_frames` since all 6 frames between `t` and `t + 5 / fps` will be decoded.
## Metrics
**Data compression ratio (lower is better)**
`video_images_size_ratio` is the ratio of the memory space on disk taken by the encoded video over the memory space taken by the original images. For instance, `video_images_size_ratio=25%` means that the video takes 4 times less memory space on disk compared to the original images.
**Loading time ratio (lower is better)**
`video_images_load_time_ratio` is the ratio of the time it takes to decode frames from the video at a given timestamps over the time it takes to load the exact same original images. Lower is better. For instance, `video_images_load_time_ratio=200%` means that decoding from video is 2 times slower than loading the original images.
**Average Mean Square Error (lower is better)**
`avg_mse` is the average mean square error between each decoded frame and its corresponding original image over all requested timestamps, and also divided by the number of pixels in the image to be comparable when switching to different image sizes.
**Average Peak Signal to Noise Ratio (higher is better)**
`avg_psnr` measures the ratio between the maximum possible power of a signal and the power of corrupting noise that affects the fidelity of its representation. Higher PSNR indicates better quality.
**Average Structural Similarity Index Measure (higher is better)**
`avg_ssim` evaluates the perceived quality of images by comparing luminance, contrast, and structure. SSIM values range from -1 to 1, where 1 indicates perfect similarity.
One aspect that can't be measured here with those metrics is the compatibility of the encoding across platforms, in particular on web browser, for visualization purposes.
h264, h265 and AV1 are all commonly used codecs and should not pose an issue. However, the chroma subsampling (`pix_fmt`) format might affect compatibility:
- `yuv420p` is more widely supported across various platforms, including web browsers.
- `yuv444p` offers higher color fidelity but might not be supported as broadly.
<!-- **Loss of a pretrained policy (higher is better)** (not available)
`loss_pretrained` is the result of evaluating with the selected encoding/decoding settings a policy pretrained on original images. It is easier to understand than `avg_l2_error`.
**Success rate after retraining (higher is better)** (not available)
`success_rate` is the result of training and evaluating a policy with the selected encoding/decoding settings. It is the most difficult metric to get but also the very best. -->
## How the benchmark works
The benchmark evaluates both encoding and decoding of video frames on the first episode of each dataset.
**Encoding:** for each `vcodec` and `pix_fmt` pair, we use a default value for `g` and `crf` upon which we change a single value (either `g` or `crf`) to one of the specified values (we don't test every combination of those as this would be computationally too heavy).
This gives a unique set of encoding parameters which is used to encode the episode.
**Decoding:** Then, for each of those unique encodings, we iterate through every combination of the decoding parameters `backend` and `timestamps_mode`. For each of them, we record the metrics of a number of samples (given by `--num-samples`). This is parallelized for efficiency and the number of processes can be controlled with `--num-workers`. Ideally, it's best to have a `--num-samples` that is divisible by `--num-workers`.
Intermediate results saved for each `vcodec` and `pix_fmt` combination in csv tables.
These are then all concatenated to a single table ready for analysis.
## Caveats
We tried to measure the most impactful parameters for both encoding and decoding. However, for computational reasons we can't test out every combination.
Additional encoding parameters exist that are not included in this benchmark. In particular:
- `-preset` which allows for selecting encoding presets. This represents a collection of options that will provide a certain encoding speed to compression ratio. By leaving this parameter unspecified, it is considered to be `medium` for libx264 and libx265 and `8` for libsvtav1.
- `-tune` which allows to optimize the encoding for certain aspects (e.g. film quality, fast decoding, etc.).
See the documentation mentioned above for more detailed info on these settings and for a more comprehensive list of other parameters.
Similarly on the decoding side, other decoders exist but are not implemented in our current benchmark. To name a few:
- `torchaudio`
- `ffmpegio`
- `decord`
- `nvc`
Note as well that since we are mostly interested in the performance at decoding time (also because encoding is done only once before uploading a dataset), we did not measure encoding times nor have any metrics regarding encoding.
However, besides the necessity to build ffmpeg from source, encoding did not pose any issue and it didn't take a significant amount of time during this benchmark.
## Install
Building ffmpeg from source is required to include libx265 and libaom/libsvtav1 (av1) video codecs ([compilation guide](https://trac.ffmpeg.org/wiki/CompilationGuide/Ubuntu)).
**Note:** While you still need to build torchvision with a conda-installed `ffmpeg<4.3` to use the `video_reader` decoder (as described in [#220](https://github.com/huggingface/lerobot/pull/220)), you also need another version which is custom-built with all the video codecs for encoding. For the script to then use that version, you can prepend the command above with `PATH="$HOME/bin:$PATH"`, which is where ffmpeg should be built.
## Adding a video decoder
Right now, we're only benchmarking the two video decoder available with torchvision: `pyav` and `video_reader`.
You can easily add a new decoder to benchmark by adding it to this function in the script:
```diff
def decode_video_frames(
video_path: str,
timestamps: list[float],
tolerance_s: float,
backend: str,
) -> torch.Tensor:
if backend in ["pyav", "video_reader"]:
return decode_video_frames_torchvision(
video_path, timestamps, tolerance_s, backend
)
+ elif backend == ["your_decoder"]:
+ return your_decoder_function(
+ video_path, timestamps, tolerance_s, backend
+ )
else:
raise NotImplementedError(backend)
```
## Example
For a quick run, you can try these parameters:
```bash
python benchmark/video/run_video_benchmark.py \
--output-dir outputs/video_benchmark \
--repo-ids \
lerobot/pusht_image \
lerobot/aloha_mobile_shrimp_image \
--vcodec libx264 libx265 \
--pix-fmt yuv444p yuv420p \
--g 2 20 None \
--crf 10 40 None \
--timestamps-modes 1_frame 2_frames \
--backends pyav video_reader \
--num-samples 5 \
--num-workers 5 \
--save-frames 0
```
## Results
### Reproduce
We ran the benchmark with the following parameters:
```bash
# h264 and h265 encodings
python benchmark/video/run_video_benchmark.py \
--output-dir outputs/video_benchmark \
--repo-ids \
lerobot/pusht_image \
lerobot/aloha_mobile_shrimp_image \
lerobot/paris_street \
lerobot/kitchen \
--vcodec libx264 libx265 \
--pix-fmt yuv444p yuv420p \
--g 1 2 3 4 5 6 10 15 20 40 None \
--crf 0 5 10 15 20 25 30 40 50 None \
--timestamps-modes 1_frame 2_frames 6_frames \
--backends pyav video_reader \
--num-samples 50 \
--num-workers 5 \
--save-frames 1
# av1 encoding (only compatible with yuv420p and pyav decoder)
python benchmark/video/run_video_benchmark.py \
--output-dir outputs/video_benchmark \
--repo-ids \
lerobot/pusht_image \
lerobot/aloha_mobile_shrimp_image \
lerobot/paris_street \
lerobot/kitchen \
--vcodec libsvtav1 \
--pix-fmt yuv420p \
--g 1 2 3 4 5 6 10 15 20 40 None \
--crf 0 5 10 15 20 25 30 40 50 None \
--timestamps-modes 1_frame 2_frames 6_frames \
--backends pyav \
--num-samples 50 \
--num-workers 5 \
--save-frames 1
```
The full results are available [here](https://docs.google.com/spreadsheets/d/1OYJB43Qu8fC26k_OyoMFgGBBKfQRCi4BIuYitQnq3sw/edit?usp=sharing)
### Parameters selected for LeRobotDataset
Considering these results, we chose what we think is the best set of encoding parameter:
- vcodec: `libsvtav1`
- pix-fmt: `yuv420p`
- g: `2`
- crf: `30`
Since we're using av1 encoding, we're choosing the `pyav` decoder as `video_reader` does not support it (and `pyav` doesn't require a custom build of `torchvision`).
### Summary
These tables show the results for `g=2` and `crf=30`, using `timestamps-modes=6_frames` and `backend=pyav`
| video_images_size_ratio | vcodec | pix_fmt | | | |
| --------------------------------- | ---------- | ------- | --------- | --------- | --------- |
| | libx264 | | libx265 | | libsvtav1 |
| repo_id | yuv420p | yuv444p | yuv420p | yuv444p | yuv420p |
| lerobot/pusht_image | **16.97%** | 17.58% | 18.57% | 18.86% | 22.06% |
| lerobot/aloha_mobile_shrimp_image | 2.14% | 2.11% | 1.38% | **1.37%** | 5.59% |
| lerobot/paris_street | 2.12% | 2.13% | **1.54%** | **1.54%** | 4.43% |
| lerobot/kitchen | 1.40% | 1.39% | **1.00%** | **1.00%** | 2.52% |
| video_images_load_time_ratio | vcodec | pix_fmt | | | |
| --------------------------------- | ------- | ------- | -------- | ------- | --------- |
| | libx264 | | libx265 | | libsvtav1 |
| repo_id | yuv420p | yuv444p | yuv420p | yuv444p | yuv420p |
| lerobot/pusht_image | 6.45 | 5.19 | **1.90** | 2.12 | 2.47 |
| lerobot/aloha_mobile_shrimp_image | 11.80 | 7.92 | 0.71 | 0.85 | **0.48** |
| lerobot/paris_street | 2.21 | 2.05 | 0.36 | 0.49 | **0.30** |
| lerobot/kitchen | 1.46 | 1.46 | 0.28 | 0.51 | **0.26** |
| | | vcodec | pix_fmt | | | |
| --------------------------------- | -------- | -------- | ------------ | -------- | --------- | ------------ |
| | | libx264 | | libx265 | | libsvtav1 |
| repo_id | metric | yuv420p | yuv444p | yuv420p | yuv444p | yuv420p |
| lerobot/pusht_image | avg_mse | 2.90E-04 | **2.03E-04** | 3.13E-04 | 2.29E-04 | 2.19E-04 |
| | avg_psnr | 35.44 | 37.07 | 35.49 | **37.30** | 37.20 |
| | avg_ssim | 98.28% | **98.85%** | 98.31% | 98.84% | 98.72% |
| lerobot/aloha_mobile_shrimp_image | avg_mse | 2.76E-04 | 2.59E-04 | 3.17E-04 | 3.06E-04 | **1.30E-04** |
| | avg_psnr | 35.91 | 36.21 | 35.88 | 36.09 | **40.17** |
| | avg_ssim | 95.19% | 95.18% | 95.00% | 95.05% | **97.73%** |
| lerobot/paris_street | avg_mse | 6.89E-04 | 6.70E-04 | 4.03E-03 | 4.02E-03 | **3.09E-04** |
| | avg_psnr | 33.48 | 33.68 | 32.05 | 32.15 | **35.40** |
| | avg_ssim | 93.76% | 93.75% | 89.46% | 89.46% | **95.46%** |
| lerobot/kitchen | avg_mse | 2.50E-04 | 2.24E-04 | 4.28E-04 | 4.18E-04 | **1.53E-04** |
| | avg_psnr | 36.73 | 37.33 | 36.56 | 36.75 | **39.12** |
| | avg_ssim | 95.47% | 95.58% | 95.52% | 95.53% | **96.82%** |
+488
View File
@@ -0,0 +1,488 @@
#!/usr/bin/env python
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Assess the performance of video decoding in various configurations.
This script will benchmark different video encoding and decoding parameters.
See the provided README.md or run `python benchmark/video/run_video_benchmark.py --help` for usage info.
"""
import argparse
import datetime as dt
import itertools
import random
import shutil
from collections import OrderedDict
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from threading import Lock
import einops
import numpy as np
import pandas as pd
import PIL
import torch
from skimage.metrics import mean_squared_error, peak_signal_noise_ratio, structural_similarity
from tqdm import tqdm
from lerobot.datasets.lerobot_dataset import LeRobotDataset
from lerobot.datasets.video_utils import (
decode_video_frames,
encode_video_frames,
)
from lerobot.utils.constants import OBS_IMAGE
from lerobot.utils.utils import TimerManager
BASE_ENCODING = OrderedDict(
[
("vcodec", "libx264"),
("pix_fmt", "yuv444p"),
("g", 2),
("crf", None),
# TODO(aliberts): Add fastdecode
# ("fastdecode", 0),
]
)
# TODO(rcadene, aliberts): move to `utils.py` folder when we want to refactor
def parse_int_or_none(value) -> int | None:
if value.lower() == "none":
return None
try:
return int(value)
except ValueError as e:
raise argparse.ArgumentTypeError(f"Invalid int or None: {value}") from e
def check_datasets_formats(repo_ids: list) -> None:
for repo_id in repo_ids:
dataset = LeRobotDataset(repo_id)
if len(dataset.meta.video_keys) > 0:
raise ValueError(
f"Use only image dataset for running this benchmark. Video dataset provided: {repo_id}"
)
def get_directory_size(directory: Path) -> int:
total_size = 0
for item in directory.rglob("*"):
if item.is_file():
total_size += item.stat().st_size
return total_size
def load_original_frames(imgs_dir: Path, timestamps: list[float], fps: int) -> torch.Tensor:
frames = []
for ts in timestamps:
idx = int(ts * fps)
frame = PIL.Image.open(imgs_dir / f"frame-{idx:06d}.png")
frame = torch.from_numpy(np.array(frame))
frame = frame.type(torch.float32) / 255
frame = einops.rearrange(frame, "h w c -> c h w")
frames.append(frame)
return torch.stack(frames)
def save_decoded_frames(
imgs_dir: Path, save_dir: Path, frames: torch.Tensor, timestamps: list[float], fps: int
) -> None:
if save_dir.exists() and len(list(save_dir.glob("frame-*.png"))) == len(timestamps):
return
save_dir.mkdir(parents=True, exist_ok=True)
for i, ts in enumerate(timestamps):
idx = int(ts * fps)
frame_hwc = (frames[i].permute((1, 2, 0)) * 255).type(torch.uint8).cpu().numpy()
PIL.Image.fromarray(frame_hwc).save(save_dir / f"frame-{idx:06d}_decoded.png")
shutil.copyfile(imgs_dir / f"frame-{idx:06d}.png", save_dir / f"frame-{idx:06d}_original.png")
def save_first_episode(imgs_dir: Path, dataset: LeRobotDataset) -> None:
episode_index = 0
ep_num_images = dataset.meta.episodes["length"][episode_index]
if imgs_dir.exists() and len(list(imgs_dir.glob("frame-*.png"))) == ep_num_images:
return
imgs_dir.mkdir(parents=True, exist_ok=True)
hf_dataset = dataset.hf_dataset.with_format(None)
# We only save images from the first camera
img_keys = [key for key in hf_dataset.features if key.startswith(OBS_IMAGE)]
imgs_dataset = hf_dataset.select_columns(img_keys[0])
for i, item in enumerate(
tqdm(imgs_dataset, desc=f"saving {dataset.repo_id} first episode images", leave=False)
):
img = item[img_keys[0]]
img.save(str(imgs_dir / f"frame-{i:06d}.png"), quality=100)
if i >= ep_num_images - 1:
break
def sample_timestamps(timestamps_mode: str, ep_num_images: int, fps: int) -> list[float]:
# Start at 5 to allow for 2_frames_4_space and 6_frames
idx = random.randint(5, ep_num_images - 1)
match timestamps_mode:
case "1_frame":
frame_indexes = [idx]
case "2_frames":
frame_indexes = [idx - 1, idx]
case "2_frames_4_space":
frame_indexes = [idx - 5, idx]
case "6_frames":
frame_indexes = [idx - i for i in range(6)][::-1]
case _:
raise ValueError(timestamps_mode)
return [idx / fps for idx in frame_indexes]
def benchmark_decoding(
imgs_dir: Path,
video_path: Path,
timestamps_mode: str,
backend: str,
ep_num_images: int,
fps: int,
num_samples: int = 50,
num_workers: int = 4,
save_frames: bool = False,
) -> dict:
def process_sample(sample: int, lock: Lock):
time_benchmark = TimerManager(log=False)
timestamps = sample_timestamps(timestamps_mode, ep_num_images, fps)
num_frames = len(timestamps)
result = {
"psnr_values": [],
"ssim_values": [],
"mse_values": [],
}
with time_benchmark, lock:
frames = decode_video_frames(video_path, timestamps=timestamps, tolerance_s=5e-1, backend=backend)
result["load_time_video_ms"] = (time_benchmark.last * 1000) / num_frames
with time_benchmark:
original_frames = load_original_frames(imgs_dir, timestamps, fps)
result["load_time_images_ms"] = (time_benchmark.last * 1000) / num_frames
frames_np, original_frames_np = frames.numpy(), original_frames.numpy()
for i in range(num_frames):
result["mse_values"].append(mean_squared_error(original_frames_np[i], frames_np[i]))
result["psnr_values"].append(
peak_signal_noise_ratio(original_frames_np[i], frames_np[i], data_range=1.0)
)
result["ssim_values"].append(
structural_similarity(original_frames_np[i], frames_np[i], data_range=1.0, channel_axis=0)
)
if save_frames and sample == 0:
save_dir = video_path.with_suffix("") / f"{timestamps_mode}_{backend}"
save_decoded_frames(imgs_dir, save_dir, frames, timestamps, fps)
return result
load_times_video_ms = []
load_times_images_ms = []
mse_values = []
psnr_values = []
ssim_values = []
# A sample is a single set of decoded frames specified by timestamps_mode (e.g. a single frame, 2 frames, etc.).
# For each sample, we record metrics (loading time and quality metrics) which are then averaged over all samples.
# As these samples are independent, we run them in parallel threads to speed up the benchmark.
# Use a single shared lock for all worker threads
shared_lock = Lock()
with ThreadPoolExecutor(max_workers=num_workers) as executor:
futures = [executor.submit(process_sample, i, shared_lock) for i in range(num_samples)]
for future in tqdm(as_completed(futures), total=num_samples, desc="samples", leave=False):
result = future.result()
load_times_video_ms.append(result["load_time_video_ms"])
load_times_images_ms.append(result["load_time_images_ms"])
psnr_values.extend(result["psnr_values"])
ssim_values.extend(result["ssim_values"])
mse_values.extend(result["mse_values"])
avg_load_time_video_ms = float(np.array(load_times_video_ms).mean())
avg_load_time_images_ms = float(np.array(load_times_images_ms).mean())
video_images_load_time_ratio = avg_load_time_video_ms / avg_load_time_images_ms
return {
"avg_load_time_video_ms": avg_load_time_video_ms,
"avg_load_time_images_ms": avg_load_time_images_ms,
"video_images_load_time_ratio": video_images_load_time_ratio,
"avg_mse": float(np.mean(mse_values)),
"avg_psnr": float(np.mean(psnr_values)),
"avg_ssim": float(np.mean(ssim_values)),
}
def benchmark_encoding_decoding(
dataset: LeRobotDataset,
video_path: Path,
imgs_dir: Path,
encoding_cfg: dict,
decoding_cfg: dict,
num_samples: int,
num_workers: int,
save_frames: bool,
overwrite: bool = False,
seed: int = 1337,
) -> list[dict]:
fps = dataset.fps
if overwrite or not video_path.is_file():
tqdm.write(f"encoding {video_path}")
encode_video_frames(
imgs_dir=imgs_dir,
video_path=video_path,
fps=fps,
vcodec=encoding_cfg["vcodec"],
pix_fmt=encoding_cfg["pix_fmt"],
g=encoding_cfg.get("g"),
crf=encoding_cfg.get("crf"),
# fast_decode=encoding_cfg.get("fastdecode"),
overwrite=True,
)
episode_index = 0
ep_num_images = dataset.meta.episodes["length"][episode_index]
width, height = tuple(dataset[0][dataset.meta.camera_keys[0]].shape[-2:])
num_pixels = width * height
video_size_bytes = video_path.stat().st_size
images_size_bytes = get_directory_size(imgs_dir)
video_images_size_ratio = video_size_bytes / images_size_bytes
random.seed(seed)
benchmark_table = []
for timestamps_mode in tqdm(
decoding_cfg["timestamps_modes"], desc="decodings (timestamps_modes)", leave=False
):
for backend in tqdm(decoding_cfg["backends"], desc="decodings (backends)", leave=False):
benchmark_row = benchmark_decoding(
imgs_dir,
video_path,
timestamps_mode,
backend,
ep_num_images,
fps,
num_samples,
num_workers,
save_frames,
)
benchmark_row.update(
**{
"repo_id": dataset.repo_id,
"resolution": f"{width} x {height}",
"num_pixels": num_pixels,
"video_size_bytes": video_size_bytes,
"images_size_bytes": images_size_bytes,
"video_images_size_ratio": video_images_size_ratio,
"timestamps_mode": timestamps_mode,
"backend": backend,
},
**encoding_cfg,
)
benchmark_table.append(benchmark_row)
return benchmark_table
def main(
output_dir: Path,
repo_ids: list[str],
vcodec: list[str],
pix_fmt: list[str],
g: list[int],
crf: list[int],
# fastdecode: list[int],
timestamps_modes: list[str],
backends: list[str],
num_samples: int,
num_workers: int,
save_frames: bool,
):
check_datasets_formats(repo_ids)
encoding_benchmarks = {
"g": g,
"crf": crf,
# "fastdecode": fastdecode,
}
decoding_benchmarks = {
"timestamps_modes": timestamps_modes,
"backends": backends,
}
headers = ["repo_id", "resolution", "num_pixels"]
headers += list(BASE_ENCODING.keys())
headers += [
"timestamps_mode",
"backend",
"video_size_bytes",
"images_size_bytes",
"video_images_size_ratio",
"avg_load_time_video_ms",
"avg_load_time_images_ms",
"video_images_load_time_ratio",
"avg_mse",
"avg_psnr",
"avg_ssim",
]
file_paths = []
for video_codec in tqdm(vcodec, desc="encodings (vcodec)"):
for pixel_format in tqdm(pix_fmt, desc="encodings (pix_fmt)", leave=False):
benchmark_table = []
for repo_id in tqdm(repo_ids, desc="encodings (datasets)", leave=False):
dataset = LeRobotDataset(repo_id)
imgs_dir = output_dir / "images" / dataset.repo_id.replace("/", "_")
# We only use the first episode
save_first_episode(imgs_dir, dataset)
for duet in [
dict(zip(encoding_benchmarks.keys(), unique_combination, strict=False))
for unique_combination in itertools.product(*encoding_benchmarks.values())
]:
encoding_cfg = BASE_ENCODING.copy()
encoding_cfg["vcodec"] = video_codec
encoding_cfg["pix_fmt"] = pixel_format
for key, value in duet.items():
encoding_cfg[key] = value
args_path = Path("_".join(str(value) for value in encoding_cfg.values()))
video_path = output_dir / "videos" / args_path / f"{repo_id.replace('/', '_')}.mp4"
benchmark_table += benchmark_encoding_decoding(
dataset,
video_path,
imgs_dir,
encoding_cfg,
decoding_benchmarks,
num_samples,
num_workers,
save_frames,
)
# Save intermediate results
benchmark_df = pd.DataFrame(benchmark_table, columns=headers)
now = dt.datetime.now()
csv_path = (
output_dir
/ f"{now:%Y-%m-%d}_{now:%H-%M-%S}_{video_codec}_{pixel_format}_{num_samples}-samples.csv"
)
benchmark_df.to_csv(csv_path, header=True, index=False)
file_paths.append(csv_path)
del benchmark_df
# Concatenate all results
df_list = [pd.read_csv(csv_path) for csv_path in file_paths]
concatenated_df = pd.concat(df_list, ignore_index=True)
concatenated_path = output_dir / f"{now:%Y-%m-%d}_{now:%H-%M-%S}_all_{num_samples}-samples.csv"
concatenated_df.to_csv(concatenated_path, header=True, index=False)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--output-dir",
type=Path,
default=Path("outputs/video_benchmark"),
help="Directory where the video benchmark outputs are written.",
)
parser.add_argument(
"--repo-ids",
type=str,
nargs="*",
default=[
"lerobot/pusht_image",
"lerobot/aloha_mobile_shrimp_image",
"lerobot/paris_street",
"lerobot/kitchen",
],
help="Datasets repo-ids to test against. First episodes only are used. Must be images.",
)
parser.add_argument(
"--vcodec",
type=str,
nargs="*",
default=["h264", "hevc", "libsvtav1"],
help="Video codecs to be tested",
)
parser.add_argument(
"--pix-fmt",
type=str,
nargs="*",
default=["yuv444p", "yuv420p"],
help="Pixel formats (chroma subsampling) to be tested",
)
parser.add_argument(
"--g",
type=parse_int_or_none,
nargs="*",
default=[1, 2, 3, 4, 5, 6, 10, 15, 20, 40, 100, None],
help="Group of pictures sizes to be tested.",
)
parser.add_argument(
"--crf",
type=parse_int_or_none,
nargs="*",
default=[0, 5, 10, 15, 20, 25, 30, 40, 50, None],
help="Constant rate factors to be tested.",
)
# parser.add_argument(
# "--fastdecode",
# type=int,
# nargs="*",
# default=[0, 1],
# help="Use the fastdecode tuning option. 0 disables it. "
# "For libx264 and libx265/hevc, only 1 is possible. "
# "For libsvtav1, 1, 2 or 3 are possible values with a higher number meaning a faster decoding optimization",
# )
parser.add_argument(
"--timestamps-modes",
type=str,
nargs="*",
default=[
"1_frame",
"2_frames",
"2_frames_4_space",
"6_frames",
],
help="Timestamps scenarios to be tested.",
)
parser.add_argument(
"--backends",
type=str,
nargs="*",
default=["torchcodec", "pyav"],
help="Torchvision decoding backend to be tested.",
)
parser.add_argument(
"--num-samples",
type=int,
default=50,
help="Number of samples for each encoding x decoding config.",
)
parser.add_argument(
"--num-workers",
type=int,
default=10,
help="Number of processes for parallelized sample processing.",
)
parser.add_argument(
"--save-frames",
type=int,
default=0,
help="Whether to save decoded frames or not. Enter a non-zero number for true.",
)
args = parser.parse_args()
main(**vars(args))
+1 -1
View File
@@ -35,7 +35,7 @@ USER root
ARG ROBOTWIN_SHA=0aeea2d669c0f8516f4d5785f0aa33ba812c14b4
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
cuda-nvcc-12-8 cuda-cudart-dev-12-8 \
cuda-nvcc-12-4 cuda-cudart-dev-12-4 \
libvulkan1 vulkan-tools \
&& mkdir -p /usr/share/vulkan/icd.d \
&& echo '{"file_format_version":"1.0.0","ICD":{"library_path":"libGLX_nvidia.so.0","api_version":"1.3.0"}}' \
+11 -7
View File
@@ -18,8 +18,9 @@
# docker build -f docker/Dockerfile.internal -t lerobot-internal .
# Configure the base image for CI with GPU access
ARG CUDA_VERSION=12.8.1
ARG OS_VERSION=24.04
# TODO(Steven): Bump these versions
ARG CUDA_VERSION=12.4.1
ARG OS_VERSION=22.04
FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu${OS_VERSION}
# Define Python version argument
@@ -35,13 +36,16 @@ ENV DEBIAN_FRONTEND=noninteractive \
# Install Python, system dependencies, and uv (as root)
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential git curl \
libglib2.0-0 libgl1 libegl1 ffmpeg \
software-properties-common build-essential git curl \
libglib2.0-0 libgl1-mesa-glx libegl1-mesa ffmpeg \
libusb-1.0-0-dev speech-dispatcher libgeos-dev portaudio19-dev \
cmake pkg-config ninja-build \
python${PYTHON_VERSION} \
python${PYTHON_VERSION}-venv \
python${PYTHON_VERSION}-dev \
&& add-apt-repository -y ppa:deadsnakes/ppa \
&& apt-get update \
&& apt-get install -y --no-install-recommends \
python${PYTHON_VERSION} \
python${PYTHON_VERSION}-venv \
python${PYTHON_VERSION}-dev \
&& curl -LsSf https://astral.sh/uv/install.sh | sh \
&& mv /root/.local/bin/uv /usr/local/bin/uv \
&& useradd --create-home --shell /bin/bash user_lerobot \
-82
View File
@@ -1,82 +0,0 @@
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This Dockerfile builds a GPU inference pod for `lerobot-policy-server`
# (remote inference over Zenoh). It starts from an NVIDIA CUDA base image;
# the cu128 PyTorch wheels bundle their own CUDA runtime (driver floor 570.86,
# see pyproject.toml [tool.uv]).
# docker build -f docker/Dockerfile.policy-server -t lerobot-policy-server .
# docker run --gpus all -v ./server.yaml:/etc/lerobot/server.yaml lerobot-policy-server
#
# Extra policy-family dependencies (e.g. pi0/smolvla need transformers) can be
# added at build time:
# docker build -f docker/Dockerfile.policy-server \
# --build-arg LEROBOT_EXTRAS="async pi0" -t lerobot-policy-server .
# Configure the base image (same CUDA family as Dockerfile.internal)
ARG CUDA_VERSION=12.8.1
ARG OS_VERSION=24.04
FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu${OS_VERSION}
# Define Python version and lerobot extras arguments
ARG PYTHON_VERSION=3.12
ARG LEROBOT_EXTRAS="async"
# Configure environment variables
ENV DEBIAN_FRONTEND=noninteractive \
PATH=/lerobot/.venv/bin:$PATH
# Install system dependencies and uv (as root).
# Kept lean: no hardware/teleop libraries — this image only serves policies.
RUN apt-get update && apt-get install -y --no-install-recommends \
git curl ca-certificates libglib2.0-0 ffmpeg \
&& curl -LsSf https://astral.sh/uv/install.sh | sh \
&& mv /root/.local/bin/uv /usr/local/bin/uv \
&& useradd --create-home --shell /bin/bash user_lerobot \
&& apt-get clean && rm -rf /var/lib/apt/lists/*
# Create application directory and set permissions
WORKDIR /lerobot
RUN chown -R user_lerobot:user_lerobot /lerobot
# Switch to the non-root user
USER user_lerobot
# Model checkpoints are cached under HF_HOME — mount it as a volume
# (or a PVC in Kubernetes) so warm restarts skip the Hub download.
ENV HOME=/home/user_lerobot \
HF_HOME=/home/user_lerobot/.cache/huggingface \
HF_LEROBOT_HOME=/home/user_lerobot/.cache/huggingface/lerobot \
TORCH_HOME=/home/user_lerobot/.cache/torch \
TRITON_CACHE_DIR=/home/user_lerobot/.cache/triton
# Create the virtual environment (Python provisioned by uv)
RUN uv venv --python ${PYTHON_VERSION}
# Install lerobot from the build context with the async extra
# (eclipse-zenoh + msgpack — see pyproject.toml [project.optional-dependencies])
COPY --chown=user_lerobot:user_lerobot setup.py pyproject.toml uv.lock README.md MANIFEST.in ./
COPY --chown=user_lerobot:user_lerobot src/ src/
RUN uv sync --locked --no-cache $(printf -- '--extra %s ' ${LEROBOT_EXTRAS})
# HTTP health + Prometheus metrics (manifest `health_port`, 0 disables)
EXPOSE 9100
# The manifest is typically mounted as a ConfigMap (Kubernetes) or a bind
# mount (docker run -v) at /etc/lerobot/server.yaml; any field can also be
# overridden on the command line, e.g. --model.repo_or_path=lerobot/pi0_towels
ENTRYPOINT ["lerobot-policy-server"]
CMD ["--manifest", "/etc/lerobot/server.yaml"]
+9 -31
View File
@@ -3,16 +3,12 @@
title: LeRobot
- local: installation
title: Installation
- local: cheat-sheet
title: Cheat sheet
title: Get started
- sections:
- local: il_robots
title: Imitation Learning for Robots
- local: lelab
title: LeLab - Lerobot GUI
- local: bring_your_own_policies
title: Adding a Policy
title: Bring Your Own Policies
- local: integrate_hardware
title: Bring Your Own Hardware
- local: hilserl
@@ -28,12 +24,6 @@
- local: rename_map
title: Using Rename Map and Empty Cameras
title: "Tutorials"
- sections:
- local: hardware_guide
title: Compute Hardware Guide
- local: torch_accelerators
title: PyTorch accelerators
title: "Compute & Hardware"
- sections:
- local: lerobot-dataset-v3
title: Using LeRobotDataset
@@ -41,12 +31,8 @@
title: Porting Large Datasets
- local: using_dataset_tools
title: Using the Dataset Tools
- local: language_and_recipes
title: Language Columns and Recipes
- local: tools
title: Tools
- local: video_encoding_parameters
title: Video encoding parameters
- local: dataset_subtask
title: Using Subtasks in the Dataset
- local: streaming_video_encoding
title: Streaming Video Encoding
title: "Datasets"
@@ -61,12 +47,6 @@
title: π₀-FAST (Pi0Fast)
- local: pi05
title: π₀.₅ (Pi05)
- local: molmoact2
title: MolmoAct2
- local: vla_jepa
title: VLA-JEPA
- local: eo1
title: EO-1
- local: groot
title: NVIDIA GR00T N1.5
- local: xvla
@@ -79,16 +59,12 @@
- sections:
- local: sarm
title: SARM
- local: robometer
title: ROBOMETER
- local: topreward
title: TOPReward
title: "Reward Models"
- sections:
- local: inference
title: Policy Deployment (lerobot-rollout)
- local: remote_inference
title: Remote Inference (lerobot-policy-server)
- local: async
title: Use Async Inference
- local: rtc
title: Real-Time Chunking (RTC)
title: "Inference"
@@ -155,8 +131,6 @@
title: OMX
- local: openarm
title: OpenArm
- local: rebot_b601
title: reBot B601-DM
title: "Robots"
- sections:
- local: phone_teleop
@@ -166,6 +140,10 @@
- local: cameras
title: Cameras
title: "Sensors"
- sections:
- local: torch_accelerators
title: PyTorch accelerators
title: "Supported Hardware"
- sections:
- local: notebooks
title: Notebooks
+10 -6
View File
@@ -79,13 +79,17 @@ If your local computer doesn't have a powerful GPU, you can utilize Google Colab
Once training is complete, you can evaluate your ACT policy using the `lerobot-record` command with your trained policy. This will run inference and record evaluation episodes:
```bash
lerobot-rollout \
--strategy.type=base \
--policy.path=${HF_USER}/act_policy \
--robot.type=so101_follower \
lerobot-record \
--robot.type=so100_follower \
--robot.port=/dev/ttyACM0 \
--robot.id=my_robot \
--robot.cameras="{ front: {type: opencv, index_or_path: 0, width: 640, height: 480, fps: 30}}" \
--display_data=true \
--task="Your task description" \ # can be skipped for ACT
--duration=60
--dataset.repo_id=${HF_USER}/eval_act_your_dataset \
--dataset.num_episodes=10 \
--dataset.single_task="Your task description" \
--dataset.streaming_encoding=true \
--dataset.encoder_threads=2 \
# --dataset.vcodec=auto \
--policy.path=${HF_USER}/act_policy
```
+313
View File
@@ -0,0 +1,313 @@
# Asynchronous Inference
With our [SmolVLA](https://huggingface.co/papers/2506.01844) we introduced a new way to run inference on real-world robots, **decoupling action prediction from action execution**.
In this tutorial, we'll show how to use asynchronous inference (_async inference_) using a finetuned version of SmolVLA, and all the policies supported by LeRobot.
**Try async inference with all the policies** supported by LeRobot!
**What you'll learn:**
1. Why asynchronous inference matters and how it compares to, more traditional, sequential inference.
2. How to spin-up a `PolicyServer` and connect a `RobotClient` from the same machine, and even over the network.
3. How to tune key parameters (`actions_per_chunk`, `chunk_size_threshold`) for your robot and policy.
If you get stuck, hop into our [Discord community](https://discord.gg/s3KuuzsPFb)!
In a nutshell: with _async inference_, your robot keeps acting while the policy server is already busy computing the next chunk of actions---eliminating "wait-for-inference" lags and unlocking smoother, more reactive behaviours.
This is fundamentally different from synchronous inference (sync), where the robot stays idle while the policy computes the next chunk of actions.
---
## Getting started with async inference
You can read more information on asynchronous inference in our [blogpost](https://huggingface.co/blog/async-robot-inference). This guide is designed to help you quickly set up and run asynchronous inference in your environment.
First, install `lerobot` with the `async` tag, to install the extra dependencies required to run async inference.
```shell
pip install -e ".[async]"
```
Then, spin up a policy server (in one terminal, or in a separate machine) specifying the host address and port for the client to connect to.
You can spin up a policy server running:
```shell
python -m lerobot.async_inference.policy_server \
--host=127.0.0.1 \
--port=8080
```
This will start a policy server listening on `127.0.0.1:8080` (`localhost`, port 8080). At this stage, the policy server is empty, as all information related to which policy to run and with which parameters are specified during the first handshake with the client. Spin up a client with:
```shell
python -m lerobot.async_inference.robot_client \
--server_address=127.0.0.1:8080 \ # SERVER: the host address and port of the policy server
--robot.type=so100_follower \ # ROBOT: your robot type
--robot.port=/dev/tty.usbmodem585A0076841 \ # ROBOT: your robot port
--robot.id=follower_so100 \ # ROBOT: your robot id, to load calibration file
--robot.cameras="{ laptop: {type: opencv, index_or_path: 0, width: 1920, height: 1080, fps: 30}, phone: {type: opencv, index_or_path: 0, width: 1920, height: 1080, fps: 30}}" \ # POLICY: the cameras used to acquire frames, with keys matching the keys expected by the policy
--task="dummy" \ # POLICY: The task to run the policy on (`Fold my t-shirt`). Not necessarily defined for all policies, such as `act`
--policy_type=your_policy_type \ # POLICY: the type of policy to run (smolvla, act, etc)
--pretrained_name_or_path=user/model \ # POLICY: the model name/path on server to the checkpoint to run (e.g., lerobot/smolvla_base)
--policy_device=mps \ # POLICY: the device to run the policy on, on the server (cuda, mps, xpu, cpu)
--actions_per_chunk=50 \ # POLICY: the number of actions to output at once
--chunk_size_threshold=0.5 \ # CLIENT: the threshold for the chunk size before sending a new observation to the server
--aggregate_fn_name=weighted_average \ # CLIENT: the function to aggregate actions on overlapping portions
--debug_visualize_queue_size=True # CLIENT: whether to visualize the queue size at runtime
```
In summary, you need to specify instructions for:
- `SERVER`: the address and port of the policy server
- `ROBOT`: the type of robot to connect to, the port to connect to, and the local `id` of the robot
- `POLICY`: the type of policy to run, and the model name/path on server to the checkpoint to run. You also need to specify which device should the sever be using, and how many actions to output at once (capped at the policy max actions value).
- `CLIENT`: the threshold for the chunk size before sending a new observation to the server, and the function to aggregate actions on overlapping portions. Optionally, you can also visualize the queue size at runtime, to help you tune the `CLIENT` parameters.
Importantly,
- `actions_per_chunk` and `chunk_size_threshold` are key parameters to tune for your setup.
- `aggregate_fn_name` is the function to aggregate actions on overlapping portions. You can either add a new one to a registry of functions, or add your own in `robot_client.py` (see [here](NOTE:addlinktoLOC))
- `debug_visualize_queue_size` is a useful tool to tune the `CLIENT` parameters.
## Done! You should see your robot moving around by now 😉
## Async vs. synchronous inference
Synchronous inference relies on interleaving action chunk prediction and action execution. This inherently results in _idle frames_, frames where the robot awaits idle the policy's output: a new action chunk.
In turn, inference is plagued by evident real-time lags, where the robot simply stops acting due to the lack of available actions.
With robotics models increasing in size, this problem risks becoming only more severe.
<p align="center">
<img
src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/async-inference/sync.png"
width="80%"
></img>
</p>
<p align="center">
<i>Synchronous inference</i> makes the robot idle while the policy is
computing the next chunk of actions.
</p>
To overcome this, we design async inference, a paradigm where action planning and execution are decoupled, resulting in (1) higher adaptability and, most importantly, (2) no idle frames.
Crucially, with async inference, the next action chunk is computed _before_ the current one is exhausted, resulting in no idleness.
Higher adaptability is ensured by aggregating the different action chunks on overlapping portions, obtaining an up-to-date plan and a tighter control loop.
<p align="center">
<img
src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/async-inference/async.png"
width="80%"
></img>
</p>
<p align="center">
<i>Asynchronous inference</i> results in no idleness because the next chunk is
computed before the current chunk is exhausted.
</p>
---
## Start the Policy Server
Policy servers are wrappers around a `PreTrainedPolicy` interfacing them with observations coming from a robot client.
Policy servers are initialized as empty containers which are populated with the requested policy specified in the initial handshake between the robot client and the policy server.
As such, spinning up a policy server is as easy as specifying the host address and port. If you're running the policy server on the same machine as the robot client, you can use `localhost` as the host address.
<hfoptions id="start_policy_server">
<hfoption id="Command">
```bash
python -m lerobot.async_inference.policy_server \
--host=127.0.0.1 \
--port=8080
```
</hfoption>
<hfoption id="API example">
<!-- prettier-ignore-start -->
```python
from lerobot.async_inference.configs import PolicyServerConfig
from lerobot.async_inference.policy_server import serve
config = PolicyServerConfig(
host="localhost",
port=8080,
)
serve(config)
```
<!-- prettier-ignore-end -->
</hfoption>
</hfoptions>
This listens on `localhost:8080` for an incoming connection from the associated`RobotClient`, which will communicate which policy to run during the first client-server handshake.
---
## Launch the Robot Client
`RobotClient` is a wrapper around a `Robot` instance, which `RobotClient` connects to the (possibly remote) `PolicyServer`.
The `RobotClient` streams observations to the `PolicyServer`, and receives action chunks obtained running inference on the server (which we assume to have better computational resources than the robot controller).
<hfoptions id="start_robot_client">
<hfoption id="Command">
```bash
python -m lerobot.async_inference.robot_client \
--server_address=127.0.0.1:8080 \ # SERVER: the host address and port of the policy server
--robot.type=so100_follower \ # ROBOT: your robot type
--robot.port=/dev/tty.usbmodem585A0076841 \ # ROBOT: your robot port
--robot.id=follower_so100 \ # ROBOT: your robot id, to load calibration file
--robot.cameras="{ laptop: {type: opencv, index_or_path: 0, width: 1920, height: 1080, fps: 30}, phone: {type: opencv, index_or_path: 0, width: 1920, height: 1080, fps: 30}}" \ # POLICY: the cameras used to acquire frames, with keys matching the keys expected by the policy
--task="dummy" \ # POLICY: The task to run the policy on (`Fold my t-shirt`). Not necessarily defined for all policies, such as `act`
--policy_type=your_policy_type \ # POLICY: the type of policy to run (smolvla, act, etc)
--pretrained_name_or_path=user/model \ # POLICY: the model name/path on server to the checkpoint to run (e.g., lerobot/smolvla_base)
--policy_device=mps \ # POLICY: the device to run the policy on, on the server
--actions_per_chunk=50 \ # POLICY: the number of actions to output at once
--chunk_size_threshold=0.5 \ # CLIENT: the threshold for the chunk size before sending a new observation to the server
--aggregate_fn_name=weighted_average \ # CLIENT: the function to aggregate actions on overlapping portions
--debug_visualize_queue_size=True # CLIENT: whether to visualize the queue size at runtime
```
</hfoption>
<hfoption id="API example">
<!-- prettier-ignore-start -->
```python
import threading
from lerobot.robots.so_follower import SO100FollowerConfig
from lerobot.cameras.opencv import OpenCVCameraConfig
from lerobot.async_inference.configs import RobotClientConfig
from lerobot.async_inference.robot_client import RobotClient
from lerobot.async_inference.helpers import visualize_action_queue_size
# 1. Create the robot instance
"""Check out the cameras available in your setup by running `python lerobot/find_cameras.py`"""
# these cameras must match the ones expected by the policy
# check the config.json on the Hub for the policy you are using
camera_cfg = {
"top": OpenCVCameraConfig(index_or_path=0, width=640, height=480, fps=30),
"side": OpenCVCameraConfig(index_or_path=1, width=640, height=480, fps=30)
}
robot_cfg = SO100FollowerConfig(
port="/dev/tty.usbmodem585A0076841",
id="follower_so100",
cameras=camera_cfg
)
# 3. Create client configuration
client_cfg = RobotClientConfig(
robot=robot_cfg,
server_address="localhost:8080",
policy_device="mps",
client_device="cpu",
policy_type="smolvla",
pretrained_name_or_path="<user>/smolvla_async",
chunk_size_threshold=0.5,
actions_per_chunk=50, # make sure this is less than the max actions of the policy
)
# 4. Create and start client
client = RobotClient(client_cfg)
# 5. Specify the task
task = "Don't do anything, stay still"
if client.start():
# Start action receiver thread
action_receiver_thread = threading.Thread(target=client.receive_actions, daemon=True)
action_receiver_thread.start()
try:
# Run the control loop
client.control_loop(task)
except KeyboardInterrupt:
client.stop()
action_receiver_thread.join()
# (Optionally) plot the action queue size
visualize_action_queue_size(client.action_queue_size)
```
<!-- prettier-ignore-end -->
</hfoption>
</hfoptions>
The following two parameters are key in every setup:
<table>
<thead>
<tr>
<th>Hyperparameter</th>
<th>Default</th>
<th>What it does</th>
</tr>
</thead>
<tbody>
<tr>
<td>
<code>actions_per_chunk</code>
</td>
<td>50</td>
<td>
How many actions the policy outputs at once. Typical values: 10-50.
</td>
</tr>
<tr>
<td>
<code>chunk_size_threshold</code>
</td>
<td>0.7</td>
<td>
When the queue is ≤ 50% full, the client sends a fresh observation.
Value in [0, 1].
</td>
</tr>
</tbody>
</table>
<Tip>
Different values of `actions_per_chunk` and `chunk_size_threshold` do result
in different behaviours.
</Tip>
On the one hand, increasing the value of `actions_per_chunk` will result in reducing the likelihood of ending up with no actions to execute, as more actions will be available when the new chunk is computed.
However, larger values of `actions_per_chunk` might also result in less precise actions, due to the compounding errors consequent to predicting actions over longer timespans.
On the other hand, increasing the value of `chunk_size_threshold` will result in sending out to the `PolicyServer` observations for inference more often, resulting in a larger number of updates action chunks, overlapping on significant portions. This results in high adaptability, in the limit predicting one action chunk for each observation, which is in turn only marginally consumed while a new one is produced.
This option does also put more pressure on the inference pipeline, as a consequence of the many requests. Conversely, values of `chunk_size_threshold` close to 0.0 collapse to the synchronous edge case, whereby new observations are only sent out whenever the current chunk is exhausted.
We found the default values of `actions_per_chunk` and `chunk_size_threshold` to work well in the experiments we developed for the [SmolVLA paper](https://huggingface.co/papers/2506.01844), but recommend experimenting with different values to find the best fit for your setup.
### Tuning async inference for your setup
1. **Choose your computational resources carefully.** [PI0](https://huggingface.co/lerobot/pi0) occupies 14GB of memory at inference time, while [SmolVLA](https://huggingface.co/lerobot/smolvla_base) requires only ~2GB. You should identify the best computational resource for your use case keeping in mind smaller policies require less computational resources. The combination of policy and device used (CPU-intensive, using MPS, or the number of CUDA cores on a given NVIDIA GPU) directly impacts the average inference latency you should expect.
2. **Adjust your `fps` based on inference latency.** While the server generates a new action chunk, the client is not idle and is stepping through its current action queue. If the two processes happen at fundamentally different speeds, the client might end up with an empty queue. As such, you should reduce your fps if you consistently run out of actions in queue.
3. **Adjust `chunk_size_threshold`**.
- Values closer to `0.0` result in almost sequential behavior. Values closer to `1.0` → send observation every step (more bandwidth, relies on good world-model).
- We found values around 0.5-0.6 to work well. If you want to tweak this, spin up a `RobotClient` setting the `--debug_visualize_queue_size` to `True`. This will plot the action queue size evolution at runtime, and you can use it to find the value of `chunk_size_threshold` that works best for your setup.
<p align="center">
<img
src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/async-inference/queues.png"
width="80%"
></img>
</p>
<p align="center">
<i>
The action queue size is plotted at runtime when the
`--debug_visualize_queue_size` flag is passed, for various levels of
`chunk_size_threshold` (`g` in the SmolVLA paper).
</i>
</p>
---
## Conclusion
Asynchronous inference represents a significant advancement in real-time robotics control, addressing the fundamental challenge of inference latency that has long plagued robotics applications. Through this tutorial, you've learned how to implement a complete async inference pipeline that eliminates idle frames and enables smoother, more reactive robot behaviors.
**Key Takeaways:**
- **Paradigm Shift**: Async inference decouples action prediction from execution, allowing robots to continue acting while new action chunks are computed in parallel
- **Performance Benefits**: Eliminates "wait-for-inference" lags that are inherent in synchronous approaches, becoming increasingly important as policy models grow larger
- **Flexible Architecture**: The server-client design enables distributed computing, where inference can run on powerful remote hardware while maintaining real-time robot control
- **Tunable Parameters**: Success depends on properly configuring `actions_per_chunk` and `chunk_size_threshold` for your specific hardware, policy, and task requirements
- **Universal Compatibility**: Works with all LeRobot-supported policies, from lightweight ACT models to vision-language models like SmolVLA
Start experimenting with the default parameters, monitor your action queue sizes, and iteratively refine your setup to achieve optimal performance for your specific use case.
If you want to discuss this further, hop into our [Discord community](https://discord.gg/s3KuuzsPFb), or open an issue on our [GitHub repository](https://github.com/huggingface/lerobot/issues).
+81 -220
View File
@@ -1,37 +1,60 @@
# Adding a Policy
# Bring Your Own Policies
This guide walks you through implementing a custom policy and getting it to work with LeRobot's training, evaluation, and deployment tools. There are two paths:
This tutorial explains how to integrate your own custom policy implementations into the LeRobot ecosystem, allowing you to leverage all LeRobot tools for training, evaluation, and deployment while using your own algorithms.
- **Plugin (out-of-tree)** — ship your policy as a standalone `lerobot_policy_*` package. Faster, no PR required, easy to iterate. Right for experimentation, internal use, or when you want to publish independently.
- **In-tree (contributed to LeRobot)** — land your policy directly in `src/lerobot/policies/`. Requires a PR, but makes your policy a first-class citizen of the library.
## Step 1: Create a Policy Package
The plugin route is usually the right starting point — promote to in-tree once the policy has stabilized and there's clear value in shipping it with the library.
Your custom policy should be organized as an installable Python package following LeRobot's plugin conventions.
Either way, the building blocks are the same: a configuration class, a policy class, and a processor factory. The first half of this guide covers those shared pieces; the second half covers the path-specific scaffolding ([Path A](#path-a-out-of-tree-plugin), [Path B](#path-b-contributing-in-tree)).
### Package Structure
A note on tone: robot-learning is an actively evolving field, and "what a policy looks like" can shift with each new architecture. The conventions described here exist because they let `lerobot-train` and `lerobot-eval` work uniformly across very different models. When a new policy genuinely doesn't fit them, raise it (in your PR, or an issue) — the conventions are not sacred.
Create a package with the prefix `lerobot_policy_` (IMPORTANT!) followed by your policy name:
---
```bash
lerobot_policy_my_custom_policy/
├── pyproject.toml
└── src/
└── lerobot_policy_my_custom_policy/
├── __init__.py
├── configuration_my_custom_policy.py
├── modeling_my_custom_policy.py
└── processor_my_custom_policy.py
```
## Anatomy of a policy
### Package Configuration
Three building blocks make up every policy. The names below use `my_policy` as a placeholder — replace with your policy's name. That name is load-bearing: it must match the string you pass to `@PreTrainedConfig.register_subclass`, the `MyPolicy.name` class attribute, and the `make_<name>_pre_post_processors` factory function (more on each below).
Set up your `pyproject.toml`:
### Configuration class
```toml
[project]
name = "lerobot_policy_my_custom_policy"
version = "0.1.0"
dependencies = [
# your policy-specific dependencies
]
requires-python = ">= 3.12"
Inherit from [`PreTrainedConfig`](https://github.com/huggingface/lerobot/blob/main/src/lerobot/configs/policies.py) and register your policy type. Here is a template — customize the parameters and methods as needed for your policy's architecture and training requirements.
[build-system]
build-backend = # your-build-backend
requires = # your-build-system
```
## Step 2: Define the Policy Configuration
Create a configuration class that inherits from [`PreTrainedConfig`](https://github.com/huggingface/lerobot/blob/main/src/lerobot/configs/policies.py) and registers your policy type:
Here is a template to get you started, customize the parameters and methods as needed for your policy's architecture and training requirements.
```python
# configuration_my_policy.py
# configuration_my_custom_policy.py
from dataclasses import dataclass, field
from lerobot.configs import PreTrainedConfig
from lerobot.optim import AdamWConfig
from lerobot.optim import CosineDecayWithWarmupSchedulerConfig
@PreTrainedConfig.register_subclass("my_policy")
@PreTrainedConfig.register_subclass("my_custom_policy")
@dataclass
class MyPolicyConfig(PreTrainedConfig):
"""Configuration class for MyPolicy.
class MyCustomPolicyConfig(PreTrainedConfig):
"""Configuration class for MyCustomPolicy.
Args:
n_obs_steps: Number of observation steps to use as input
@@ -54,20 +77,16 @@ class MyPolicyConfig(PreTrainedConfig):
raise ValueError("n_action_steps cannot exceed horizon")
def validate_features(self) -> None:
"""Validate input/output feature compatibility.
Call this explicitly from your policy's __init__ — the base class does not.
"""
"""Validate input/output feature compatibility."""
if not self.image_features:
raise ValueError("MyPolicy requires at least one image feature.")
raise ValueError("MyCustomPolicy requires at least one image feature.")
if self.action_feature is None:
raise ValueError("MyPolicy requires 'action' in output_features.")
raise ValueError("MyCustomPolicy requires 'action' in output_features.")
def get_optimizer_preset(self) -> AdamWConfig:
return AdamWConfig(lr=self.optimizer_lr, weight_decay=self.optimizer_weight_decay)
def get_scheduler_preset(self):
"""Return a LRSchedulerConfig from lerobot.optim, or None."""
return None
@property
@@ -82,7 +101,8 @@ class MyPolicyConfig(PreTrainedConfig):
@property
def action_delta_indices(self) -> list[int]:
"""Relative timestep offsets for the action chunk the dataset loader returns."""
"""Relative timestep offsets for the action chunk the dataset loader returns.
"""
return list(range(self.horizon))
@property
@@ -90,34 +110,32 @@ class MyPolicyConfig(PreTrainedConfig):
return None
```
The string you pass to `@register_subclass` must match `MyPolicy.name` (next section) and is what users supply as `--policy.type` on the CLI. Default to `AdamW` from `lerobot.optim` for `get_optimizer_preset` unless you genuinely need otherwise.
## Step 3: Implement the Policy Class
### Policy class
Inherit from [`PreTrainedPolicy`](https://github.com/huggingface/lerobot/blob/main/src/lerobot/policies/pretrained.py) and set two class attributes — both are checked by `__init_subclass__`:
Create your policy implementation by inheriting from [`PreTrainedPolicy`](https://github.com/huggingface/lerobot/blob/main/src/lerobot/policies/pretrained.py):
```python
# modeling_my_policy.py
# modeling_my_custom_policy.py
import torch
import torch.nn as nn
from typing import Any
from lerobot.policies import PreTrainedPolicy
from lerobot.utils.constants import ACTION
from .configuration_my_policy import MyPolicyConfig
from .configuration_my_custom_policy import MyCustomPolicyConfig
class MyPolicy(PreTrainedPolicy):
config_class = MyPolicyConfig # must match the string in @register_subclass
name = "my_policy"
class MyCustomPolicy(PreTrainedPolicy):
config_class = MyCustomPolicyConfig # must match the string in @register_subclass
name = "my_custom_policy"
def __init__(self, config: MyPolicyConfig, dataset_stats: dict[str, Any] = None):
def __init__(self, config: MyCustomPolicyConfig, dataset_stats: dict[str, Any] = None):
super().__init__(config, dataset_stats)
config.validate_features() # not called automatically by the base class
self.config = config
self.model = ... # your nn.Module here
def reset(self):
"""Reset per-episode state. Called by lerobot-eval at the start of each episode."""
"""Reset episode state."""
...
def get_optim_params(self) -> dict:
@@ -129,51 +147,35 @@ class MyPolicy(PreTrainedPolicy):
...
def select_action(self, batch: dict[str, torch.Tensor], **kwargs) -> torch.Tensor:
"""Return a single action for the current timestep (called every step at inference)."""
"""Return a single action for the current timestep (called at inference)."""
...
def forward(self, batch: dict[str, torch.Tensor]) -> tuple[torch.Tensor, dict | None]:
def forward(self, batch: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
"""Compute the training loss.
Returns `(loss, output_dict)`. `output_dict` may be `None`; everything in it must be
logging-friendly Python natives (no tensors with gradients).
`batch["action_is_pad"]` is a bool mask of shape (B, horizon) that marks
timesteps padded because the episode ended before `horizon` steps; you
timesteps padded because the episode ended before `horizon` steps, you
can exclude those from your loss.
"""
actions = batch[ACTION]
action_is_pad = batch.get("action_is_pad")
...
return loss, {"some_loss_component": some_loss_component.item()}
return {"loss": ...}
```
The methods called by the train/eval loops:
## Step 4: Add Data Processors
| Method | Used by | What it does |
| ----------------------------------------------------------------- | ----------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `reset() -> None` | `lerobot-eval` | Clear per-episode state at the start of each episode. |
| `select_action(batch, **kwargs) -> Tensor` | `lerobot-eval` | Return the next action `(B, action_dim)`. Called every step. |
| `predict_action_chunk(batch, **kwargs) -> Tensor` | the policy itself | Return an action chunk `(B, chunk_size, action_dim)`. Currently abstract on the base class — raise `NotImplementedError` if your policy doesn't chunk. |
| `forward(batch, reduction="mean") -> tuple[Tensor, dict \| None]` | `lerobot-train` | Return `(loss, output_dict)`. Accept `reduction="none"` if you want to support per-sample weighting. |
| `get_optim_params() -> dict` | the optimizer | Return `self.parameters()` for simple policies; return a named parameter dict for [multi-optimizer policies](https://github.com/huggingface/lerobot/blob/ecd38c50d7d15b4184cf42649ff1185ee2e11eeb/src/lerobot/policies/sac/modeling_sac.py#L61-L73). |
| `update() -> None` _(optional)_ | `lerobot-train` | Called after each optimizer step _if defined_. Use for EMA, target nets, replay buffers (TDMPC uses this). |
Batches are flat dictionaries keyed by the constants in [`lerobot.utils.constants`](https://github.com/huggingface/lerobot/blob/main/src/lerobot/utils/constants.py): `OBS_STATE` (`observation.state.<motor>`), `OBS_IMAGES` (`observation.images.<camera>`), `OBS_LANGUAGE`, `ACTION`, etc. Reuse the constants — don't invent new prefixes.
### Processor functions
LeRobot uses `PolicyProcessorPipeline`s to normalize inputs and de-normalize outputs around your policy. For a concrete reference, see [`processor_act.py`](https://github.com/huggingface/lerobot/blob/main/src/lerobot/policies/act/processor_act.py) or [`processor_diffusion.py`](https://github.com/huggingface/lerobot/blob/main/src/lerobot/policies/diffusion/processor_diffusion.py).
Create processor functions. For a concrete reference, see [processor_act.py](https://github.com/huggingface/lerobot/blob/main/src/lerobot/policies/act/processor_act.py) or [processor_diffusion.py](https://github.com/huggingface/lerobot/blob/main/src/lerobot/policies/diffusion/processor_diffusion.py).
```python
# processor_my_policy.py
# processor_my_custom_policy.py
from typing import Any
import torch
from lerobot.processor import PolicyAction, PolicyProcessorPipeline
def make_my_policy_pre_post_processors(
def make_my_custom_policy_pre_post_processors(
config,
dataset_stats: dict[str, dict[str, torch.Tensor]] | None = None,
) -> tuple[
@@ -185,48 +187,11 @@ def make_my_policy_pre_post_processors(
return preprocessor, postprocessor
```
**Important function naming:** LeRobot discovers your processor by name. The function **must** be called `make_{policy_name}_pre_post_processors` (matching the string you passed to `@PreTrainedConfig.register_subclass`).
**Important - function naming:** LeRobot discovers your processor by name. The function **must** be called `make_{policy_name}_pre_post_processors` (matching the string you passed to `@PreTrainedConfig.register_subclass`).
---
## Step 5: Package Initialization
## Path A: Out-of-tree plugin
The fastest way to ship a policy: package it as a standalone Python distribution and install it alongside LeRobot. No PR required, you own the release cycle, and you can publish to PyPI under your own namespace.
### Package structure
Create a package with the prefix `lerobot_policy_` (IMPORTANT!) followed by your policy name:
```bash
lerobot_policy_my_policy/
├── pyproject.toml
└── src/
└── lerobot_policy_my_policy/
├── __init__.py
├── configuration_my_policy.py
├── modeling_my_policy.py
└── processor_my_policy.py
```
### `pyproject.toml`
```toml
[project]
name = "lerobot_policy_my_policy"
version = "0.1.0"
dependencies = [
# your policy-specific dependencies
]
requires-python = ">= 3.12"
[build-system]
build-backend = # your-build-backend
requires = # your-build-system
```
### Package `__init__.py`
Expose your classes in the package's `__init__.py` and guard against missing `lerobot`:
Expose your classes in the package's `__init__.py`:
```python
# __init__.py
@@ -239,148 +204,44 @@ except ImportError:
"lerobot is not installed. Please install lerobot to use this policy package."
)
from .configuration_my_policy import MyPolicyConfig
from .modeling_my_policy import MyPolicy
from .processor_my_policy import make_my_policy_pre_post_processors
from .configuration_my_custom_policy import MyCustomPolicyConfig
from .modeling_my_custom_policy import MyCustomPolicy
from .processor_my_custom_policy import make_my_custom_policy_pre_post_processors
__all__ = [
"MyPolicyConfig",
"MyPolicy",
"make_my_policy_pre_post_processors",
"MyCustomPolicyConfig",
"MyCustomPolicy",
"make_my_custom_policy_pre_post_processors",
]
```
### Install and use
## Step 6: Installation and Usage
### Install Your Policy Package
```bash
cd lerobot_policy_my_policy
cd lerobot_policy_my_custom_policy
pip install -e .
# Or install from PyPI if published
pip install lerobot_policy_my_policy
pip install lerobot_policy_my_custom_policy
```
### Use Your Policy
Once installed, your policy automatically integrates with LeRobot's training and evaluation tools:
```bash
lerobot-train \
--policy.type my_policy \
--policy.type my_custom_policy \
--env.type pusht \
--steps 200000
```
---
## Path B: Contributing in-tree
When your policy has stabilized and there's clear value in shipping it with the library, you can land it directly in LeRobot. Read the general [contribution guide](./contributing) and the [PR template](https://github.com/huggingface/lerobot/blob/main/.github/PULL_REQUEST_TEMPLATE.md) first — that's where you'll find the testing/quality expectations every PR has to meet (`pre-commit run -a`, `pytest`, the community-review rule, etc.). What's below is the policy-specific layer on top of that.
### In-tree layout
```
src/lerobot/policies/my_policy/
├── __init__.py # re-exports config + modeling + processor factory
├── configuration_my_policy.py # MyPolicyConfig + @register_subclass
├── modeling_my_policy.py # MyPolicy(PreTrainedPolicy)
├── processor_my_policy.py # make_my_policy_pre_post_processors
└── README.md # symlink → ../../../../docs/source/policy_my_policy_README.md
```
Two notes:
- The `README.md` next to the source is a **symlink** into `docs/source/policy_<name>_README.md` — the actual file lives under `docs/`. Existing policies (act, smolvla, diffusion, …) all do this; copy one of those symlinks. The policy README is conventionally minimal: paper link + BibTeX citation.
- The user-facing tutorial — what to install, how to train, hyperparameters, benchmark numbers — lives separately at `docs/source/<my_policy>.mdx` and is registered in `_toctree.yml` under "Policies".
The file names are load-bearing: the factory does lazy imports by name, and the processor is discovered by the `make_<policy_name>_pre_post_processors` convention.
### Wiring
Three places need to know about your policy. All by name.
1. **`policies/__init__.py`** — re-export `MyPolicyConfig` and add it to `__all__`. **Don't** re-export the modeling class; it loads lazily through the factory (so `import lerobot` stays fast).
2. **`factory.py:get_policy_class`** — add a branch returning `MyPolicy` from a lazy import.
3. **`factory.py:make_policy_config`** and **`factory.py:make_pre_post_processors`** — same idea, two more branches.
Mirror an existing policy that's structurally similar to yours; the diff is small.
### Heavy / optional dependencies
Most policies need a heavy backbone (transformers, diffusers, a specific VLM SDK). The convention is **two-step gating**: a `TYPE_CHECKING`-guarded import at module top, and a `require_package` runtime check in the constructor. [`modeling_diffusion.py`](https://github.com/huggingface/lerobot/blob/main/src/lerobot/policies/diffusion/modeling_diffusion.py) is the canonical reference:
```python
from typing import TYPE_CHECKING
from lerobot.utils.import_utils import _diffusers_available, require_package
if TYPE_CHECKING or _diffusers_available:
from diffusers.schedulers.scheduling_ddim import DDIMScheduler
else:
DDIMScheduler = None # keeps the symbol bindable at import time
class DiffusionPolicy(PreTrainedPolicy):
def __init__(self, config):
require_package("diffusers", extra="diffusion")
super().__init__(config)
...
```
This way:
- `import lerobot.policies` keeps working without the extra installed (the symbol is just bound to `None`).
- Type checkers see the real symbol.
- Instantiating the policy without the extra raises a clear `ImportError` pointing at `pip install 'lerobot[diffusion]'`.
Add a matching extra to [`pyproject.toml`](https://github.com/huggingface/lerobot/blob/main/pyproject.toml) `[project.optional-dependencies]` and include it in the `all` extra so `pip install 'lerobot[all]'` keeps installing everything.
### Benchmarks and a published checkpoint
A new policy is much easier to review — and far more useful — when it ships with a working checkpoint and at least one number you can reproduce.
**Pick at least one in-tree benchmark.** LeRobot ships sim benchmarks with per-benchmark Docker images (LIBERO, LIBERO-plus, Meta-World, RoboTwin 2.0, RoboCasa365, RoboCerebra, RoboMME, VLABench and more). Pick the one that matches your policy's modality — VLAs usually go to LIBERO or VLABench; image-only BC to LIBERO or Meta-World. The full list lives under [Benchmarks](./libero) in the docs sidebar.
**Push the checkpoint & processors** to the Hub under `lerobot/<policy>_<benchmark>` (or your namespace if you don't have write access; a maintainer can mirror it). Use `PreTrainedPolicy.push_model_to_hub` so the repo gets `config.json`, `model.safetensors`, and a model card.
**Report results in your policy's MDX**, with the exact `lerobot-eval` command and hardware so anyone can re-run:
```markdown
## Results
Evaluated on LIBERO with `lerobot/<policy>_libero`:
| Suite | Success rate | n_episodes |
| -------------- | -----------: | ---------: |
| libero_spatial | 87.5% | 50 |
| libero_object | 93.0% | 50 |
| libero_goal | 81.5% | 50 |
| libero_10 | 62.0% | 50 |
| **average** | **81.0%** | 200 |
Reproduce: `lerobot-eval --policy.path=lerobot/<policy>_libero --env.type=libero --env.task=libero_spatial --eval.n_episodes=50` (1× A100 40 GB).
```
Use `n_episodes ≥ 50` per suite for stable success-rate estimates.
If your policy is real-robot-only and no sim benchmark applies, swap the sim eval for: a public training dataset on the Hub, the `lerobot-train` command, the checkpoint, and a real-robot success rate over ≥10 episodes via `lerobot-rollout --policy.path=...`.
### PR checklist
The general expectations are in [`CONTRIBUTING.md`](https://github.com/huggingface/lerobot/blob/main/CONTRIBUTING.md) and the [PR template](https://github.com/huggingface/lerobot/blob/main/.github/PULL_REQUEST_TEMPLATE.md). On top of those, reviewers will look for:
- [ ] `MyPolicy` and `MyPolicyConfig` cover the surface above; `__init_subclass__` accepts the class.
- [ ] `factory.py` and `policies/__init__.py` are wired (lazy imports for modeling).
- [ ] `make_my_policy_pre_post_processors` follows the naming convention.
- [ ] Optional deps live behind a `[project.optional-dependencies]` extra and the `TYPE_CHECKING + require_package` guard.
- [ ] `tests/policies/` updated; backward-compat artifact committed & policy-specific tests.
- [ ] `src/lerobot/policies/<name>/README.md` symlinked into `docs/source/policy_<name>_README.md`; user-facing `docs/source/<name>.mdx` written and added to `_toctree.yml`.
- [ ] At least one reproducible benchmark eval in the policy MDX with a published checkpoint (sim benchmark, or real-robot dataset + checkpoint).
The fastest way to get a clean PR is to copy the directory of the existing policy closest to yours, rename, and replace contents method by method. Don't wait until everything is polished — open a draft PR early and iterate with us; reviewers would much rather give feedback on a half-finished branch than a fully-merged one.
---
## Examples and community contributions
## Examples and Community Contributions
Check out these example policy implementations:
- [DiTFlow Policy](https://github.com/danielsanjosepro/lerobot_policy_ditflow) Diffusion Transformer policy with flow-matching objective. Try it out in this example: [DiTFlow Example](https://github.com/danielsanjosepro/test_lerobot_policy_ditflow)
- [DiTFlow Policy](https://github.com/danielsanjosepro/lerobot_policy_ditflow) - Diffusion Transformer policy with flow-matching objective. Try it out in this example: [DiTFlow Example](https://github.com/danielsanjosepro/test_lerobot_policy_ditflow)
Thanks for taking the time to bring a new policy into LeRobot. Every architecture that lands in `main` — and every plugin published by the community — makes the library a little more useful for the next person, and a little more representative of where robot learning is going. We're looking forward to seeing what you ship. 🤗
Share your policy implementations with the community! 🤗
-139
View File
@@ -1,139 +0,0 @@
# Cheat sheet
All of the LeRobot commands in one place. If you forgot how to use a specific command or want to learn about a new one you can do it here.
> [!WARNING]
> For all of the commands listed below remember to change the ports/names/ids to your own values!
> [!TIP]
> Another great way to look at all the commands and get them configured for your specific setup is to use this [Jupyter Notebook](https://github.com/huggingface/lerobot/blob/main/examples/notebooks/quickstart.ipynb).
### Setup and installation
For installation please look at [LeRobot Installation](https://huggingface.co/docs/lerobot/main/en/installation).
### Useful tools
###### Find port
Use this to identify which serial ports your robots are connected to. Follow the instructions in your terminal: you will be asked to unplug the USB cable and press Enter. The script will then detect and print the correct serial port for that robot.
```bash
lerobot-find-port
```
###### Find cameras
Quickly find camera indices and verify their output. This command prints camera information to the terminal and saves test frames from each detected camera to `lerobot/outputs/captured_images`
```bash
lerobot-find-cameras
```
### Calibration
In most cases you will need to perform calibration just once for each robot and teleoperation device. Before performing the calibration make sure that all the joints are roughly in the middle position.
```bash
lerobot-calibrate \
--robot.type=so101_follower \
--robot.port=/dev/ttyACM0 \
--robot.id=my_follower_arm
```
Make sure that you use the same IDs used during calibration later for the other scripts. That's how LeRobot finds the calibration files.
### Teleoperation
Teleoperating with two cameras and displaying the data with Rerun.
```bash
lerobot-teleoperate \
--robot.type=so101_follower \
--robot.port=/dev/ttyACM0 \
--robot.id=my_follower_arm \
--robot.cameras="{ top: {type: opencv, index_or_path: 1, width: 640, height: 480, fps: 30}, wrist: {type: opencv, index_or_path: 0, width: 640, height: 480, fps: 30} }" \
--teleop.type=so101_leader \
--teleop.port=/dev/ttyACM1 \
--teleop.id=my_leader_arm \
--display_data=true
```
### Recording a dataset
The dataset is automatically uploaded to the server and saved under repo_id, make sure you are logged in to your HF account with CLI:
`hf auth login`
You can get the token from: [https://huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)
```bash
lerobot-record \
--robot.type=so101_follower \
--robot.port=/dev/ttyACM0 \
--robot.id=my_follower_arm \
--robot.cameras="{ top: {type: opencv, index_or_path: 1, width: 640, height: 480, fps: 30}, wrist: {type: opencv, index_or_path: 0, width: 640, height: 480, fps: 30} }" \
--teleop.type=so101_leader \
--teleop.port=/dev/ttyACM1 \
--teleop.id=my_leader_arm \
--dataset.repo_id=${HF_USER}/so101_dataset_test \
--dataset.num_episodes=30 \
--dataset.single_task="put the red brick in a bowl" \
--dataset.streaming_encoding=true \
--display_data=true
```
While collecting the dataset you can control the process with your keyboard:
Control the data recording flow using keyboard shortcuts:
- Press **Right Arrow (`→`)**: Save episode and move to the next.
- Press **Left Arrow (`←`)**: Delete current episode and retry.
- Press **Escape (`ESC`)**: Stop, encode videos, and upload.
### Training
Depending on your hardware training the policy might take a few hours. That's how you train simple `ACT` policy:
```bash
lerobot-train \
--dataset.repo_id=${HF_USER}/so101_dataset_test \
--policy.type=act \
--output_dir=outputs/train/act_so101_test \
--job_name=act_so101_test \
--policy.device=cuda \
--wandb.enable=true \
--policy.repo_id=${HF_USER}/policy_test \
--steps=20000
```
- Policy Types: `act`, `diffusion`, `smolvla`, `pi05`
- Devices: `cuda` (NVIDIA), `mps` (Apple Silicon), `cpu`
If you want to fine-tune a specific model you can provide the path to the model. In this case path is enough and type can be skipped.
```bash
lerobot-train \
--dataset.repo_id=${HF_USER}/so101_dataset_test \
--policy.path=username/the_policy_to_finetune \
--policy.device=cuda \
--policy.repo_id=${HF_USER}/policy_test \
--output_dir=outputs/train/act_so101_test \
--steps=20000
```
### Inference
Inference means running the trained policy/model on a robot. For that we use `lerobot-rollout`. You will need to provide a path to your policy. It can be a local path or a path to Hugging Face for example "lerobot/folding_latest". Your cameras configuration needs to match what was used when collecting the dataset. Duration is in seconds if unspecified, it will run forever.
> [!TIP]
> If you are using the previous release V0.5.1 instead of `lerobot-rollout` you need to use `lerobot-record`. More information [here](https://huggingface.co/docs/lerobot/v0.5.1/en/il_robots#run-inference-and-evaluate-your-policy).
```bash
lerobot-rollout \
--strategy.type=base \
--policy.path=${HF_USER}/my_policy \
--robot.type=so101_follower \
--robot.port=/dev/ttyACM1 \
--robot.cameras="{ up: {type: opencv, index_or_path: /dev/video1, width: 640, height: 480, fps: 30}, side: {type: opencv, index_or_path: /dev/video5, width: 640, height: 480, fps: 30}}" \
--task="Put lego brick into the transparent box" \
--duration=60
```
+277
View File
@@ -0,0 +1,277 @@
# Using Subtasks in LeRobot Datasets
Subtask support in robotics datasets has proven effective in improving robot reasoning and understanding. Subtasks are particularly useful for:
- **Hierarchical policies**: Building policies that include subtask predictions to visualize robot reasoning in real time
- **Reward modeling**: Helping reward models understand task progression (e.g., SARM-style stage-aware reward models)
- **Task decomposition**: Breaking down complex manipulation tasks into atomic, interpretable steps
LeRobotDataset now supports subtasks as part of its dataset structure, alongside tasks.
## What are Subtasks?
While a **task** describes the overall goal (e.g., "Pick up the apple and place it in the basket"), **subtasks** break down the execution into finer-grained steps:
1. "Approach the apple"
2. "Grasp the apple"
3. "Lift the apple"
4. "Move to basket"
5. "Release the apple"
Each frame in the dataset can be annotated with its corresponding subtask, enabling models to learn and predict these intermediate stages.
<img
src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lerobot/subtask-asset.png"
alt="An overview of subtask annotation showing how frames are labeled with intermediate subtask stages"
width="80%"
/>
<p>
<em>Figure: Overview of subtask annotation.</em>
</p>
**Reference:** _Subtask-learning based for robot self-assembly in flexible collaborative assembly in manufacturing_, Original Article, Published: 19 April 2022.
## Dataset Structure
Subtask information is stored in the dataset metadata:
```
my-dataset/
├── data/
│ └── ...
├── meta/
│ ├── info.json
│ ├── stats.json
│ ├── tasks.parquet
│ ├── subtasks.parquet # Subtask index → subtask string mapping
│ └── episodes/
│ └── ...
└── videos/
└── ...
```
### Subtasks Parquet File
The `meta/subtasks.parquet` file maps subtask indices to their natural language descriptions:
| subtask_index | subtask (index column) |
| ------------- | ---------------------- |
| 0 | "Approach the apple" |
| 1 | "Grasp the apple" |
| 2 | "Lift the apple" |
| ... | ... |
### Frame-Level Annotations
Each frame in the dataset can include a `subtask_index` field that references the subtasks parquet file:
```python
# Example frame data in the parquet file
{
"index": 42,
"timestamp": 1.4,
"episode_index": 0,
"task_index": 0,
"subtask_index": 2, # References "Lift the apple"
"observation.state": [...],
"action": [...],
}
```
## Annotating Datasets with Subtasks
We provide a HuggingFace Space for easily annotating any LeRobotDataset with subtasks:
**[https://huggingface.co/spaces/lerobot/annotate](https://huggingface.co/spaces/lerobot/annotate)**
After completing your annotation:
1. Click "Push to Hub" to upload your annotated dataset
2. You can also run the annotation space locally by following the instructions at [github.com/huggingface/lerobot-annotate](https://github.com/huggingface/lerobot-annotate)
## Loading Datasets with Subtasks
When you load a dataset with subtask annotations, the subtask information is automatically available:
```python
from lerobot.datasets import LeRobotDataset
# Load a dataset with subtask annotations
dataset = LeRobotDataset("jadechoghari/collect-fruit-annotated")
# Access a sample
sample = dataset[100]
# The sample includes both task and subtask information
print(sample["task"]) # "Collect the fruit"
print(sample["subtask"]) # "Grasp the apple"
print(sample["task_index"]) # tensor(0)
print(sample["subtask_index"]) # tensor(2)
```
### Checking for Subtask Support
You can check if a dataset has subtask annotations:
```python
# Check if subtasks are available
has_subtasks = (
"subtask_index" in dataset.features
and dataset.meta.subtasks is not None
)
if has_subtasks:
print(f"Dataset has {len(dataset.meta.subtasks)} unique subtasks")
print("Subtasks:", list(dataset.meta.subtasks.index))
```
## Using Subtasks for Training
### With the Tokenizer Processor
The `TokenizerProcessor` automatically handles subtask tokenization for Vision-Language Action (VLA) models:
```python
from lerobot.processor import TokenizerProcessorStep
# Create a tokenizer processor step
tokenizer_processor = TokenizerProcessorStep(
tokenizer_name_or_path="google/paligemma-3b-pt-224",
padding="max_length",
max_length=64,
)
# The processor will automatically tokenize subtasks if present in the batch
# and add them to the observation under:
# - "observation.subtask.tokens"
# - "observation.subtask.attention_mask"
```
When subtasks are available in the batch, the tokenizer processor adds:
- `observation.subtask.tokens`: Tokenized subtask text
- `observation.subtask.attention_mask`: Attention mask for the subtask tokens
### DataLoader with Subtasks
```python
import torch
from lerobot.datasets import LeRobotDataset
dataset = LeRobotDataset("jadechoghari/collect-fruit-annotated")
dataloader = torch.utils.data.DataLoader(
dataset,
batch_size=16,
shuffle=True,
)
for batch in dataloader:
# Access subtask information in the batch
subtasks = batch["subtask"] # List of subtask strings
subtask_indices = batch["subtask_index"] # Tensor of subtask indices
# Use for training hierarchical policies or reward models
print(f"Batch subtasks: {set(subtasks)}")
```
## Example Datasets with Subtask Annotations
Try loading a dataset with subtask annotations:
```python
from lerobot.datasets import LeRobotDataset
# Example dataset with subtask annotations
dataset = LeRobotDataset("jadechoghari/collect-fruit-annotated")
# Explore the subtasks
print("Available subtasks:")
for subtask_name in dataset.meta.subtasks.index:
print(f" - {subtask_name}")
# Get subtask distribution
subtask_counts = {}
for i in range(len(dataset)):
sample = dataset[i]
subtask = sample["subtask"]
subtask_counts[subtask] = subtask_counts.get(subtask, 0) + 1
print("\nSubtask distribution:")
for subtask, count in sorted(subtask_counts.items(), key=lambda x: -x[1]):
print(f" {subtask}: {count} frames")
```
## Use Cases
### 1. Hierarchical Policy Training
Train policies that predict both actions and current subtask:
```python
class HierarchicalPolicy(nn.Module):
def __init__(self, num_subtasks):
super().__init__()
self.action_head = nn.Linear(hidden_dim, action_dim)
self.subtask_head = nn.Linear(hidden_dim, num_subtasks)
def forward(self, observations):
features = self.encoder(observations)
actions = self.action_head(features)
subtask_logits = self.subtask_head(features)
return actions, subtask_logits
```
### 2. Stage-Aware Reward Modeling (SARM)
Build reward models that understand task progression:
```python
# SARM predicts:
# - Stage: Which subtask is being executed (discrete)
# - Progress: How far along the subtask (continuous 0-1)
class SARMRewardModel(nn.Module):
def forward(self, observations):
features = self.encoder(observations)
stage_logits = self.stage_classifier(features)
progress = self.progress_regressor(features)
return stage_logits, progress
```
### 3. Progress Visualization
Monitor robot execution by tracking subtask progression:
```python
def visualize_execution(model, observations):
for t, obs in enumerate(observations):
action, subtask_logits = model(obs)
predicted_subtask = subtask_names[subtask_logits.argmax()]
print(f"t={t}: Executing '{predicted_subtask}'")
```
## API Reference
### LeRobotDataset Properties
| Property | Type | Description |
| --------------------------- | ---------------------- | ------------------------------------------ |
| `meta.subtasks` | `pd.DataFrame \| None` | DataFrame mapping subtask names to indices |
| `features["subtask_index"]` | `dict` | Feature spec for subtask_index if present |
### Sample Keys
When subtasks are available, each sample includes:
| Key | Type | Description |
| --------------- | -------------- | ------------------------------------ |
| `subtask_index` | `torch.Tensor` | Integer index of the current subtask |
| `subtask` | `str` | Natural language subtask description |
## Related Resources
- [SARM Paper](https://arxiv.org/pdf/2509.25358) - Stage-Aware Reward Modeling for Long Horizon Robot Manipulation
- [LeRobot Annotate Space](https://huggingface.co/spaces/lerobot/annotate) - Interactive annotation tool
- [LeRobotDataset v3.0](./lerobot-dataset-v3) - Dataset format documentation
+1 -1
View File
@@ -194,7 +194,7 @@ lerobot-record \
--dataset.single_task="Navigate around obstacles" \
--dataset.streaming_encoding=true \
--dataset.encoder_threads=2 \
# --dataset.camera_encoder.vcodec=auto \
# --dataset.vcodec=auto \
--display_data=true
```
-168
View File
@@ -1,168 +0,0 @@
# EO-1
EO-1 is a **Vision-Language-Action policy for robot control**. The LeRobot implementation integrates EO-1 with the standard LeRobot training, evaluation, processor interface.
## Model Overview
EO-1 uses a Qwen2.5-VL backbone for vision-language understanding and adds a continuous flow-matching action head for robot control. The policy formats each robot-control sample as a multimodal conversation: camera images are passed to Qwen2.5-VL, the robot state is represented with EO-1 state tokens, and the future action chunk is represented with EO-1 action tokens.
<img
src="https://huggingface.co/datasets/HaomingSong/lerobot-documentation-images/resolve/main/lerobot/eo_pipeline.png"
alt="An overview of EO-1"
width="85%"
/>
During training, EO-1 learns to denoise continuous action chunks at the action-token positions. During inference, it samples an action chunk, returns continuous actions, and executes `n_action_steps` from the chunk before sampling again.
### What the LeRobot Integration Covers
- Standard `policy.type=eo1` configuration through LeRobot
- Qwen2.5-VL image and text preprocessing through policy processors
- Continuous flow-matching action prediction
- Checkpoint save/load through LeRobot policy APIs
- Training with `lerobot-train` and evaluation with `lerobot-eval`
The broader EO-1 project also includes interleaved vision-text-action pretraining and multimodal reasoning workflows. This page focuses on the LeRobot robot-control policy path.
## Installation Requirements
1. Install LeRobot by following the [Installation Guide](./installation).
2. Install EO-1 dependencies by running:
```bash
pip install -e ".[eo1]"
```
3. If you want to train or evaluate on LIBERO, install the LIBERO dependencies too:
```bash
pip install -e ".[eo1,libero]"
```
EO-1 can use the standard PyTorch scaled-dot-product attention backend through `policy.attn_implementation=sdpa`. If your environment has a compatible `flash_attn` installation, you can request `policy.attn_implementation=flash_attention_2`.
## Data Requirements
EO-1 expects a LeRobot dataset with:
- At least one visual observation, for example `observation.images.image`
- `observation.state`
- `action`
- A language task instruction through the dataset `task` field
If your dataset uses different observation names, use `rename_map` to align them with the names expected by your training or evaluation setup.
## Usage
To use EO-1 in a LeRobot configuration, specify the policy type as:
```python
policy.type=eo1
```
By default, a new EO-1 policy initializes its backbone from:
```python
policy.vlm_base=Qwen/Qwen2.5-VL-3B-Instruct
```
Once a LeRobot-format EO-1 checkpoint is available, load it with:
```python
policy.path=your-org/your-eo1-checkpoint
```
## Training
### Training Command Example
```bash
lerobot-train \
--dataset.repo_id=your_org/your_dataset \
--policy.type=eo1 \
--policy.vlm_base=Qwen/Qwen2.5-VL-3B-Instruct \
--policy.dtype=bfloat16 \
--policy.attn_implementation=sdpa \
--policy.gradient_checkpointing=false \
--output_dir=./outputs/eo1_training \
--job_name=eo1_training \
--steps=300000 \
--batch_size=16 \
--policy.device=cuda
```
### Key Training Parameters
| Parameter | Default | Description |
| -------------------------------------- | ----------------------------- | ----------------------------------------------------------------------- |
| `policy.vlm_base` | `Qwen/Qwen2.5-VL-3B-Instruct` | Qwen2.5-VL checkpoint used to initialize a new policy |
| `policy.dtype` | `auto` | Backbone dtype request: `auto`, `bfloat16`, or `float32` |
| `policy.attn_implementation` | `None` | Optional Qwen attention backend, such as `sdpa` |
| `policy.gradient_checkpointing` | `false` | Reduces memory usage during training |
| `policy.chunk_size` | `8` | Number of future actions predicted per chunk |
| `policy.n_action_steps` | `8` | Number of actions consumed from a sampled chunk |
| `policy.num_denoise_steps` | `10` | Number of flow-matching denoising steps used during sampling |
| `policy.max_state_dim` | `32` | State padding dimension |
| `policy.max_action_dim` | `32` | Action padding dimension |
| `policy.force_fp32_autocast` | `true` | Keeps the flow head in fp32 even when the backbone uses mixed precision |
| `policy.supervise_padding_action_dims` | `true` | Controls whether padded action dimensions are supervised |
| `policy.supervise_padding_actions` | `true` | Controls whether padded future action rows are supervised |
## Evaluation
EO-1 can be evaluated through `lerobot-eval` once you have a LeRobot-format checkpoint:
```bash
lerobot-eval \
--policy.path=your-org/your-eo1-checkpoint \
--env.type=libero \
--env.task=libero_object \
--eval.batch_size=1 \
--eval.n_episodes=20
```
For datasets or environments whose camera names differ from the checkpoint configuration, pass a `rename_map`:
```bash
lerobot-eval \
--policy.path=your-org/your-eo1-checkpoint \
--env.type=libero \
--env.task=libero_object \
--rename_map='{"observation.images.image2":"observation.images.wrist_image"}'
```
## Configuration Notes
### Image Processing
EO-1 uses the Qwen2.5-VL processor. The `policy.image_min_pixels` and `policy.image_max_pixels` settings control the image resizing bounds before the visual tokens are passed into the backbone.
### State and Action Dimensions
The policy pads state and action vectors to `policy.max_state_dim` and `policy.max_action_dim` before the EO-1 flow head. Predictions are cropped back to the original action dimension before being returned by the policy.
### Attention Backend
Use `policy.attn_implementation=sdpa` for a portable setup. Use `flash_attention_2` only when `flash_attn` is installed and compatible with your environment.
## References
- [EO-1 project](https://github.com/EO-Robotics/EO1)
- [EO-1 paper](https://arxiv.org/abs/2508.21112)
- [Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct)
## Citation
```bibtex
@article{eo1,
title={EO-1: Interleaved Vision-Text-Action Pretraining for General Robot Control},
author={Delin Qu and Haoming Song and Qizhi Chen and Zhaoqing Chen and Xianqiang Gao and Xinyi Ye and Qi Lv and Modi Shi and Guanghui Ren and Cheng Ruan and Maoqing Yao and Haoran Yang and Jiacheng Bao and Bin Zhao and Dong Wang},
journal={arXiv preprint},
year={2025},
url={https://arxiv.org/abs/2508.21112}
}
```
## License
This LeRobot integration follows the **Apache 2.0 License** used by LeRobot. Check the upstream EO-1 model and dataset pages for the licenses of released EO-1 checkpoints and data.
+6 -6
View File
@@ -105,12 +105,10 @@ These results demonstrate GR00T's strong generalization capabilities across dive
### Evaluate in your hardware setup
Once you have trained your model using your parameters you can run inference in your downstream task. Follow the instructions in [Policy Deployment (lerobot-rollout)](./inference). For example:
Once you have trained your model using your parameters you can run inference in your downstream task. Follow the instructions in [Imitation Learning for Robots](./il_robots). For example:
```bash
lerobot-rollout\
--strategy.type=sentry \
--strategy.upload_every_n_episodes=5 \
lerobot-record \
--robot.type=bi_so_follower \
--robot.left_arm_port=/dev/ttyACM1 \
--robot.right_arm_port=/dev/ttyACM0 \
@@ -121,12 +119,14 @@ lerobot-rollout\
}' \
--display_data=true \
--dataset.repo_id=<user>/eval_groot-bimanual \
--dataset.num_episodes=10 \
--dataset.single_task="Grab and handover the red cube to the other arm" \
--dataset.streaming_encoding=true \
--dataset.encoder_threads=2 \
# --dataset.camera_encoder.vcodec=auto \
# --dataset.vcodec=auto \
--policy.path=<user>/groot-bimanual \ # your trained model
--duration=600
--dataset.episode_time_s=30 \
--dataset.reset_time_s=10
```
## License
-98
View File
@@ -1,98 +0,0 @@
# Compute HW Guide for LeRobot Training
Rough sizing for training a LeRobot policy: how much VRAM each policy needs, what training time looks like, and where to run when local hardware isn't enough.
The numbers below are **indicative** — order-of-magnitude figures for picking hardware, not exact predictions. Throughput depends heavily on dataset I/O, image resolution, batch size, and number of GPUs.
## Memory by policy group
Policies cluster by backbone size; the groupings below give a single VRAM envelope per group instead of repeating numbers per policy. Memory scales roughly linearly with batch size; AdamW (the LeRobot default) carries optimizer state that adds ~30100% over a forward+backward pass alone.
| Group | Policies | Peak VRAM (BS 8, AdamW) | Suitable starter GPUs |
| ---------- | ------------------------------------------- | ----------------------: | --------------------------------- |
| Light BC | `act`, `vqbet`, `tdmpc` | ~26GB | Laptop GPU (RTX 3060), L4, A10G |
| Diffusion | `diffusion`, `multi_task_dit` | ~814GB | RTX 4070+ / L4 / A10G |
| Small VLA | `smolvla` | ~1016GB | RTX 4080+ / L4 / A10G |
| Large VLA | `pi0`, `pi0_fast`, `pi05`, `xvla`, `wall_x` | ~2440GB | A100 40 GB+ (24 GB tight at BS 1) |
| Multimodal | `groot`, `eo1` | ~2440GB | A100 40 GB+ |
| RL | `sac` | config-dep. | See [HIL-SERL guide](./hilserl) |
Memory-bound? Drop the batch size (~linear), use gradient accumulation to recover effective batch, or for SmolVLA leave `freeze_vision_encoder=True`.
## Training time
Robotics imitation learning typically converges in **510 epochs over the dataset**, not hundreds of thousands of raw steps. Once you know your epoch count, wall-clock is essentially:
```text
total_frames = sum of frames over all episodes # 50 ep × 30 fps × 30 s ≈ 45,000
steps_per_epoch = ceil(total_frames / (num_gpus × batch_size))
total_steps = epochs × steps_per_epoch
wall_clock ≈ total_steps × per_step_time
```
Per-step time depends on the policy and the GPU. The numbers in the table below are anchors — pick the row closest to your setup and scale linearly with `total_steps` if you train longer or shorter.
### Common scenarios
Indicative wall-clock for **5 epochs on a ~50-episode dataset (~45k frames at 30 fps × 30 s)**, default optimizer (AdamW), 640×480 images:
| Setup | Policy | Batch | Wall-clock |
| ------------------------------------ | -------------- | ----- | ---------: |
| Single RTX 4090 / RTX 3090 (24 GB) | `act` | 8 | ~3060min |
| Single RTX 4090 / RTX 3090 (24 GB) | `diffusion` | 8 | ~24h |
| Single L4 / A10G (24 GB) | `act` | 8 | ~12h |
| Single L4 / A10G (24 GB) | `smolvla` | 4 | ~36h |
| Single A100 40 GB | `smolvla` | 16 | ~12h |
| Single A100 40 GB | `pi0` / `pi05` | 4 | ~48h |
| 4× H100 80 GB cluster (`accelerate`) | `diffusion` | 32 | ~3060min |
| 4× H100 80 GB cluster (`accelerate`) | `smolvla` | 32 | ~12h |
| Apple Silicon M1/M2/M3 Max (MPS) | `act` | 4 | ~614h |
These are order-of-magnitude figures. Real runs deviate by ±50% depending on image resolution, dataset I/O, dataloader threading, and exact GPU SKU. They are useful as "is this run going to take an hour or a day?" intuition, not as SLAs.
### Multi-GPU matters a lot
`accelerate launch --num_processes=N` is the easiest way to cut training time. Each optimizer step processes `N × batch_size` samples in roughly the same wall-clock as a single-GPU step, so 4 GPUs ≈ 4× speedup for compute-bound runs. See the [Multi GPU training](./multi_gpu_training) guide for the full setup.
Reference data points on a 4×H100 80 GB cluster (`accelerate launch --num_processes=4`), 5000 steps, batch 32, AdamW, dataset [`imstevenpmwork/super_poulain_draft`](https://huggingface.co/datasets/imstevenpmwork/super_poulain_draft) (~50 episodes, ~640×480 images):
| Policy | Wall-clock | `update_s` | `dataloading_s` | GPU util | Notable flags |
| ----------- | ---------- | ---------: | --------------: | -------- | ------------------------------------------------------------------------------------------------------------------------------ |
| `diffusion` | 16m 17s | 0.167 | 0.015 | ~90% | defaults (training from scratch) |
| `smolvla` | 27m 49s | 0.312 | 0.011 | ~80% | `--policy.path=lerobot/smolvla_base`, `freeze_vision_encoder=false`, `train_expert_only=false` |
| `pi05` | 3h 41m | 2.548 | 0.014 | ~95% | `--policy.pretrained_path=lerobot/pi05_base`, `gradient_checkpointing=true`, `dtype=bfloat16`, vision encoder + expert trained |
The `dataloading_s` vs. `update_s` ratio is the diagnostic that matters: when `dataloading_s` approaches `update_s`, more GPUs stop helping — your dataloader is the bottleneck and you should look at `--num_workers`, image resolution, and disk speed before adding compute.
### Schedule and checkpoints
If you shorten training (e.g. 5k10k steps on a small dataset), also shorten the LR schedule with `--policy.scheduler_decay_steps≈--steps`. Otherwise the LR stays near its peak and never decays. Same for `--save_freq`.
## Where to run
VRAM is the first filter. Within a tier, pick by budget and availability — the `$``$$$$` columns are relative; check current pricing on the provider you actually use.
| Class | VRAM | Tier | Comfortable for |
| -------------------------- | ----- | ------ | ----------------------------------------------------------- |
| RTX 3090 / 4090 (consumer) | 24 GB | `$` | Light BC, Diffusion, SmolVLA. Tight for VLAs at batch 1. |
| L4 / A10G (cloud) | 24 GB | `$$$` | Same envelope; common on Google Cloud, RunPod, AWS `g5/g6`. |
| A100 40 GB | 40 GB | `$$$` | Any policy at reasonable batch sizes. |
| A100 80 GB / H100 80 GB | 80 GB | `$$$$` | Multi-GPU clusters; large batches for VLAs. |
| **CPU only** | — | — | Don't train. Use Colab or rent a GPU. |
### Hugging Face Jobs
[Hugging Face Jobs](https://huggingface.co/docs/hub/jobs) lets you run training on managed HF infrastructure, billed by the second. The repo publishes a ready-to-use image: **`huggingface/lerobot-gpu:latest`**, rebuilt **every night at 02:00 UTC from `main`** ([`docker_publish.yml`](https://github.com/huggingface/lerobot/blob/main/.github/workflows/docker_publish.yml)) — so it tracks the current state of the repo, not a tagged release.
```bash
hf jobs run --flavor a10g-large huggingface/lerobot-gpu:latest \
bash -c "nvidia-smi && lerobot-train \
--policy.type=act --dataset.repo_id=<USER>/<DATASET> \
--policy.repo_id=<USER>/act_<task> --batch_size=8 --steps=50000"
```
Notes:
- The leading `nvidia-smi` is a quick sanity check that CUDA is visible inside the container — useful to fail fast if the flavor or driver mismatched.
- The default Job timeout is 30 minutes; pass `--timeout 4h` (or longer) for real training.
- `--flavor` maps onto the table above: `t4-small`/`t4-medium` (T4, ACT only), `l4x1`/`l4x4` (L4 24 GB), `a10g-small/large/largex2/largex4` (A10G 24 GB scaled out), `a100-large` (A100). For the current full catalogue + pricing see [https://huggingface.co/docs/hub/jobs](https://huggingface.co/docs/hub/jobs).
+37 -40
View File
@@ -62,7 +62,7 @@ pip install -e ".[hilserl]"
### Understanding Configuration
The training process begins with proper configuration for the HILSERl environment. The main configuration class is `GymManipulatorConfig` in `lerobot/rl/gym_manipulator.py`, which contains nested `HILSerlRobotEnvConfig` (defined in `lerobot/envs/configs.py`) and `DatasetConfig`. The configuration is organized into focused, nested sub-configs:
The training process begins with proper configuration for the HILSerl environment. The main configuration class is `GymManipulatorConfig` in `lerobot/rl/gym_manipulator.py`, which contains nested `HILSerlRobotEnvConfig` and `DatasetConfig`. The configuration is organized into focused, nested sub-configs:
<!-- prettier-ignore-start -->
```python
@@ -95,7 +95,6 @@ class HILSerlProcessorConfig:
class ObservationConfig:
add_joint_velocity_to_observation: bool = False # Add joint velocities to state
add_current_to_observation: bool = False # Add motor currents to state
add_ee_pose_to_observation: bool = False # Add end-effector pose to state
display_cameras: bool = False # Display camera feeds during execution
class ImagePreprocessingConfig:
@@ -327,22 +326,14 @@ lerobot-find-joint-limits \
Max joint positions [-20.0, -20.0, -20.0, -20.0, -20.0, -20.0]
Min joint positions [50.0, 50.0, 50.0, 50.0, 50.0, 50.0]
```
3. Use these values in your environment configuration under `env.processor.inverse_kinematics.end_effector_bounds` (see `InverseKinematicsConfig` in `lerobot/envs/configs.py`)
3. Use these values in the configuration of your teleoperation device (TeleoperatorConfig) under the `end_effector_bounds` field
**Example Configuration**
```json
{
"env": {
"processor": {
"inverse_kinematics": {
"end_effector_bounds": {
"max": [0.24, 0.2, 0.1],
"min": [0.16, -0.08, 0.03]
}
}
}
}
"end_effector_bounds": {
"max": [0.24, 0.20, 0.10],
"min": [0.16, -0.08, 0.03]
}
```
@@ -413,24 +404,30 @@ We support using a gamepad or a keyboard or the leader arm of the robot.
HIL-Serl learns actions in the end-effector space of the robot. Therefore, the teleoperation will control the end-effector's x,y,z displacements.
The end-effector transformation is applied by the processor pipeline (`InverseKinematicsRLStep`, `EEBoundsAndSafety`, `EEReferenceAndDelta`, `GripperVelocityToJoint`) configured under `env.processor.inverse_kinematics` (`InverseKinematicsConfig`) and `env.processor.gripper` / `env.processor.max_gripper_pos`. The defaults related to the end-effector space are:
For that we need to define a version of the robot that takes actions in the end-effector space. Check the robot class `SO100FollowerEndEffector` and its configuration `SO100FollowerEndEffectorConfig` for the default parameters related to the end-effector space.
<!-- prettier-ignore-start -->
```python
class InverseKinematicsConfig:
"""Configuration for inverse kinematics processing."""
class SO100FollowerEndEffectorConfig(SO100FollowerConfig):
"""Configuration for the SO100FollowerEndEffector robot."""
urdf_path: str | None = None
target_frame_name: str | None = None
# bounds for the end-effector in x,y,z direction
end_effector_bounds: dict[str, list[float]] | None = None
# maximum step size for the end-effector in x,y,z direction
end_effector_step_sizes: dict[str, float] | None = None
# Default bounds for the end-effector position (in meters)
end_effector_bounds: dict[str, list[float]] = field( # bounds for the end-effector in x,y,z direction
default_factory=lambda: {
"min": [-1.0, -1.0, -1.0], # min x, y, z
"max": [1.0, 1.0, 1.0], # max x, y, z
}
)
class HILSerlProcessorConfig:
...
# maximum gripper position that the gripper will be open at
max_gripper_pos: float | None = 100.0
max_gripper_pos: float = 50 # maximum gripper position that the gripper will be open at
end_effector_step_sizes: dict[str, float] = field( # maximum step size for the end-effector in x,y,z direction
default_factory=lambda: {
"x": 0.02,
"y": 0.02,
"z": 0.02,
}
)
```
<!-- prettier-ignore-end -->
@@ -609,11 +606,11 @@ This guide explains how to train a reward classifier for human-in-the-loop reinf
**Note**: Training a reward classifier is optional. You can start the first round of RL experiments by annotating the success manually with your gamepad or keyboard device.
The reward classifier implementation in `lerobot/rewards/classifier/modeling_classifier.py` uses a pretrained vision model to process the images. It can output either a single value for binary rewards to predict success/fail cases or multiple values for multi-class settings.
The reward classifier implementation in `modeling_classifier.py` uses a pretrained vision model to process the images. It can output either a single value for binary rewards to predict success/fail cases or multiple values for multi-class settings.
**Collecting a Dataset for the reward classifier**
Before training, you need to collect a dataset with labeled examples. Setting `mode: "record"` in your config and running `gym_manipulator.py` enables the process of collecting a dataset of observations, actions, and rewards.
Before training, you need to collect a dataset with labeled examples. The `record_dataset` function in `gym_manipulator.py` enables the process of collecting a dataset of observations, actions, and rewards.
To collect a dataset, you need to modify some parameters in the environment configuration based on HILSerlRobotEnvConfig.
@@ -661,7 +658,7 @@ Example configuration section for data collection:
},
"dataset": {
"repo_id": "hf_username/dataset_name",
"root": "data/your_dataset",
"dataset_root": "data/your_dataset",
"task": "reward_classifier_task",
"num_episodes_to_record": 20,
"replay_episode": null,
@@ -674,7 +671,7 @@ Example configuration section for data collection:
**Reward Classifier Configuration**
The reward classifier is configured using `lerobot/rewards/classifier/configuration_classifier.py`. Here are the key parameters:
The reward classifier is configured using `configuration_classifier.py`. Here are the key parameters:
- **model_name**: Base model architecture (e.g., we mainly use `"helper2424/resnet10"`)
- **model_type**: `"cnn"` or `"transformer"`
@@ -692,7 +689,7 @@ Example configuration for training the [reward classifier](https://huggingface.c
"repo_id": "hf_username/dataset_name",
"root": null
},
"reward_model": {
"policy": {
"type": "reward_classifier",
"model_name": "helper2424/resnet10",
"model_type": "cnn",
@@ -702,6 +699,7 @@ Example configuration for training the [reward classifier](https://huggingface.c
"dropout_rate": 0.1,
"learning_rate": 1e-4,
"device": "cuda",
"use_amp": true,
"input_features": {
"observation.images.front": {
"type": "VISUAL",
@@ -820,14 +818,13 @@ The LeRobot system uses a distributed actor-learner architecture for training. T
**Configuration Setup**
Create a training configuration file (example available [here](https://huggingface.co/datasets/lerobot/config_examples/resolve/main/rl/train_config.json)). The training config is based on the main `TrainRLServerPipelineConfig` class in `lerobot/rl/train_rl.py`.
Create a training configuration file (example available [here](https://huggingface.co/datasets/lerobot/config_examples/resolve/main/rl/train_config.json)). The training config is based on the main `TrainRLServerPipelineConfig` class in `lerobot/configs/train.py`.
1. Configure the policy settings (`type="gaussian_actor"`, `device`, etc.)
2. Configure the algorithm settings under the top-level `algorithm` block (`type="sac"`, learning rates, discount, etc., defined in `lerobot/rl/algorithms/sac/configuration_sac.py`).
3. Set `dataset` to your cropped dataset
4. Configure environment settings with crop parameters
5. Check the other parameters related to the Gaussian Actor in [configuration_gaussian_actor.py](https://github.com/huggingface/lerobot/blob/main/src/lerobot/policies/gaussian_actor/configuration_gaussian_actor.py#L79).
6. Verify that the `policy` config is correct with the right `input_features` and `output_features` for your task.
1. Configure the policy settings (`type="sac"`, `device`, etc.)
2. Set `dataset` to your cropped dataset
3. Configure environment settings with crop parameters
4. Check the other parameters related to SAC in [configuration_sac.py](https://github.com/huggingface/lerobot/blob/main/src/lerobot/policies/sac/configuration_sac.py#L79).
5. Verify that the `policy` config is correct with the right `input_features` and `output_features` for your task.
**Starting the Learner**
@@ -929,7 +926,7 @@ The ideal behaviour is that your intervention rate should drop gradually during
Some configuration values have a disproportionate impact on training stability and speed:
- **`temperature_init`** (`algorithm.temperature_init`) initial entropy temperature in SAC. Higher values encourage more exploration; lower values make the policy more deterministic early on. A good starting point is `1e-2`. We observed that setting it too high can make human interventions ineffective and slow down learning.
- **`temperature_init`** (`policy.temperature_init`) initial entropy temperature in SAC. Higher values encourage more exploration; lower values make the policy more deterministic early on. A good starting point is `1e-2`. We observed that setting it too high can make human interventions ineffective and slow down learning.
- **`policy_parameters_push_frequency`** (`policy.actor_learner_config.policy_parameters_push_frequency`) interval in _seconds_ between two weight pushes from the learner to the actor. The default is `4 s`. Decrease to **1-2 s** to provide fresher weights (at the cost of more network traffic); increase only if your connection is slow, as this will reduce sample efficiency.
- **`storage_device`** (`policy.storage_device`) device on which the learner keeps the policy parameters. If you have spare GPU memory, set this to `"cuda"` (instead of the default `"cpu"`). Keeping the weights on-GPU removes CPU→GPU transfer overhead and can significantly increase the number of learner updates per second.
+2 -2
View File
@@ -232,7 +232,7 @@ lerobot-record \
--dataset.private=true \
--dataset.streaming_encoding=true \
--dataset.encoder_threads=2 \
# --dataset.camera_encoder.vcodec=auto \
# --dataset.vcodec=auto \
--display_data=true
```
@@ -278,6 +278,6 @@ lerobot-record \
--dataset.num_episodes=10 \
--dataset.streaming_encoding=true \
--dataset.encoder_threads=2 \
# --dataset.camera_encoder.vcodec=auto \
# --dataset.vcodec=auto \
--policy.path=outputs/train/hopejr_hand/checkpoints/last/pretrained_model
```
+104 -206
View File
@@ -68,13 +68,13 @@ from lerobot.teleoperators.so_leader import SO101Leader, SO101LeaderConfig
from lerobot.robots.so_follower import SO101Follower, SO101FollowerConfig
robot_config = SO101FollowerConfig(
port="/dev/tty.usbmodem5AB90687491",
id="my_follower_arm",
port="/dev/tty.usbmodem58760431541",
id="my_red_robot_arm",
)
teleop_config = SO101LeaderConfig(
port="/dev/tty.usbmodem5AB90689011",
id="my_leader_arm",
port="/dev/tty.usbmodem58760431551",
id="my_blue_leader_arm",
)
robot = SO101Follower(robot_config)
@@ -108,13 +108,13 @@ With `rerun`, you can teleoperate again while simultaneously visualizing the cam
<hfoption id="Command">
```bash
lerobot-teleoperate \
--robot.type=so101_follower \
--robot.port=/dev/tty.usbmodem5AB90687491 \
--robot.id=my_follower_arm \
--robot.cameras="{front: {type: opencv, index_or_path: 0, width: 640, height: 480, fps: 30}}" \
--teleop.type=so101_leader \
--teleop.port=/dev/tty.usbmodem5AB90689011 \
--teleop.id=my_leader_arm \
--robot.type=koch_follower \
--robot.port=/dev/tty.usbmodem58760431541 \
--robot.id=my_awesome_follower_arm \
--robot.cameras="{ front: {type: opencv, index_or_path: 0, width: 1920, height: 1080, fps: 30}}" \
--teleop.type=koch_leader \
--teleop.port=/dev/tty.usbmodem58760431551 \
--teleop.id=my_awesome_leader_arm \
--display_data=true
```
</hfoption>
@@ -122,48 +122,34 @@ lerobot-teleoperate \
<!-- prettier-ignore-start -->
```python
import time
from lerobot.teleoperators.so_leader import SO101Leader, SO101LeaderConfig
from lerobot.robots.so_follower import SO101Follower, SO101FollowerConfig
from lerobot.cameras.opencv import OpenCVCameraConfig
from lerobot.utils.visualization_utils import init_rerun, log_rerun_data, shutdown_rerun
from lerobot.teleoperators.koch_leader import KochLeader, KochLeaderConfig
from lerobot.robots.koch_follower import KochFollower, KochFollowerConfig
robot_config = SO101FollowerConfig(
port="/dev/tty.usbmodem5AB90687491",
id="my_follower_arm",
cameras={
"wrist": OpenCVCameraConfig(index_or_path=0, width=640, height=480, fps=30),
"top": OpenCVCameraConfig(index_or_path=1, width=640, height=480, fps=30)
}
camera_config = {
"front": OpenCVCameraConfig(index_or_path=0, width=1920, height=1080, fps=30)
}
robot_config = KochFollowerConfig(
port="/dev/tty.usbmodem585A0076841",
id="my_red_robot_arm",
cameras=camera_config
)
teleop_config = SO101LeaderConfig(
port="/dev/tty.usbmodem5AB90689011",
id="my_leader_arm",
teleop_config = KochLeaderConfig(
port="/dev/tty.usbmodem58760431551",
id="my_blue_leader_arm",
)
init_rerun(session_name="teleoperation")
robot = SO101Follower(robot_config)
teleop_device = SO101Leader(teleop_config)
robot = KochFollower(robot_config)
teleop_device = KochLeader(teleop_config)
robot.connect()
teleop_device.connect()
TARGET_HZ = 30
TIME_PER_FRAME = 1.0 / TARGET_HZ
while True:
start_time = time.perf_counter()
observation = robot.get_observation()
action = teleop_device.get_action()
robot.send_action(action)
log_rerun_data(observation=observation, action=action)
elapsed_time = time.perf_counter() - start_time
sleep_time = TIME_PER_FRAME - elapsed_time
if sleep_time > 0:
time.sleep(sleep_time)
```
<!-- prettier-ignore-end -->
@@ -207,7 +193,7 @@ lerobot-record \
--dataset.num_episodes=5 \
--dataset.single_task="Grab the black cube" \
--dataset.streaming_encoding=true \
# --dataset.camera_encoder.vcodec=auto \
# --dataset.vcodec=auto \
--dataset.encoder_threads=2
```
</hfoption>
@@ -216,11 +202,10 @@ lerobot-record \
<!-- prettier-ignore-start -->
```python
from lerobot.cameras.opencv import OpenCVCameraConfig
from lerobot.datasets.lerobot_dataset import LeRobotDataset
from lerobot.datasets import LeRobotDataset
from lerobot.utils.feature_utils import hw_to_dataset_features
from lerobot.robots.so_follower import SO101Follower, SO101FollowerConfig
from lerobot.teleoperators.so_leader.config_so_leader import SO101LeaderConfig
from lerobot.teleoperators.so_leader.so_leader import SO101Leader
from lerobot.robots.so_follower import SO100Follower, SO100FollowerConfig
from lerobot.teleoperators.so_leader import SO100Leader, SO100LeaderConfig
from lerobot.common.control_utils import init_keyboard_listener
from lerobot.utils.utils import log_say
from lerobot.utils.visualization_utils import init_rerun
@@ -233,56 +218,71 @@ EPISODE_TIME_SEC = 60
RESET_TIME_SEC = 10
TASK_DESCRIPTION = "My task description"
def main():
# Create robot configuration
robot_config = SO101FollowerConfig(
port="/dev/tty.usbmodem5AB90687491",
id="my_follower_arm",
cameras={
"wrist": OpenCVCameraConfig(index_or_path=0, width=640, height=480, fps=30),
"top": OpenCVCameraConfig(index_or_path=1, width=640, height=480, fps=30)
}
)
# Create robot configuration
robot_config = SO100FollowerConfig(
id="my_awesome_follower_arm",
cameras={
"front": OpenCVCameraConfig(index_or_path=0, width=640, height=480, fps=FPS) # Optional: fourcc="MJPG" for troubleshooting OpenCV async error.
},
port="/dev/tty.usbmodem58760434471",
)
teleop_config = SO101LeaderConfig(
port="/dev/tty.usbmodem5AB90689011",
id="my_leader_arm",
)
teleop_config = SO100LeaderConfig(
id="my_awesome_leader_arm",
port="/dev/tty.usbmodem585A0077581",
)
# Initialize the robot and teleoperator
robot = SO101Follower(robot_config)
teleop = SO101Leader(teleop_config)
# Initialize the robot and teleoperator
robot = SO100Follower(robot_config)
teleop = SO100Leader(teleop_config)
# Configure the dataset features
action_features = hw_to_dataset_features(robot.action_features, "action")
obs_features = hw_to_dataset_features(robot.observation_features, "observation")
dataset_features = {**action_features, **obs_features}
# Configure the dataset features
action_features = hw_to_dataset_features(robot.action_features, "action")
obs_features = hw_to_dataset_features(robot.observation_features, "observation")
dataset_features = {**action_features, **obs_features}
# Create the dataset
dataset = LeRobotDataset.create(
repo_id="<hf_username>/<dataset_repo_id>",
# Create the dataset
dataset = LeRobotDataset.create(
repo_id="<hf_username>/<dataset_repo_id>",
fps=FPS,
features=dataset_features,
robot_type=robot.name,
use_videos=True,
image_writer_threads=4,
)
# Initialize the keyboard listener and rerun visualization
_, events = init_keyboard_listener()
init_rerun(session_name="recording")
# Connect the robot and teleoperator
robot.connect()
teleop.connect()
# Create the required processors
teleop_action_processor, robot_action_processor, robot_observation_processor = make_default_processors()
episode_idx = 0
while episode_idx < NUM_EPISODES and not events["stop_recording"]:
log_say(f"Recording episode {episode_idx + 1} of {NUM_EPISODES}")
record_loop(
robot=robot,
events=events,
fps=FPS,
features=dataset_features,
robot_type=robot.name,
use_videos=True,
image_writer_threads=4,
teleop_action_processor=teleop_action_processor,
robot_action_processor=robot_action_processor,
robot_observation_processor=robot_observation_processor,
teleop=teleop,
dataset=dataset,
control_time_s=EPISODE_TIME_SEC,
single_task=TASK_DESCRIPTION,
display_data=True,
)
# Initialize the keyboard listener and rerun visualization
_, events = init_keyboard_listener()
init_rerun(session_name="recording")
# Connect the robot and teleoperator
robot.connect()
teleop.connect()
# Create the required processors
teleop_action_processor, robot_action_processor, robot_observation_processor = make_default_processors()
episode_idx = 0
while episode_idx < NUM_EPISODES and not events["stop_recording"]:
log_say(f"Recording episode {episode_idx + 1} of {NUM_EPISODES}")
# Reset the environment if not stopping or re-recording
if not events["stop_recording"] and (episode_idx < NUM_EPISODES - 1 or events["rerecord_episode"]):
log_say("Reset the environment")
record_loop(
robot=robot,
events=events,
@@ -291,50 +291,26 @@ def main():
robot_action_processor=robot_action_processor,
robot_observation_processor=robot_observation_processor,
teleop=teleop,
dataset=dataset,
control_time_s=EPISODE_TIME_SEC,
control_time_s=RESET_TIME_SEC,
single_task=TASK_DESCRIPTION,
display_data=True,
)
# Reset the environment if not stopping or re-recording
if not events["stop_recording"] and (episode_idx < NUM_EPISODES - 1 or events["rerecord_episode"]):
log_say("Reset the environment")
record_loop(
robot=robot,
events=events,
fps=FPS,
teleop_action_processor=teleop_action_processor,
robot_action_processor=robot_action_processor,
robot_observation_processor=robot_observation_processor,
teleop=teleop,
control_time_s=RESET_TIME_SEC,
single_task=TASK_DESCRIPTION,
display_data=True,
)
if events["rerecord_episode"]:
log_say("Re-recording episode")
events["rerecord_episode"] = False
events["exit_early"] = False
dataset.clear_episode_buffer()
continue
if events["rerecord_episode"]:
log_say("Re-recording episode")
events["rerecord_episode"] = False
events["exit_early"] = False
dataset.clear_episode_buffer()
continue
dataset.save_episode()
episode_idx += 1
dataset.save_episode()
episode_idx += 1
# finalize dataset
log_say("Finalizing dataset...")
dataset.finalize()
# Clean up
log_say("Stop recording")
robot.disconnect()
teleop.disconnect()
dataset.push_to_hub()
if __name__ == "__main__":
main()
# Clean up
log_say("Stop recording")
robot.disconnect()
teleop.disconnect()
dataset.push_to_hub()
```
<!-- prettier-ignore-end -->
@@ -372,7 +348,7 @@ The `record` function provides a suite of tools for capturing and managing data
##### 2. Checkpointing and Resuming
- Checkpoints are automatically created during recording.
- If an issue occurs or you want to record additional episodes in the same dataset, you can resume by re-running the same command with `--resume=true`. When resuming a recording, `--dataset.num_episodes` must be set to the **number of additional episodes to be recorded**, and not to the targeted total number of episodes in the dataset! Make sure that you also set `--dataset.root="local_path"`, it's a local path to save the new part of the dataset and is required to resume.
- If an issue occurs, you can resume by re-running the same command with `--resume=true`. When resuming a recording, `--dataset.num_episodes` must be set to the **number of additional episodes to be recorded**, and not to the targeted total number of episodes in the dataset !
- To start recording from scratch, **manually delete** the dataset directory.
##### 3. Recording Parameters
@@ -446,7 +422,7 @@ from lerobot.utils.utils import log_say
episode_idx = 0
robot_config = SO100FollowerConfig(port="/dev/tty.usbmodem5AB90687491", id="my_follower_arm")
robot_config = SO100FollowerConfig(port="/dev/tty.usbmodem58760434471", id="my_awesome_follower_arm")
robot = SO100Follower(robot_config)
robot.connect()
@@ -514,83 +490,6 @@ Additionally you can provide extra `tags` or specify a `license` for your model
If your local computer doesn't have a powerful GPU you could utilize Google Colab to train your model by following the [ACT training notebook](./notebooks#training-act).
#### Train using Hugging Face Jobs
Hugging Face jobs let's you easily select hardware and run the training in the cloud. So if you don't have a powerful GPU or you need more VRAM or just want to train a model much faster use HF Jobs! It's pay as you go and you simply pay for each second of use, you can see the pricing and additional information [here](https://huggingface.co/docs/hub/jobs).
To run the training use this command:
<hfoptions id="train_with_hf_jobs">
<hfoption id="Command">
```bash
hf jobs run \
--flavor a10g-small \
--timeout 4h \
--secrets HF_TOKEN \
huggingface/lerobot-gpu:latest \
-- \
python -m lerobot.scripts.lerobot_train \
--dataset.repo_id=username/dataset \
--policy.type=act \
--steps=5000 \
--batch_size=16 \
--policy.device=cuda \
--policy.repo_id=username/your_policy \
--log_freq=100
```
</hfoption>
<hfoption id="API example">
<!-- prettier-ignore-start -->
```python
from huggingface_hub import run_job, get_token
run_name = "act_so101_hf_jobs"
dataset_id = "username/dataset"
user_hub_id = "username"
command_args = [
"python", "-m", "lerobot.scripts.lerobot_train",
"--dataset.repo_id", dataset_id,
"--policy.type", "act",
"--steps", "5000",
"--batch_size", "16",
"--num_workers", "4",
"--policy.device", "cuda",
"--log_freq", "100",
"--save_freq", "1000",
"--save_checkpoint", "true",
"--wandb.enable", "false",
"--policy.repo_id", f"{user_hub_id}/{run_name}"
]
print(f"Submitting job '{run_name}' to Hugging Face Infrastructure...")
job_info = run_job(
image="huggingface/lerobot-gpu:latest",
command=command_args,
flavor="a10g-small",
timeout="4h",
secrets={"HF_TOKEN": get_token()}
)
print("\n🚀 Job successfully launched!")
print(f"🔹 Job ID: {job_info.id}")
print(f"🔗 Live UI Dashboard & Logs: {job_info.url}")
```
<!-- prettier-ignore-end -->
</hfoption>
</hfoptions>
You can modify the `--flavor` to use different hardware, for example: `t4-small`, `a100-large`, `h200`. Use `hf jobs hardware` to see the full list with pricing.
Depending on the model you want to train and the hardware you selected you can also modify the `--batch_size` and `--number_of_workers`.
For longer training sessions increase the timeout.
Once the training is started you can go to [Jobs](https://huggingface.co/settings/jobs) and see if your jobs is running as well as all the outputs. Sometimes it takes a few minutes to schedule your job so be patient.
After training the model will be pushed to hub and you can use it as any other model with LeRobot.
#### Upload policy checkpoints
Once training is done, upload the latest checkpoint with:
@@ -647,6 +546,5 @@ The `--strategy.type` flag selects the execution mode:
- `sentry`: Continuous recording with auto-upload (useful for large-scale evaluation)
- `highlight`: Ring buffer recording with keystroke save (useful for capturing interesting events)
- `dagger`: Human-in-the-loop data collection (see [HIL Data Collection](./hil_data_collection))
- `episodic`: Episode-oriented policy recording with reset phases between episodes
All strategies support `--inference.type=rtc` for smooth execution with slow VLA models (Pi0, Pi0.5, SmolVLA).
-38
View File
@@ -157,44 +157,6 @@ Foot pedal input is also supported via `--strategy.input_device=pedal`. Configur
| `--strategy.input_device` | Input device: `keyboard` or `pedal` (default: keyboard) |
| `--teleop.type` | **Required.** Teleoperator type |
### Episodic (`--strategy.type=episodic`)
Episode-oriented recording that mirrors the behavior of `lerobot-record`. The policy drives the robot for each episode; an optional teleoperator can drive the robot during the reset phase between episodes.
```bash
lerobot-rollout \
--strategy.type=episodic \
--policy.path=${HF_USER}/my_policy \
--robot.type=so100_follower \
--robot.port=/dev/ttyACM0 \
--teleop.type=so100_leader \
--teleop.port=/dev/ttyACM1 \
--dataset.repo_id=${HF_USER}/my_eval_data \
--dataset.num_episodes=20 \
--dataset.episode_time_s=30 \
--dataset.reset_time_s=10 \
--dataset.single_task="Pick up the red cube"
```
Teleop is optional — if omitted the robot holds its position during the reset phase.
**Keyboard controls:**
| Key | Action |
| ----------- | -------------------------------- |
| `→` (right) | End the current episode early |
| `←` (left) | Discard episode and re-record it |
| `ESC` | Stop the recording session |
| Flag | Description |
| ----------------------------------------------- | -------------------------------------------------------------------------- |
| `--dataset.num_episodes` | Number of episodes to record |
| `--dataset.episode_time_s` | Duration of each recording episode in seconds |
| `--dataset.reset_time_s` | Duration of the reset phase between episodes in seconds |
| `--teleop.type` | Optional. Teleoperator to drive the robot during resets |
| `--strategy.reset_to_initial_position` | Whether to reset the robot to its initial position between episodes |
| `--strategy.smooth_leader_to_follower_handover` | Whether to turn on or off the leader -> follower smooth handover behavior. |
---
## Inference Backends
-50
View File
@@ -207,56 +207,6 @@ pip install 'lerobot[feetech]' # Feetech motor support
_Multiple extras can be combined (e.g., `.[core_scripts,pi,pusht]`). For a full list of available extras, refer to `pyproject.toml`._
### PyTorch CUDA variant (Linux only)
On Linux, the install path determines which CUDA wheel you get. macOS and Windows installs use the PyPI default (MPS / CPU / CUDA-Windows wheel respectively) and can skip this section.
<!-- prettier-ignore-start -->
<hfoptions id="cuda_variant">
<hfoption id="uv-source">
**Source install via `uv` (`uv sync` or `uv pip install -e .`)**
`torch` and `torchvision` are pinned by the project to the **CUDA 12.8** PyTorch index (`https://download.pytorch.org/whl/cu128`, driver floor **570.86**) — covers Ampere/Ada/Hopper/Blackwell GPUs. No action needed for typical NVIDIA setups.
To override for a different CUDA variant:
```bash
uv pip install --force-reinstall torch torchvision \
--index-url https://download.pytorch.org/whl/cu126 # older drivers; or cu130 for Blackwell on driver ≥ 580
```
</hfoption>
<hfoption id="pip-conda">
**Source install via `pip`/`conda`, or `pip install lerobot` from PyPI**
PyPI default torch wheel is currently a cu130-bundled Linux wheel, driver floor **580.65**.
To pick a specific CUDA variant:
**Using `pip` or `conda`** — install torch first with an explicit index, then lerobot:
```bash
pip install --index-url https://download.pytorch.org/whl/cu128 torch torchvision
pip install -e ".[all]" # source
# — or —
pip install lerobot # from PyPI
```
**Using `uv` to install from PyPI** — one-liner via `--torch-backend` (uv ≥ 0.6):
```bash
uv pip install --torch-backend cu128 lerobot
```
Supported values include `auto`, `cpu`, `cu126`, `cu128`, `cu129`, `cu130`, plus various `rocm*` and `xpu`. Swap as needed for your driver.
</hfoption>
</hfoptions>
<!-- prettier-ignore-end -->
### Troubleshooting
If you encounter build errors, you may need to install additional system dependencies: `cmake`, `build-essential`, and `ffmpeg libs`.
-147
View File
@@ -1,147 +0,0 @@
# Language columns and recipes
Most LeRobot datasets ship with a single `task` string per episode — fine for
short, single-instruction skills, but not enough for the longer-horizon,
multi-modal robot policies the field is moving toward (high-level planning,
memory, interjections, VQA, tool use). To support those policies without
forking the dataset format, LeRobot extends `LeRobotDataset` with two optional
language columns and a small recipe layer that turns those rows into
chat-style training samples on the fly.
The design splits cleanly into three layers:
1. **Data in the dataset** — language annotations stored next to frames in
`data/chunk-*/file-*.parquet` as two optional columns (`language_persistent`
and `language_events`). Datasets without these columns keep their existing
behavior.
2. **Recipe** — a YAML file that declares which annotation rows to bind and
how to lay them out as chat turns (`role`, `content`, optional images,
optional tool calls). Recipes are pure config; no Python required to add a
new one.
3. **Training format** — at sample time, `RenderMessagesStep` resolves the
recipe against the per-frame annotations and emits HF-style `messages` plus
LeRobot-specific sidecars (`message_streams`, `target_message_indices`)
that policy processors consume.
This page describes each layer in turn.
## Layer 1 — language columns in the dataset
The two optional columns live next to frame data in
`data/chunk-*/file-*.parquet`:
- `language_persistent`: a list of rows broadcast across every frame in an episode for state that remains active, such as `subtask`, `plan`, and `memory`.
- `language_events`: a list of rows only on the exact frame where an event was emitted, such as `interjection`, `vqa`, and speech tool calls.
Both columns share the same row shape (event rows omit `timestamp` because the
frame the row sits on already provides it):
```text
role: string
content: string | null
style: string | null
timestamp: float32 # persistent rows only
camera: string | null # observation.images.* feature key, view-dependent rows only
tool_calls: list[Json] | null
```
The `camera` field tags rows whose `content` is grounded in a specific camera
view. Rows of view-dependent styles (`vqa` and `trace`) MUST set `camera` to
the matching `observation.images.*` feature key. Rows of every other style —
including `motion`, which describes robot-frame primitives in joint / Cartesian
terms — MUST leave `camera` as `null`. Pipeline writers and the validator
enforce this via `validate_camera_field(style, camera)`.
`meta/tasks.parquet` remains the canonical source for the task. The special `${task}` recipe binding always reads that task string and does not depend on language annotations.
### Architecture
The language stack itself has three internal modules backing layer 1:
1. `lerobot.datasets.language` defines the schema, style registry, and `column_for_style`.
2. `lerobot.datasets.language_render` resolves rows and renders messages.
3. `RenderMessagesStep` turns dataset samples into `messages`, `message_streams`, and `target_message_indices`.
`LeRobotDataset` stays recipe-agnostic. It passes `language_persistent` and `language_events` through when present, and unannotated datasets keep their existing behavior.
## Layer 2 — recipe anatomy
Recipes are YAML files backed by `TrainingRecipe` and `MessageTurn`. They
declare which annotation rows to pull (via `bindings`) and how to compose them
into chat turns (`messages`).
```yaml
messages:
- { role: user, content: "${task}", stream: high_level }
- { role: assistant, content: "${subtask}", stream: low_level, target: true }
```
A recipe can also branch into a weighted **blend** of sub-recipes. At sample
time, exactly one branch is selected deterministically from the sample index,
so different frames train different objectives (e.g. memory updates vs.
low-level execution vs. VQA) without any Python wiring.
### Temporal semantics
Persistent styles are active after emission until replaced:
- `active_at(t, style=subtask)`
- `nth_prev(style=memory, offset=1)`
- `nth_next(style=subtask, offset=1)`
Event styles only exist on their exact timestamp:
- `emitted_at(t, style=interjection)`
- `emitted_at(t, style=vqa, role=user, camera=observation.images.top)`
- `emitted_at(t, role=assistant, tool_name=say)`
Exact event matching has no tolerance window, so writers must stamp event rows with frame timestamps from the parquet data.
### View-dependent resolution
For view-dependent styles (`vqa` and `trace`), the resolver gains a
`camera=` filter parallel to `role=` and `tool_name=`. Datasets with multiple
cameras typically emit one (`vqa`, `user`) + (`vqa`, `assistant`) pair per
camera at the same timestamp; without `camera=`, those resolvers see two
matches and raise an ambiguity error. Recipes consume each camera through its
own binding plus a matching image block, e.g.
```yaml
ask_vqa_top:
bindings:
vqa_query: "emitted_at(t, style=vqa, role=user, camera=observation.images.top)"
vqa: "emitted_at(t, style=vqa, role=assistant, camera=observation.images.top)"
messages:
- role: user
stream: high_level
if_present: vqa_query
content:
- { type: image, feature: observation.images.top }
- { type: text, text: "${vqa_query}" }
- {
role: assistant,
content: "${vqa}",
stream: high_level,
target: true,
if_present: vqa,
}
```
Add one such sub-recipe per camera the dataset records.
## Layer 3 — training format
Rendered samples use HF-style chat messages plus LeRobot sidecars:
```python
sample["messages"]
sample["message_streams"]
sample["target_message_indices"]
```
The renderer does not apply a tokenizer chat template. Policy processors decide how to serialize the messages for their backbone, which keeps the same dataset usable across SmolVLA, Pi0.5, and any future VLM that expects OpenAI-style chat messages.
## Graceful absence
If both language columns are missing, `None`, or empty, `RenderMessagesStep` is a no-op.
If an event-scoped branch is selected on a frame without the required event row, rendering returns `None`, allowing a loader to retry another sample.
-29
View File
@@ -1,29 +0,0 @@
# LeLab - LeRobot Guide
LeLab is a graphical user interface built on top of the LeRobot library, designed to make robotics accessible without needing to memorize CLI commands. From a single app you can configure your robot, teleoperate it, collect datasets, train policies locally or on cloud GPUs via HF Jobs, and deploy trained models back onto your robot. It's the easiest way to go from an unboxed SO-101 to a working policy, and a great companion for anyone learning the LeRobot workflow. Source code and issues live on GitHub: [huggingface/leLab](https://github.com/huggingface/leLab).
> [!TIP]
> For now LeLab is compatible only with SO-ARM101
<Youtube id="VqyKUuW9V1g" />
### Installation
Requires [`uv`](https://docs.astral.sh/uv/getting-started/installation/). Install and launch in one command:
```
uv tool install git+https://github.com/huggingface/leLab.git && lelab
```
After install, run `lelab` from your terminal anytime to start the app.
### Features
- **Add robots** — Select arm type (leader/follower), calibrate each joint from the middle position, and attach cameras.
- **Teleoperation** — Control the follower arm with the leader and see a live 3D visualization of the arms.
- **Dataset recording** — Define a task description, number of episodes, and episode/reset durations. Press spacebar to advance between episodes. 30+ episodes recommended.
- **Local training** — Train a policy directly on your own machine with a selected dataset, policy type, batch size, and step count.
- **Cloud training with HF Jobs** — Train on powerful GPUs via [HF Jobs](https://huggingface.co/docs/huggingface_hub/en/guides/jobs) with transparent pricing. Run `hf auth login` first. See the [Compute HW Guide](hardware_guide) for hardware/batch size tips.
- **Training visualization** — Watch progress live in the app, with checkpoints saved automatically.
- **Run trained policies** — Pick any model from your jobs list and run inference on your robot with one click.
- **Use community datasets** — Provide any Hugging Face dataset ID to train on datasets you didn't record yourself.
+2 -39
View File
@@ -10,7 +10,6 @@ This docs will guide you to:
- Stream datasets without downloading using `StreamingLeRobotDataset`
- Apply image transforms for data augmentation during training
- Migrate existing `v2.1` datasets to `v3.0`
- Experiment with other `LeRobotDataset` formats and implementations like Lance
## Whats new in `v3`
@@ -44,7 +43,7 @@ lerobot-record \
--dataset.num_episodes=5 \
--dataset.single_task="Grab the black cube" \
--dataset.streaming_encoding=true \
# --dataset.camera_encoder.vcodec=auto \
# --dataset.vcodec=auto \
--dataset.encoder_threads=2
```
@@ -275,7 +274,7 @@ A converter aggregates perepisode files into larger shards and writes episode
pip install "https://github.com/huggingface/lerobot/archive/33cad37054c2b594ceba57463e8f11ee374fa93c.zip"
# Convert an existing v2.1 dataset hosted on the Hub:
python -m lerobot.scripts.convert_dataset_v21_to_v30 --repo-id=<HF_USER/DATASET_ID>
python -m lerobot.datasets.v30.convert_dataset_v21_to_v30 --repo-id=<HF_USER/DATASET_ID>
```
**What it does**
@@ -316,39 +315,3 @@ Dataset v3.0 uses incremental parquet writing with buffered metadata for efficie
- Ensures the dataset is valid for loading
Without calling `finalize()`, your parquet files will be incomplete and the dataset won't load properly.
## Other formats and implementations
### Lance
Lance is a useful format for multimodal AI datasets, especially for large-scale training requiring high performance IO and random access.
The `lerobot-lancedb` package implements `LeRobotLanceDataset` (for JPEG images) and `LeRobotLanceVideoDataset` (for mp4 videos).
Those two storage layouts both subclass LeRobotDataset and can provide data loading speed ups.
`LeRobotLanceDataset` is a drop-in replacement for `LeRobotDataset`:
```python
from lerobot.datasets import LeRobotDatasetMetadata
from lerobot.policies.diffusion.configuration_diffusion import DiffusionConfig
from lerobot_lancedb import LeRobotLanceDataset, LeRobotLanceVideoDataset
cfg = DiffusionConfig(...)
meta = LeRobotDatasetMetadata(root=local_dataset_path) # or use repo_id=... to load metadata from the Hub
delta_timestamps = {...}
# Use LeRobotLanceDataset for image datasets
dataset = LeRobotLanceDataset(
root=local_dataset_path, # or use repo_id=... to stream from the Hub
delta_timestamps=delta_timestamps,
return_uint8=True,
)
# Or use LeRobotLanceVideoDataset for video datasets:
dataset = LeRobotLanceVideoDataset(
root=local_dataset_path, # or use repo_id=... to stream from the Hub
delta_timestamps=delta_timestamps,
return_uint8=True,
)
```
Join the discussion on [Github](https://github.com/huggingface/lerobot/issues/3608) and explore the `lerobot-lancedb` documentation [here](https://lancedb.github.io/lerobot-lancedb/).
-433
View File
@@ -1,433 +0,0 @@
# MolmoAct2 Policy
MolmoAct2 is the LeRobot policy implementation of
[MolmoAct2](https://allenai.org/blog/molmoact2), ported into the LeRobot
training, evaluation, checkpointing, and dataset interfaces for easier use with
LeRobot datasets.
This implementation currently supports training and evaluation for the regular
MolmoAct2 model. MolmoAct2-Think, which supports adaptive depth reasoning, is
not included in this LeRobot policy yet and is coming soon.
For the original MolmoAct2 training code used for the experiments reported in
the paper, see [allenai/molmoact2](https://github.com/allenai/molmoact2).
## Installation Requirements
Install LeRobot with the MolmoAct2 optional dependencies:
```bash
pip install -e ".[molmoact2]"
```
To run the models in this repository, you need an NVIDIA GPU. The measurements
below were taken on a single NVIDIA H100 80GB with bf16 model loading, LIBERO with two RGB cameras. MolmoAct2 rows use `chunk_size=10`, action dim 7
padded to `expected_max_action_dim=32`, and `num_flow_timesteps=8`. Training measurements use
`gradient_checkpointing=true` and include the forward pass, backward pass,
gradient clipping, optimizer step, and optimizer state allocation. Values are
peak GPU memory sampled with `nvidia-smi`. Leave a few GiB of headroom for
dataloader workers, CUDA context, and fragmentation.
Multi-GPU training through `accelerate` increases throughput and global batch
size, but this LeRobot port does not currently expose the original MolmoAct2
`fsdp_devices` model-parallel training path. The current training script has
not been tested for multi-node training.
| Mode | Peak Memory, bs=8 | Peak Memory, bs=16 | Peak Memory, bs=32 |
| ------------------------------------------------ | ----------------: | -----------------: | -----------------: |
| Inference, continuous, CUDA graph enabled (bs=1) | 12.1 GiB | - | - |
| Fine-tuning, action expert only, continuous | 16.5 GiB | 18.3 GiB | 21.4 GiB |
| Fine-tuning, LoRA VLM, both action modes | 20.2 GiB | 26.8 GiB | 41.3 GiB |
| Fine-tuning, full model, both action modes | 48.3 GiB | 49.8 GiB | 60.1 GiB |
The repo has been tested with Ubuntu 22.04.
## Usage
To use MolmoAct2 in a LeRobot training config, set:
```python
policy.type=molmoact2
```
## Training
MolmoAct2 can be fine-tuned from either the released MolmoAct2 Hugging Face
checkpoint format or from a checkpoint already saved by LeRobot. Both routes use
the same LeRobot training loop, dataset transforms, checkpoint saving, and
logging. The difference is only how the initial policy weights and processor
state are loaded.
### Training With Original MolmoAct2 Weight
Use `policy.checkpoint_path` when starting from a released MolmoAct2 checkpoint,
for example `allenai/MolmoAct2` or `allenai/MolmoAct2-LIBERO`. LeRobot will load
the original HF model files, then build its own policy processor from the
dataset metadata and the policy options below.
The command below shows full fine-tuning on the merged LIBERO dataset. It uses
bf16 model loading, 8 flow timesteps, LeRobot dataset statistics, image
augmentation, and LeRobot's checkpointing/logging path.
```bash
accelerate launch \
--num_processes=8 \
--mixed_precision=bf16 \
-m lerobot.scripts.lerobot_train \
--dataset.repo_id=allenai/MolmoAct2-LIBERO-Dataset \
--dataset.root=/path/to/lerobot/data/allenai/MolmoAct2-LIBERO-Dataset \
--dataset.video_backend=pyav \
--dataset.image_transforms.enable=true \
--policy.type=molmoact2 \
--policy.checkpoint_path=allenai/MolmoAct2-LIBERO \
--policy.device=cuda \
--policy.action_mode=both \
--policy.chunk_size=10 \
--policy.n_action_steps=10 \
--policy.setup_type="single franka robotic arm in libero" \
--policy.control_mode="delta end-effector pose" \
--policy.image_keys='["observation.images.image","observation.images.wrist_image"]' \
--policy.model_dtype=bfloat16 \
--policy.num_flow_timesteps=8 \
--policy.gradient_checkpointing=true \
--policy.freeze_embedding=true \
--policy.normalize_gripper=false \
--policy.enable_knowledge_insulation=false \
--policy.push_to_hub=false \
--wandb.enable=true \
--wandb.entity=<wandb_entity> \
--wandb.project=<wandb_project> \
--job_name=<job_name> \
--output_dir=outputs/<job_name> \
--steps=10000 \
--batch_size=32 \
--num_workers=4 \
--log_freq=20 \
--eval_freq=-1 \
--save_checkpoint=true \
--save_freq=2000
```
### Training With LeRobot MolmoAct2 Weight
Use `policy.path` when starting from a MolmoAct2 checkpoint that was saved by
LeRobot, either from a local `pretrained_model` directory or from the Hub. This
restores the saved LeRobot policy config, model weights, processor, and
normalization statistics. You can still override training-time options such as
`batch_size`, `steps`, LoRA flags, or `policy.action_mode`.
```bash
accelerate launch \
--num_processes=8 \
--mixed_precision=bf16 \
-m lerobot.scripts.lerobot_train \
--dataset.repo_id=allenai/MolmoAct2-LIBERO-Dataset \
--dataset.root=/path/to/lerobot/data/allenai/MolmoAct2-LIBERO-Dataset \
--dataset.video_backend=pyav \
--dataset.image_transforms.enable=true \
--policy.path=/path/to/pretrained_model \
--policy.device=cuda \
--policy.action_mode=both \
--policy.chunk_size=10 \
--policy.n_action_steps=10 \
--policy.model_dtype=bfloat16 \
--policy.num_flow_timesteps=8 \
--policy.gradient_checkpointing=true \
--wandb.enable=true \
--wandb.entity=<wandb_entity> \
--wandb.project=<wandb_project> \
--job_name=<job_name> \
--output_dir=outputs/<job_name> \
--steps=10000 \
--batch_size=32 \
--num_workers=4 \
--log_freq=20 \
--eval_freq=-1 \
--save_checkpoint=true \
--save_freq=2000
```
### Common Practices
For fine-tuning on a comparatively small dataset, such as a single LIBERO suite
or a real-world dataset with less than 200 demonstrations, a global batch size of
16 to 32 is a good starting point. In these settings, `policy.enable_lora_vlm=true` or `policy.train_action_expert_only=true` is also a practical choice. In both
cases, we intentionally keep the action expert fully trainable, which we found
to be crucial for model performance. For larger fine-tuning datasets, larger
global batch sizes and full fine-tuning are usually preferred.
### Common Policy Options
- `policy.checkpoint_path`: original MolmoAct2 HF checkpoint to initialize from.
Use this for released MolmoAct2 weights.
- `policy.path`: LeRobot checkpoint to initialize from. Use this for checkpoints
created by LeRobot training.
- `policy.action_mode`: training target, one of `continuous`, `discrete`, or
`both`. `both` trains the flow-matching action expert and the discrete
action-token loss.
- `policy.train_action_expert_only`: trains only parameters whose names contain
`action_expert`. It requires `policy.action_mode=continuous`.
- `policy.enable_lora_vlm`: enables LoRA on VLM linear layers. Use
`policy.enable_lora_action_expert=true` only if LoRA should also cover action
expert linear layers. When `policy.enable_lora_action_expert=false`, the
action expert base weights remain fully trainable while the VLM is trained
through LoRA adapters. When `policy.enable_lora_action_expert=true`, the
action expert is also adapter-tuned instead of fully fine-tuned.
- `policy.enable_knowledge_insulation`: when `true`, detaches action-expert
context K/V states before the action loss. The default is `false`.
- `policy.chunk_size`: action horizon used by the policy. For LIBERO we use
`10`. This LeRobot port overrides the loaded checkpoint's
`max_action_horizon` with this value.
- `policy.n_action_steps`: number of actions consumed from each predicted
chunk before querying the policy again. For LIBERO, set it to `chunk_size`.
- `policy.setup_type`: text inserted into the prompt to describe the robot and
scene, e.g. `single franka robotic arm in libero`. More examples are listed
in the `metadata_by_tag` entries of
[`norm_stats.json`](https://huggingface.co/allenai/MolmoAct2/blob/main/norm_stats.json).
- `policy.control_mode`: text inserted into the prompt to describe the action
space, e.g. `delta end-effector pose` or `absolute joint pose`.
- `policy.image_keys`: ordered LeRobot image observation keys passed to the
processor.
- `policy.model_dtype`: checkpoint/forward dtype, one of `float32`,
`bfloat16`, or `float16`. Use `bfloat16` for normal training.
- `policy.num_flow_timesteps`: number of flow-matching timesteps sampled per
example during training. We use `8` for fine-tuning.
- `policy.num_inference_steps`: optional override for continuous action
generation steps at inference time.
- `policy.gradient_checkpointing`: enables checkpointing in the VLM/action path
to reduce activation memory.
- `policy.freeze_embedding`: freezes input embeddings. The default is `true`.
- `policy.normalize_gripper`: controls whether gripper dimensions are included
in state/action quantile normalization. The default is `false`.
- `policy.normalize_language`: normalizes task strings before prompt
construction. The default is `true`.
- `policy.mask_action_dim_padding`: masks padded dimensions in the flow loss.
Released checkpoints use `policy.expected_max_action_dim=32`.
- `policy.max_sequence_length`: optional manual sequence cap. Leave unset to
infer it from images, state dimension, action dimension, action horizon, and
discrete-action mode.
### Learning Rates
MolmoAct2 uses parameter-group learning rates to match the original MolmoAct2
fine-tuning experiments.
- Full fine-tuning uses `policy.optimizer_lr=1e-5` for the VLM,
`policy.optimizer_vit_lr=5e-6` for the vision tower,
`policy.optimizer_connector_lr=5e-6` for image connector layers, and
`policy.optimizer_action_expert_lr=5e-5` for the action expert.
- LoRA VLM fine-tuning sets the VLM, vision, and connector LoRA parameter
groups to `5e-5` when `policy.enable_lora_vlm=true`. By default,
`policy.enable_lora_action_expert=false`, so the action expert is still fully
fine-tuned with `policy.optimizer_action_expert_lr`. If
`policy.enable_lora_action_expert=true`, the action expert is trained through
LoRA adapters instead.
- Action-expert-only fine-tuning trains only the action expert and uses
`policy.optimizer_action_expert_lr=5e-5`.
You can override the full fine-tuning and action-expert learning rates with
`policy.optimizer_lr`, `policy.optimizer_vit_lr`,
`policy.optimizer_connector_lr`, and `policy.optimizer_action_expert_lr`.
Scheduler settings can be changed with `policy.scheduler_warmup_steps`,
`policy.scheduler_decay_steps`, and `policy.scheduler_decay_lr`.
### Dataset Quantile Statistics
MolmoAct2 defaults to quantile normalization for state and action features. If
your dataset has not been converted with quantile statistics, you can add them
with:
```bash
python src/lerobot/scripts/augment_dataset_quantile_stats.py \
--repo-id=your_dataset
```
Alternatively, train MolmoAct2 with mean/std normalization:
```bash
--policy.normalization_mapping='{"ACTION": "MEAN_STD", "STATE": "MEAN_STD", "VISUAL": "IDENTITY"}'
```
## Evaluation
Evaluation also supports both LeRobot-saved checkpoints and original MolmoAct2
HF checkpoints. For LIBERO replication, keep the EGL rendering environment
fixed and use `policy.per_episode_seed=true`.
**Important:** We found that `num_steps_wait=10` does not reliably let the
LIBERO scene stabilize and can degrade measured success. All LIBERO evaluation
results reported here use `num_steps_wait=50`.
### Evaluation With LeRobot MolmoAct2 Weight
Use `policy.path` for a checkpoint saved by LeRobot. The saved processor and
normalization statistics are restored together with the model.
```bash
export MUJOCO_GL=egl
export PYOPENGL_PLATFORM=egl
export OMP_NUM_THREADS=1
export MKL_NUM_THREADS=1
lerobot-eval \
--policy.path=allenai/MolmoAct2-LIBERO-LeRobot \
--policy.inference_action_mode=continuous \
--policy.model_dtype=bfloat16 \
--policy.use_amp=true \
--policy.enable_inference_cuda_graph=true \
--policy.device=cuda \
--policy.per_episode_seed=true \
--policy.eval_seed=1000 \
--env.type=libero \
--env.task=libero_10,libero_goal,libero_object,libero_spatial \
--env.camera_name_mapping='{"agentview_image":"image","robot0_eye_in_hand_image":"wrist_image"}' \
--eval.batch_size=1 \
--eval.n_episodes=50 \
--seed=1000
```
### Evaluation With Original MolmoAct2 Weight
You can evaluate a released Hugging Face checkpoint directly without first
converting it to a LeRobot checkpoint. In this case, set
`policy.checkpoint_path` to the HF model repo and provide `policy.norm_tag`.
For LIBERO, `policy.norm_tag=libero` loads the LIBERO action/state
normalization statistics, action horizon, prompt metadata, and image-key order
from the checkpoint's `norm_stats.json`.
To fully replicate the MolmoAct2 paper results with released Hugging Face
checkpoints, we recommend using the v0.5.1-pinned
[`allenai/lerobot` `molmoact2-hf-inference`](https://github.com/allenai/lerobot/tree/molmoact2-hf-inference)
branch. That branch matches the original evaluation settings used for the
reported numbers.
```bash
export MUJOCO_GL=egl
export PYOPENGL_PLATFORM=egl
export OMP_NUM_THREADS=1
export MKL_NUM_THREADS=1
lerobot-eval \
--policy.type=molmoact2 \
--policy.checkpoint_path=allenai/MolmoAct2-LIBERO \
--policy.norm_tag=libero \
--policy.inference_action_mode=continuous \
--policy.model_dtype=float32 \
--policy.use_amp=false \
--policy.enable_inference_cuda_graph=true \
--policy.device=cuda \
--policy.per_episode_seed=true \
--policy.eval_seed=1000 \
--env.type=libero \
--env.task=libero_goal \
--env.camera_name_mapping='{"agentview_image":"image","robot0_eye_in_hand_image":"wrist_image"}' \
--eval.batch_size=1 \
--eval.n_episodes=50 \
--seed=1000
```
Use `--env.task=libero_10,libero_goal,libero_object,libero_spatial` to run the
full LIBERO suite. The same command works for other released MolmoAct2
checkpoints as long as the requested `policy.norm_tag` exists in that
checkpoint's `norm_stats.json`.
### Common Evaluation Options
- `policy.inference_action_mode`: required for rollout. Use `continuous` for
flow-matching inference or `discrete` for action-token inference. It must be
compatible with the training-time `policy.action_mode` saved in the
checkpoint.
- `policy.path`: LeRobot checkpoint path or Hub repo. Use this for checkpoints
saved by LeRobot.
- `policy.checkpoint_path`: original MolmoAct2 HF checkpoint path or Hub repo.
Use this with `policy.type=molmoact2` and `policy.norm_tag`.
- `policy.norm_tag`: selects normalization statistics, prompt metadata,
image-key order, and action horizon from the original checkpoint's
`norm_stats.json`. It is required for direct original-HF checkpoint
evaluation.
- `policy.model_dtype`: model load/forward dtype. Use `bfloat16` for normal
GPU evaluation. Use `float32` only when you explicitly want fp32 inference.
- `policy.use_amp`: runs the policy forward under autocast during eval. For
`model_dtype=bfloat16`, keep this enabled.
- `policy.enable_inference_cuda_graph`: enables the MolmoAct2 inference CUDA
graph path for faster repeated continuous-action rollout.
- `policy.per_episode_seed` and `policy.eval_seed`: make stochastic continuous
action generation deterministic per episode for replication.
- `env.task`: comma-separated LIBERO suites or a single suite. Use
`libero_10,libero_goal,libero_object,libero_spatial` for the full benchmark.
- `env.camera_name_mapping`: maps LIBERO camera names to the image keys expected
by the policy processor.
## Performance Results
### LIBERO Benchmark Results
MolmoAct2 has demonstrated strong performance on the LIBERO benchmark suite. To
compare and test its LeRobot implementation, we fine-tuned
[`allenai/MolmoAct2-LIBERO`](https://huggingface.co/allenai/MolmoAct2-LIBERO)
for an additional 10k steps on the LIBERO dataset with per-GPU batch size 32 on
8 H100 GPUs, then compared the results to the original MolmoAct2 reference
results.
The LeRobot fine-tuned checkpoint reported here is available at
[`allenai/MolmoAct2-LIBERO-LeRobot`](https://huggingface.co/allenai/MolmoAct2-LIBERO-LeRobot)
and was trained on
[`allenai/MolmoAct2-LIBERO-Dataset`](https://huggingface.co/datasets/allenai/MolmoAct2-LIBERO-Dataset).
| Benchmark | LeRobot Implementation | MolmoAct2 Original |
| -------------- | ---------------------: | -----------------: |
| LIBERO Spatial | 98.4% | 97.8% |
| LIBERO Object | 100.0% | 100.0% |
| LIBERO Goal | 98.0% | 97.8% |
| LIBERO 10 | 96.6% | 93.2% |
| Average | 98.25% | 97.20% |
These results demonstrate MolmoAct2's strong performance across diverse robotic
manipulation tasks. To reproduce them, follow the instructions in the LIBERO
evaluation section.
## Differences From the Original Implementation
This LeRobot port is intended to match MolmoAct2 behavior while using LeRobot's
dataset, training, evaluation, checkpoint, and logging infrastructure. The main
differences from the original training repository are:
- The original paper training stack loads the model in fp32 and trains under
mixed precision. This LeRobot port usually loads the checkpoint directly in
`policy.model_dtype=bfloat16` for lower memory use.
- The original repository uses its own FSDP/model-parallel training path. The
LeRobot port uses the standard LeRobot/Accelerate training path and has not
been tested for multi-node training.
- The original repository supports sequence packing. The LeRobot port trains on
one LeRobot sample per item and pads to an inferred fixed sequence budget.
- The LeRobot port follows LeRobot's optimizer, scheduler, checkpoint saving,
dataset transforms, image augmentation, and Weights & Biases logging
conventions.
- The original training path supports mixed action horizons by padding to
`max_action_horizon` and masking padded horizon slots in the action expert
self-attention. This is useful when training across datasets with different
control frequencies. The LeRobot port currently targets single-dataset
fine-tuning, so `policy.chunk_size` overrides the checkpoint
`max_action_horizon` and horizon masking is not implemented yet. Support for
this mixed-horizon path is planned.
## Citation
```bibtex
@misc{fang2026molmoact2actionreasoningmodels,
title={MolmoAct2: Action Reasoning Models for Real-world Deployment},
author={Haoquan Fang and Jiafei Duan and Donovan Clay and Sam Wang and Shuo Liu and Weikai Huang and Xiang Fan and Wei-Chuan Tsai and Shirui Chen and Yi Ru Wang and Shanli Xing and Jaemin Cho and Jae Sung Park and Ainaz Eftekhar and Peter Sushko and Karen Farley and Angad Wadhwa and Cole Harrison and Winson Han and Ying-Chun Lee and Eli VanderBilt and Rose Hendrix and Suveen Ellawela and Lucas Ngoo and Joyce Chai and Zhongzheng Ren and Ali Farhadi and Dieter Fox and Ranjay Krishna},
year={2026},
eprint={2605.02881},
archivePrefix={arXiv},
primaryClass={cs.RO},
url={https://arxiv.org/abs/2605.02881},
}
```
## License
This model is licensed under Apache 2.0. It is intended for research and
educational use in accordance with
[Ai2's Responsible Use Guidelines](https://allenai.org/responsible-use),
consistent with [allenai/molmoact2](https://github.com/allenai/molmoact2).
+2 -4
View File
@@ -28,15 +28,13 @@ lerobot-train \
--steps=100000 \
--batch_size=32 \
--peft.method_type=LORA \
--peft.r=64 \
--peft.lora_alpha=64
--peft.r=64
```
Note the `--peft.method_type` parameter that let's you select which PEFT method to use. Here we use
[LoRA](https://huggingface.co/docs/peft/main/en/package_reference/lora) (Low-Rank Adapter) which is probably the most
popular fine-tuning method to date. Low-rank adaption means that we only fine-tune a matrix with comparably low rank
instead of the full weight matrix. This rank can be specified using the `--peft.r` parameter, and the LoRA scaling factor with
`--peft.lora_alpha` (where `scaling = lora_alpha / r`). The higher the rank
instead of the full weight matrix. This rank can be specified using the `--peft.r` parameter. The higher the rank
the closer you get to full fine-tuning
There are more complex methods that have more parameters. These are not yet supported, feel free to raise an issue
+1 -1
View File
@@ -91,7 +91,7 @@ lerobot-train \
If your dataset is not converted with `quantiles`, you can convert it with the following command:
```bash
python src/lerobot/scripts/augment_dataset_quantile_stats.py \
python src/lerobot/datasets/v30/augment_dataset_quantile_stats.py \
--repo-id=your_dataset \
```
-39
View File
@@ -1,39 +0,0 @@
# MolmoAct2
This repository contains the LeRobot policy implementation of
[MolmoAct2](https://allenai.org/blog/molmoact2), ported into LeRobot for
training, evaluation, checkpointing, and dataset compatibility.
This implementation currently supports training and evaluation for the regular
MolmoAct2 model. MolmoAct2-Think, which supports adaptive depth reasoning, is
not included in this LeRobot policy yet and is coming soon.
For the original MolmoAct2 training code used for the experiments reported in
the paper, see [allenai/molmoact2](https://github.com/allenai/molmoact2).
## LIBERO Evaluation
Important: we found that `num_steps_wait=10` does not reliably let the LIBERO
scene stabilize and can degrade measured success. All LIBERO evaluation results
reported for this LeRobot implementation use `num_steps_wait=50`.
## Citation
```bibtex
@misc{fang2026molmoact2actionreasoningmodels,
title={MolmoAct2: Action Reasoning Models for Real-world Deployment},
author={Haoquan Fang and Jiafei Duan and Donovan Clay and Sam Wang and Shuo Liu and Weikai Huang and Xiang Fan and Wei-Chuan Tsai and Shirui Chen and Yi Ru Wang and Shanli Xing and Jaemin Cho and Jae Sung Park and Ainaz Eftekhar and Peter Sushko and Karen Farley and Angad Wadhwa and Cole Harrison and Winson Han and Ying-Chun Lee and Eli VanderBilt and Rose Hendrix and Suveen Ellawela and Lucas Ngoo and Joyce Chai and Zhongzheng Ren and Ali Farhadi and Dieter Fox and Ranjay Krishna},
year={2026},
eprint={2605.02881},
archivePrefix={arXiv},
primaryClass={cs.RO},
url={https://arxiv.org/abs/2605.02881},
}
```
## License
This model is licensed under Apache 2.0. It is intended for research and
educational use in accordance with
[Ai2's Responsible Use Guidelines](https://allenai.org/responsible-use),
consistent with [allenai/molmoact2](https://github.com/allenai/molmoact2).
-39
View File
@@ -1,39 +0,0 @@
# VLA-JEPA
This repository contains the LeRobot port of **VLA-JEPA**, a Vision-Language-Action model that combines a Qwen3-VL language backbone with a self-supervised video world model (V-JEPA2) and a flow-matching DiT action head.
Converted from [ginwind/VLA-JEPA](https://huggingface.co/ginwind/VLA-JEPA).
---
## Architecture Overview
| Component | Module | Role |
| ----------------------- | --------------------------------- | ------------------------------------------------------- |
| **Qwen3-VL backbone** | `Qwen3VLInterface` | Fuses images + language instruction into context tokens |
| **DiT-B action head** | `VLAJEPAActionHead` | Flow-matching diffusion over the action chunk |
| **V-JEPA2 world model** | `ActionConditionedVideoPredictor` | Self-supervised video prediction loss (training only) |
At inference time only the Qwen backbone and action head are used; the world model is not needed.
---
## Citation
```bibtex
@misc{sun2026vlajepaenhancingvisionlanguageactionmodel,
title = {VLA-JEPA: Enhancing Vision-Language-Action Model with Latent World Model},
author = {Jingwen Sun and Wenyao Zhang and Zekun Qi and Shaojie Ren and Zezhi Liu and Hanxin Zhu and Guangzhong Sun and Xin Jin and Zhibo Chen},
year = {2026},
eprint = {2602.10098},
archivePrefix = {arXiv},
primaryClass = {cs.RO},
url = {https://arxiv.org/abs/2602.10098},
}
```
---
## License
Weights are distributed under the license terms of the original [ginwind/VLA-JEPA](https://huggingface.co/ginwind/VLA-JEPA) repository (**Apache 2.0 License**). The LeRobot integration code follows the **Apache 2.0 License**.
+1 -1
View File
@@ -300,7 +300,7 @@ This replaces the old episode-per-file structure with efficient, optimally-sized
If you have existing datasets in v2.1 format, use the migration tool:
```bash
python src/lerobot/scripts/convert_dataset_v21_to_v30.py \
python src/lerobot/datasets/v30/convert_dataset_v21_to_v30.py \
--repo-id your_id/existing_dataset
```
+2 -2
View File
@@ -161,7 +161,7 @@ lerobot-record \
--dataset.private=true \
--dataset.streaming_encoding=true \
--dataset.encoder_threads=2 \
# --dataset.camera_encoder.vcodec=auto \
# --dataset.vcodec=auto \
--display_data=true
```
@@ -203,7 +203,7 @@ lerobot-record \
--dataset.private=true \
--dataset.streaming_encoding=true \
--dataset.encoder_threads=2 \
# --dataset.camera_encoder.vcodec=auto \
# --dataset.vcodec=auto \
--display_data=true
```
-186
View File
@@ -1,186 +0,0 @@
# reBot B601-DM
[reBot B601-DM](https://wiki.seeedstudio.com/rebot_arm_b601_dm_lerobot/) is an open-source, low-cost robot arm from Seeed Studio for embodied-AI and imitation learning. It comes as a **follower** arm (the `B601-DM`, a 6-DOF arm plus gripper driven by Damiao CAN motors) and a **leader** arm (the `StarArm102` / `reBot Arm 102`, driven by FashionStar UART smart servos) used to teleoperate it.
This page covers **calibration** and **teleoperation** for both single-arm and bimanual (dual-arm) setups.
<div style="display: flex; align-items: center; gap: 10px;">
<img
src="https://files.seeedstudio.com/wiki/robotics/projects/lerobot/b601dm_zeroposition.jpg"
alt="reBot B601-DM follower arm at its zero position"
width="48%"
/>
<img
src="https://files.seeedstudio.com/wiki/robotics/projects/lerobot/102_zeroposition.jpg"
alt="reBot Arm 102 leader arm at its zero position"
width="48%"
/>
</div>
_Left: the B601-DM follower at its zero position. Right: the reBot Arm 102 leader at its zero position. Images courtesy of [Seeed Studio](https://wiki.seeedstudio.com/rebot_arm_b601_dm_lerobot/)._
## Install LeRobot 🤗
Follow our [Installation Guide](./installation), then install the reBot support:
```bash
pip install -e ".[rebot]"
```
This pulls in `motorbridge` (CAN motor control for the B601-DM follower) and `motorbridge-smart-servo` (FashionStar UART servos for the reBot Arm 102 leader).
## Registered device types
| Type | Kind |
| ------------------------ | -------------------------------------------- |
| `rebot_b601_follower` | single-arm B601-DM follower robot |
| `bi_rebot_b601_follower` | bimanual (dual-arm) follower robot |
| `rebot_102_leader` | single-arm reBot Arm 102 leader teleoperator |
| `bi_rebot_102_leader` | bimanual (dual-arm) leader teleoperator |
The bimanual types compose two single-arm instances and namespace each arm's
observation/action keys with a `left_` / `right_` prefix. Per-arm settings are
passed through nested `left_arm_config.*` / `right_arm_config.*` arguments.
## Find the USB ports
For each device, find the USB port associated with its motor bus using:
```bash
lerobot-find-port
```
<Tip warning={true}>
On Linux, remove `brltty` (`sudo apt remove brltty`) so it does not hold the
leader's USB serial port. You may also need to grant access to the serial
devices: `sudo chmod 666 /dev/ttyACM* /dev/ttyUSB*`.
</Tip>
## Calibration
Neither arm stores a persistent hardware calibration: every time it connects, the motors are re-zeroed against the pose the arm is physically holding. Calibration simply records that zero pose. When prompted, **manually move the arm to its zero position** (the default sit-down pose shown above, gripper fully closed) and press <kbd>ENTER</kbd>.
### Follower (B601-DM)
<hfoptions id="calibrate-follower">
<hfoption id="Single arm">
```bash
lerobot-calibrate \
--robot.type=rebot_b601_follower \
--robot.port=/dev/ttyACM0 \
--robot.id=follower \
--robot.can_adapter=damiao
```
</hfoption>
<hfoption id="Dual arm">
Connect the bimanual follower; calibration runs for the left arm, then the right arm.
```bash
lerobot-calibrate \
--robot.type=bi_rebot_b601_follower \
--robot.id=bi_follower \
--robot.left_arm_config.port=/dev/ttyACM0 \
--robot.left_arm_config.can_adapter=damiao \
--robot.right_arm_config.port=/dev/ttyACM1 \
--robot.right_arm_config.can_adapter=damiao
```
Per-arm calibration files are saved with `_left` / `_right` suffixes on the id.
</hfoption>
</hfoptions>
### Leader (reBot Arm 102)
<hfoptions id="calibrate-leader">
<hfoption id="Single arm">
```bash
lerobot-calibrate \
--teleop.type=rebot_102_leader \
--teleop.port=/dev/ttyUSB0 \
--teleop.id=leader
```
</hfoption>
<hfoption id="Dual arm">
```bash
lerobot-calibrate \
--teleop.type=bi_rebot_102_leader \
--teleop.id=bi_leader \
--teleop.left_arm_config.port=/dev/ttyUSB0 \
--teleop.right_arm_config.port=/dev/ttyUSB1
```
</hfoption>
</hfoptions>
## Teleoperation
Once both arms are calibrated, drive the follower with the leader. The follower talks to its CAN bus through a Damiao serial bridge (`can_adapter=damiao`, the default) or a SocketCAN adapter (`can_adapter=socketcan`). See the [OpenArm page](./openarm) for more details on the SocketCAN adapter configuration.
<hfoptions id="teleoperate">
<hfoption id="Single arm">
```bash
lerobot-teleoperate \
--robot.type=rebot_b601_follower \
--robot.port=/dev/ttyACM0 \
--robot.id=follower \
--robot.can_adapter=damiao \
--teleop.type=rebot_102_leader \
--teleop.port=/dev/ttyUSB0 \
--teleop.id=leader
```
</hfoption>
<hfoption id="Dual arm">
The bimanual leader and follower reuse the single-arm classes; each arm is
configured through nested `left_arm_config.*` / `right_arm_config.*` arguments,
so a bimanual reBot Arm 102 leader drives a bimanual B601-DM follower.
```bash
lerobot-teleoperate \
--robot.type=bi_rebot_b601_follower \
--robot.id=bi_follower \
--robot.left_arm_config.port=/dev/ttyACM0 \
--robot.left_arm_config.can_adapter=damiao \
--robot.right_arm_config.port=/dev/ttyACM1 \
--robot.right_arm_config.can_adapter=damiao \
--teleop.type=bi_rebot_102_leader \
--teleop.id=bi_leader \
--teleop.left_arm_config.port=/dev/ttyUSB0 \
--teleop.right_arm_config.port=/dev/ttyUSB1
```
</hfoption>
</hfoptions>
<Tip>
The leader and follower share the same joint names (`shoulder_pan,
shoulder_lift, elbow_flex, wrist_flex, wrist_yaw, wrist_roll, gripper`), so
leader actions map directly onto the follower.
</Tip>
If the motion of a joint is reversed, flip its sign in the leader's `joint_directions` (the gripper also carries a scale to widen its range to the follower):
```bash
lerobot-teleoperate \
--robot.type=rebot_b601_follower \
--robot.port=/dev/ttyACM0 \
--robot.can_adapter=damiao \
--teleop.type=rebot_102_leader \
--teleop.port=/dev/ttyUSB0 \
--teleop.joint_directions='{"shoulder_pan":-1,"shoulder_lift":-1,"elbow_flex":1,"wrist_flex":1,"wrist_yaw":1,"wrist_roll":-1,"gripper":-6}'
```
## Recording datasets
Swap `lerobot-teleoperate` for `lerobot-record` (with the same `--robot.*` / `--teleop.*` arguments, plus `--dataset.*`) to record demonstrations for training. See [Imitation Learning for Robots](./il_robots) for the full workflow.
For hardware assembly and wiring, see the [Seeed Studio reBot wiki](https://wiki.seeedstudio.com/rebot_arm_b601_dm_lerobot/).
-250
View File
@@ -1,250 +0,0 @@
# Remote Inference (lerobot-policy-server)
Remote inference decouples GPU policy inference from robot control. A `lerobot-policy-server` process runs the policy on a GPU machine; the robot runs `lerobot-rollout --inference.type=remote` as a **weightless edge client** — no policy weights, no GPU, no policy processors on the robot. One GPU server can serve several robots at once, and the remote backend works with every rollout strategy (`base`, `sentry`, `highlight`, `dagger`, `episodic`).
Use remote inference when:
- The policy is too large or too slow for the machine attached to the robot (e.g. Pi0/Pi0.5 on a Raspberry Pi or laptop edge).
- You want one GPU to serve a fleet of robots running the same policy.
- You want to update or restart the inference side without touching the robots.
<Tip>
Remote inference requires the `async` extra on **both** sides: `pip install 'lerobot[async]'` (installs `eclipse-zenoh` and `msgpack`). The server additionally needs the extras of the policy it serves (e.g. `lerobot[pi]`, `lerobot[smolvla]`).
</Tip>
## Architecture
```
robot (edge, weightless) GPU machine
┌───────────────────────────┐ ┌────────────────────────────┐
│ lerobot-rollout │ │ lerobot-policy-server │
│ --inference.type=remote │ zenoh │ one process = one │
│ │ router │ (model, revision, GPU) │
│ control loop @ fps │ ┌────────┐ │ │
│ └─ pops local action ◄──┼───┤ zenohd ├─────┼─► inference worker thread │
│ buffer (chunks) │ └────────┘ │ (round-robin over │
│ │ observations ► │ client sessions) │
│ network worker thread ───┼──► ◄ action │ │
│ (publishes obs, merges │ chunks │ stateless per request │
│ chunks into buffer) │ │ │
└───────────────────────────┘ └────────────────────────────┘
```
The client keeps a local **action buffer** filled with chunks of future actions, so the control loop never blocks on the network: short network blips are absorbed by the buffer and the robot keeps moving. The client self-clocks — it requests a new chunk whenever the buffer holds less than `--inference.buffer_time_s` seconds of playback.
The server is **stateless per request**: clients ship their RTC prefixes and a delay hint with every observation, so a server crash or restart loses zero control state and reconnects are trivial. In production both robots and servers _dial out_ to a `zenohd` router (NAT-friendly: nothing on the robot network needs an open inbound port).
## Quickstart on a LAN (peer mode, no router)
For a quick test on one network you can skip the router: the server listens directly and the robot connects to it.
On the GPU machine:
```bash
lerobot-policy-server \
--model.repo_or_path=${HF_USER}/my_pi0_policy \
--default_task="pick up the cube" \
--zenoh.mode=peer \
--zenoh.listen_endpoints='["tcp/0.0.0.0:7447"]'
```
Wait for `Policy server up: ...` (the model is downloaded, loaded, and warmed up first).
On the robot machine (replace `192.168.1.42` with the GPU machine's IP):
```bash
lerobot-rollout \
--strategy.type=base \
--policy.path=${HF_USER}/my_pi0_policy \
--inference.type=remote \
--inference.zenoh_mode=peer \
--inference.connect_endpoint=tcp/192.168.1.42:7447 \
--robot.type=so100_follower \
--robot.port=/dev/ttyACM0 \
--robot.cameras="{ front: {type: opencv, index_or_path: 0, width: 640, height: 480, fps: 30}}" \
--task="pick up the cube" \
--duration=60
```
`--policy.path` on the client resolves to a config-only download (no weights): it is used for pre-flight validation and action ordering, and doubles as the default service address. The client's `--policy.path` and `--task` must match the server's `--model.repo_or_path` and `--default_task` — that pair is the namespace the service is published under (see [Troubleshooting](#troubleshooting)).
## Production deployment (router)
In production, run a [zenoh router](https://zenoh.io/docs/getting-started/installation/) (`zenohd`) somewhere both sides can reach, and have robots and servers dial out to it:
```bash
zenohd # listens on tcp/0.0.0.0:7447 by default
```
Configure the server with a YAML manifest:
```yaml
# server.yaml
model:
repo_or_path: lerobot/pi0_towels
revision: main
dtype: bfloat16 # optional cast after load
device: cuda
default_task: "fold the towel"
serving_mode: auto # shared for verified chunk-stateless policies, exclusive otherwise
max_sessions: 5
warmup_inferences: 2
trained_fps: 30.0
rtc:
enabled: true
execution_horizon: 10
max_guidance_weight: 10.0
health_port: 9100 # /healthz + /metrics; 0 disables
zenoh:
mode: client
connect_endpoints: ["tcp/router.gpu-cluster.internal:7447"]
```
```bash
lerobot-policy-server --manifest server.yaml
```
Everything in the manifest can also be set directly on the CLI (`--model.repo_or_path=...`, `--max_sessions=...`, etc.). One process serves exactly one `(model, revision, dtype, device)` — to serve two models, or one model on two GPUs, run two processes. Dynamic model loading is deliberately unsupported: pre-warmed processes keep capacity planning honest.
On the robot, only the endpoint changes (the default `--inference.zenoh_mode=client` is already router mode):
```bash
lerobot-rollout \
--strategy.type=base \
--policy.path=lerobot/pi0_towels \
--inference.type=remote \
--inference.connect_endpoint=tcp/router.gpu-cluster.internal:7447 \
--robot.type=so100_follower \
--robot.port=/dev/ttyACM0 \
--robot.cameras="{ front: {type: opencv, index_or_path: 0, width: 640, height: 480, fps: 30}}" \
--task="fold the towel" \
--duration=600
```
### TLS / mTLS
For traffic that leaves a trusted network, terminate TLS at the router and give both sides client certificates (all three PEM paths are required together):
```yaml
# server.yaml (zenoh section)
zenoh:
mode: client
connect_endpoints: ["tls/router.gpu-cluster.internal:7447"]
tls_root_ca_certificate: /etc/lerobot/ca.pem
tls_connect_certificate: /etc/lerobot/server.pem
tls_connect_private_key: /etc/lerobot/server.key
```
On the robot the equivalent flags are `--inference.tls_ca`, `--inference.tls_cert`, and `--inference.tls_key`, with `--inference.connect_endpoint=tls/...`.
<Tip>
Multicast scouting is always disabled: discovery is configuration, not protocol magic. If nothing connects, check the endpoints — there is no fallback discovery mechanism.
</Tip>
## RTC over the network
The remote engine reuses the [Real-Time Chunking](./rtc) machinery: the client keeps the chunk leftover and latency tracking locally and ships an action prefix plus a delay hint with every observation; the server runs prefix-conditioned chunk generation. This gives the same smooth chunk-to-chunk transitions as local RTC, with network latency folded into the delay computation.
RTC is enabled by default on both sides (`rtc.enabled: true`). Tune it from the client:
```bash
lerobot-rollout \
... \
--inference.type=remote \
--inference.rtc.execution_horizon=10 \
--inference.rtc.max_guidance_weight=10.0
```
If the server or its policy does not support RTC (only `pi0`, `pi05`, and `smolvla` are RTC-capable, and the server manifest must have `rtc.enabled: true`), the session is **downgraded to plain chunk-append** and the client logs:
```
RTC downgraded to chunk-append (server does not support RTC)
```
The robot still runs — chunks are simply appended to the buffer without prefix blending, which can produce visible seams between chunks on slow policies.
## Fail-safe behavior
The client runs a fail-safe state machine (`CONNECTING → STREAMING → DEGRADED → STALLED → RECONNECTING → DEAD`). A bad initial deployment fails fast: `lerobot-rollout` aborts before the robot moves if the handshake or validation fails. Once streaming, faults degrade in stages:
| Condition | Behavior |
| -------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------- |
| Short network blip / late chunk | The robot rides its action buffer; state goes `DEGRADED` after `--inference.degraded_after_s` (default 1.0 s) without a fresh chunk |
| Buffered actions older than `max_action_age_s` | Stale actions are dropped (never executed); default `--inference.max_action_age_s=3.0` |
| Buffer runs dry (`STALLED`) | Fallback per `--inference.fallback`: `hold` (default — robot holds its last commanded position), `repeat_last`, or `zero` |
| Server liveliness lost / repeated request timeouts | `RECONNECTING`: re-handshake with exponential backoff (`reconnect_initial_backoff_s=0.5` doubling up to `reconnect_max_backoff_s=10.0`) |
| Reconnected server runs a different model/revision | Hard refusal (`DEAD`) — the client never executes wrong-model chunks |
| Offline longer than `max_offline_s` (default 60 s) | `DEAD`: the engine signals the rollout's shutdown event for a clean stop |
<Tip warning={true}>
`--inference.fallback=zero` is required for velocity-controlled robots: for them "send nothing" means "keep the last velocity", so an explicit zero command is the only safe stop. For position-controlled arms the default `hold` is safe.
</Tip>
Server restarts are equally graceful: on SIGTERM the server drops its liveliness token first (clients ride their buffers through the drain), finishes the in-flight inference, and exits. Clients reconnect when the replacement comes up.
## Serving multiple robots
`max_sessions` caps concurrent clients per server process. A single inference worker thread serializes GPU access and round-robins over sessions with a pending observation; per-client newest-wins mailboxes mean overload degrades into longer cycle times (larger but correct client-side delays), never into queue buildup.
A rough capacity estimate, keeping ~20% headroom:
```
N_robots ≈ 0.8 / (rate × inference_time)
```
where `rate` is each robot's chunk-request rate in Hz (how often the client's buffer dips below `buffer_time_s`) and `inference_time` is the server's seconds per chunk. For example, at 100 ms per chunk and ~2 chunk requests per second per robot: `N ≈ 0.8 / (2 × 0.1) = 4` robots.
The actual serving mode is classified per policy family, never inferred:
- **shared** — verified chunk-stateless policies (`act`, `pi0`, `pi05`, and `smolvla` with `n_obs_steps=1`) serve up to `max_sessions` clients from one policy instance.
- **exclusive** — stateful families (diffusion-family policies, `smolvla` with observation history, and any unverified policy) are forced to `max_sessions=1`. Run one server process per robot for these.
`serving_mode: auto` (the default) resolves this automatically; you may force `exclusive`, but `shared` can never override a stateful classification.
## Observability
With `health_port` set (default 9100), the server exposes:
- `GET /healthz` — `200 ok` while the inference worker is alive, `503` otherwise. Wire this to your orchestrator's liveness probe.
- `GET /metrics` — Prometheus text format: `lerobot_policy_server_requests_total`, `errors_total`, `superseded_total`, `dropped_unknown_client_total`, `sessions_opened_total`, `sessions_closed_total`, `active_sessions`, `server_load`.
Every inference request also emits one structured audit line on the `lerobot.policy_server.audit` logger:
```json
{
"session_id": "9f2c...",
"client_uuid": "robot-07",
"seq_id": 412,
"episode_id": 3,
"queue_wait_ms": 1.8,
"inference_ms": 93.2,
"superseded": 0,
"outcome": "ok"
}
```
`(session_id, seq_id)` correlates a server-side audit line with the client's request. Set a stable `--inference.client_uuid` per robot (instead of the default fresh UUID per run) for fleet-wide log correlation, and use `--inference.tags` to forward free-form labels in the handshake.
## Troubleshooting
**`No policy server answered status query at '@lerobot/...'`**
The client found no server under the key it dialed. Either the endpoint is wrong (check `--inference.connect_endpoint`, the router, and firewalls), or the **service namespace** does not match. The namespace is the `(model_id, revision, task)` triple: on the client it comes from `--inference.service_model_id` (default: `--policy.path`), `--inference.service_revision` (default: `main`), and `--inference.service_task` (default: the rollout `--task`); on the server from `model.repo_or_path`, `model.revision`, and `service_name` (default: a slug of `default_task`). A robot task string that differs from the server's `default_task` is the most common cause — fix the task, or pin the namespace explicitly with `--inference.service_task` on the client / `service_name` in the manifest.
**`Action name/order mismatch between server policy and this robot`**
The hard sync-safety contract: chunk columns map to motors **by order**, so the robot's ordered action keys must exactly equal the policy's `action_feature_names`. This fires when the robot type, motor naming, or rename map differs from the training setup. Use the same robot type (and rename map) the policy was trained with.
**`RTC requested but this server/policy does not support it — downgrading to chunk-append`**
Informational, not fatal. Enable RTC in the server manifest (`rtc.enabled: true`) and make sure the policy family is RTC-capable (`pi0`, `pi05`, `smolvla`). Otherwise, expect chunk-append behavior (see [RTC over the network](#rtc-over-the-network)).
**`server full: N/N sessions active`**
The session-open was rejected at capacity. Raise `max_sessions` (shared mode only), or point the robot at another server replica — the rejection includes the current load so orchestration can retry elsewhere.
-185
View File
@@ -1,185 +0,0 @@
# ROBOMETER
ROBOMETER is a **general-purpose video-language robotic reward model**. It predicts dense, frame-level task progress and frame-level success from a trajectory video and a task description.
**Paper**: [ROBOMETER: Scaling General-Purpose Robotic Reward Models via Trajectory Comparisons](https://arxiv.org/abs/2603.02115)
**Project**: [robometer.github.io](https://robometer.github.io/)
**Original code**: [github.com/robometer/robometer](https://github.com/robometer/robometer)
**Checkpoint**: [lerobot/Robometer-4B](https://huggingface.co/lerobot/Robometer-4B)
## Overview
ROBOMETER builds on `Qwen/Qwen3-VL-4B-Instruct` and adds three lightweight prediction heads:
- **Progress head**: predicts per-frame task progress in `[0, 1]`.
- **Success head**: predicts per-frame task success probability.
- **Preference head**: predicts which of two trajectories better completes the task during training.
The paper trains ROBOMETER with a composite objective:
```text
L = L_pref + L_prog + L_succ
```
The LeRobot integration is currently **inference-only**. It preserves the preference head so that the published `Robometer-4B` checkpoint loads without remapping, but `compute_reward()` queries the progress or success head only.
## What the LeRobot Integration Covers
- Standard `reward_model.type=robometer` configuration through LeRobot.
- Qwen3-VL image and text preprocessing through `RobometerEncoderProcessorStep`.
- LeRobot reward-model save/load APIs through `PreTrainedRewardModel`.
- Dense, frame-level progress and success predictions internally.
- A scalar reward through `compute_reward()` for downstream LeRobot reward-model usage.
This page focuses on using the published ROBOMETER checkpoint as a zero-shot reward model. Training ROBOMETER from scratch is outside the current LeRobot integration.
## Installation Requirements
1. Install LeRobot by following the [Installation Guide](./installation).
2. Install the ROBOMETER dependencies:
```bash
pip install -e ".[robometer]"
```
If you use `uv` directly from a source checkout:
```bash
uv sync --extra robometer
```
ROBOMETER uses a Qwen3-VL-4B backbone, so GPU inference is strongly recommended.
## Model Inputs and Outputs
ROBOMETER expects:
- A trajectory video or sequence of frames.
- A natural-language task description.
In LeRobot datasets, the preprocessor reads:
| Config field | Default | Meaning |
| ------------------------- | ------------------------ | ----------------------------------------------------- |
| `reward_model.image_key` | `observation.images.top` | Camera/video observation used by ROBOMETER |
| `reward_model.task_key` | `task` | Key in complementary data that stores the task string |
| `reward_model.max_frames` | `8` | Maximum number of frames passed to ROBOMETER |
The model predicts per-frame progress and success internally. The LeRobot reward API returns a scalar per sample:
- `reward_output="progress"` (default): return the last-frame progress, clamped to `[0, 1]`.
- `reward_output="success"`: return `1.0` if the last-frame success probability is above `success_threshold`, otherwise `0.0`.
## Usage
### Load the Reward Model Directly
```python
from lerobot.rewards.robometer import RobometerConfig, RobometerRewardModel
cfg = RobometerConfig(
pretrained_path="lerobot/Robometer-4B",
device="cuda",
reward_output="progress",
)
reward_model = RobometerRewardModel.from_pretrained(cfg.pretrained_path, config=cfg)
```
### Encode Frames and Compute a Reward
For a direct Python call, provide frames as `uint8` arrays with shape `(T, H, W, C)` and a task string:
```python
from lerobot.rewards.robometer.modeling_robometer import ROBOMETER_FEATURE_PREFIX
from lerobot.rewards.robometer.processor_robometer import RobometerEncoderProcessorStep
# frames: np.ndarray, shape (T, H, W, C), dtype uint8
# task: str
encoder = RobometerEncoderProcessorStep(
base_model_id=cfg.base_model_id,
use_multi_image=cfg.use_multi_image,
use_per_frame_progress_token=cfg.use_per_frame_progress_token,
max_frames=cfg.max_frames,
)
encoded = encoder.encode_samples([(frames, task)])
batch = {f"{ROBOMETER_FEATURE_PREFIX}{key}": value for key, value in encoded.items()}
reward = reward_model.compute_reward(batch)
```
`reward` is a tensor of shape `(batch_size,)`.
### Use the Reward Factory
You can also instantiate ROBOMETER through the reward factory:
```python
from lerobot.rewards import make_reward_model, make_reward_model_config, make_reward_pre_post_processors
cfg = make_reward_model_config(
"robometer",
pretrained_path="lerobot/Robometer-4B",
device="cuda",
image_key="observation.images.top",
)
reward_model = make_reward_model(cfg)
preprocessor, postprocessor = make_reward_pre_post_processors(cfg)
```
The preprocessor writes Qwen-VL tensors under the `observation.robometer.*` namespace, and `compute_reward()` reads those encoded tensors.
## Configuration Notes
### Backbone and Vocabulary
The published checkpoint uses a Qwen3-VL-4B backbone. ROBOMETER adds five special tokens to the tokenizer in a fixed order:
```text
<|split_token|>
<|reward_token|>
<|pref_token|>
<|sim_token|>
<|prog_token|>
```
`<|prog_token|>` is inserted after each frame and is the hidden-state position used for per-frame progress and success prediction. `<|split_token|>` and `<|pref_token|>` are used by the paper's pairwise trajectory preference objective. `<|reward_token|>` and `<|sim_token|>` are preserved for checkpoint compatibility.
The LeRobot config stores a serialized `vlm_config` with the post-resize vocabulary so the model can reload from `config.json` without downloading the base Qwen weights first. For `Qwen/Qwen3-VL-4B-Instruct`, the tokenizer length is `151669`, and the five ROBOMETER tokens produce the checkpoint vocabulary size `151674`.
### Progress Prediction
In the published checkpoint, progress is discrete. The progress head outputs logits over `progress_discrete_bins=10` uniformly spaced bin centers in `[0, 1]`. LeRobot converts these logits into a continuous value by applying a softmax and taking the expectation over bin centers, matching the upstream ROBOMETER implementation.
### Success Prediction
The success head outputs raw logits per frame. LeRobot converts them to probabilities with `sigmoid`. When `reward_output="success"`, `compute_reward()` thresholds the last-frame success probability using `success_threshold`.
## Limitations
- The current LeRobot integration is inference-only; it does not implement ROBOMETER training or preference-pair training.
- `compute_reward()` returns a scalar per sample for the LeRobot reward-model API, even though ROBOMETER predicts per-frame progress and success internally.
- ROBOMETER is video-language based; it does not use privileged robot state such as contact forces or object poses.
## References
- [ROBOMETER project](https://robometer.github.io/)
- [ROBOMETER paper](https://arxiv.org/abs/2603.02115)
- [Original ROBOMETER code](https://github.com/robometer/robometer)
- [Published ROBOMETER-4B checkpoint](https://huggingface.co/lerobot/Robometer-4B)
- [Qwen3-VL-4B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct)
## Citation
```bibtex
@inproceedings{liang2026robometer,
title = {Robometer: Scaling General-Purpose Robotic Reward Models via Trajectory Comparisons},
author={Anthony Liang and Yigit Korkmaz and Jiahui Zhang and Minyoung Hwang and Abrar Anwar and Sidhant Kaushik and Aditya Shah and Alex S. Huang and Luke Zettlemoyer and Dieter Fox and Yu Xiang and Anqi Li and Andreea Bobu and Abhishek Gupta and Stephen Tu and Erdem Biyik and Jesse Zhang},
year={2026},
booktitle={Robotics: Science and Systems 2026},
}
```
## License
This LeRobot integration follows the **Apache 2.0 License** used by LeRobot. Check the upstream ROBOMETER code and model pages for the licenses of the original implementation and released checkpoints.
+9 -9
View File
@@ -151,18 +151,18 @@ lerobot-rollout \
--device=cuda
```
## How It Relates to Remote Inference
## How It Differs from the Async Inference in LeRobot
Both RTC and [remote inference](./remote_inference) improve real-time robot control, but they solve different problems.
Both RTC and [async inference](./async) improve real-time robot control, but they solve different problems.
| Aspect | Remote Inference | RTC |
| ------------- | ------------------------------------------------------------------------ | --------------------------------------------------- |
| **Problem** | The policy is too large (or too slow) for the edge machine | Discontinuities between action chunks |
| **Solution** | Run inference on a GPU server; the robot executes buffered action chunks | Guide new chunks to continue smoothly from previous |
| **Benefit** | Weightless edge clients, one GPU serves many robots | Smooth transitions, natural motion |
| **Best Used** | Large models with high inference latency, robot fleets | Flow-matching based policies |
| Aspect | Async Inference | RTC |
| ------------- | -------------------------------------------------------------------------- | --------------------------------------------------- |
| **Problem** | Idle frames while waiting for inference | Discontinuities between action chunks |
| **Solution** | Decouple prediction from execution | Guide new chunks to continue smoothly from previous |
| **Benefit** | No waiting, continuous action | Smooth transitions, natural motion |
| **Best Used** | Async inference is best used with large models with high inference latency | Flow-matching based policies |
**Use both together** (`--inference.type=remote` with `--inference.rtc.execution_horizon=...`) for maximum smoothness and reactivity: the remote engine reuses RTC's chunk-merging machinery client-side while the server runs prefix-conditioned chunk generation.
**Use both together** for maximum smoothness and reactivity!
## Advanced: Debug Tracking
+28 -29
View File
@@ -46,7 +46,7 @@ This ensures identical task states map to consistent progress values, even acros
## Inputs and Targets (What the new code expects)
SARM is trained through its processor (`src/lerobot/rewards/sarm/processor_sarm.py`), which:
SARM is trained through its processor (`src/lerobot/policies/sarm/processor_sarm.py`), which:
- **Encodes** images and task text with CLIP (ViT-B/32) into `video_features` and `text_features`
- **Pads/truncates** robot state into `state_features` (up to `max_state_dim`)
@@ -347,7 +347,7 @@ Use `compute_rabc_weights.py` with `--visualize-only` to visualize model predict
<hfoption id="single_stage">
```bash
python -m lerobot.rewards.sarm.compute_rabc_weights \
python src/lerobot/policies/sarm/compute_rabc_weights.py \
--dataset-repo-id your-username/your-dataset \
--reward-model-path your-username/sarm-model \
--visualize-only \
@@ -360,7 +360,7 @@ python -m lerobot.rewards.sarm.compute_rabc_weights \
<hfoption id="dense_only">
```bash
python -m lerobot.rewards.sarm.compute_rabc_weights \
python src/lerobot/policies/sarm/compute_rabc_weights.py \
--dataset-repo-id your-username/your-dataset \
--reward-model-path your-username/sarm-model \
--visualize-only \
@@ -373,7 +373,7 @@ python -m lerobot.rewards.sarm.compute_rabc_weights \
<hfoption id="dual">
```bash
python -m lerobot.rewards.sarm.compute_rabc_weights \
python src/lerobot/policies/sarm/compute_rabc_weights.py \
--dataset-repo-id your-username/your-dataset \
--reward-model-path your-username/sarm-model \
--visualize-only \
@@ -429,7 +429,7 @@ The weighting follows **Equations 8-9** from the paper:
First, run the SARM model on all frames in your dataset to compute progress values:
```bash
python -m lerobot.rewards.sarm.compute_rabc_weights \
python src/lerobot/policies/sarm/compute_rabc_weights.py \
--dataset-repo-id your-username/your-dataset \
--reward-model-path your-username/sarm-model \
--head-mode sparse \
@@ -465,15 +465,15 @@ This script:
### Step 5b: Train Policy with RA-BC
Once you have the progress file, train your policy with RA-BC weighting. The progress file is auto-detected from the dataset path (`sarm_progress.parquet`) if not explicitly provided. Currently PI0, PI0.5 and SmolVLA are supported with RA-BC:
Once you have the progress file, train your policy with RA-BC weighting. The progress file is auto-detected from the dataset path (`sarm_progress.parquet`). Currently PI0, PI0.5 and SmolVLA are supported with RA-BC:
```bash
lerobot-train \
--dataset.repo_id=your-username/your-dataset \
--policy.type=pi0 \
--sample_weighting.type=rabc \
--sample_weighting.head_mode=sparse \
--sample_weighting.kappa=0.01 \
--use_rabc=true \
--rabc_head_mode=sparse \
--rabc_kappa=0.01 \
--output_dir=outputs/train/policy_rabc \
--batch_size=32 \
--steps=40000
@@ -488,13 +488,12 @@ The training script automatically:
**RA-BC Arguments:**
| Argument | Description | Default |
| ---------------------------------- | ------------------------------------------------------ | ----------------------- |
| `--sample_weighting.type` | Weighting strategy type (`rabc` or `uniform`) | `rabc` |
| `--sample_weighting.progress_path` | Path to progress parquet file | `sarm_progress.parquet` |
| `--sample_weighting.head_mode` | Which SARM head's progress to use: `sparse` or `dense` | `sparse` |
| `--sample_weighting.kappa` | Threshold κ for high-quality samples | `0.01` |
| `--sample_weighting.epsilon` | Small constant for numerical stability | `1e-6` |
| Argument | Description | Default |
| ---------------------- | ---------------------------------------------------------- | ---------------------------------- |
| `--use_rabc` | Enable RA-BC sample weighting | `false` |
| `--rabc_progress_path` | Path to progress parquet file (auto-detected from dataset) | `sarm_progress.parquet` in dataset |
| `--rabc_head_mode` | Which SARM head's progress to use: `sparse` or `dense` | `sparse` |
| `--rabc_kappa` | Threshold κ for high-quality samples | `0.01` |
### Tuning RA-BC Kappa
@@ -512,30 +511,30 @@ The `kappa` parameter is the threshold that determines which samples get full we
Monitor these WandB metrics during training:
| Metric | Healthy Range | Problem Indicator |
| ----------------------------- | ------------- | ------------------------- |
| `sample_weight_mean_weight` | 0.3 - 0.8 | ≈ 1.0 means kappa too low |
| `sample_weighting/delta_mean` | > 0 | Should be positive |
| `sample_weighting/delta_std` | > 0 | Variance in data quality |
| Metric | Healthy Range | Problem Indicator |
| ------------------ | ------------- | ------------------------- |
| `rabc_mean_weight` | 0.3 - 0.8 | ≈ 1.0 means kappa too low |
| `rabc_delta_mean` | > 0 | Should be positive |
| `rabc_delta_std` | > 0 | Variance in data quality |
**If `sample_weight_mean_weight ≈ 1.0`:** Your kappa is too low. Most samples have `delta > kappa` and bypass the soft-weighting entirely. RA-BC becomes equivalent to vanilla BC.
**If `rabc_mean_weight ≈ 1.0`:** Your kappa is too low. Most samples have `delta > kappa` and bypass the soft-weighting entirely. RA-BC becomes equivalent to vanilla BC.
**Setting kappa based on your data:**
The default `kappa=0.01` was tuned for the paper's T-shirt folding task (~90s episodes at 30fps). For your dataset, check the logged `sample_weighting/delta_mean` and `sample_weighting/delta_std`:
The default `kappa=0.01` was tuned for the paper's T-shirt folding task (~90s episodes at 30fps). For your dataset, check the logged `rabc_delta_mean` and `rabc_delta_std`:
```
# If delta_mean ≈ 0.03 and delta_std ≈ 0.02:
# Most deltas fall in range [0.01, 0.05]
# Option 1: Set kappa = delta_mean (medium selectivity)
--sample_weighting.kappa=0.03
--rabc_kappa=0.03
# Option 2: Set kappa = delta_mean + delta_std (high selectivity)
--sample_weighting.kappa=0.05
--rabc_kappa=0.05
# Option 3: Set kappa = delta_mean + 2*delta_std (very selective)
--sample_weighting.kappa=0.07
--rabc_kappa=0.07
```
**When RA-BC may not help:**
@@ -551,8 +550,8 @@ accelerate launch \
src/lerobot/scripts/lerobot_train.py \
--dataset.repo_id=your-username/your-dataset \
--policy.type=pi0 \
--sample_weighting.type=rabc \
--sample_weighting.kappa=0.01 \
--use_rabc=true \
--rabc_kappa=0.01 \
--output_dir=outputs/train/policy_rabc \
--batch_size=32 \
--steps=40000
@@ -577,7 +576,7 @@ accelerate launch \
### RA-BC
1. **Train SARM first**: RA-BC quality depends entirely on SARM quality
2. **Monitor `sample_weight_mean_weight`**: If it's ≈ 1.0, increase kappa (see [Tuning RA-BC Kappa](#tuning-ra-bc-kappa))
2. **Monitor `rabc_mean_weight`**: If it's ≈ 1.0, increase kappa (see [Tuning RA-BC Kappa](#tuning-ra-bc-kappa))
---
+8 -8
View File
@@ -97,22 +97,22 @@ Similarly for when recording an episode, it is recommended that you are logged i
Once you are logged in, you can run inference in your setup by doing:
```bash
lerobot-rollout \
--strategy.type=base \
lerobot-record \
--robot.type=so101_follower \
--robot.port=/dev/ttyACM0 \ # <- Use your port
--robot.id=my_blue_follower_arm \ # <- Use your robot id
--robot.cameras="{ front: {type: opencv, index_or_path: 8, width: 640, height: 480, fps: 30}}" \ # <- Use your cameras
--task="Grasp a lego block and put it in the bin." \ # <- Use the same task description you used in your dataset recording
# <- RTC optional, use when running on low power hardware \
# --inference.type=rtc \
# --inference.rtc.execution_horizon=10 \
# --inference.rtc.max_guidance_weight=10.0 \
--dataset.single_task="Grasp a lego block and put it in the bin." \ # <- Use the same task description you used in your dataset recording
--dataset.repo_id=${HF_USER}/eval_DATASET_NAME_test \ # <- This will be the dataset name on HF Hub
--dataset.episode_time_s=50 \
--dataset.num_episodes=10 \
--dataset.streaming_encoding=true \
--dataset.encoder_threads=2 \
# --dataset.vcodec=auto \
# <- Teleop optional if you want to teleoperate in between episodes \
# --teleop.type=so100_leader \
# --teleop.port=/dev/ttyACM0 \
# --teleop.id=my_red_leader_arm \
# --display_data=true #optional use if you want to see the camera stream \
--policy.path=HF_USER/FINETUNE_MODEL_NAME # <- Use your fine-tuned model
```
+22 -22
View File
@@ -17,9 +17,9 @@ This makes `save_episode()` near-instant (the video is already encoded by the ti
| Parameter | CLI Flag | Type | Default | Description |
| ----------------------- | --------------------------------- | ------------- | ------------- | ----------------------------------------------------------------- |
| `streaming_encoding` | `--dataset.streaming_encoding` | `bool` | `True` | Enable real-time encoding during capture |
| `vcodec` | `--dataset.camera_encoder.vcodec` | `str` | `"libsvtav1"` | Video codec. `"auto"` detects best HW encoder |
| `vcodec` | `--dataset.vcodec` | `str` | `"libsvtav1"` | Video codec. `"auto"` detects best HW encoder |
| `encoder_threads` | `--dataset.encoder_threads` | `int \| None` | `None` (auto) | Threads per encoder instance. `None` will leave the vcoded decide |
| `encoder_queue_maxsize` | `--dataset.encoder_queue_maxsize` | `int` | `30` | Max buffered frames per camera (~1s at 30fps). Consumes RAM |
| `encoder_queue_maxsize` | `--dataset.encoder_queue_maxsize` | `int` | `60` | Max buffered frames per camera (~2s at 30fps). Consumes RAM |
## 3. Performance Considerations
@@ -48,7 +48,7 @@ This parameter controls how many threads each encoder instance uses internally:
### Backpressure and Frame Dropping
Each camera has a bounded queue (`encoder_queue_maxsize`, default 30 frames). When the encoder can't keep up:
Each camera has a bounded queue (`encoder_queue_maxsize`, default 60 frames). When the encoder can't keep up:
1. The queue fills up (consuming RAM)
2. New frames are **dropped** (not blocked) — the capture loop continues uninterrupted
@@ -82,15 +82,15 @@ Use HW encoding when:
### Available HW Encoders
| Encoder | Platform | Hardware | CLI Value |
| ------------------- | ------------- | ------------------------------------------------------------------------------------------------ | --------------------------------------------------- |
| `h264_videotoolbox` | macOS | Apple Silicon / Intel | `--dataset.camera_encoder.vcodec=h264_videotoolbox` |
| `hevc_videotoolbox` | macOS | Apple Silicon / Intel | `--dataset.camera_encoder.vcodec=hevc_videotoolbox` |
| `h264_nvenc` | Linux/Windows | NVIDIA GPU | `--dataset.camera_encoder.vcodec=h264_nvenc` |
| `hevc_nvenc` | Linux/Windows | NVIDIA GPU | `--dataset.camera_encoder.vcodec=hevc_nvenc` |
| `h264_vaapi` | Linux | Intel/AMD GPU | `--dataset.camera_encoder.vcodec=h264_vaapi` |
| `h264_qsv` | Linux/Windows | Intel Quick Sync | `--dataset.camera_encoder.vcodec=h264_qsv` |
| `auto` | Any | Probes the system for available HW encoders. Falls back to `libsvtav1` if no HW encoder is found | `--dataset.camera_encoder.vcodec=auto` |
| Encoder | Platform | Hardware | CLI Value |
| ------------------- | ------------- | ------------------------------------------------------------------------------------------------ | ------------------------------------ |
| `h264_videotoolbox` | macOS | Apple Silicon / Intel | `--dataset.vcodec=h264_videotoolbox` |
| `hevc_videotoolbox` | macOS | Apple Silicon / Intel | `--dataset.vcodec=hevc_videotoolbox` |
| `h264_nvenc` | Linux/Windows | NVIDIA GPU | `--dataset.vcodec=h264_nvenc` |
| `hevc_nvenc` | Linux/Windows | NVIDIA GPU | `--dataset.vcodec=hevc_nvenc` |
| `h264_vaapi` | Linux | Intel/AMD GPU | `--dataset.vcodec=h264_vaapi` |
| `h264_qsv` | Linux/Windows | Intel Quick Sync | `--dataset.vcodec=h264_qsv` |
| `auto` | Any | Probes the system for available HW encoders. Falls back to `libsvtav1` if no HW encoder is found | `--dataset.vcodec=auto` |
> [!NOTE]
> In order to use the HW accelerated encoders you might need to upgrade your GPU drivers.
@@ -100,15 +100,15 @@ Use HW encoding when:
## 5. Troubleshooting
| Symptom | Likely Cause | Fix |
| ------------------------------------------------------------------ | -------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| System freezes or choppy robot movement or Rerun visualization lag | CPU starved (100% load usage) | Close other apps, reduce encoding throughput, lower `encoder_threads`, use `h264`, use `display_data=False`. If the CPU continues to be at 100% then it might be insufficient for your setup, consider `--dataset.streaming_encoding=false` or HW encoding (`--dataset.camera_encoder.vcodec=auto`) |
| "Encoder queue full" warnings or dropped frames in dataset | Encoder can't keep up (Queue overflow) | If CPU is not at 100%: Increase `encoder_threads`, increase `encoder_queue_maxsize` or use HW encoding (`--dataset.camera_encoder.vcodec=auto`). |
| High RAM usage | Queue filling faster than encoding | `encoder_threads` too low or CPU insufficient. Reduce `encoder_queue_maxsize` or use HW encoding |
| Large video files | Using HW encoder or H.264 | Expected trade-off. Switch to `libsvtav1` if CPU allows |
| `save_episode()` still slow | `streaming_encoding` is `False` | Set `--dataset.streaming_encoding=true` |
| Encoder thread crash | Codec not available or invalid settings | Check `vcodec` is installed, try `--dataset.camera_encoder.vcodec=auto` |
| Recorded dataset is missing frames | CPU/GPU starvation or occasional load spikes | If ~5% of frames are missing, your system is likely overloaded — follow the recommendations above. If fewer frames are missing (~2%), they are probably due to occasional transient load spikes (often at startup) and can be considered expected. |
| Symptom | Likely Cause | Fix |
| ------------------------------------------------------------------ | -------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| System freezes or choppy robot movement or Rerun visualization lag | CPU starved (100% load usage) | Close other apps, reduce encoding throughput, lower `encoder_threads`, use `h264`, use `display_data=False`. If the CPU continues to be at 100% then it might be insufficient for your setup, consider `--dataset.streaming_encoding=false` or HW encoding (`--dataset.vcodec=auto`) |
| "Encoder queue full" warnings or dropped frames in dataset | Encoder can't keep up (Queue overflow) | If CPU is not at 100%: Increase `encoder_threads`, increase `encoder_queue_maxsize` or use HW encoding (`--dataset.vcodec=auto`). |
| High RAM usage | Queue filling faster than encoding | `encoder_threads` too low or CPU insufficient. Reduce `encoder_queue_maxsize` or use HW encoding |
| Large video files | Using HW encoder or H.264 | Expected trade-off. Switch to `libsvtav1` if CPU allows |
| `save_episode()` still slow | `streaming_encoding` is `False` | Set `--dataset.streaming_encoding=true` |
| Encoder thread crash | Codec not available or invalid settings | Check `vcodec` is installed, try `--dataset.vcodec=auto` |
| Recorded dataset is missing frames | CPU/GPU starvation or occasional load spikes | If ~5% of frames are missing, your system is likely overloaded — follow the recommendations above. If fewer frames are missing (~2%), they are probably due to occasional transient load spikes (often at startup) and can be considered expected. |
## 6. Recommended Configurations
@@ -146,7 +146,7 @@ On very constrained systems, streaming encoding may compete too heavily with the
# 2camsx 640x480x3 @30fps: Requires some tuning.
# Use H.264, disable streaming, consider batching encoding
lerobot-record --dataset.camera_encoder.vcodec=h264 --dataset.streaming_encoding=false ...
lerobot-record --dataset.vcodec=h264 --dataset.streaming_encoding=false ...
```
## 7. Closing note
-210
View File
@@ -1,210 +0,0 @@
# Tools
LeRobot v3.1 supports **tool calls** in policies — assistant messages can
emit structured invocations like `say(text="OK, starting now")` that the
runtime dispatches to a real implementation (TTS, controller, logger, …).
This page covers:
1. Where the tool catalog lives.
2. How the annotation pipeline produces tool-call atoms.
3. How to add your own tool.
## Where tools are declared
Two layers.
**The catalog** — a list of OpenAI-style function schemas — lives at
`meta/info.json["tools"]` on each dataset. Example:
```json
{
"features": { "...": "..." },
"tools": [
{
"type": "function",
"function": {
"name": "say",
"description": "Speak a short utterance to the user via the TTS executor.",
"parameters": {
"type": "object",
"properties": {
"text": {
"type": "string",
"description": "The verbatim text to speak."
}
},
"required": ["text"]
}
}
}
]
}
```
Read it via the dataset metadata accessor:
```python
from lerobot.datasets.dataset_metadata import LeRobotDatasetMetadata
meta = LeRobotDatasetMetadata(repo_id="pepijn/super_poulain_final_annotations")
tools = meta.tools # list[dict] — OpenAI tool schemas
```
If the dataset's `info.json` doesn't declare any tools, `meta.tools`
returns `DEFAULT_TOOLS` from `lerobot.datasets.language` — currently a
single-entry list with the canonical `say` schema. So unannotated
datasets and chat-template consumers keep working without any
configuration:
```python
prompt_str = tokenizer.apply_chat_template(
sample["messages"],
tools=meta.tools, # works either way
add_generation_prompt=False,
tokenize=False,
)
```
**The implementations** — runnable Python — will live under
`src/lerobot/tools/`, one file per tool. The runtime dispatcher and
the canonical `say` implementation (wrapping Kyutai's pocket-tts) are
not part of the catalog layer described here; today this layer ships
only the schema storage and the `DEFAULT_TOOLS` fallback constant.
## Per-row tool _invocations_
The catalog above describes _what can be called_. The actual _call_ — the
function name plus the argument values — is stored per-row, on the
assistant atoms in `language_events`:
```python
{
"role": "assistant",
"content": null,
"style": null,
"timestamp": 12.4,
"camera": null,
"tool_calls": [
{ "type": "function",
"function": { "name": "say", "arguments": { "text": "On it." } } }
]
}
```
Recipes splice these into rendered messages via `tool_calls_from`:
```yaml
user_interjection_response:
bindings:
speech: "emitted_at(t, role=assistant, tool_name=say)"
messages:
- { role: user, content: "${task}", stream: high_level }
- {
role: assistant,
content: "${current_plan}",
stream: high_level,
target: true,
tool_calls_from: speech,
}
```
The model's training target is one assistant turn that carries both the
plan text _and_ the `say` tool call. At inference, the runtime parses
the generated text back into structured `tool_calls` and dispatches to
the matching implementation.
## How to add your own tool
> **Note:** Steps 2 and 3 below describe the runtime layer
> (`src/lerobot/tools/`, the `Tool` protocol, `TOOL_REGISTRY`,
> `get_tools(meta)`) which is not part of the catalog layer shipped
> today — those modules don't yet exist in the tree. Step 1 alone is
> enough to make the tool visible to the chat template via
> `meta.tools` so the model can learn to _generate_ the call;
> executing the call at inference requires the runtime layer.
Three steps. Concrete example: a `record_observation` tool the policy
can call to capture an extra observation outside the regular control
loop.
### Step 1 — declare the schema
Add an entry under `meta/info.json["tools"]`. Either edit the file
directly on disk _before_ running the annotation pipeline (it'll be
preserved) or hand it to `lerobot-annotate` via a config flag.
```json
{
"tools": [
{ "type": "function", "function": { "name": "say", "...": "..." } },
{
"type": "function",
"function": {
"name": "record_observation",
"description": "Capture a high-resolution still image for the user.",
"parameters": {
"type": "object",
"properties": {
"label": {
"type": "string",
"description": "Short label for the saved image."
}
},
"required": ["label"]
}
}
}
]
}
```
The schema follows OpenAI's function-calling convention exactly, so the
chat template can render it natively.
### Step 2 — implement the call
Create `src/lerobot/tools/record_observation.py`:
```python
from .base import Tool
from typing import Any
RECORD_OBSERVATION_SCHEMA: dict[str, Any] = { "...": "..." } # mirrors the JSON above
class RecordObservationTool:
name = "record_observation"
schema = RECORD_OBSERVATION_SCHEMA
def __init__(self, schema: dict | None = None, output_dir: str = "."):
self.output_dir = output_dir
def call(self, arguments: dict) -> str:
label = arguments["label"]
# ... save the latest camera frame to <output_dir>/<label>.png ...
return f"saved {label}.png"
```
One file per tool keeps dependencies isolated — `record_observation`
might pull `pillow`, while `say` pulls `pocket-tts`. Users installing
only the tools they need avoid heavy transitive deps.
### Step 3 — register it
Add to `src/lerobot/tools/registry.py`:
```python
from .record_observation import RecordObservationTool
TOOL_REGISTRY["record_observation"] = RecordObservationTool
```
That's it. At runtime `get_tools(meta)` looks up each schema in
`meta.tools`, instantiates the matching registered class, and returns
a name → instance dict the dispatcher can route into.
If you want to use a tool _without_ writing an implementation (e.g. for
training-time chat-template formatting only), step 1 alone is enough —
the model still learns to _generate_ the call. Steps 2 and 3 are only
needed to actually _execute_ it at inference.
-177
View File
@@ -1,177 +0,0 @@
# TOPReward
TOPReward is a **zero-shot reward model** that extracts token log-probabilities from an off-the-shelf vision-language model (VLM) as a robotic reward signal. Given a video trajectory and a task instruction, it returns the VLM's log-likelihood that the instruction is true — no fine-tuning required.
**Paper**: [TOPReward: Token Probabilities as Hidden Zero-Shot Rewards for Robotics](https://arxiv.org/abs/2602.19313)
**Project**: [topreward.github.io](https://topreward.github.io/webpage/)
**Original code**: [github.com/TOPReward/TOPReward](https://github.com/TOPReward/TOPReward)
**Default backbone**: [Qwen/Qwen3-VL-8B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-8B-Instruct)
## Overview
TOPReward asks a generic VLM how likely a task instruction is, **conditioned on the video** of a robot trying to complete that task. Concretely, given:
- A trajectory video (a sequence of frames).
- A task instruction (e.g. _"open the drawer"_).
it builds a chat prompt of the form
```text
<video>
"The above video shows a robot manipulation trajectory that completes the
following task: <instruction> Decide whether the above statement is True
or not. The answer is: True"
```
forwards it through the VLM, label-masks everything except the very last token, and reads back the log-probability of that token — by default the literal `"True"` that closes the suffix template. The resulting `log P("True" | video + prompt + instruction)` is the reward.
Because the method only depends on a frozen VLM, TOPReward is **zero-shot**: there are no fine-tuned weights to host. The "model" in LeRobot is a small wrapper around `transformers`' `Qwen3VLForConditionalGeneration` plus the label-masking logic. The processor owns the tokeniser and builds the full chat prompt (EO-1/Robometer pattern).
## What the LeRobot integration covers
- Standard `reward_model.type=topreward` configuration through LeRobot.
- VLM loading via the `transformers` `Qwen3VLForConditionalGeneration` API.
- Prompt assembly + tokenisation in the processor (matching upstream `QwenClient.compute_instruction_reward`).
- `compute_reward()` returns one scalar log-prob per sample.
- LeRobot reward-model save/load — `save_pretrained` writes only `config.json` (the VLM is identified by `vlm_name`).
- An offline labeling script that writes a `topreward_progress.parquet` (SARM-compatible schema) for RA-BC and overlay.
The current LeRobot port supports the **Qwen3-VL client only**. Other upstream clients (Gemini, OpenAI, Gemma, Molmo) can be added as follow-up extras.
## Installation Requirements
1. Install LeRobot following the [Installation Guide](./installation).
2. Install the TOPReward optional extra:
```bash
pip install -e ".[topreward]"
```
or, with `uv` from a source checkout:
```bash
uv sync --extra topreward
```
This pulls in `transformers`. The first time you run TOPReward, Hugging Face will also download the VLM weights from the Hub (~16 GB for Qwen3-VL-8B-Instruct). A GPU is strongly recommended.
## Model Inputs and Outputs
TOPReward expects:
- A trajectory video or sequence of frames.
- A natural-language task description.
In LeRobot datasets the preprocessor reads:
| Config field | Default | Meaning |
| ------------------------- | --------------------------- | --------------------------------------------- |
| `reward_model.image_key` | `observation.images.top` | Camera observation used by TOPReward |
| `reward_model.task_key` | `task` | Key in complementary data for the task string |
| `reward_model.max_frames` | `16` | Cap on frames per sample |
| `reward_model.fps` | `2.0` | Metadata passed to the Qwen video processor |
| `reward_model.vlm_name` | `Qwen/Qwen3-VL-8B-Instruct` | Hugging Face Hub id of the underlying VLM |
The model returns:
- `compute_reward(batch)`: one log-probability per sample. Higher = better task-video alignment. When `success_threshold` is finite, returns the binary thresholded value instead.
## Usage
### Load the reward model directly
```python
from lerobot.rewards.topreward import TOPRewardConfig, TOPRewardModel
cfg = TOPRewardConfig(
vlm_name="Qwen/Qwen3-VL-8B-Instruct",
device="cuda",
)
reward_model = TOPRewardModel(cfg)
```
### Use the reward factory
```python
from lerobot.rewards import make_reward_model, make_reward_model_config, make_reward_pre_post_processors
cfg = make_reward_model_config(
"topreward",
vlm_name="Qwen/Qwen3-VL-8B-Instruct",
device="cuda",
image_key="observation.images.top",
)
reward_model = make_reward_model(cfg)
preprocessor, postprocessor = make_reward_pre_post_processors(cfg)
```
The preprocessor tokenises the full prompt (video + prefix + instruction suffix), writes Qwen-VL tensors + `prompt_length` under `observation.topreward.*`. The model reads those tensors, label-masks based on `prompt_length`, and extracts the log-prob reward.
### Offline dataset labeling
Write a `topreward_progress.parquet` for RA-BC training and overlay videos:
```bash
# Sparse-dense (15 anchors per episode, matches upstream)
uv run python -m lerobot.rewards.topreward.compute_rabc_weights \
--dataset-repo-id lerobot/libero_10_image \
--num-samples 15 \
--device cuda
```
Then render the progress overlay for any episode:
```bash
uv run examples/dataset/create_progress_videos.py \
--repo-id lerobot/libero_10_image \
--episode 0 \
--progress-file topreward_progress.parquet \
--gif
```
## Configuration Notes
### Prompt knobs
The default prompt mirrors the upstream paper:
```text
prompt_prefix = "The above video shows a robot manipulation trajectory that completes the following task: "
prompt_suffix_template = "{instruction} Decide whether the above statement is True or not. The answer is: True"
```
Both are exposed on `TOPRewardConfig` for ablation. The suffix template **must** contain `{instruction}`.
### Chat template
`add_chat_template=True` wraps the full prompt (including instruction) with the tokenizer's chat template before tokenisation. Default is `False`, matching the upstream paper's main experiments.
## Limitations
- The current LeRobot port is **inference-only and zero-shot**; `forward()` is not overridden and `is_trainable` returns `False`.
- Only the **Qwen3-VL family** is supported; other upstream clients are out of scope.
- TOPReward inherits the underlying VLM's biases.
## References
- [TOPReward project page](https://topreward.github.io/webpage/)
- [TOPReward paper](https://arxiv.org/abs/2602.19313)
- [Original TOPReward code](https://github.com/TOPReward/TOPReward)
- [Qwen3-VL-8B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-8B-Instruct)
## Citation
```bibtex
@article{chen2026topreward,
title={TOPReward: Token Probabilities as Hidden Zero-Shot Rewards for Robotics},
author={Chen, Shirui and Harrison, Cole and Lee, Ying-Chun and Yang, Angela Jin and
Ren, Zhongzheng and Ratliff, Lillian J and Duan, Jiafei and Fox, Dieter and
Krishna, Ranjay},
journal={arXiv preprint arXiv:2602.19313},
year={2026}
}
```
## License
The original TOPReward codebase is MIT-licensed. The LeRobot port follows the LeRobot Apache 2.0 license; the wrapped Qwen3-VL weights are subject to the original Qwen license.
+9 -5
View File
@@ -117,10 +117,10 @@ lerobot-edit-dataset \
--repo_id lerobot/pusht_image \
--operation.type convert_image_to_video \
--operation.output_dir outputs/pusht_video \
--operation.camera_encoder.vcodec libsvtav1 \
--operation.camera_encoder.pix_fmt yuv420p \
--operation.camera_encoder.g 2 \
--operation.camera_encoder.crf 30
--operation.vcodec libsvtav1 \
--operation.pix_fmt yuv420p \
--operation.g 2 \
--operation.crf 30
# Convert only specific episodes
lerobot-edit-dataset \
@@ -147,7 +147,11 @@ lerobot-edit-dataset \
**Parameters:**
- `output_dir`: Custom output directory (optional - by default uses `new_repo_id` or `{repo_id}_video`)
- `camera_encoder`: Video encoder settings — all sub-fields accessible via `--operation.camera_encoder.<field>. See [Video Encoding Parameters](./video_encoding_parameters) for more details.
- `vcodec`: Video codec to use - options: `h264`, `hevc`, `libsvtav1` (default: `libsvtav1`)
- `pix_fmt`: Pixel format - options: `yuv420p`, `yuv444p` (default: `yuv420p`)
- `g`: Group of pictures (GOP) size - lower values give better quality but larger files (default: 2)
- `crf`: Constant rate factor - lower values give better quality but larger files, 0 is lossless (default: 30)
- `fast_decode`: Fast decode tuning option (default: 0)
- `episode_indices`: List of specific episodes to convert (default: all episodes)
- `num_workers`: Number of parallel workers for processing (default: 4)
-117
View File
@@ -1,117 +0,0 @@
# Video encoding parameters
When video storage is enabled, LeRobot stores each camera stream as an **MP4** file instead of saving one image file per timestep. Video encoding compresses across time, which usually cuts dataset size and I/O compared to a pile of PNG, while keeping MP4 — a format every player and loader understands.
Encoding frames into an MP4 is a full FFmpeg pipeline: choice of encoder, pixel format, GOP/keyframes, quality vs. speed, and optional extra encoder flags. Most of these knobs are user-tunable through `camera_encoder`, a nested `VideoEncoderConfig` (`lerobot.configs.video.VideoEncoderConfig`) passed through PyAV.
You can set these parameters from the CLI with `--dataset.camera_encoder.<field>` (e.g. with `lerobot-record` or `lerobot-rollout`). The same block applies to every camera video stream in that run.
<Tip>
Video storage must be on for `camera_encoder` to have any effect —
`use_videos=True` in Python APIs, or `--dataset.video=true` on the CLI (the
recording default). With video off, inputs stay as images and `camera_encoder`
is ignored.
</Tip>
For details on **when** frames are written vs. encoded (streaming vs. post-episode), queues, and other top-level `--dataset.*` switches, see [Streaming Video Encoding](./streaming_video_encoding). For an encoding-parameter comparison and experiments, see the [video-benchmark Space](https://huggingface.co/spaces/lerobot/video-benchmark).
---
## Example
```bash
lerobot-record \
--robot.type=so100_follower \
--robot.port=/dev/tty.usbmodem58760431541 \
--robot.cameras="{laptop: {type: opencv, index_or_path: 0, width: 640, height: 480, fps: 30}}" \
--robot.id=black \
--teleop.type=so100_leader \
--teleop.port=/dev/tty.usbmodem58760431551 \
--teleop.id=blue \
--dataset.repo_id=<my_username>/<my_dataset_name> \
--dataset.num_episodes=2 \
--dataset.single_task="Grab the cube" \
--dataset.streaming_encoding=true \
--dataset.encoder_threads=2 \
--dataset.camera_encoder.vcodec=h264 \
--dataset.camera_encoder.preset=fast \
--dataset.camera_encoder.extra_options={"tune": "film", "profile:v": "high", "bf": 2} \
--display_data=true
```
---
## Tuning parameters
<Tip warning={true}>
The defaults are tuned to balance **compression ratio**, **visual quality**, and **decoding/seek speed** for typical robotics datasets. Changing them can affect both recording (CPU load, frame drops) and training (decoding throughput, image quality).
Only override these parameters if you have a specific reason to, and measure the impact on your pipeline before relying on the new settings.
</Tip>
All flags below are prefixed with `--dataset.camera_encoder.` on the CLI.
| Parameter | Type | Default | Description |
| --------------- | ---------------- | ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vcodec` | `str` | `"libsvtav1"` | Video codec name. `"auto"` picks the first available hardware encoder from a fixed preference list, falling back to `libsvtav1`. |
| `pix_fmt` | `str` | `"yuv420p"` | Output pixel format. Must be supported by the chosen codec in your FFmpeg build. |
| `g` | `int` | `2` | GOP size — a keyframe every `g` frames. Emitted as FFmpeg option `g`. |
| `crf` | `int` or `float` | `30` | Abstract quality value, mapped per codec (see the [mapping](#mapping-videoencoderconfig--ffmpeg-options) below). Lower → higher quality / larger output where the mapping is monotone. |
| `preset` | `int` or `str` | `12` \* | Encoder speed preset; meaning depends on the codec. <br/>\* When unset and `vcodec=libsvtav1`, LeRobot defaults to `12`. |
| `fast_decode` | `int` | `0` | `libsvtav1`: `02`, passed via `svtav1-params`. <br/>`h264` / `hevc` (software): if `>0`, sets `tune=fastdecode`. <br/>Other codecs: usually unused. |
| `video_backend` | `str` | `"pyav"` | Only `"pyav"` is currently implemented for video encoding. |
| `extra_options` | `dict` | `{}` | Extra FFmpeg or codec specific options merged after the structured fields above. Cannot override keys already set by those fields. |
---
## Persistence in dataset metadata
After the first episode of a video stream is encoded, the encoder configuration is **persisted into the dataset metadata** (`meta/info.json`) under each video feature, alongside the values probed from the file itself. For a video feature `observation.images.<camera>`, the layout in `info.json` is:
```json
{
"features": {
"observation.images.laptop": {
"dtype": "video",
"shape": [480, 640, 3],
"info": {
"video.height": 480,
"video.width": 640,
"video.codec": "h264",
"video.pix_fmt": "yuv420p",
"video.fps": 30,
"video.channels": 3,
"video.is_depth_map": false,
"video.g": 2,
"video.crf": 30,
"video.preset": "fast",
"video.fast_decode": 0,
"video.video_backend": "pyav",
"video.extra_options": { "tune": "film", "profile:v": "high", "bf": 2 }
}
}
}
}
```
Two sources contribute to the `info` block:
- **Stream-derived** (read back from the encoded MP4 with PyAV): `video.height`, `video.width`, `video.codec`, `video.pix_fmt`, `video.fps`, `video.channels`, `video.is_depth_map`, plus `audio.*` if an audio stream is present.
- **Encoder-derived** (taken from `VideoEncoderConfig`): `video.g`, `video.crf`, `video.preset`, `video.fast_decode`, `video.video_backend`, `video.extra_options`.
<Tip>
This block is populated **once**, from the **first** episode. It assumes every
episode in the dataset was encoded with the same `camera_encoder`. Changing
encoder settings partway through a recording is not supported — the
`info.json` will only reflect the parameters used for the first episode.
</Tip>
---
## Merging datasets
When aggregating datasets with `merge_datasets`, video files are concatenated as-is (no re-encoding), and encoder fields in `info.json` are merged per-key:
- **Stream-derived fields must match** across sources: `video.codec`, `video.pix_fmt`, `video.height`, `video.width`, `video.fps`. Otherwise FFmpeg's concat demuxer fails.
- **Encoder-tuning fields are merged loosely**: `video.g`, `video.crf`, `video.preset`, `video.fast_decode`, `video.extra_options`. If every source agrees, the value is kept; if not, it's set to `null` (or `{}` for `video.extra_options`) and a warning is logged.
-235
View File
@@ -1,235 +0,0 @@
# VLA-JEPA
This is the LeRobot port of **VLA-JEPA**, a Vision-Language-Action model that combines a Qwen3-VL language backbone with a self-supervised video world model (V-JEPA2) and a flow-matching DiT action head.
---
## Architecture Overview
VLA-JEPA has three main components:
| Component | Module | Role |
| ----------------------- | --------------------------------- | ------------------------------------------------------- |
| **Qwen3-VL backbone** | `Qwen3VLInterface` | Fuses images + language instruction into context tokens |
| **DiT-B action head** | `VLAJEPAActionHead` | Flow-matching diffusion over the action chunk |
| **V-JEPA2 world model** | `ActionConditionedVideoPredictor` | Self-supervised video prediction loss (training only) |
### Data flow
**Training:**
1. A video clip of `num_video_frames` frames is encoded by V-JEPA2 into per-frame patch tokens.
2. The Qwen3-VL backbone processes multi-view images + the task instruction and produces a sequence of context tokens that includes special action tokens (for world model conditioning) and embodied tokens.
3. The action head receives those context tokens as cross-attention keys/values and predicts a denoised action chunk via flow matching.
4. The world model predictor uses the action tokens extracted from Qwen to predict future V-JEPA2 frame embeddings; a regression loss on those predictions is added to the action loss.
**Inference:**
Only Qwen + the action head are used. The world model is not needed at inference time.
### Action head details
Available presets via `action_model_type`:
| Preset | Hidden dim | Heads | Head dim |
| ------- | ---------- | ----- | -------- |
| `DiT-B` | 768 | 12 | 64 |
| `DiT-L` | 1536 | 32 | 48 |
### World model details
The video predictor is a ViT-style transformer (`ActionConditionedVideoPredictor`) that takes:
- **Frame tokens**: V-JEPA2 patch embeddings projected to `predictor_embed_dim`
- **Action tokens**: Qwen action token embeddings projected to `predictor_embed_dim`
It uses block-causal attention so each temporal step can attend to all previous steps. The predictor's input `embed_dim` equals `num_views × video_encoder_hidden_size` (e.g. 2 views × 1024 = 2048 for the pretrained checkpoints).
---
## Pretrained Checkpoints
Three checkpoints are available directly inside the LeRobot org here: [`lerobot/VLA-JEPA`](https://huggingface.co/collections/lerobot/vla-jepa), converted from [ginwind/VLA-JEPA](https://huggingface.co/ginwind/VLA-JEPA):
| Checkpoint | Dataset | Cameras | World model | Action dim |
| ----------------------------- | ----------------- | ----------------------- | ----------- | ---------- |
| `lerobot/VLA-JEPA-LIBERO` | LIBERO-10 | 2 (agentview + wrist) | Enabled | 7 |
| `lerobot/VLA-JEPA-Pretrain` | DROID 1.0.1 | 2 (exterior left views) | Enabled | 7 |
| `lerobot/VLA-JEPA-SimplerEnv` | OXE Bridge / RT-1 | 1 (view duplicated ×2) | Enabled | 7 |
All checkpoints use `Qwen/Qwen3-VL-2B-Instruct` as the language backbone.
---
## Configuration
Key parameters in `VLAJEPAConfig`:
| Parameter | Default | Description |
| ------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `chunk_size` | 7 | Number of actions predicted per inference call |
| `n_action_steps` | 7 | Steps executed from the predicted chunk before re-planning |
| `num_video_frames` | 8 | Video clip length fed to the world model |
| `enable_world_model` | `True` | Whether to load and train the V-JEPA2 predictor |
| `world_model_loss_weight` | 0.1 | Weight of the JEPA prediction loss relative to the action loss |
| `num_inference_timesteps` | 4 | Euler integration steps for action denoising |
| `freeze_qwen` | `False` | Freeze the Qwen3-VL backbone and only train the action head |
| `reinit_modules` | `None` | Key prefixes allowed to be randomly re-initialised on load (for cross-embodiment transfer, see [Fine-tuning on a different embodiment](#fine-tuning-on-a-different-embodiment)) |
| `gripper_dim` | 6 | Index of the gripper dimension in the action vector (e.g. 6 for a 7-DoF arm with gripper as the last joint) |
| `gripper_threshold` | 0.5 | Threshold used by `pre_snap_gripper_action` and `binarize_gripper_action` to binarize the gripper dimension |
| `pre_snap_gripper_action` | `True` | Snap the gripper dim to {0, 1} before unnormalization. Set to `False` for robots without a binary gripper |
| `binarize_gripper_action` | `True` | Binarize the gripper dim to {-1, 1} after unnormalization. Set to `False` for robots without a binary gripper |
---
## Training
Number of training steps may vary based on dataset size and compute budget. The original paper pretrained for 50k on ssv2 + droid jointly, then additional 30k steps for LIBERO, but fewer steps may still yield good performance when fine-tuning from the provided pretrained checkpoints.
### Full training from scratch
```bash
lerobot-train \
policy.type=vla_jepa \
policy.repo_id=your_org/your_repo \
dataset.repo_id=your_org/your_dataset
```
### Fine-tuning from a pretrained checkpoint
```bash
lerobot-train \
--policy.path=lerobot/VLA-JEPA-Pretrain \
--policy.repo_id=your_org/your_repo \
--dataset.repo_id=your_org/your_dataset
```
If you want to freeze the Qwen backbone and only train the action head, set `policy.freeze_qwen=True`:
```bash
lerobot-train \
--policy.path=lerobot/VLA-JEPA-Pretrain \
--policy.repo_id=your_org/your_repo \
--policy.freeze_qwen=true \
--dataset.repo_id=your_org/your_dataset
```
### Fine-tuning on a different embodiment
When the target robot has a different action or state dimensionality than the pretrained checkpoint, the input/output projection layers of the action head will have mismatched shapes and cannot be loaded directly. `reinit_modules` lets you list the key prefixes that are allowed to mismatch — those layers are randomly re-initialised while every other weight is reused from the checkpoint. Any shape mismatch outside the listed prefixes raises an error.
The layers that depend on `action_dim` and `state_dim` are:
| Layer | Key prefix |
| ----------------------------------------- | ----------------------------------- |
| Action encoder (action_dim → inner_dim) | `model.action_model.action_encoder` |
| Action decoder (hidden_size → action_dim) | `model.action_model.action_decoder` |
| State encoder (state_dim → inner_dim) | `model.action_model.state_encoder` |
```bash
lerobot-train \
--policy.path=lerobot/VLA-JEPA-Pretrain \
--policy.repo_id=your_org/your_repo \
--policy.freeze_qwen=true \
--policy.reinit_modules='["model.action_model.action_encoder", "model.action_model.action_decoder", "model.action_model.state_encoder"]' \
--dataset.repo_id=your_org/your_dataset
```
If your robot has no proprioceptive state, omit `model.action_model.state_encoder` from the list.
### Reproducing the LIBERO results
**Training on LIBERO:**
starts the training from the Pretrain checkpoint, trains for 30k steps on the LIBERO dataset.
Original paper mentions training across 8 GPUs with a batch size of 32, meaning global batch size of 256.
```bash
lerobot-train \
--policy.path=lerobot/VLA-JEPA-Pretrain \
--policy.repo_id=your_org/your_repo \
--dataset.repo_id=HuggingFaceVLA/libero \
--steps=30000
```
**Evaluating the pretrained LIBERO-10 checkpoint:**
```bash
lerobot-eval \
--policy.path=lerobot/VLA-JEPA-LIBERO \
--env.type=libero \
--env.task=libero_spatial,libero_object,libero_goal,libero_10 \
--eval.n_episodes=10 \
--eval.batch_size=5
```
To evaluate a subset of tasks only:
```bash
lerobot-eval \
--policy.path=lerobot/VLA-JEPA-LIBERO \
--env.type=libero \
--env.task=libero_10 \
--env.task_ids='[0,1,2]' \
--eval.n_episodes=10 \
--eval.batch_size=5
```
**Expected results:**
| Suite | Episodes | Successes | Success Rate |
| -------------- | -------- | --------- | ------------ |
| libero_spatial | 100 | 93 | **95.0%** |
| libero_object | 100 | 100 | **100.0%** |
| libero_goal | 100 | 98 | **98.0%** |
| libero_10 | 100 | 96 | **93.0%** |
| **Overall** | **400** | **387** | **96.5%** |
---
## Fine-tuning on datasets with a different number of cameras
The pretrained world model predictor was trained with `embed_dim = jepa_tubelet_size × 1024` (default `jepa_tubelet_size=2`).
**Default behaviour — view padding / trimming (no action required)**
When fine-tuning from `VLA-JEPA-Pretrain` the model automatically adjusts the number of views fed to the world model to match `jepa_tubelet_size`:
- **Single-view datasets (e.g. BridgeV2):** the single-view latent is duplicated to produce a two-view world-model input, preserving the JEPA self-supervised signal without any weight mismatch.
- **>2-view datasets (e.g. DROID with 3 views):** all views are passed to the Qwen backbone (for richer context), but only the first `jepa_tubelet_size` views (one wrist + one third-person, following the configured view order) are used for the world model.
**Option 1 — Disable the world model**
Set `enable_world_model=False` to skip the JEPA loss entirely. Only the Qwen backbone and action head are loaded and trained. This is sufficient for good action performance.
```bash
lerobot-train \
--policy.path=lerobot/VLA-JEPA-Pretrain \
--policy.enable_world_model=false \
--policy.repo_id=your_org/your_repo \
--dataset.repo_id=your_org/single_camera_dataset
```
**Option 2 — Reinitialize the predictor input projection**
If you want to change `jepa_tubelet_size` to a value other than 2, load the checkpoint with `strict=False` and reinitialize `model.video_predictor.predictor_embed` for the new `embed_dim`. All other predictor block weights (attention, MLP, norm, output projection) are camera-count-agnostic and can be reused from the pretrained checkpoint.
---
## Citation
```bibtex
@misc{sun2026vlajepaenhancingvisionlanguageactionmodel,
title = {VLA-JEPA: Enhancing Vision-Language-Action Model with Latent World Model},
author = {Jingwen Sun and Wenyao Zhang and Zekun Qi and Shaojie Ren and Zezhi Liu and Hanxin Zhu and Guangzhong Sun and Xin Jin and Zhibo Chen},
year = {2026},
eprint = {2602.10098},
archivePrefix = {arXiv},
primaryClass = {cs.RO},
url = {https://arxiv.org/abs/2602.10098},
}
```
---
## License
Weights are distributed under the license terms of the original [ginwind/VLA-JEPA](https://huggingface.co/ginwind/VLA-JEPA) repository (**Apache 2.0 License**). The LeRobot integration code follows the **Apache 2.0 License**.
+4 -4
View File
@@ -220,7 +220,7 @@ REAL_DIM = 12
# Postprocessing: Trim 20D predictions to 12D for deployment
```
See the [action_hub.py](https://github.com/huggingface/lerobot/blob/main/src/lerobot/policies/xvla/action_hub.py) implementation for details.
See the [action_hub.py](/home/jade_choghari/robot/lerobot/src/lerobot/policies/xvla/action_hub.py) implementation for details.
#### Auto Action Mode (Recommended)
@@ -519,9 +519,9 @@ If you use X-VLA in your research, please cite:
- [X-VLA Paper](https://arxiv.org/pdf/2510.10274)
- [LeRobot Documentation](https://github.com/huggingface/lerobot)
- [Action Registry Implementation](https://github.com/huggingface/lerobot/blob/main/src/lerobot/policies/xvla/action_hub.py)
- [Processor Implementation](https://github.com/huggingface/lerobot/blob/main/src/lerobot/policies/xvla/processor_xvla.py)
- [Model Configuration](https://github.com/huggingface/lerobot/blob/main/src/lerobot/policies/xvla/configuration_xvla.py)
- [Action Registry Implementation](https://github.com/huggingface/lerobot/src/lerobot/policies/xvla/action_hub.py)
- [Processor Implementation](https://github.com/huggingface/lerobot/src/lerobot/policies/xvla/processor_xvla.py)
- [Model Configuration](https://github.com/huggingface/lerobot/src/lerobot/policies/xvla/configuration_xvla.py)
## Contributing
+15 -37
View File
@@ -15,12 +15,10 @@
# limitations under the License.
"""
Create MP4 (or GIF) videos with per-frame progress overlay for specified episodes.
Create MP4 (or GIF) videos with sarm_progress overlay for specified episodes.
Downloads datasets from HuggingFace, seeks directly into the episode segment
of the source video, draws a progress line on each frame, and writes the result.
The progress data is read from a parquet file that lives alongside the dataset
(configurable via ``--progress-file``).
Usage:
python examples/dataset/create_progress_videos.py \
@@ -58,26 +56,22 @@ SCORE_FONT_SCALE = 0.8
TASK_FONT_SCALE = 0.55
def download_episode_metadata(
repo_id: str, episode: int, progress_file: str = "sarm_progress.parquet"
) -> Path:
"""Download only the metadata and per-frame progress file for a dataset.
def download_episode_metadata(repo_id: str, episode: int) -> Path:
"""Download only the metadata and sarm_progress files for a dataset.
Args:
repo_id: HuggingFace dataset repository ID.
episode: Episode index (used for logging only; all meta is fetched).
progress_file: Filename of the per-frame progress parquet inside the
dataset repo.
Returns:
Local cache path for the downloaded snapshot.
"""
logging.info("[1/4] Downloading metadata + %s for %s (episode %d) ...", progress_file, repo_id, episode)
logging.info("[1/4] Downloading metadata for %s (episode %d) ...", repo_id, episode)
local_path = Path(
snapshot_download(
repo_id=repo_id,
repo_type="dataset",
allow_patterns=["meta/**", progress_file],
allow_patterns=["meta/**", "sarm_progress.parquet"],
ignore_patterns=["*.mp4"],
)
)
@@ -221,28 +215,25 @@ def download_video_file(repo_id: str, local_path: Path, video_rel: str) -> Path:
return video_path
def load_progress_data(
local_path: Path, episode: int, progress_file: str = "sarm_progress.parquet"
) -> np.ndarray | None:
"""Load per-frame progress values for an episode.
def load_progress_data(local_path: Path, episode: int) -> np.ndarray | None:
"""Load sarm_progress values for an episode.
Args:
local_path: Dataset cache root.
episode: Episode index.
progress_file: Filename of the per-frame progress parquet.
Returns:
Sorted (N, 2) array of (frame_index, progress), or None if unavailable.
"""
parquet_path = local_path / progress_file
parquet_path = local_path / "sarm_progress.parquet"
if not parquet_path.exists():
logging.warning("%s not found", progress_file)
logging.warning("sarm_progress.parquet not found")
return None
df = pd.read_parquet(parquet_path)
logging.info(" %s columns: %s", progress_file, list(df.columns))
logging.info(" sarm_progress.parquet columns: %s", list(df.columns))
episode_df = df[df["episode_index"] == episode].copy()
if episode_df.empty:
logging.warning("No progress rows for episode %d in %s", episode, progress_file)
logging.warning("No sarm_progress rows for episode %d", episode)
return None
episode_df = episode_df.sort_values("frame_index")
@@ -585,7 +576,6 @@ def process_dataset(
camera_key: str | None,
output_dir: Path,
create_gif: bool = False,
progress_file: str = "sarm_progress.parquet",
) -> Path | None:
"""Full pipeline: download, extract metadata, composite progress, write output.
@@ -595,8 +585,6 @@ def process_dataset(
camera_key: Camera key to use, or None for auto-selection.
output_dir: Directory to write output files.
create_gif: If True, also generate a GIF from the MP4.
progress_file: Filename of the per-frame progress parquet inside the
dataset repo.
Returns:
Path to the final output file, or None on failure.
@@ -604,7 +592,7 @@ def process_dataset(
safe_name = repo_id.replace("/", "_")
logging.info("Processing: %s | episode %d", repo_id, episode)
local_path = download_episode_metadata(repo_id, episode, progress_file)
local_path = download_episode_metadata(repo_id, episode)
logging.info(" Local cache: %s", local_path)
episode_meta = load_episode_meta(local_path, episode, camera_key)
@@ -612,9 +600,9 @@ def process_dataset(
video_path = download_video_file(repo_id, local_path, episode_meta["video_rel"])
progress_data = load_progress_data(local_path, episode, progress_file)
progress_data = load_progress_data(local_path, episode)
if progress_data is None:
logging.error("Could not load progress data from %s. Skipping overlay.", progress_file)
logging.error("Could not load sarm_progress data. Skipping overlay.")
return None
logging.info(" Progress frames: %d", len(progress_data))
@@ -639,7 +627,7 @@ def process_dataset(
def main() -> None:
parser = argparse.ArgumentParser(
description="Create MP4/GIF videos with per-frame progress overlay for dataset episodes."
description="Create MP4/GIF videos with sarm_progress overlay for dataset episodes."
)
parser.add_argument(
"--repo-id",
@@ -670,15 +658,6 @@ def main() -> None:
action="store_true",
help="Also generate a GIF from the MP4 output.",
)
parser.add_argument(
"--progress-file",
type=str,
default="sarm_progress.parquet",
help=(
"Filename of the per-frame progress parquet inside the dataset repo "
"(default: 'sarm_progress.parquet')."
),
)
args = parser.parse_args()
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
@@ -691,7 +670,6 @@ def main() -> None:
camera_key=args.camera_key,
output_dir=args.output_dir,
create_gif=args.gif,
progress_file=args.progress_file,
)
if result:
+1 -1
View File
@@ -69,7 +69,7 @@ class ComputeProgressShards(PipelineStep):
import torch
from tqdm import tqdm
from lerobot.rewards.sarm.compute_rabc_weights import (
from lerobot.policies.sarm.compute_rabc_weights import (
generate_all_frame_indices,
interpolate_progress,
load_sarm_resources,
+2 -29
View File
@@ -80,7 +80,7 @@
"}\n",
"\n",
"# Dataset\n",
"HF_USER = \"your_hf_username\" # `hf auth whoami` to find your username\n",
"HF_USER = \"your_hf_username\" # `huggingface-cli whoami` to find your username\n",
"DATASET_NAME = \"my_so101_dataset\"\n",
"TASK_DESCRIPTION = \"pick and place the block\"\n",
"NUM_EPISODES = 10\n",
@@ -291,34 +291,7 @@
"\n",
"Uses `POLICY_PATH` from the Configuration cell (defaults to the Hub repo ID). You can also put there the `LAST_CHECKPOINT_PATH`.\n",
"\n",
"See the [inference docs](https://huggingface.co/docs/lerobot/il_robots#run-inference-and-evaluate-your-policy) for details.\n",
"\n",
"Recently ```lerobot-rollout``` was introduced, you can [read more about it here](https://huggingface.co/docs/lerobot/main/en/il_robots?eval=Base+mode+%28no+recording%29#run-inference-and-evaluate-your-policy)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print_cmd(\n",
" \"lerobot-rollout\",\n",
" \"--strategy.type=base\",\n",
" f\"--policy.path={POLICY_PATH}\",\n",
" f\"--robot.type={ROBOT_TYPE}\",\n",
" f\"--robot.port={ROBOT_PORT}\",\n",
" CAMERAS_FLAG,\n",
" f'--task=\"{TASK_DESCRIPTION}\"',\n",
" \"--duration=60\",\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"if you are using the V0.5.1 release you should use ```lerobot-record``` instead of rollout"
"See the [inference docs](https://huggingface.co/docs/lerobot/il_robots#run-inference-and-evaluate-your-policy) for details."
]
},
{
-136
View File
@@ -1,136 +0,0 @@
# OMX Follower — Cube Pick And Place Example
This is an example of what is possible to do with LeRobot on a physical setup.
It is a WIP and being used internally at LeRobot and specific to our setup, but we hope it can be a useful reference for how to use LeRobot APIs and CLIs.
It includes an end-to-end example for the **OMX Follower** robot arm: pick and place a cube dataset, train a policy, and deploy it autonomously.
## Hardware
| Component | Value |
| --------- | ------------------------------------ |
| Robot | OMX Follower |
| Cameras | 2× OpenCV cameras (wrist + top-down) |
## Scripts
| Script | Purpose |
| ---------------------- | --------------------------------------------------------------- |
| `reset_environment.py` | Standalone utility: sweep workspace, grab cube, place cube |
| `record_grab.py` | Automated data collection: reset → place → record grab episodes |
## Setup
Make sure you have LeRobot installed in your env. (See [the installation guide](https://huggingface.co/docs/lerobot/installation))
Next, we will declare some environment variables for convenience. Adjust the camera indices and robot port to match your system configuration.
```bash
export ROBOT_PORT=/dev/ttyACM0
export TELEOP_PORT=/dev/ttyACM1
export HF_USERNAME=<your_hf_username>
export ROBOT_CAMERAS="{ wrist: {type: opencv, index_or_path: 0, width: 640, height: 480, fps: 30, fourcc: MJPG}, top: {type: opencv, index_or_path: 2, width: 640, height: 480, fps: 30, fourcc: MJPG} }"
```
## Step 1 — Collect Data
```bash
lerobot-record \
--robot.type=omx_follower \
--robot.port=$ROBOT_PORT \
--robot.id=omx_follower \
--robot.cameras="$ROBOT_CAMERAS" \
--teleop.type=omx_leader \
--teleop.port=$TELEOP_PORT \
--teleop.id=omx_leader \
--dataset.repo_id=$HF_USERNAME/omx_pickandplace \
--dataset.root=data/omx_pickandplace \
--dataset.num_episodes=50 \
--dataset.single_task="Pick the cube and place it in the blue square" \
--dataset.streaming_encoding=true \
--dataset.push_to_hub=true
```
### Bonus Auto-Collect script
/!\ This is specific to our setup and the task of picking and placing a cube. It is not a general-purpose data collection script. As you may notice, it doesn't require a teleop.
```bash
python -m examples.omx.record_grab \
--robot.type=omx_follower \
--robot.port=$ROBOT_PORT \
--robot.id=omx_follower \
--robot.cameras="$ROBOT_CAMERAS" \
--dataset.repo_id=$HF_USERNAME/omx_pickandplace \
--dataset.root=data/omx_pickandplace \
--dataset.num_episodes=50 \
--dataset.single_task="Pick the cube and place it in the blue square" \
--dataset.streaming_encoding=true \
--dataset.push_to_hub=true
```
Each episode:
1. The arm grabs the cube from the center of the workspace and places it at a random position.
2. The arm returns to HOME.
3. A targeted grab is recorded: HOME → approach raised → lower onto cube → grasp → lift → carry → drop → HOME.
A dataset is already available here [`maximellerbach/omx_pickandplace`](https://huggingface.co/datasets/maximellerbach/omx_pickandplace), so you can skip directly to training if you want.
## Step 2 — Train
To train a simple `ACT` policy on the collected dataset, you can use the `lerobot-train` CLI:
```bash
lerobot-train \
--dataset.repo_id=$HF_USERNAME/omx_pickandplace \
--policy.type=act \
--output_dir=outputs/train/omx_pickandplace_act \
--policy.device=cuda \
--policy.repo_id=$HF_USERNAME/omx_pickandplace_act \
--steps=20000 \
--wandb.enable=true
```
A pretrained `ACT` policy is already available here [`maximellerbach/omx_pickandplace_act`](https://huggingface.co/maximellerbach/omx_pickandplace_act).
## Step 3 — Rollout
Use the `lerobot-rollout` CLI with base strategy:
```bash
lerobot-rollout \
--strategy.type=base \
--robot.type=omx_follower \
--robot.port=$ROBOT_PORT \
--robot.id=omx_follower \
--robot.cameras="$ROBOT_CAMERAS" \
--policy.path=$HF_USERNAME/omx_pickandplace_act \
```
For continuous recording with automatic upload (sentry mode):
```bash
lerobot-rollout \
--strategy.type=sentry \
--strategy.upload_every_n_episodes=10 \
--robot.type=omx_follower \
--robot.port=$ROBOT_PORT \
--robot.id=omx_follower \
--robot.cameras="$ROBOT_CAMERAS" \
--policy.path=$HF_USERNAME/omx_pickandplace_act \
--dataset.repo_id=$HF_USERNAME/rollout_omx_pickandplace_act \
```
## Environment Reset Utility
Those are specific to this particular physical setup. Those are scripts that execute hardcoded sequences of actions on the robot to reset the environment, which is useful for data collection and evaluation. They are not general-purpose scripts.
`reset_environment.py` can be run standalone to prepare the workspace:
```bash
# Grab cube + place it at a random position on the left side
python -m examples.omx.reset_environment --port $ROBOT_PORT --mode grab_and_place
```
It also exposes `grab_cube(robot)` and `place_cube(robot)` for use in custom scripts.
-422
View File
@@ -1,422 +0,0 @@
#!/usr/bin/env python3
"""
Auto-record grab episodes for the OMX robot arm.
Each episode cycle:
1. grab_and_place — grab cube from workspace center and place at a random (pan, reach) position
2. HOME — return arm to home with gripper open
3. record_grab — execute a targeted grab to the stored position while recording
observations + actions to a LeRobotDataset
Usage (run from repo root):
python -m examples.omx.record_grab \\
--robot.type=omx_follower \\
--robot.port=/dev/ttyACM0 \\
--robot.id=omx_follower \\
--robot.cameras="{ wrist: {type: opencv, index_or_path: 6, width: 640, height: 480, fps: 30, fourcc: MJPG}, top: {type: opencv, index_or_path: 4, width: 640, height: 480, fps: 30, fourcc: MJPG} }" \\
--dataset.repo_id=<hf_username>/<dataset_name> \\
--dataset.root=data/omx_grab \\
--dataset.num_episodes=50 \\
--dataset.single_task="Grab the cube" \\
--dataset.streaming_encoding=true
"""
import logging
from dataclasses import dataclass
from pprint import pformat
import numpy as np
from lerobot.cameras import CameraConfig # noqa: F401
from lerobot.cameras.opencv import OpenCVCameraConfig # noqa: F401
from lerobot.configs import parser
from lerobot.configs.dataset import DatasetRecordConfig
from lerobot.datasets import (
LeRobotDataset,
VideoEncodingManager,
aggregate_pipeline_dataset_features,
create_initial_features,
)
from lerobot.processor import make_default_processors
from lerobot.robots import RobotConfig, make_robot_from_config
from lerobot.robots.omx_follower import OmxFollower
from lerobot.utils.constants import ACTION, OBS_STR
from lerobot.utils.feature_utils import build_dataset_frame, combine_feature_dicts
from lerobot.utils.robot_utils import precise_sleep
from .reset_environment import (
APPROACH_SPEED,
GRIPPER_CLOSE_POS,
HOME_POSE,
PUSH_END_ELBOW_FLEX,
PUSH_END_SHOULDER_LIFT,
PUSH_START_ELBOW_FLEX,
PUSH_START_SHOULDER_LIFT,
array_to_pose,
grab_cube,
horizontal_wrist_flex,
move_to_pose,
place_cube,
pose_to_array,
)
# ── Grab-episode motion parameters ────────────────────────────────────────────
# Shoulder-lift offset for the raised approach phase (subtracted from the target sl, arm is higher).
GRAB_RAISE_SL_OFFSET = 20.0
GRAB_LOWER_SPEED = 20.0
RECORD_SPEED = 30.0
# Pose the arm travels to after closing the gripper (cube held).
GRAB_CARRY_POSE = {
"shoulder_pan.pos": -23.0,
"shoulder_lift.pos": 5.0,
"elbow_flex.pos": 18.0,
"wrist_flex.pos": -14.0,
"wrist_roll.pos": 0.0,
"gripper.pos": GRIPPER_CLOSE_POS,
}
# Per-joint jitter limits (degrees) applied to transit waypoints for human-like variation.
# Cube-approach and carry poses are never jittered to preserve precision.
_JITTER_LIMITS: dict[str, float] = {
"shoulder_pan.pos": 5.0,
"shoulder_lift.pos": 4.0,
"elbow_flex.pos": 4.0,
"wrist_flex.pos": 3.0,
"wrist_roll.pos": 2.0,
"gripper.pos": 0.0,
}
def _jitter_pose(pose: dict, rng: np.random.Generator) -> dict:
"""Return a copy of pose with independent per-joint random perturbations."""
return {
k: v + rng.uniform(-_JITTER_LIMITS.get(k, 0.0), _JITTER_LIMITS.get(k, 0.0)) for k, v in pose.items()
}
def _random_stuck_pose(rng: np.random.Generator) -> dict:
"""Return a physically plausible stuck pose (failed grasp), gripper closed.
ef bounds are piecewise-linear in sl so the arm stays in a reachable,
table-safe envelope across the full sl range:
sl=-50 → ef ∈ [ 0, 50] (arm raised, can be bent forward)
sl= 0 → ef ∈ [-25, 25] (mid reach)
sl= 30 → ef ∈ [-20, 0] (arm extended, little room to flex)
wrist_flex is randomly offset from the horizontal value.
"""
pan = float(rng.uniform(-5.0, 35.0))
sl = float(rng.uniform(-50.0, 30.0))
if sl <= 0.0:
alpha = (sl + 50.0) / 50.0 # 0 at sl=-50, 1 at sl=0
ef_lo = alpha * -25.0 # 0 → -25
ef_hi = 50.0 + alpha * -25.0 # 50 → 25
else:
alpha = sl / 30.0 # 0 at sl=0, 1 at sl=30
ef_lo = -25.0 + alpha * 5.0 # -25 → -20
ef_hi = 25.0 + alpha * -25.0 # 25 → 0
ef = float(rng.uniform(ef_lo, ef_hi))
wf = horizontal_wrist_flex(sl, ef) + float(rng.uniform(-15.0, 15.0))
return {
"shoulder_pan.pos": pan,
"shoulder_lift.pos": sl,
"elbow_flex.pos": ef,
"wrist_flex.pos": wf,
"wrist_roll.pos": float(rng.uniform(-15.0, 15.0)),
"gripper.pos": GRIPPER_CLOSE_POS,
}
logger = logging.getLogger(__name__)
@dataclass
class OmxRecordGrabConfig:
robot: RobotConfig
dataset: DatasetRecordConfig
# Resume recording on an existing dataset.
resume: bool = False
# Fraction of episodes that start from a random stuck pose (gripper closed) to
# generate recovery data. 0.0 = disabled, 1.0 = all episodes are recovery starts.
recovery_prob: float = 0.5
def record_episode_spline(
robot: OmxFollower,
waypoints: list[dict],
speeds: list[float],
dataset: LeRobotDataset,
task: str,
) -> None:
"""Execute a Catmull-Rom-style spline through waypoints, recording each frame.
Segment durations are parameterized from the maximum absolute joint delta
between consecutive waypoints divided by the requested segment speed,
producing non-uniform timing in joint space. Interior tangents are derived
from the adjacent per-segment velocities, with clamped (zero-velocity)
endpoints so the arm starts and stops smoothly. Each segment is cubic
Hermite, giving C1 continuity at every waypoint.
"""
pts = [pose_to_array(w) for w in waypoints]
n = len(pts)
# Steps and duration per segment
n_steps_list = []
timestamps = []
for i in range(n - 1):
max_dist = float(np.max(np.abs(pts[i + 1] - pts[i])))
ns = max(1, int(max_dist / speeds[i] * dataset.fps)) if max_dist >= 0.5 else 0
n_steps_list.append(ns)
timestamps.append(ns / dataset.fps)
# Velocity tangents (deg/sec) — clamped at endpoints, Catmull-Rom for interior
vels = [np.zeros_like(pts[0])]
for i in range(1, n - 1):
v_prev = (pts[i] - pts[i - 1]) / timestamps[i - 1] if timestamps[i - 1] > 0 else np.zeros_like(pts[0])
v_next = (pts[i + 1] - pts[i]) / timestamps[i] if timestamps[i] > 0 else np.zeros_like(pts[0])
vels.append(0.5 * (v_prev + v_next))
vels.append(np.zeros_like(pts[0]))
dt = 1.0 / dataset.fps
for seg in range(n - 1):
ns = n_steps_list[seg]
if ns == 0:
continue
p0, p1 = pts[seg], pts[seg + 1]
# Scale velocity (deg/sec) to t-space tangent (deg/t-unit, where t: 0→1 over ns steps)
m0 = vels[seg] * timestamps[seg]
m1 = vels[seg + 1] * timestamps[seg]
for step in range(1, ns + 1):
t = step / ns
h00 = 2 * t**3 - 3 * t**2 + 1
h10 = t**3 - 2 * t**2 + t
h01 = -2 * t**3 + 3 * t**2
h11 = t**3 - t**2
commanded = h00 * p0 + h10 * m0 + h01 * p1 + h11 * m1
action = array_to_pose(commanded)
robot.send_action(action)
obs = robot.get_observation()
obs_frame = build_dataset_frame(dataset.features, obs, prefix=OBS_STR)
action_frame = build_dataset_frame(dataset.features, action, prefix=ACTION)
dataset.add_frame({**obs_frame, **action_frame, "task": task})
precise_sleep(dt)
def record_grab_episode(
robot: OmxFollower,
dataset: LeRobotDataset,
pan: float,
t: float,
task: str,
recovery_start: bool = False,
) -> None:
"""Execute a targeted grab to the stored (pan, t) position, recording every frame.
Normal sequence (initial HOME move is NOT recorded):
HOME → raised approach above cube → lower → close gripper
→ raise [jittered] → retract [jittered] → GRAB_CARRY_POSE → drop → HOME
Recovery sequence (recovery_start=True): arm is moved to a random stuck pose
(gripper closed) without recording, then recording begins from there:
stuck_pose → raised approach above cube → [normal grab sequence from there]
All segments are joined by a Catmull-Rom spline (C1-continuous velocities).
"""
sl = PUSH_START_SHOULDER_LIFT + t * (PUSH_END_SHOULDER_LIFT - PUSH_START_SHOULDER_LIFT)
ef = PUSH_START_ELBOW_FLEX + t * (PUSH_END_ELBOW_FLEX - PUSH_START_ELBOW_FLEX)
sl_raised = sl - GRAB_RAISE_SL_OFFSET
wf_horizontal = horizontal_wrist_flex(sl, ef)
rng = np.random.default_rng()
if recovery_start:
stuck_pose = _random_stuck_pose(rng)
logger.info(f"Recovery start: {stuck_pose}")
move_to_pose(robot, stuck_pose, APPROACH_SPEED)
first_waypoints = [stuck_pose]
first_speeds = []
else:
jittery_start = _jitter_pose(HOME_POSE, rng)
move_to_pose(robot, jittery_start, APPROACH_SPEED)
first_waypoints = [jittery_start]
first_speeds = []
waypoints = first_waypoints + [
{ # raised approach: arm above cube
"shoulder_pan.pos": pan,
"shoulder_lift.pos": sl_raised,
"elbow_flex.pos": ef,
"wrist_flex.pos": horizontal_wrist_flex(sl_raised, ef),
"wrist_roll.pos": 0.0,
"gripper.pos": 60.0,
},
{ # lower onto cube — no jitter: precision needed
"shoulder_pan.pos": pan,
"shoulder_lift.pos": sl,
"elbow_flex.pos": ef,
"wrist_flex.pos": wf_horizontal,
"wrist_roll.pos": 0.0,
"gripper.pos": 60.0,
},
{ # close gripper — no jitter: precision needed
"shoulder_pan.pos": pan,
"shoulder_lift.pos": sl,
"elbow_flex.pos": ef,
"wrist_flex.pos": wf_horizontal,
"wrist_roll.pos": 0.0,
"gripper.pos": GRIPPER_CLOSE_POS,
},
_jitter_pose(
{ # raise with cube
"shoulder_pan.pos": pan,
"shoulder_lift.pos": sl_raised,
"elbow_flex.pos": ef,
"wrist_flex.pos": horizontal_wrist_flex(sl_raised, ef),
"wrist_roll.pos": 0.0,
"gripper.pos": GRIPPER_CLOSE_POS,
},
rng,
),
_jitter_pose(
{ # retract: fold arm toward HOME before sweeping to carry zone
"shoulder_pan.pos": pan * 0.25,
"shoulder_lift.pos": HOME_POSE["shoulder_lift.pos"] + 5.0,
"elbow_flex.pos": HOME_POSE["elbow_flex.pos"] - 5.0,
"wrist_flex.pos": 0.0,
"wrist_roll.pos": 0.0,
"gripper.pos": GRIPPER_CLOSE_POS,
},
rng,
),
GRAB_CARRY_POSE, # no jitter: target drop zone
{**GRAB_CARRY_POSE, "gripper.pos": 60.0}, # drop cube
HOME_POSE,
]
speeds = first_speeds + [
RECORD_SPEED, # (HOME →) raised approach
GRAB_LOWER_SPEED, # raised approach → lower
GRAB_LOWER_SPEED, # lower → close gripper
RECORD_SPEED, # close gripper → raise
RECORD_SPEED, # raise → retract
RECORD_SPEED, # retract → carry pose
RECORD_SPEED, # carry pose → drop
RECORD_SPEED, # drop → HOME
]
record_episode_spline(robot, waypoints, speeds, dataset, task)
# Dwell at HOME for ~0.5 s before next episode
home_action = build_dataset_frame(dataset.features, HOME_POSE, prefix=ACTION)
dt = 1.0 / dataset.fps
for _ in range(int(dataset.fps * 0.5)):
robot.send_action(HOME_POSE)
obs = robot.get_observation()
obs_frame = build_dataset_frame(dataset.features, obs, prefix=OBS_STR)
dataset.add_frame({**obs_frame, **home_action, "task": task})
precise_sleep(dt)
@parser.wrap()
def record_grab(cfg: OmxRecordGrabConfig) -> LeRobotDataset:
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
logger.info(pformat(cfg))
robot = make_robot_from_config(cfg.robot)
use_videos = cfg.dataset.video
teleop_action_processor, _, robot_obs_processor = make_default_processors()
dataset_features = combine_feature_dicts(
aggregate_pipeline_dataset_features(
pipeline=teleop_action_processor,
initial_features=create_initial_features(action=robot.action_features),
use_videos=use_videos,
),
aggregate_pipeline_dataset_features(
pipeline=robot_obs_processor,
initial_features=create_initial_features(observation=robot.observation_features),
use_videos=use_videos,
),
)
num_cameras = len(robot.cameras) if hasattr(robot, "cameras") else 0
dataset = None
try:
if cfg.resume:
dataset = LeRobotDataset.resume(
cfg.dataset.repo_id,
root=cfg.dataset.root,
streaming_encoding=cfg.dataset.streaming_encoding,
batch_encoding_size=cfg.dataset.video_encoding_batch_size,
vcodec=cfg.dataset.vcodec,
encoder_threads=cfg.dataset.encoder_threads,
image_writer_processes=cfg.dataset.num_image_writer_processes if num_cameras > 0 else 0,
image_writer_threads=cfg.dataset.num_image_writer_threads_per_camera * num_cameras
if num_cameras > 0
else 0,
)
else:
cfg.dataset.stamp_repo_id()
dataset = LeRobotDataset.create(
cfg.dataset.repo_id,
cfg.dataset.fps,
root=cfg.dataset.root,
robot_type=robot.name,
features=dataset_features,
use_videos=use_videos,
streaming_encoding=cfg.dataset.streaming_encoding,
batch_encoding_size=cfg.dataset.video_encoding_batch_size,
vcodec=cfg.dataset.vcodec,
encoder_threads=cfg.dataset.encoder_threads,
image_writer_processes=cfg.dataset.num_image_writer_processes if num_cameras > 0 else 0,
image_writer_threads=cfg.dataset.num_image_writer_threads_per_camera * num_cameras
if num_cameras > 0
else 0,
)
robot.connect(calibrate=True)
rng = np.random.default_rng()
with VideoEncodingManager(dataset):
for episode_idx in range(cfg.dataset.num_episodes):
logger.info(f"=== Episode {episode_idx + 1}/{cfg.dataset.num_episodes} ===")
logger.info("Step 1: grabbing and placing cube...")
grab_cube(robot)
pan, t = place_cube(robot)
logger.info(f"Cube placed at pan={pan:.1f}, reach={t:.2f}")
recovery_start = cfg.recovery_prob > 0 and float(rng.random()) < cfg.recovery_prob
logger.info(f"Step 2: recording {'recovery ' if recovery_start else ''}grab episode...")
record_grab_episode(
robot,
dataset,
pan,
t,
cfg.dataset.single_task,
recovery_start=recovery_start,
)
dataset.save_episode()
logger.info(f"Episode {episode_idx + 1} saved.")
finally:
if dataset:
dataset.finalize()
if robot.is_connected:
robot.disconnect()
if cfg.dataset.push_to_hub and dataset and dataset.num_episodes > 0:
dataset.push_to_hub(tags=cfg.dataset.tags, private=cfg.dataset.private)
return dataset
if __name__ == "__main__":
record_grab()
-267
View File
@@ -1,267 +0,0 @@
#!/usr/bin/env python3
"""
Auto-reset and cube-grab utility for the OMX robot arm.
Provides:
- grab_cube(robot): sweep workspace, center cube, close gripper
- place_cube(robot): carry cube to a random position, release
Standalone usage (run from repo root):
python -m examples.omx.reset_environment --port /dev/ttyACM1 --mode grab
python -m examples.omx.reset_environment --port /dev/ttyACM1 --mode grab_and_place
Joint range: -100 to 100 for arm joints; gripper: 50 = closed, 80 = open.
To read current joint values for calibration, add after robot.connect():
obs = robot.get_observation()
print({k: round(obs[k], 1) for k in JOINT_NAMES})
robot.disconnect(); raise SystemExit
Parallel-to-ground IK: wrist_flex = WRIST_HORIZONTAL_OFFSET - shoulder_lift - elbow_flex.
Linear interpolation preserves this constraint between any two poses that satisfy it.
"""
import argparse
import logging
import numpy as np
from lerobot.robots.omx_follower import OmxFollower, OmxFollowerConfig
from lerobot.robots.robot import Robot
from lerobot.utils.robot_utils import precise_sleep
logger = logging.getLogger(__name__)
# ── Poses ─────────────────────────────────────────────────────────────────────
HOME_POSE = {
"shoulder_pan.pos": 0.0,
"shoulder_lift.pos": -50.0,
"elbow_flex.pos": 50.0,
"wrist_flex.pos": 0.0,
"wrist_roll.pos": 0.0,
"gripper.pos": 60.0,
}
SWEEP_WAYPOINTS = [
{
"shoulder_pan.pos": -60.0,
"shoulder_lift.pos": 50.0,
"elbow_flex.pos": -60.0,
"wrist_flex.pos": -20.0,
"wrist_roll.pos": 0.0,
"gripper.pos": 60.0,
},
{
"shoulder_pan.pos": -30.0,
"shoulder_lift.pos": 50.0,
"elbow_flex.pos": -60.0,
"wrist_flex.pos": -5.0,
"wrist_roll.pos": 0.0,
"gripper.pos": 60.0,
},
{
"shoulder_pan.pos": 20.0,
"shoulder_lift.pos": 50.0,
"elbow_flex.pos": -55.0,
"wrist_flex.pos": -5.0,
"wrist_roll.pos": 0.0,
"gripper.pos": 60.0,
},
]
# ── Motion parameters ─────────────────────────────────────────────────────────
CONTROL_HZ = 30
APPROACH_SPEED = 50.0
SWEEP_SPEED = 40.0
# ── Grab-sequence parameters ──────────────────────────────────────────────────
GRAB_PAN = 0.0
SWEEP_LEFT_PAN = -60.0
SWEEP_RIGHT_PAN = 60.0
SWEEP_END_OFFSET = 5.0 # stop before center so the cube isn't pushed past GRAB_PAN
SWEEP_END_PAN_RANGE = (15.0, 20.0)
SWEEP_LOW_SHOULDER_LIFT = 50.0
SWEEP_LOW_ELBOW_FLEX_START = -60.0
SWEEP_LOW_ELBOW_FLEX_END = -55.0
SWEEP_HIGH_WRIST_FLEX = -20.0 # wrist tilted up during high approach to clear obstacles
PUSH_START_SHOULDER_LIFT = 0.0
PUSH_START_ELBOW_FLEX = 45.0
PUSH_END_SHOULDER_LIFT = 50.0
PUSH_END_ELBOW_FLEX = -50.0
# Subtracted from shoulder_lift during the push sweep to clear the platform surface.
# Does not affect the grab-target interpolation in record_grab.py.
PUSH_RAISE_OFFSET = 5.0
WRIST_HORIZONTAL_OFFSET = 0.0 # tune if gripper tilts during push: + tilts nose up, - down
GRIPPER_CLOSE_POS = 50.0
PLACE_LEFT_PAN_RANGE = (5.0, 30.0) # random pan range for cube placement on the left side
PLACE_REACH_RANGE = (0.1, 0.7) # 0 = arm retracted (PUSH_START), 1 = fully extended (PUSH_END)
JOINT_NAMES = [
"shoulder_pan.pos",
"shoulder_lift.pos",
"elbow_flex.pos",
"wrist_flex.pos",
"wrist_roll.pos",
"gripper.pos",
]
# ── Helpers ───────────────────────────────────────────────────────────────────
def pose_to_array(pose: dict) -> np.ndarray:
return np.array([pose[k] for k in JOINT_NAMES])
def array_to_pose(arr: np.ndarray) -> dict:
return {k: float(arr[i]) for i, k in enumerate(JOINT_NAMES)}
def horizontal_wrist_flex(shoulder_lift: float, elbow_flex: float) -> float:
return WRIST_HORIZONTAL_OFFSET - shoulder_lift - elbow_flex
def _low_sweep_pose(pan: float, elbow_flex: float, wrist_flex: float | None = None) -> dict:
sl = SWEEP_LOW_SHOULDER_LIFT
return {
"shoulder_pan.pos": pan,
"shoulder_lift.pos": sl,
"elbow_flex.pos": elbow_flex,
"wrist_flex.pos": horizontal_wrist_flex(sl, elbow_flex) if wrist_flex is None else wrist_flex,
"wrist_roll.pos": 0.0,
"gripper.pos": 60.0,
}
def _high_sweep_pose(pan: float) -> dict:
return {**HOME_POSE, "shoulder_pan.pos": pan, "wrist_flex.pos": SWEEP_HIGH_WRIST_FLEX}
def _push_pose(shoulder_lift: float, elbow_flex: float, pan: float = GRAB_PAN, gripper: float = 70.0) -> dict:
return {
"shoulder_pan.pos": pan,
"shoulder_lift.pos": shoulder_lift,
"elbow_flex.pos": elbow_flex,
"wrist_flex.pos": horizontal_wrist_flex(shoulder_lift, elbow_flex),
"wrist_roll.pos": 0.0,
"gripper.pos": gripper,
}
def move_to_pose(robot: Robot, target: dict, speed: float) -> None:
"""Interpolate from current position to target at the given speed (units/s)."""
obs = robot.get_observation()
current = np.array([obs[k] for k in JOINT_NAMES])
goal = pose_to_array(target)
max_distance = float(np.max(np.abs(goal - current)))
if max_distance < 0.5:
return
n_steps = max(1, int(max_distance / speed * CONTROL_HZ))
dt = 1.0 / CONTROL_HZ
for step in range(1, n_steps + 1):
t = step / n_steps
robot.send_action(array_to_pose(current + t * (goal - current)))
precise_sleep(dt)
# ── Sequences ─────────────────────────────────────────────────────────────────
def grab_cube(robot: Robot) -> None:
"""Left sweep → right sweep → extend arm parallel to ground → close gripper."""
move_to_pose(robot, HOME_POSE, APPROACH_SPEED)
for pan, end_pan in [
(SWEEP_LEFT_PAN, GRAB_PAN - SWEEP_END_OFFSET),
(SWEEP_RIGHT_PAN, GRAB_PAN + SWEEP_END_OFFSET),
]:
logger.info(f"Sweeping {'left' if pan < 0 else 'right'} → center...")
move_to_pose(robot, _high_sweep_pose(pan), APPROACH_SPEED)
move_to_pose(
robot, _low_sweep_pose(pan, SWEEP_LOW_ELBOW_FLEX_START, wrist_flex=-20.0), APPROACH_SPEED
)
move_to_pose(robot, _low_sweep_pose(end_pan, SWEEP_LOW_ELBOW_FLEX_END, wrist_flex=0.0), SWEEP_SPEED)
move_to_pose(robot, HOME_POSE, APPROACH_SPEED)
logger.info("Extending to push cube into gripper...")
move_to_pose(
robot,
_push_pose(PUSH_START_SHOULDER_LIFT - PUSH_RAISE_OFFSET, PUSH_START_ELBOW_FLEX),
APPROACH_SPEED,
)
move_to_pose(
robot,
_push_pose(PUSH_END_SHOULDER_LIFT - PUSH_RAISE_OFFSET, PUSH_END_ELBOW_FLEX),
SWEEP_SPEED,
)
logger.info("Closing gripper...")
move_to_pose(
robot,
_push_pose(PUSH_END_SHOULDER_LIFT, PUSH_END_ELBOW_FLEX, gripper=GRIPPER_CLOSE_POS),
APPROACH_SPEED,
)
logger.info("Grab complete.")
def place_cube(robot: Robot) -> tuple[float, float]:
"""Carry the cube (gripper closed) to a random position on the left side, then release.
Returns:
(pan, t): pan angle and reach scalar [0, 1] of the placement position.
"""
pan = float(np.random.uniform(*PLACE_LEFT_PAN_RANGE))
t = float(np.random.uniform(*PLACE_REACH_RANGE))
sl = PUSH_START_SHOULDER_LIFT + t * (PUSH_END_SHOULDER_LIFT - PUSH_START_SHOULDER_LIFT)
ef = PUSH_START_ELBOW_FLEX + t * (PUSH_END_ELBOW_FLEX - PUSH_START_ELBOW_FLEX)
logger.info(f"Placing cube at pan={pan:.1f}, reach={t:.2f}...")
move_to_pose(robot, {**HOME_POSE, "gripper.pos": GRIPPER_CLOSE_POS}, APPROACH_SPEED)
move_to_pose(
robot, {**HOME_POSE, "shoulder_pan.pos": pan, "gripper.pos": GRIPPER_CLOSE_POS}, APPROACH_SPEED
)
move_to_pose(robot, _push_pose(sl, ef, pan=pan, gripper=GRIPPER_CLOSE_POS), APPROACH_SPEED)
move_to_pose(robot, _push_pose(sl, ef, pan=pan, gripper=80.0), APPROACH_SPEED)
move_to_pose(robot, HOME_POSE, APPROACH_SPEED)
logger.info("Place complete.")
return pan, t
# ── Entry point ───────────────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser(description="OMX arm reset / grab script")
parser.add_argument("--port", default="/dev/ttyACM1")
parser.add_argument("--robot_id", default="omx_follower")
parser.add_argument("--mode", choices=["grab", "grab_and_place"], default="grab_and_place")
args = parser.parse_args()
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
robot = OmxFollower(OmxFollowerConfig(port=args.port, id=args.robot_id))
robot.connect(calibrate=True)
try:
if args.mode == "grab":
grab_cube(robot)
elif args.mode == "grab_and_place":
grab_cube(robot)
place_cube(robot)
finally:
robot.disconnect()
if __name__ == "__main__":
main()
-115
View File
@@ -1,115 +0,0 @@
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Example manifest for `lerobot-policy-server --manifest server.yaml`.
#
# One process = one (model, revision, dtype, device) on one GPU. Dynamic
# model loading is deliberately unsupported: pre-warmed processes keep
# capacity planning honest. Every field below can also be overridden on
# the command line via draccus, e.g. --model.repo_or_path=... or
# --zenoh.connect_endpoints='["tcp/other-router:7447"]'.
#
# Field names mirror the dataclasses in src/lerobot/policy_server/manifest.py.
# --- Which policy this process serves, and where it runs ------------------
model:
# Hub repo id (org/name) or a local checkpoint directory. Required.
repo_or_path: lerobot/pi0_towels
# Hub revision: branch, tag, or commit sha.
revision: main
# Optional torch dtype cast applied after load (e.g. "bfloat16",
# "float16"). null keeps the checkpoint's native dtype.
dtype: bfloat16
# Inference device, e.g. "cuda", "cuda:1", "cpu".
device: cuda
# --- Task namespace --------------------------------------------------------
# The task this service is published under. VLA clients may override the
# task per session unless `pin_task` is true, in which case session opens
# with a different task string are rejected.
default_task: "fold the towel"
pin_task: false
# Optional override for the <task_slug> key segment of the Zenoh prefix
# (defaults to a slug of `default_task`).
service_name: ""
# --- Serving mode & capacity ------------------------------------------------
# "auto" resolves from the policy classification: shared for verified
# chunk-stateless policies (act/pi0/pi05, smolvla with n_obs_steps=1),
# exclusive otherwise. Chunk-stateful policies — e.g. diffusion, whose
# predict_action_chunk reads select_action-fed queues — are always forced
# to "exclusive" (max_sessions=1); "shared" cannot override that.
serving_mode: auto
# Capacity rule-of-thumb: with t = server seconds per inference, r = each
# client's request rate (self-clocked to ~1-4 Hz, not the control rate),
# H = RTC execution horizon, and dt = control period:
# max_sessions ~= min( 0.8 / (r*t), (H*dt/2 - network RTT) / t )
# e.g. ACT @ 20 ms, 1 Hz refresh -> ~40 clients/GPU; Pi0 @ 150 ms -> ~5.
# Session opens beyond this are rejected with the current load in the
# reply, so clients retry another replica.
max_sessions: 5
# Dummy inferences run at startup so the first real request does not pay
# for CUDA graph/kernel warmup.
warmup_inferences: 2
# --- FPS contract -----------------------------------------------------------
# Control rate the policy was trained at. Clients reporting a different
# fps get a warning — or a hard reject when `strict_fps` is true.
trained_fps: 30.0
strict_fps: false
# --- Real Time Chunking (RTC) -----------------------------------------------
# Global to this process: init_rtc_processor mutates the policy instance,
# so RTC is a per-process decision, not per-session. Only rtc-capable
# families (pi0/pi05/smolvla) honor it; others are downgraded to plain
# chunk-append at session open.
rtc:
enabled: true
# Number of actions executed from each chunk before the next chunk is
# blended in (the H in the capacity formula above).
execution_horizon: 10
# --- Housekeeping ------------------------------------------------------------
# Sessions with no liveliness token and no traffic for this long are
# garbage-collected (belt-and-braces behind liveliness GC).
session_idle_timeout_s: 300.0
# --- Transport ----------------------------------------------------------------
# Robots and servers both *dial out* to a zenohd router in production
# (mode: client). mode: peer + listen_endpoints supports router-less LAN
# and loopback test deployments. Multicast scouting is always disabled:
# fleet discovery is configuration, not protocol magic.
zenoh:
mode: client
connect_endpoints:
- tcp/router.gpu-cluster.internal:7447
listen_endpoints: []
# mTLS material (PEM paths). All three are required for tls/ endpoints;
# leave them null for plain tcp/ inside a trusted network.
# tls_root_ca_certificate: /etc/lerobot/tls/ca.pem
# tls_connect_certificate: /etc/lerobot/tls/server.pem
# tls_connect_private_key: /etc/lerobot/tls/server.key
# Escape hatch: raw JSON5 merged into the zenoh config last.
# extra_config_json5: '{transport: {link: {tx: {queue: {size: {data: 4}}}}}}'
# --- Observability -------------------------------------------------------------
# HTTP health + Prometheus metrics port; 0 disables the endpoint.
health_port: 9100
# Optional bounded request/response capture for offline replay.
debug:
capture_dir: null
capture_max: 256
@@ -0,0 +1,17 @@
from lerobot.async_inference.configs import PolicyServerConfig
from lerobot.async_inference.policy_server import serve
def main():
host = ... # something like "127.0.0.1" if you're exposing to localhost
port = ... # something like 8080
config = PolicyServerConfig(
host=host,
port=port,
)
serve(config)
if __name__ == "__main__":
main()
@@ -0,0 +1,62 @@
import threading
from lerobot.async_inference.configs import RobotClientConfig
from lerobot.async_inference.helpers import visualize_action_queue_size
from lerobot.async_inference.robot_client import RobotClient
from lerobot.cameras.opencv import OpenCVCameraConfig
from lerobot.robots.so_follower import SO100FollowerConfig
def main():
# these cameras must match the ones expected by the policy - find your cameras with lerobot-find-cameras
# check the config.json on the Hub for the policy you are using to see the expected camera specs
camera_cfg = {
"up": OpenCVCameraConfig(index_or_path=0, width=640, height=480, fps=30),
"side": OpenCVCameraConfig(index_or_path=1, width=640, height=480, fps=30),
}
# # find ports using lerobot-find-port
follower_port = ... # something like "/dev/tty.usbmodem58760431631"
# # the robot ids are used the load the right calibration files
follower_id = ... # something like "follower_so100"
robot_cfg = SO100FollowerConfig(port=follower_port, id=follower_id, cameras=camera_cfg)
server_address = ... # something like "127.0.0.1:8080" if using localhost
# 3. Create client configuration
client_cfg = RobotClientConfig(
robot=robot_cfg,
server_address=server_address,
policy_device="mps",
client_device="cpu",
policy_type="act",
pretrained_name_or_path="<user>/robot_learning_tutorial_act",
chunk_size_threshold=0.5, # g
actions_per_chunk=50, # make sure this is less than the max actions of the policy
)
# 4. Create and start client
client = RobotClient(client_cfg)
# 5. Provide a textual description of the task
task = ...
if client.start():
# Start action receiver thread
action_receiver_thread = threading.Thread(target=client.receive_actions, daemon=True)
action_receiver_thread.start()
try:
# Run the control loop
client.control_loop(task)
except KeyboardInterrupt:
client.stop()
action_receiver_thread.join()
# (Optionally) plot the action queue size
visualize_action_queue_size(client.action_queue_size)
if __name__ == "__main__":
main()
+22 -25
View File
@@ -4,13 +4,13 @@ from pathlib import Path
from queue import Empty, Full
import torch
import torch.optim as optim
from lerobot.datasets import LeRobotDataset
from lerobot.envs.configs import HILSerlProcessorConfig, HILSerlRobotEnvConfig
from lerobot.policies import GaussianActorConfig
from lerobot.policies.gaussian_actor.modeling_gaussian_actor import GaussianActorPolicy
from lerobot.rewards.classifier.modeling_classifier import Classifier
from lerobot.rl.algorithms.sac import SACAlgorithm, SACAlgorithmConfig
from lerobot.policies import SACConfig
from lerobot.policies.sac.modeling_sac import SACPolicy
from lerobot.policies.sac.reward_model.modeling_classifier import Classifier
from lerobot.rl.buffer import ReplayBuffer
from lerobot.rl.gym_manipulator import make_robot_env
from lerobot.robots.so_follower import SO100FollowerConfig
@@ -28,7 +28,7 @@ def run_learner(
transitions_queue: mp.Queue,
parameters_queue: mp.Queue,
shutdown_event: mp.Event,
policy_learner: GaussianActorPolicy,
policy_learner: SACPolicy,
online_buffer: ReplayBuffer,
offline_buffer: ReplayBuffer,
lr: float = 3e-4,
@@ -40,9 +40,8 @@ def run_learner(
policy_learner.train()
policy_learner.to(device)
algo_config = SACAlgorithmConfig.from_policy_config(policy_learner.config)
algorithm = SACAlgorithm(policy=policy_learner, config=algo_config)
algorithm.make_optimizers_and_scheduler()
# Create Adam optimizer from scratch - simple and clean
optimizer = optim.Adam(policy_learner.parameters(), lr=lr)
print(f"[LEARNER] Online buffer capacity: {online_buffer.capacity}")
print(f"[LEARNER] Offline buffer capacity: {offline_buffer.capacity}")
@@ -84,26 +83,24 @@ def run_learner(
else:
batch[key] = online_batch[key]
def batch_iter(b=batch):
while True:
yield b
loss, _ = policy_learner.forward(batch)
stats = algorithm.update(batch_iter())
optimizer.zero_grad()
loss.backward()
optimizer.step()
training_step += 1
if training_step % LOG_EVERY == 0:
log_dict = stats.to_log_dict()
print(
f"[LEARNER] Training step {training_step}, "
f"critic_loss: {log_dict.get('critic', 'N/A'):.4f}, "
f"[LEARNER] Training step {training_step}, Loss: {loss.item():.4f}, "
f"Buffers: Online={len(online_buffer)}, Offline={len(offline_buffer)}"
)
# Send updated parameters to actor every 10 training steps
if training_step % SEND_EVERY == 0:
try:
weights = algorithm.get_weights()
parameters_queue.put_nowait(weights)
state_dict = {k: v.cpu() for k, v in policy_learner.state_dict().items()}
parameters_queue.put_nowait(state_dict)
print("[LEARNER] Sent updated parameters to actor")
except Full:
# Missing write due to queue not being consumed (should happen rarely)
@@ -116,7 +113,7 @@ def run_actor(
transitions_queue: mp.Queue,
parameters_queue: mp.Queue,
shutdown_event: mp.Event,
policy_actor: GaussianActorPolicy,
policy_actor: SACPolicy,
reward_classifier: Classifier,
env_cfg: HILSerlRobotEnvConfig,
device: torch.device = "mps",
@@ -147,15 +144,15 @@ def run_actor(
while step < MAX_STEPS_PER_EPISODE and not shutdown_event.is_set():
try:
new_weights = parameters_queue.get_nowait()
policy_actor.load_state_dict(new_weights)
new_params = parameters_queue.get_nowait()
policy_actor.load_state_dict(new_params)
print("[ACTOR] Updated policy parameters from learner")
except Empty: # No new updated parameters available from learner, waiting
pass
# Get action from policy (returns full action: continuous + discrete)
# Get action from policy
policy_obs = make_policy_obs(obs, device=device)
action_tensor = policy_actor.select_action(policy_obs)
action_tensor = policy_actor.select_action(policy_obs) # predicts a single action
action = action_tensor.squeeze(0).cpu().numpy()
# Step environment
@@ -264,14 +261,14 @@ def main():
action_features = hw_to_dataset_features(env.robot.action_features, "action")
# Create SAC policy for action selection
policy_cfg = GaussianActorConfig(
policy_cfg = SACConfig(
device=device,
input_features=obs_features,
output_features=action_features,
)
policy_actor = GaussianActorPolicy(policy_cfg)
policy_learner = GaussianActorPolicy(policy_cfg)
policy_actor = SACPolicy(policy_cfg)
policy_learner = SACPolicy(policy_cfg)
demonstrations_repo_id = "lerobot/example_hil_serl_dataset"
offline_dataset = LeRobotDataset(repo_id=demonstrations_repo_id)
@@ -1,7 +1,7 @@
import torch
from lerobot.datasets import LeRobotDataset
from lerobot.rewards import RewardClassifierConfig, make_reward_model, make_reward_pre_post_processors
from lerobot.policies import RewardClassifierConfig, make_policy, make_pre_post_processors
def main():
@@ -22,10 +22,10 @@ def main():
model_name="microsoft/resnet-18",
)
# Make reward model, preprocessor, and optimizer
reward_model = make_reward_model(config, dataset_stats=dataset.meta.stats)
optimizer = config.get_optimizer_preset().build(reward_model.parameters())
preprocessor, _ = make_reward_pre_post_processors(config, dataset_stats=dataset.meta.stats)
# Make policy, preprocessor, and optimizer
policy = make_policy(config, ds_meta=dataset.meta)
optimizer = config.get_optimizer_preset().build(policy.parameters())
preprocessor, _ = make_pre_post_processors(policy_cfg=config, dataset_stats=dataset.meta.stats)
classifier_id = "<user>/reward_classifier_hil_serl_example"
@@ -42,7 +42,7 @@ def main():
batch = preprocessor(batch)
# Forward pass
loss, output_dict = reward_model.forward(batch)
loss, output_dict = policy.forward(batch)
# Backward pass and optimization
optimizer.zero_grad()
@@ -58,8 +58,8 @@ def main():
print("Training finished!")
# You can now save the trained reward model.
reward_model.push_to_hub(classifier_id)
# You can now save the trained policy.
policy.push_to_hub(classifier_id)
if __name__ == "__main__":
+22 -71
View File
@@ -59,8 +59,8 @@ keywords = ["lerobot", "huggingface", "robotics", "machine learning", "artifici
dependencies = [
# Core ML
"torch>=2.7,<2.12.0",
"torchvision>=0.22.0,<0.27.0",
"torch>=2.7,<2.11.0",
"torchvision>=0.22.0,<0.26.0",
"numpy>=2.0.0,<2.3.0", # NOTE: Explicitly listing numpy helps the resolver converge faster. Upper bound imposed by opencv-python-headless.
"opencv-python-headless>=4.9.0,<4.14.0",
"Pillow>=10.0.0,<13.0.0",
@@ -95,28 +95,17 @@ dependencies = [
# ── Feature-scoped extras ──────────────────────────────────
dataset = [
"datasets>=4.7.0,<5.0.0",
"datasets>=4.0.0,<5.0.0",
"pandas>=2.0.0,<3.0.0", # NOTE: Transitive dependency of datasets
"pyarrow>=21.0.0,<30.0.0", # NOTE: Transitive dependency of datasets
"lerobot[av-dep]",
# NOTE: torchcodec wheel availability matrix (PyPI):
# - linux x86_64/amd64 + macOS arm64 : wheels since 0.3.0 (the historic supported set).
# - win32 x86_64 : wheels since 0.7.0 (needs torch>=2.8).
# - linux aarch64/arm64 : wheels since 0.11.0 (needs torch>=2.11).
# - macOS x86_64 (Intel) and linux armv7l: no wheels in any released version -> fall through to the PyAV decoder.
# Each platform gets its own line so the resolver picks the minimum version that has a wheel for it.
# Other torch/torchcodec pairings (informational): 0.8.1 = ffmpeg>=8 support, 0.10 = system-wide ffmpeg support, 0.12 needs torch==2.12.
"torchcodec>=0.3.0,<0.12.0; (sys_platform == 'linux' and (platform_machine == 'x86_64' or platform_machine == 'AMD64')) or (sys_platform == 'darwin' and platform_machine == 'arm64')",
"torchcodec>=0.7.0,<0.12.0; sys_platform == 'win32'",
"torchcodec>=0.11.0,<0.12.0; sys_platform == 'linux' and (platform_machine == 'aarch64' or platform_machine == 'arm64')",
"torchcodec>=0.3.0,<0.11.0; sys_platform != 'win32' and (sys_platform != 'linux' or (platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 'armv7l')) and (sys_platform != 'darwin' or platform_machine != 'x86_64')", # NOTE: Windows support starts at version 0.7 (needs torch==2.8), ffmpeg>=8 support starts at version 0.8.1 (needs torch==2.9), system-wide ffmpeg support starts at version 0.10 (needs torch==2.10).
"jsonlines>=4.0.0,<5.0.0",
]
training = [
"lerobot[dataset]",
"wandb>=0.24.0,<0.28.0",
"lerobot[accelerate-dep]",
"accelerate>=1.10.0,<2.0.0",
"wandb>=0.24.0,<0.25.0",
]
hardware = [
"lerobot[pynput-dep]",
@@ -138,12 +127,9 @@ dataset_viz = ["lerobot[dataset]", "lerobot[viz]"]
# Common
av-dep = ["av>=15.0.0,<16.0.0"]
pygame-dep = ["pygame>=2.5.1,<2.7.0"]
# NOTE: 0.9.16 links against liburdfdom_sensor.so.4, which is unavailable on Ubuntu 24.04
# (noble ships urdfdom 3.x). Cap below 0.9.16 until system urdfdom 4.x is broadly available.
placo-dep = ["placo>=0.9.6,<0.9.16"]
transformers-dep = ["transformers>=5.4.0,<5.6.0"]
grpcio-dep = ["grpcio>=1.73.1,<2.0.0", "protobuf>=6.31.1,<8.0.0"]
accelerate-dep = ["accelerate>=1.14.0,<2.0.0"]
placo-dep = ["placo>=0.9.6,<0.9.17"]
transformers-dep = ["transformers==5.3.0"] # TODO(Steven): https://github.com/huggingface/lerobot/pull/3249
grpcio-dep = ["grpcio==1.73.1", "protobuf>=6.31.1,<6.32.0"]
can-dep = ["python-can>=4.2.0,<5.0.0"]
peft-dep = ["peft>=0.18.0,<1.0.0"]
scipy-dep = ["scipy>=1.14.0,<2.0.0"]
@@ -154,8 +140,6 @@ pyserial-dep = ["pyserial>=3.5,<4.0"]
deepdiff-dep = ["deepdiff>=7.0.1,<9.0.0"]
pynput-dep = ["pynput>=1.7.8,<1.9.0"]
pyzmq-dep = ["pyzmq>=26.2.1,<28.0.0"]
motorbridge-dep = ["motorbridge>=0.3.2,<0.4.0"]
motorbridge-smart-servo-dep = ["motorbridge-smart-servo>=0.0.4,<0.1.0"]
# Motors
feetech = ["feetech-servo-sdk>=1.0.0,<2.0.0", "lerobot[pyserial-dep]", "lerobot[deepdiff-dep]"]
@@ -178,15 +162,7 @@ unitree_g1 = [
"lerobot[matplotlib-dep]",
"lerobot[pygame-dep]",
]
# reachy2-sdk caps grpcio<=1.73.1 and protobuf<=6.32.0; quarantined here so downstream users aren't held back. reachy2-sdk is unlikely to release new versions.
reachy2 = [
"reachy2_sdk>=1.0.15,<1.1.0",
"grpcio<=1.73.1",
"protobuf<=6.32.0",
]
# Seeed Studio reBot B601-DM follower (motorbridge / CAN) + StarArm102 / reBot Arm 102
# leader (motorbridge-smart-servo / FashionStar UART servos).
rebot = ["lerobot[motorbridge-dep]", "lerobot[motorbridge-smart-servo-dep]"]
reachy2 = ["reachy2_sdk>=1.0.15,<1.1.0"]
kinematics = ["lerobot[placo-dep]"]
intelrealsense = [
"pyrealsense2>=2.55.1.6486,<2.57.0 ; sys_platform != 'darwin'",
@@ -204,8 +180,7 @@ wallx = [
"lerobot[qwen-vl-utils-dep]",
]
pi = ["lerobot[transformers-dep]", "lerobot[scipy-dep]"]
molmoact2 = ["lerobot[transformers-dep]", "lerobot[peft-dep]", "lerobot[scipy-dep]"]
smolvla = ["lerobot[transformers-dep]", "num2words>=0.5.14,<0.6.0", "lerobot[accelerate-dep]"]
smolvla = ["lerobot[transformers-dep]", "num2words>=0.5.14,<0.6.0", "accelerate>=1.7.0,<2.0.0"]
multi_task_dit = ["lerobot[transformers-dep]", "lerobot[diffusers-dep]"]
groot = [
"lerobot[transformers-dep]",
@@ -218,30 +193,24 @@ groot = [
"flash-attn>=2.5.9,<3.0.0 ; sys_platform != 'darwin'"
]
sarm = ["lerobot[transformers-dep]", "pydantic>=2.0.0,<3.0.0", "faker>=33.0.0,<35.0.0", "lerobot[matplotlib-dep]", "lerobot[qwen-vl-utils-dep]"]
robometer = ["lerobot[transformers-dep]", "lerobot[qwen-vl-utils-dep]", "lerobot[peft-dep]"]
topreward = ["lerobot[transformers-dep]"]
xvla = ["lerobot[transformers-dep]"]
eo1 = ["lerobot[transformers-dep]", "lerobot[qwen-vl-utils-dep]"]
hilserl = ["lerobot[transformers-dep]", "lerobot[dataset]", "gym-hil>=0.1.14,<0.2.0", "lerobot[grpcio-dep]", "lerobot[placo-dep]"]
vla_jepa = ["lerobot[transformers-dep]", "lerobot[diffusers-dep]", "lerobot[qwen-vl-utils-dep]"]
hilserl = ["lerobot[transformers-dep]", "gym-hil>=0.1.13,<0.2.0", "lerobot[grpcio-dep]", "lerobot[placo-dep]"]
# Features
# Remote inference over Zenoh: lerobot-policy-server + lerobot-rollout --inference.type=remote.
# Keep zenohd routers on the same minor version as the Python binding.
async = ["eclipse-zenoh>=1.9,<2.0", "msgpack>=1.0.0,<2.0.0"]
async = ["lerobot[grpcio-dep]", "lerobot[matplotlib-dep]"]
peft = ["lerobot[transformers-dep]", "lerobot[peft-dep]"]
# Development
dev = ["pre-commit>=3.7.0,<5.0.0", "debugpy>=1.8.1,<1.9.0", "lerobot[grpcio-dep]", "grpcio-tools>=1.73.1,<2.0.0", "mypy>=1.19.1", "ruff>=0.14.1", "lerobot[notebook]"]
dev = ["pre-commit>=3.7.0,<5.0.0", "debugpy>=1.8.1,<1.9.0", "lerobot[grpcio-dep]", "grpcio-tools==1.73.1", "mypy>=1.19.1", "ruff>=0.14.1", "lerobot[notebook]"]
notebook = ["jupyter>=1.0.0,<2.0.0", "ipykernel>=6.0.0,<7.0.0"]
test = ["pytest>=8.1.0,<9.0.0", "pytest-timeout>=2.4.0,<3.0.0", "pytest-cov>=5.0.0,<8.0.0", "mock-serial>=0.0.1,<0.1.0 ; sys_platform != 'win32'"]
video_benchmark = ["scikit-image>=0.23.2,<0.26.0", "pandas>=2.2.2,<2.4.0"]
# Simulation
# NOTE: Explicitly listing scipy helps flatten the dependecy tree.
aloha = ["lerobot[dataset]", "gym-aloha>=0.1.4,<0.2.0", "lerobot[scipy-dep]"]
aloha = ["lerobot[dataset]", "gym-aloha>=0.1.2,<0.2.0", "lerobot[scipy-dep]"]
pusht = ["lerobot[dataset]", "gym-pusht>=0.1.5,<0.2.0", "pymunk>=6.6.0,<7.0.0"] # TODO: Fix pymunk version in gym-pusht instead
libero = ["lerobot[dataset]", "lerobot[transformers-dep]", "hf-libero>=0.1.4,<0.2.0; sys_platform == 'linux'", "lerobot[scipy-dep]"]
libero = ["lerobot[dataset]", "lerobot[transformers-dep]", "hf-libero>=0.1.3,<0.2.0; sys_platform == 'linux'", "lerobot[scipy-dep]"]
metaworld = ["lerobot[dataset]", "metaworld==3.0.0", "lerobot[scipy-dep]"]
# NOTE: vlabench is NOT exposed as a `lerobot` extra. Its only distribution
# is the OpenMOSS/VLABench GitHub repo (package name `VLABench`, no PyPI
@@ -279,19 +248,16 @@ all = [
"lerobot[lekiwi]",
"lerobot[openarms]",
"lerobot[reachy2]",
"lerobot[rebot]",
"lerobot[kinematics]",
"lerobot[intelrealsense]",
"lerobot[diffusion]",
"lerobot[multi_task_dit]",
"lerobot[wallx]",
"lerobot[pi]",
"lerobot[molmoact2]",
"lerobot[smolvla]",
# "lerobot[groot]", TODO(Steven): Gr00t requires specific installation instructions for flash-attn
"lerobot[xvla]",
"lerobot[hilserl]",
"lerobot[vla_jepa]",
"lerobot[async]",
"lerobot[dev]",
"lerobot[test]",
@@ -302,8 +268,6 @@ all = [
"lerobot[libero]; sys_platform == 'linux'",
"lerobot[metaworld]",
"lerobot[sarm]",
"lerobot[robometer]",
"lerobot[topreward]",
"lerobot[peft]",
# "lerobot[unitree_g1]", TODO: Unitree requires specific installation instructions for unitree_sdk2
]
@@ -326,23 +290,8 @@ lerobot-imgtransform-viz="lerobot.scripts.lerobot_imgtransform_viz:main"
lerobot-edit-dataset="lerobot.scripts.lerobot_edit_dataset:main"
lerobot-setup-can="lerobot.scripts.lerobot_setup_can:main"
lerobot-rollout="lerobot.scripts.lerobot_rollout:main"
lerobot-policy-server="lerobot.scripts.lerobot_policy_server:main"
# ---------------- Tool Configurations ----------------
# cu128 wheels keep broad hardware reach; the driver floor is 570.86.
# To use a different CUDA variant, reinstall torch with an explicit index, e.g.:
# uv pip install --force-reinstall torch torchvision \
# --index-url https://download.pytorch.org/whl/cu130
[[tool.uv.index]]
name = "pytorch-cu128"
url = "https://download.pytorch.org/whl/cu128"
explicit = true
[tool.uv.sources]
torch = [{ index = "pytorch-cu128", marker = "sys_platform == 'linux'" }]
torchvision = [{ index = "pytorch-cu128", marker = "sys_platform == 'linux'" }]
[tool.setuptools.package-data]
lerobot = ["envs/*.json"]
@@ -420,11 +369,8 @@ default.extend-ignore-identifiers-re = [
"ein",
"thw",
"inpt",
"arange",
"is_compileable",
"ROBOTIS",
"OT_VALUE",
"VanderBilt"
"OT_VALUE"
]
# TODO: Uncomment when ready to use
@@ -519,6 +465,11 @@ ignore_errors = false
# module = "lerobot.rl.*"
# ignore_errors = false
# [[tool.mypy.overrides]]
# module = "lerobot.async_inference.*"
# ignore_errors = false
[[tool.mypy.overrides]]
module = "lerobot.transport.*"
ignore_errors = false
@@ -12,8 +12,19 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from .configuration_gaussian_actor import GaussianActorConfig
from .modeling_gaussian_actor import GaussianActorPolicy
from .processor_gaussian_actor import make_gaussian_actor_pre_post_processors
"""
Async inference server/client.
__all__ = ["GaussianActorConfig", "GaussianActorPolicy", "make_gaussian_actor_pre_post_processors"]
Requires: ``pip install 'lerobot[async]'``
Available modules (import directly)::
from lerobot.async_inference.policy_server import ...
from lerobot.async_inference.robot_client import ...
"""
from lerobot.utils.import_utils import require_package
require_package("grpcio", extra="async", import_name="grpc")
__all__: list[str] = []
+203
View File
@@ -0,0 +1,203 @@
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from collections.abc import Callable
from dataclasses import dataclass, field
import torch
from lerobot.robots.config import RobotConfig
from .constants import (
DEFAULT_FPS,
DEFAULT_INFERENCE_LATENCY,
DEFAULT_OBS_QUEUE_TIMEOUT,
)
# Aggregate function registry for CLI usage
AGGREGATE_FUNCTIONS = {
"weighted_average": lambda old, new: 0.3 * old + 0.7 * new,
"latest_only": lambda old, new: new,
"average": lambda old, new: 0.5 * old + 0.5 * new,
"conservative": lambda old, new: 0.7 * old + 0.3 * new,
}
def get_aggregate_function(name: str) -> Callable[[torch.Tensor, torch.Tensor], torch.Tensor]:
"""Get aggregate function by name from registry."""
if name not in AGGREGATE_FUNCTIONS:
available = list(AGGREGATE_FUNCTIONS.keys())
raise ValueError(f"Unknown aggregate function '{name}'. Available: {available}")
return AGGREGATE_FUNCTIONS[name]
@dataclass
class PolicyServerConfig:
"""Configuration for PolicyServer.
This class defines all configurable parameters for the PolicyServer,
including networking settings and action chunking specifications.
"""
# Networking configuration
host: str = field(default="localhost", metadata={"help": "Host address to bind the server to"})
port: int = field(default=8080, metadata={"help": "Port number to bind the server to"})
# Timing configuration
fps: int = field(default=DEFAULT_FPS, metadata={"help": "Frames per second"})
inference_latency: float = field(
default=DEFAULT_INFERENCE_LATENCY, metadata={"help": "Target inference latency in seconds"}
)
obs_queue_timeout: float = field(
default=DEFAULT_OBS_QUEUE_TIMEOUT, metadata={"help": "Timeout for observation queue in seconds"}
)
def __post_init__(self):
"""Validate configuration after initialization."""
if self.port < 1 or self.port > 65535:
raise ValueError(f"Port must be between 1 and 65535, got {self.port}")
if self.environment_dt <= 0:
raise ValueError(f"environment_dt must be positive, got {self.environment_dt}")
if self.inference_latency < 0:
raise ValueError(f"inference_latency must be non-negative, got {self.inference_latency}")
if self.obs_queue_timeout < 0:
raise ValueError(f"obs_queue_timeout must be non-negative, got {self.obs_queue_timeout}")
@classmethod
def from_dict(cls, config_dict: dict) -> "PolicyServerConfig":
"""Create a PolicyServerConfig from a dictionary."""
return cls(**config_dict)
@property
def environment_dt(self) -> float:
"""Environment time step, in seconds"""
return 1 / self.fps
def to_dict(self) -> dict:
"""Convert the configuration to a dictionary."""
return {
"host": self.host,
"port": self.port,
"fps": self.fps,
"environment_dt": self.environment_dt,
"inference_latency": self.inference_latency,
}
@dataclass
class RobotClientConfig:
"""Configuration for RobotClient.
This class defines all configurable parameters for the RobotClient,
including network connection, policy settings, and control behavior.
"""
# Policy configuration
policy_type: str = field(metadata={"help": "Type of policy to use"})
pretrained_name_or_path: str = field(metadata={"help": "Pretrained model name or path"})
# Robot configuration (for CLI usage - robot instance will be created from this)
robot: RobotConfig = field(metadata={"help": "Robot configuration"})
# Policies typically output K actions at max, but we can use less to avoid wasting bandwidth (as actions
# would be aggregated on the client side anyway, depending on the value of `chunk_size_threshold`)
actions_per_chunk: int = field(metadata={"help": "Number of actions per chunk"})
# Task instruction for the robot to execute (e.g., 'fold my tshirt')
task: str = field(default="", metadata={"help": "Task instruction for the robot to execute"})
# Network configuration
server_address: str = field(default="localhost:8080", metadata={"help": "Server address to connect to"})
# Device configuration
policy_device: str = field(default="cpu", metadata={"help": "Device for policy inference"})
client_device: str = field(
default="cpu",
metadata={
"help": "Device to move actions to after receiving from server (e.g., for downstream planners)"
},
)
# Control behavior configuration
chunk_size_threshold: float = field(default=0.5, metadata={"help": "Threshold for chunk size control"})
fps: int = field(default=DEFAULT_FPS, metadata={"help": "Frames per second"})
# Aggregate function configuration (CLI-compatible)
aggregate_fn_name: str = field(
default="weighted_average",
metadata={"help": f"Name of aggregate function to use. Options: {list(AGGREGATE_FUNCTIONS.keys())}"},
)
# Debug configuration
debug_visualize_queue_size: bool = field(
default=False, metadata={"help": "Visualize the action queue size"}
)
@property
def environment_dt(self) -> float:
"""Environment time step, in seconds"""
return 1 / self.fps
def __post_init__(self):
"""Validate configuration after initialization."""
if not self.server_address:
raise ValueError("server_address cannot be empty")
if not self.policy_type:
raise ValueError("policy_type cannot be empty")
if not self.pretrained_name_or_path:
raise ValueError("pretrained_name_or_path cannot be empty")
if not self.policy_device:
raise ValueError("policy_device cannot be empty")
if not self.client_device:
raise ValueError("client_device cannot be empty")
if self.chunk_size_threshold < 0 or self.chunk_size_threshold > 1:
raise ValueError(f"chunk_size_threshold must be between 0 and 1, got {self.chunk_size_threshold}")
if self.fps <= 0:
raise ValueError(f"fps must be positive, got {self.fps}")
if self.actions_per_chunk <= 0:
raise ValueError(f"actions_per_chunk must be positive, got {self.actions_per_chunk}")
self.aggregate_fn = get_aggregate_function(self.aggregate_fn_name)
@classmethod
def from_dict(cls, config_dict: dict) -> "RobotClientConfig":
"""Create a RobotClientConfig from a dictionary."""
return cls(**config_dict)
def to_dict(self) -> dict:
"""Convert the configuration to a dictionary."""
return {
"server_address": self.server_address,
"policy_type": self.policy_type,
"pretrained_name_or_path": self.pretrained_name_or_path,
"policy_device": self.policy_device,
"client_device": self.client_device,
"chunk_size_threshold": self.chunk_size_threshold,
"fps": self.fps,
"actions_per_chunk": self.actions_per_chunk,
"task": self.task,
"debug_visualize_queue_size": self.debug_visualize_queue_size,
"aggregate_fn_name": self.aggregate_fn_name,
}
+29
View File
@@ -0,0 +1,29 @@
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Client side: The environment evolves with a time resolution equal to 1/fps"""
DEFAULT_FPS = 30
"""Server side: Running inference on (at most) 1/fps"""
DEFAULT_INFERENCE_LATENCY = 1 / DEFAULT_FPS
"""Server side: Timeout for observation queue in seconds"""
DEFAULT_OBS_QUEUE_TIMEOUT = 2
# All action chunking policies
SUPPORTED_POLICIES = ["act", "smolvla", "diffusion", "tdmpc", "vqbet", "pi0", "pi05", "groot"]
# TODO: Add all other robots
SUPPORTED_ROBOTS = ["so100_follower", "so101_follower", "bi_so_follower", "omx_follower"]
+297
View File
@@ -0,0 +1,297 @@
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import logging.handlers
import os
import time
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
import torch
from lerobot.configs import PolicyFeature
# NOTE: Configs need to be loaded for the client to be able to instantiate the policy config
from lerobot.policies import ( # noqa: F401
ACTConfig,
DiffusionConfig,
PI0Config,
PI05Config,
SmolVLAConfig,
VQBeTConfig,
)
from lerobot.robots.robot import Robot
from lerobot.utils.constants import OBS_IMAGES, OBS_STATE, OBS_STR
from lerobot.utils.feature_utils import build_dataset_frame, hw_to_dataset_features
from lerobot.utils.utils import init_logging
Action = torch.Tensor
# observation as received from the robot (can be numpy arrays, floats, etc.)
RawObservation = dict[str, Any]
# observation as those recorded in LeRobot dataset (keys are different)
LeRobotObservation = dict[str, torch.Tensor]
# observation, ready for policy inference (image keys resized)
Observation = dict[str, torch.Tensor]
def visualize_action_queue_size(action_queue_size: list[int]) -> None:
import matplotlib.pyplot as plt
_, ax = plt.subplots()
ax.set_title("Action Queue Size Over Time")
ax.set_xlabel("Environment steps")
ax.set_ylabel("Action Queue Size")
ax.set_ylim(0, max(action_queue_size) * 1.1)
ax.grid(True, alpha=0.3)
ax.plot(range(len(action_queue_size)), action_queue_size)
plt.show()
def map_robot_keys_to_lerobot_features(robot: Robot) -> dict[str, dict]:
return hw_to_dataset_features(robot.observation_features, OBS_STR, use_video=False)
def is_image_key(k: str) -> bool:
return k.startswith(OBS_IMAGES)
def resize_robot_observation_image(image: torch.tensor, resize_dims: tuple[int, int, int]) -> torch.tensor:
assert image.ndim == 3, f"Image must be (C, H, W)! Received {image.shape}"
# (H, W, C) -> (C, H, W) for resizing from robot obsevation resolution to policy image resolution
image = image.permute(2, 0, 1)
dims = (resize_dims[1], resize_dims[2])
# Add batch dimension for interpolate: (C, H, W) -> (1, C, H, W)
image_batched = image.unsqueeze(0)
# Interpolate and remove batch dimension: (1, C, H, W) -> (C, H, W)
resized = torch.nn.functional.interpolate(image_batched, size=dims, mode="bilinear", align_corners=False)
return resized.squeeze(0)
# TODO(Steven): Consider implementing a pipeline step for this
def raw_observation_to_observation(
raw_observation: RawObservation,
lerobot_features: dict[str, dict],
policy_image_features: dict[str, PolicyFeature],
) -> Observation:
observation = {}
observation = prepare_raw_observation(raw_observation, lerobot_features, policy_image_features)
for k, v in observation.items():
if isinstance(v, torch.Tensor): # VLAs present natural-language instructions in observations
if "image" in k:
# Policy expects images in shape (B, C, H, W)
observation[k] = prepare_image(v).unsqueeze(0)
else:
observation[k] = v
return observation
def prepare_image(image: torch.Tensor) -> torch.Tensor:
"""Minimal preprocessing to turn int8 images to float32 in [0, 1], and create a memory-contiguous tensor"""
image = image.type(torch.float32) / 255
image = image.contiguous()
return image
def extract_state_from_raw_observation(
lerobot_obs: RawObservation,
) -> torch.Tensor:
"""Extract the state from a raw observation."""
state = torch.tensor(lerobot_obs[OBS_STATE])
if state.ndim == 1:
state = state.unsqueeze(0)
return state
def extract_images_from_raw_observation(
lerobot_obs: RawObservation,
camera_key: str,
) -> dict[str, torch.Tensor]:
"""Extract the images from a raw observation."""
return torch.tensor(lerobot_obs[camera_key])
def make_lerobot_observation(
robot_obs: RawObservation,
lerobot_features: dict[str, dict],
) -> LeRobotObservation:
"""Make a lerobot observation from a raw observation."""
return build_dataset_frame(lerobot_features, robot_obs, prefix=OBS_STR)
def prepare_raw_observation(
robot_obs: RawObservation,
lerobot_features: dict[str, dict],
policy_image_features: dict[str, PolicyFeature],
) -> Observation:
"""Matches keys from the raw robot_obs dict to the keys expected by a given policy (passed as
policy_image_features)."""
# 1. {motor.pos1:value1, motor.pos2:value2, ..., laptop:np.ndarray} ->
# -> {observation.state:[value1,value2,...], observation.images.laptop:np.ndarray}
lerobot_obs = make_lerobot_observation(robot_obs, lerobot_features)
# 2. Greps all observation.images.<> keys
image_keys = list(filter(is_image_key, lerobot_obs))
# state's shape is expected as (B, state_dim)
state_dict = {OBS_STATE: extract_state_from_raw_observation(lerobot_obs)}
image_dict = {
image_k: extract_images_from_raw_observation(lerobot_obs, image_k) for image_k in image_keys
}
# Turns the image features to (C, H, W) with H, W matching the policy image features.
# This reduces the resolution of the images
image_dict = {
key: resize_robot_observation_image(torch.tensor(lerobot_obs[key]), policy_image_features[key].shape)
for key in image_keys
}
if "task" in robot_obs:
state_dict["task"] = robot_obs["task"]
return {**state_dict, **image_dict}
def get_logger(name: str, log_to_file: bool = True) -> logging.Logger:
"""
Get a logger using the standardized logging setup from utils.py.
Args:
name: Logger name (e.g., 'policy_server', 'robot_client')
log_to_file: Whether to also log to a file
Returns:
Configured logger instance
"""
# Create logs directory if logging to file
if log_to_file:
os.makedirs("logs", exist_ok=True)
log_file = Path(f"logs/{name}_{int(time.time())}.log")
else:
log_file = None
# Initialize the standardized logging
init_logging(log_file=log_file, display_pid=False)
# Return a named logger
return logging.getLogger(name)
@dataclass
class TimedData:
"""A data object with timestamp and timestep information.
Args:
timestamp: Unix timestamp relative to data's creation.
data: The actual data to wrap a timestamp around.
timestep: The timestep of the data.
"""
timestamp: float
timestep: int
def get_timestamp(self):
return self.timestamp
def get_timestep(self):
return self.timestep
@dataclass
class TimedAction(TimedData):
action: Action
def get_action(self):
return self.action
@dataclass
class TimedObservation(TimedData):
observation: RawObservation
must_go: bool = False
def get_observation(self):
return self.observation
@dataclass
class FPSTracker:
"""Utility class to track FPS metrics over time."""
target_fps: float
first_timestamp: float = None
total_obs_count: int = 0
def calculate_fps_metrics(self, current_timestamp: float) -> dict[str, float]:
"""Calculate average FPS vs target"""
self.total_obs_count += 1
# Initialize first observation time
if self.first_timestamp is None:
self.first_timestamp = current_timestamp
# Calculate overall average FPS (since start)
total_duration = current_timestamp - self.first_timestamp
avg_fps = (self.total_obs_count - 1) / total_duration if total_duration > 1e-6 else 0.0
return {"avg_fps": avg_fps, "target_fps": self.target_fps}
def reset(self):
"""Reset the FPS tracker state"""
self.first_timestamp = None
self.total_obs_count = 0
@dataclass
class RemotePolicyConfig:
policy_type: str
pretrained_name_or_path: str
lerobot_features: dict[str, PolicyFeature]
actions_per_chunk: int
device: str = "cpu"
rename_map: dict[str, str] = field(default_factory=dict)
def _compare_observation_states(obs1_state: torch.Tensor, obs2_state: torch.Tensor, atol: float) -> bool:
"""Check if two observation states are similar, under a tolerance threshold"""
return bool(torch.linalg.norm(obs1_state - obs2_state) < atol)
def observations_similar(
obs1: TimedObservation, obs2: TimedObservation, lerobot_features: dict[str, dict], atol: float = 1
) -> bool:
"""Check if two observations are similar, under a tolerance threshold. Measures distance between
observations as the difference in joint-space between the two observations.
NOTE(fracapuano): This is a very simple check, and it is enough for the current use case.
An immediate next step is to use (fast) perceptual difference metrics comparing some camera views,
to surpass this joint-space similarity check.
"""
obs1_state = extract_state_from_raw_observation(
make_lerobot_observation(obs1.get_observation(), lerobot_features)
)
obs2_state = extract_state_from_raw_observation(
make_lerobot_observation(obs2.get_observation(), lerobot_features)
)
return _compare_observation_states(obs1_state, obs2_state, atol=atol)
@@ -0,0 +1,439 @@
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Example:
```shell
python -m lerobot.async_inference.policy_server \
--host=127.0.0.1 \
--port=8080 \
--fps=30 \
--inference_latency=0.033 \
--obs_queue_timeout=1
```
"""
import logging
import pickle # nosec
import threading
import time
from concurrent import futures
from dataclasses import asdict
from pprint import pformat
from queue import Empty, Queue
from typing import Any
import draccus
import grpc
import torch
from lerobot.policies import get_policy_class, make_pre_post_processors
from lerobot.processor import PolicyProcessorPipeline
from lerobot.transport import (
services_pb2, # type: ignore
services_pb2_grpc, # type: ignore
)
from lerobot.transport.utils import receive_bytes_in_chunks
from lerobot.types import PolicyAction
from .configs import PolicyServerConfig
from .constants import SUPPORTED_POLICIES
from .helpers import (
FPSTracker,
Observation,
RemotePolicyConfig,
TimedAction,
TimedObservation,
get_logger,
observations_similar,
raw_observation_to_observation,
)
class PolicyServer(services_pb2_grpc.AsyncInferenceServicer):
prefix = "policy_server"
logger = get_logger(prefix)
def __init__(self, config: PolicyServerConfig):
self.config = config
self.shutdown_event = threading.Event()
# FPS measurement
self.fps_tracker = FPSTracker(target_fps=config.fps)
self.observation_queue = Queue(maxsize=1)
self._predicted_timesteps_lock = threading.Lock()
self._predicted_timesteps = set()
self.last_processed_obs = None
# Attributes will be set by SendPolicyInstructions
self.device = None
self.policy_type = None
self.lerobot_features = None
self.actions_per_chunk = None
self.policy = None
self.preprocessor: PolicyProcessorPipeline[dict[str, Any], dict[str, Any]] | None = None
self.postprocessor: PolicyProcessorPipeline[PolicyAction, PolicyAction] | None = None
@property
def running(self):
return not self.shutdown_event.is_set()
@property
def policy_image_features(self):
return self.policy.config.image_features
def _reset_server(self) -> None:
"""Flushes server state when new client connects."""
# only running inference on the latest observation received by the server
self.shutdown_event.set()
self.observation_queue = Queue(maxsize=1)
with self._predicted_timesteps_lock:
self._predicted_timesteps = set()
def Ready(self, request, context): # noqa: N802
client_id = context.peer()
self.logger.info(f"Client {client_id} connected and ready")
self._reset_server()
self.shutdown_event.clear()
return services_pb2.Empty()
def SendPolicyInstructions(self, request, context): # noqa: N802
"""Receive policy instructions from the robot client"""
if not self.running:
self.logger.warning("Server is not running. Ignoring policy instructions.")
return services_pb2.Empty()
client_id = context.peer()
policy_specs = pickle.loads(request.data) # nosec
if not isinstance(policy_specs, RemotePolicyConfig):
raise TypeError(f"Policy specs must be a RemotePolicyConfig. Got {type(policy_specs)}")
if policy_specs.policy_type not in SUPPORTED_POLICIES:
raise ValueError(
f"Policy type {policy_specs.policy_type} not supported. "
f"Supported policies: {SUPPORTED_POLICIES}"
)
self.logger.info(
f"Receiving policy instructions from {client_id} | "
f"Policy type: {policy_specs.policy_type} | "
f"Pretrained name or path: {policy_specs.pretrained_name_or_path} | "
f"Actions per chunk: {policy_specs.actions_per_chunk} | "
f"Device: {policy_specs.device}"
)
self.device = policy_specs.device
self.policy_type = policy_specs.policy_type # act, pi0, etc.
self.lerobot_features = policy_specs.lerobot_features
self.actions_per_chunk = policy_specs.actions_per_chunk
policy_class = get_policy_class(self.policy_type)
start = time.perf_counter()
self.policy = policy_class.from_pretrained(policy_specs.pretrained_name_or_path)
self.policy.to(self.device)
# Load preprocessor and postprocessor, overriding device to match requested device
device_override = {"device": self.device}
self.preprocessor, self.postprocessor = make_pre_post_processors(
self.policy.config,
pretrained_path=policy_specs.pretrained_name_or_path,
preprocessor_overrides={
"device_processor": device_override,
"rename_observations_processor": {"rename_map": policy_specs.rename_map},
},
postprocessor_overrides={"device_processor": device_override},
)
end = time.perf_counter()
self.logger.info(f"Time taken to put policy on {self.device}: {end - start:.4f} seconds")
return services_pb2.Empty()
def SendObservations(self, request_iterator, context): # noqa: N802
"""Receive observations from the robot client"""
client_id = context.peer()
self.logger.debug(f"Receiving observations from {client_id}")
receive_time = time.time() # comparing timestamps so need time.time()
start_deserialize = time.perf_counter()
received_bytes = receive_bytes_in_chunks(
request_iterator, None, self.shutdown_event, self.logger
) # blocking call while looping over request_iterator
timed_observation = pickle.loads(received_bytes) # nosec
deserialize_time = time.perf_counter() - start_deserialize
self.logger.debug(f"Received observation #{timed_observation.get_timestep()}")
obs_timestep = timed_observation.get_timestep()
obs_timestamp = timed_observation.get_timestamp()
# Calculate FPS metrics
fps_metrics = self.fps_tracker.calculate_fps_metrics(obs_timestamp)
self.logger.debug(
f"Received observation #{obs_timestep} | "
f"Avg FPS: {fps_metrics['avg_fps']:.2f} | " # fps at which observations are received from client
f"Target: {fps_metrics['target_fps']:.2f} | "
f"One-way latency: {(receive_time - obs_timestamp) * 1000:.2f}ms"
)
self.logger.debug(
f"Server timestamp: {receive_time:.6f} | "
f"Client timestamp: {obs_timestamp:.6f} | "
f"Deserialization time: {deserialize_time:.6f}s"
)
if not self._enqueue_observation(
timed_observation # wrapping a RawObservation
):
self.logger.debug(f"Observation #{obs_timestep} has been filtered out")
return services_pb2.Empty()
def GetActions(self, request, context): # noqa: N802
"""Returns actions to the robot client. Actions are sent as a single
chunk, containing multiple actions."""
client_id = context.peer()
self.logger.debug(f"Client {client_id} connected for action streaming")
# Generate action based on the most recent observation and its timestep
try:
getactions_starts = time.perf_counter()
obs = self.observation_queue.get(timeout=self.config.obs_queue_timeout)
self.logger.info(
f"Running inference for observation #{obs.get_timestep()} (must_go: {obs.must_go})"
)
with self._predicted_timesteps_lock:
self._predicted_timesteps.add(obs.get_timestep())
start_time = time.perf_counter()
action_chunk = self._predict_action_chunk(obs)
inference_time = time.perf_counter() - start_time
start_time = time.perf_counter()
actions_bytes = pickle.dumps(action_chunk) # nosec
serialize_time = time.perf_counter() - start_time
# Create and return the action chunk
actions = services_pb2.Actions(data=actions_bytes)
self.logger.info(
f"Action chunk #{obs.get_timestep()} generated | "
f"Total time: {(inference_time + serialize_time) * 1000:.2f}ms"
)
self.logger.debug(
f"Action chunk #{obs.get_timestep()} generated | "
f"Inference time: {inference_time:.2f}s |"
f"Serialize time: {serialize_time:.2f}s |"
f"Total time: {inference_time + serialize_time:.2f}s"
)
time.sleep(
max(0, self.config.inference_latency - max(0, time.perf_counter() - getactions_starts))
) # sleep controls inference latency
return actions
except Empty: # no observation added to queue in obs_queue_timeout
return services_pb2.Empty()
except Exception as e:
self.logger.error(f"Error in StreamActions: {e}")
return services_pb2.Empty()
def _obs_sanity_checks(self, obs: TimedObservation, previous_obs: TimedObservation) -> bool:
"""Check if the observation is valid to be processed by the policy"""
with self._predicted_timesteps_lock:
predicted_timesteps = self._predicted_timesteps
if obs.get_timestep() in predicted_timesteps:
self.logger.debug(f"Skipping observation #{obs.get_timestep()} - Timestep predicted already!")
return False
elif observations_similar(obs, previous_obs, lerobot_features=self.lerobot_features):
self.logger.debug(
f"Skipping observation #{obs.get_timestep()} - Observation too similar to last obs predicted!"
)
return False
else:
return True
def _enqueue_observation(self, obs: TimedObservation) -> bool:
"""Enqueue an observation if it must go through processing, otherwise skip it.
Observations not in queue are never run through the policy network"""
if (
obs.must_go
or self.last_processed_obs is None
or self._obs_sanity_checks(obs, self.last_processed_obs)
):
last_obs = self.last_processed_obs.get_timestep() if self.last_processed_obs else "None"
self.logger.debug(
f"Enqueuing observation. Must go: {obs.must_go} | Last processed obs: {last_obs}"
)
# If queue is full, get the old observation to make room
if self.observation_queue.full():
# pops from queue
_ = self.observation_queue.get_nowait()
self.logger.debug("Observation queue was full, removed oldest observation")
# Now put the new observation (never blocks as queue is non-full here)
self.observation_queue.put(obs)
return True
return False
def _time_action_chunk(self, t_0: float, action_chunk: list[torch.Tensor], i_0: int) -> list[TimedAction]:
"""Turn a chunk of actions into a list of TimedAction instances,
with the first action corresponding to t_0 and the rest corresponding to
t_0 + i*environment_dt for i in range(len(action_chunk))
"""
return [
TimedAction(timestamp=t_0 + i * self.config.environment_dt, timestep=i_0 + i, action=action)
for i, action in enumerate(action_chunk)
]
def _get_action_chunk(self, observation: dict[str, torch.Tensor]) -> torch.Tensor:
"""Get an action chunk from the policy. The chunk contains only"""
chunk = self.policy.predict_action_chunk(observation)
if chunk.ndim != 3:
chunk = chunk.unsqueeze(0) # adding batch dimension, now shape is (B, chunk_size, action_dim)
return chunk[:, : self.actions_per_chunk, :]
def _predict_action_chunk(self, observation_t: TimedObservation) -> list[TimedAction]:
"""Predict an action chunk based on an observation.
Pipeline:
1. Convert raw observation to LeRobot format
2. Apply preprocessor (tokenization, normalization, batching, device placement)
3. Run policy inference to get action chunk
4. Apply postprocessor (unnormalization, device movement)
5. Convert to TimedAction list
"""
"""1. Prepare observation"""
start_prepare = time.perf_counter()
observation: Observation = raw_observation_to_observation(
observation_t.get_observation(),
self.lerobot_features,
self.policy_image_features,
)
prepare_time = time.perf_counter() - start_prepare
"""2. Apply preprocessor"""
start_preprocess = time.perf_counter()
observation = self.preprocessor(observation)
self.last_processed_obs: TimedObservation = observation_t
preprocessing_time = time.perf_counter() - start_preprocess
"""3. Get action chunk"""
start_inference = time.perf_counter()
action_tensor = self._get_action_chunk(observation)
inference_time = time.perf_counter() - start_inference
self.logger.info(
f"Preprocessing and inference took {inference_time:.4f}s, action shape: {action_tensor.shape}"
)
"""4. Apply postprocessor"""
# Apply postprocessor (handles unnormalization and device movement)
# Postprocessor expects (B, action_dim) per action, but we have (B, chunk_size, action_dim)
# So we process each action in the chunk individually
start_postprocess = time.perf_counter()
_, chunk_size, _ = action_tensor.shape
# Process each action in the chunk
processed_actions = []
for i in range(chunk_size):
# Extract action at timestep i: (B, action_dim)
single_action = action_tensor[:, i, :]
processed_action = self.postprocessor(single_action)
processed_actions.append(processed_action)
# Stack back to (B, chunk_size, action_dim), then remove batch dim
action_tensor = torch.stack(processed_actions, dim=1).squeeze(0)
self.logger.debug(f"Postprocessed action shape: {action_tensor.shape}")
action_tensor = action_tensor.detach().cpu()
"""5. Convert to TimedAction list"""
action_chunk = self._time_action_chunk(
observation_t.get_timestamp(), list(action_tensor), observation_t.get_timestep()
)
postprocess_stops = time.perf_counter()
postprocessing_time = postprocess_stops - start_postprocess
self.logger.info(
f"Observation {observation_t.get_timestep()} | "
f"Total time: {1000 * (postprocess_stops - start_prepare):.2f}ms"
)
self.logger.debug(
f"Observation {observation_t.get_timestep()} | "
f"Prepare time: {1000 * prepare_time:.2f}ms | "
f"Preprocessing time: {1000 * preprocessing_time:.2f}ms | "
f"Inference time: {1000 * inference_time:.2f}ms | "
f"Postprocessing time: {1000 * postprocessing_time:.2f}ms | "
f"Total time: {1000 * (postprocess_stops - start_prepare):.2f}ms"
)
return action_chunk
def stop(self):
"""Stop the server"""
self._reset_server()
self.logger.info("Server stopping...")
@draccus.wrap()
def serve(cfg: PolicyServerConfig):
"""Start the PolicyServer with the given configuration.
Args:
config: PolicyServerConfig instance. If None, uses default configuration.
"""
logging.info(pformat(asdict(cfg)))
# Create the server instance first
policy_server = PolicyServer(cfg)
# Setup and start gRPC server
server = grpc.server(futures.ThreadPoolExecutor(max_workers=4))
services_pb2_grpc.add_AsyncInferenceServicer_to_server(policy_server, server)
server.add_insecure_port(f"{cfg.host}:{cfg.port}")
policy_server.logger.info(f"PolicyServer started on {cfg.host}:{cfg.port}")
server.start()
server.wait_for_termination()
policy_server.logger.info("Server terminated")
if __name__ == "__main__":
serve()
+517
View File
@@ -0,0 +1,517 @@
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Example command:
```shell
python src/lerobot/async_inference/robot_client.py \
--robot.type=so100_follower \
--robot.port=/dev/tty.usbmodem58760431541 \
--robot.cameras="{ front: {type: opencv, index_or_path: 0, width: 1920, height: 1080, fps: 30}}" \
--robot.id=black \
--task="dummy" \
--server_address=127.0.0.1:8080 \
--policy_type=act \
--pretrained_name_or_path=user/model \
--policy_device=mps \
--client_device=cpu \
--actions_per_chunk=50 \
--chunk_size_threshold=0.5 \
--aggregate_fn_name=weighted_average \
--debug_visualize_queue_size=True
```
"""
import logging
import pickle # nosec
import threading
import time
from collections.abc import Callable
from dataclasses import asdict
from pprint import pformat
from queue import Queue
from typing import Any
import draccus
import grpc
import torch
from lerobot.cameras.opencv import OpenCVCameraConfig # noqa: F401
from lerobot.cameras.realsense import RealSenseCameraConfig # noqa: F401
from lerobot.robots import ( # noqa: F401
Robot,
RobotConfig,
bi_so_follower,
koch_follower,
make_robot_from_config,
omx_follower,
so_follower,
)
from lerobot.transport import (
services_pb2, # type: ignore
services_pb2_grpc, # type: ignore
)
from lerobot.transport.utils import grpc_channel_options, send_bytes_in_chunks
from lerobot.utils.import_utils import register_third_party_plugins
from .configs import RobotClientConfig
from .helpers import (
Action,
FPSTracker,
Observation,
RawObservation,
RemotePolicyConfig,
TimedAction,
TimedObservation,
get_logger,
map_robot_keys_to_lerobot_features,
visualize_action_queue_size,
)
class RobotClient:
prefix = "robot_client"
logger = get_logger(prefix)
def __init__(self, config: RobotClientConfig):
"""Initialize RobotClient with unified configuration.
Args:
config: RobotClientConfig containing all configuration parameters
"""
# Store configuration
self.config = config
self.robot = make_robot_from_config(config.robot)
self.robot.connect()
lerobot_features = map_robot_keys_to_lerobot_features(self.robot)
# Use environment variable if server_address is not provided in config
self.server_address = config.server_address
self.policy_config = RemotePolicyConfig(
config.policy_type,
config.pretrained_name_or_path,
lerobot_features,
config.actions_per_chunk,
config.policy_device,
)
self.channel = grpc.insecure_channel(
self.server_address, grpc_channel_options(initial_backoff=f"{config.environment_dt:.4f}s")
)
self.stub = services_pb2_grpc.AsyncInferenceStub(self.channel)
self.logger.info(f"Initializing client to connect to server at {self.server_address}")
self.shutdown_event = threading.Event()
# Initialize client side variables
self.latest_action_lock = threading.Lock()
self.latest_action = -1
self.action_chunk_size = -1
self._chunk_size_threshold = config.chunk_size_threshold
self.action_queue = Queue()
self.action_queue_lock = threading.Lock() # Protect queue operations
self.action_queue_size = []
self.start_barrier = threading.Barrier(2) # 2 threads: action receiver, control loop
# FPS measurement
self.fps_tracker = FPSTracker(target_fps=self.config.fps)
self.logger.info("Robot connected and ready")
# Use an event for thread-safe coordination
self.must_go = threading.Event()
self.must_go.set() # Initially set - observations qualify for direct processing
@property
def running(self):
return not self.shutdown_event.is_set()
def start(self):
"""Start the robot client and connect to the policy server"""
try:
# client-server handshake
start_time = time.perf_counter()
self.stub.Ready(services_pb2.Empty())
end_time = time.perf_counter()
self.logger.debug(f"Connected to policy server in {end_time - start_time:.4f}s")
# send policy instructions
policy_config_bytes = pickle.dumps(self.policy_config)
policy_setup = services_pb2.PolicySetup(data=policy_config_bytes)
self.logger.info("Sending policy instructions to policy server")
self.logger.debug(
f"Policy type: {self.policy_config.policy_type} | "
f"Pretrained name or path: {self.policy_config.pretrained_name_or_path} | "
f"Device: {self.policy_config.device}"
)
self.stub.SendPolicyInstructions(policy_setup)
self.shutdown_event.clear()
return True
except grpc.RpcError as e:
self.logger.error(f"Failed to connect to policy server: {e}")
return False
def stop(self):
"""Stop the robot client"""
self.shutdown_event.set()
self.robot.disconnect()
self.logger.debug("Robot disconnected")
self.channel.close()
self.logger.debug("Client stopped, channel closed")
def send_observation(
self,
obs: TimedObservation,
) -> bool:
"""Send observation to the policy server.
Returns True if the observation was sent successfully, False otherwise."""
if not self.running:
raise RuntimeError("Client not running. Run RobotClient.start() before sending observations.")
if not isinstance(obs, TimedObservation):
raise ValueError("Input observation needs to be a TimedObservation!")
start_time = time.perf_counter()
observation_bytes = pickle.dumps(obs)
serialize_time = time.perf_counter() - start_time
self.logger.debug(f"Observation serialization time: {serialize_time:.6f}s")
try:
observation_iterator = send_bytes_in_chunks(
observation_bytes,
services_pb2.Observation,
log_prefix="[CLIENT] Observation",
silent=True,
)
_ = self.stub.SendObservations(observation_iterator)
obs_timestep = obs.get_timestep()
self.logger.debug(f"Sent observation #{obs_timestep} | ")
return True
except grpc.RpcError as e:
self.logger.error(f"Error sending observation #{obs.get_timestep()}: {e}")
return False
def _inspect_action_queue(self):
with self.action_queue_lock:
queue_size = self.action_queue.qsize()
timestamps = sorted([action.get_timestep() for action in self.action_queue.queue])
self.logger.debug(f"Queue size: {queue_size}, Queue contents: {timestamps}")
return queue_size, timestamps
def _aggregate_action_queues(
self,
incoming_actions: list[TimedAction],
aggregate_fn: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] | None = None,
):
"""Finds the same timestep actions in the queue and aggregates them using the aggregate_fn"""
if aggregate_fn is None:
# default aggregate function: take the latest action
def aggregate_fn(x1, x2):
return x2
future_action_queue = Queue()
with self.action_queue_lock:
internal_queue = self.action_queue.queue
current_action_queue = {action.get_timestep(): action.get_action() for action in internal_queue}
for new_action in incoming_actions:
with self.latest_action_lock:
latest_action = self.latest_action
# New action is older than the latest action in the queue, skip it
if new_action.get_timestep() <= latest_action:
continue
# If the new action's timestep is not in the current action queue, add it directly
elif new_action.get_timestep() not in current_action_queue:
future_action_queue.put(new_action)
continue
# If the new action's timestep is in the current action queue, aggregate it
# TODO: There is probably a way to do this with broadcasting of the two action tensors
future_action_queue.put(
TimedAction(
timestamp=new_action.get_timestamp(),
timestep=new_action.get_timestep(),
action=aggregate_fn(
current_action_queue[new_action.get_timestep()], new_action.get_action()
),
)
)
with self.action_queue_lock:
self.action_queue = future_action_queue
def receive_actions(self, verbose: bool = False):
"""Receive actions from the policy server"""
# Wait at barrier for synchronized start
self.start_barrier.wait()
self.logger.info("Action receiving thread starting")
while self.running:
try:
# Use StreamActions to get a stream of actions from the server
actions_chunk = self.stub.GetActions(services_pb2.Empty())
if len(actions_chunk.data) == 0:
continue # received `Empty` from server, wait for next call
receive_time = time.time()
# Deserialize bytes back into list[TimedAction]
deserialize_start = time.perf_counter()
timed_actions = pickle.loads(actions_chunk.data) # nosec
deserialize_time = time.perf_counter() - deserialize_start
# Log device type of received actions
if len(timed_actions) > 0:
received_device = timed_actions[0].get_action().device.type
self.logger.debug(f"Received actions on device: {received_device}")
# Move actions to client_device (e.g., for downstream planners that need GPU)
client_device = self.config.client_device
if client_device != "cpu":
for timed_action in timed_actions:
if timed_action.get_action().device.type != client_device:
timed_action.action = timed_action.get_action().to(client_device)
self.logger.debug(f"Converted actions to device: {client_device}")
else:
self.logger.debug(f"Actions kept on device: {client_device}")
self.action_chunk_size = max(self.action_chunk_size, len(timed_actions))
# Calculate network latency if we have matching observations
if len(timed_actions) > 0 and verbose:
with self.latest_action_lock:
latest_action = self.latest_action
self.logger.debug(f"Current latest action: {latest_action}")
# Get queue state before changes
old_size, old_timesteps = self._inspect_action_queue()
if not old_timesteps:
old_timesteps = [latest_action] # queue was empty
# Log incoming actions
incoming_timesteps = [a.get_timestep() for a in timed_actions]
first_action_timestep = timed_actions[0].get_timestep()
server_to_client_latency = (receive_time - timed_actions[0].get_timestamp()) * 1000
self.logger.info(
f"Received action chunk for step #{first_action_timestep} | "
f"Latest action: #{latest_action} | "
f"Incoming actions: {incoming_timesteps[0]}:{incoming_timesteps[-1]} | "
f"Network latency (server->client): {server_to_client_latency:.2f}ms | "
f"Deserialization time: {deserialize_time * 1000:.2f}ms"
)
# Update action queue
start_time = time.perf_counter()
self._aggregate_action_queues(timed_actions, self.config.aggregate_fn)
queue_update_time = time.perf_counter() - start_time
self.must_go.set() # after receiving actions, next empty queue triggers must-go processing!
if verbose:
# Get queue state after changes
new_size, new_timesteps = self._inspect_action_queue()
with self.latest_action_lock:
latest_action = self.latest_action
self.logger.info(
f"Latest action: {latest_action} | "
f"Old action steps: {old_timesteps[0]}:{old_timesteps[-1]} | "
f"Incoming action steps: {incoming_timesteps[0]}:{incoming_timesteps[-1]} | "
f"Updated action steps: {new_timesteps[0]}:{new_timesteps[-1]}"
)
self.logger.debug(
f"Queue update complete ({queue_update_time:.6f}s) | "
f"Before: {old_size} items | "
f"After: {new_size} items | "
)
except grpc.RpcError as e:
self.logger.error(f"Error receiving actions: {e}")
def actions_available(self):
"""Check if there are actions available in the queue"""
with self.action_queue_lock:
return not self.action_queue.empty()
def _action_tensor_to_action_dict(self, action_tensor: torch.Tensor) -> dict[str, float]:
action = {key: action_tensor[i].item() for i, key in enumerate(self.robot.action_features)}
return action
def control_loop_action(self, verbose: bool = False) -> dict[str, Any]:
"""Reading and performing actions in local queue"""
# Lock only for queue operations
get_start = time.perf_counter()
with self.action_queue_lock:
self.action_queue_size.append(self.action_queue.qsize())
# Get action from queue
timed_action = self.action_queue.get_nowait()
get_end = time.perf_counter() - get_start
_performed_action = self.robot.send_action(
self._action_tensor_to_action_dict(timed_action.get_action())
)
with self.latest_action_lock:
self.latest_action = timed_action.get_timestep()
if verbose:
with self.action_queue_lock:
current_queue_size = self.action_queue.qsize()
self.logger.debug(
f"Ts={timed_action.get_timestamp()} | "
f"Action #{timed_action.get_timestep()} performed | "
f"Queue size: {current_queue_size}"
)
self.logger.debug(
f"Popping action from queue to perform took {get_end:.6f}s | Queue size: {current_queue_size}"
)
return _performed_action
def _ready_to_send_observation(self):
"""Flags when the client is ready to send an observation"""
with self.action_queue_lock:
return self.action_queue.qsize() / self.action_chunk_size <= self._chunk_size_threshold
def control_loop_observation(self, task: str, verbose: bool = False) -> RawObservation:
try:
# Get serialized observation bytes from the function
start_time = time.perf_counter()
raw_observation: RawObservation = self.robot.get_observation()
raw_observation["task"] = task
with self.latest_action_lock:
latest_action = self.latest_action
observation = TimedObservation(
timestamp=time.time(), # need time.time() to compare timestamps across client and server
observation=raw_observation,
timestep=max(latest_action, 0),
)
obs_capture_time = time.perf_counter() - start_time
# If there are no actions left in the queue, the observation must go through processing!
with self.action_queue_lock:
observation.must_go = self.must_go.is_set() and self.action_queue.empty()
current_queue_size = self.action_queue.qsize()
_ = self.send_observation(observation)
self.logger.debug(f"QUEUE SIZE: {current_queue_size} (Must go: {observation.must_go})")
if observation.must_go:
# must-go event will be set again after receiving actions
self.must_go.clear()
if verbose:
# Calculate comprehensive FPS metrics
fps_metrics = self.fps_tracker.calculate_fps_metrics(observation.get_timestamp())
self.logger.info(
f"Obs #{observation.get_timestep()} | "
f"Avg FPS: {fps_metrics['avg_fps']:.2f} | "
f"Target: {fps_metrics['target_fps']:.2f}"
)
self.logger.debug(
f"Ts={observation.get_timestamp():.6f} | Capturing observation took {obs_capture_time:.6f}s"
)
return raw_observation
except Exception as e:
self.logger.error(f"Error in observation sender: {e}")
def control_loop(self, task: str, verbose: bool = False) -> tuple[Observation, Action]:
"""Combined function for executing actions and streaming observations"""
# Wait at barrier for synchronized start
self.start_barrier.wait()
self.logger.info("Control loop thread starting")
_performed_action = None
_captured_observation = None
while self.running:
control_loop_start = time.perf_counter()
"""Control loop: (1) Performing actions, when available"""
if self.actions_available():
_performed_action = self.control_loop_action(verbose)
"""Control loop: (2) Streaming observations to the remote policy server"""
if self._ready_to_send_observation():
_captured_observation = self.control_loop_observation(task, verbose)
self.logger.debug(f"Control loop (ms): {(time.perf_counter() - control_loop_start) * 1000:.2f}")
# Dynamically adjust sleep time to maintain the desired control frequency
time.sleep(max(0, self.config.environment_dt - (time.perf_counter() - control_loop_start)))
return _captured_observation, _performed_action
@draccus.wrap()
def async_client(cfg: RobotClientConfig):
logging.info(pformat(asdict(cfg)))
# TODO: Assert if checking robot support is still needed with the plugin system
# if cfg.robot.type not in SUPPORTED_ROBOTS:
# raise ValueError(f"Robot {cfg.robot.type} not yet supported!")
client = RobotClient(cfg)
if client.start():
client.logger.info("Starting action receiver thread...")
# Create and start action receiver thread
action_receiver_thread = threading.Thread(target=client.receive_actions, daemon=True)
# Start action receiver thread
action_receiver_thread.start()
try:
# The main thread runs the control loop
client.control_loop(task=cfg.task)
finally:
client.stop()
action_receiver_thread.join()
if cfg.debug_visualize_queue_size:
visualize_action_queue_size(client.action_queue_size)
client.logger.info("Client stopped")
if __name__ == "__main__":
register_third_party_plugins()
async_client() # run the client
+3 -9
View File
@@ -199,13 +199,12 @@ class OpenCVCamera(Camera):
DeviceNotConnectedError: If the camera is not connected.
"""
# Set FOURCC first (if specified) as it can affect available FPS/resolution options
if self.config.fourcc is not None:
self._validate_fourcc()
if self.videocapture is None:
raise DeviceNotConnectedError(f"{self} videocapture is not initialized")
set_fourcc_after_size_and_fps = platform.system() == "Windows"
if self.config.fourcc is not None and not set_fourcc_after_size_and_fps:
self._validate_fourcc()
default_width = int(round(self.videocapture.get(cv2.CAP_PROP_FRAME_WIDTH)))
default_height = int(round(self.videocapture.get(cv2.CAP_PROP_FRAME_HEIGHT)))
@@ -223,11 +222,6 @@ class OpenCVCamera(Camera):
else:
self._validate_fps()
if self.config.fourcc is not None and set_fourcc_after_size_and_fps:
# On Windows with DSHOW, changing the resolution can silently override the FOURCC setting.
# Set FOURCC last to make sure the requested pixel format is actually enforced.
self._validate_fourcc()
def _validate_fps(self) -> None:
"""Validates and sets the camera's frames per second (FPS)."""
@@ -17,7 +17,6 @@ Provides the RealSenseCamera class for capturing frames from Intel RealSense cam
"""
import logging
import sys
import time
from threading import Event, Lock, Thread
from typing import TYPE_CHECKING, Any
@@ -42,7 +41,6 @@ from ..utils import get_cv2_rotation
from .configuration_realsense import RealSenseCameraConfig
logger = logging.getLogger(__name__)
pkg_name = "pyrealsense2-macosx" if sys.platform == "darwin" else "pyrealsense2"
class RealSenseCamera(Camera):
@@ -116,7 +114,7 @@ class RealSenseCamera(Camera):
Args:
config: The configuration settings for the camera.
"""
require_package(pkg_name, extra="intelrealsense", import_name="pyrealsense2")
require_package("pyrealsense2", extra="intelrealsense")
super().__init__(config)
self.config = config
-70
View File
@@ -18,7 +18,6 @@ from __future__ import annotations
# Utilities
########################################################################################
import logging
import time
import traceback
from contextlib import nullcontext
from copy import copy
@@ -244,72 +243,3 @@ def sanity_check_dataset_robot_compatibility(
raise ValueError(
"Dataset metadata compatibility check failed with mismatches:\n" + "\n".join(mismatches)
)
########################################################################################
# Teleoperator smooth handover helpers
# NOTE(Maxime): These functions use minimal type hints to maintain compatibility with utils
# being a root module.
########################################################################################
def teleop_supports_feedback(teleop) -> bool:
"""Return True when the teleop can receive position feedback (is actuated).
Actuated teleops (e.g. SO-101, OpenArmMini) have non-empty ``feedback_features``
and expose ``enable_torque`` / ``disable_torque`` motor-control methods.
TODO(Maxime): See if it is possible to unify this interface across teleops instead of duck-typing.
"""
return (
bool(teleop.feedback_features)
and hasattr(teleop, "disable_torque")
and hasattr(teleop, "enable_torque")
)
def teleop_smooth_move_to(teleop, target_pos: dict, duration_s: float = 2.0, fps: int = 30) -> None:
"""Smoothly move an actuated teleop to ``target_pos`` via linear interpolation.
Requires the teleoperator to support feedback (i.e. have non-empty
``feedback_features`` and implement ``disable_torque`` / ``enable_torque``).
``target_pos`` is expected to be in the teleop's action/feedback key space.
For homogeneous setups (e.g. SO-101 leader + SO-101 follower) this matches
the robot action key space directly.
TODO(Maxime): This blocks up to ``duration_s`` seconds; during this time the
follower robot does not receive new actions, which could be an issue on LeKiwi.
"""
teleop.enable_torque()
current = teleop.get_action()
steps = max(int(duration_s * fps), 1)
for step in range(steps + 1):
t = step / steps
interp = {
k: current[k] * (1 - t) + target_pos[k] * t if k in target_pos else current[k] for k in current
}
teleop.send_feedback(interp)
time.sleep(1 / fps)
def follower_smooth_move_to(
robot, current: dict, target: dict, duration_s: float = 1.0, fps: int = 30
) -> None:
"""Smoothly move the follower robot from ``current`` to ``target`` action.
Used when the teleop is non-actuated: instead of driving the leader arm to
the follower, the follower is brought to the teleop's current pose so the
robot meets the operator's hand rather than jumping to it on the first frame.
Both ``current`` and ``target`` must be in the robot action key space
(i.e. the output of ``robot_action_processor``).
"""
steps = max(int(duration_s * fps), 1)
for step in range(steps + 1):
t = step / steps
interp = {k: current[k] * (1 - t) + target[k] * t if k in target else current[k] for k in current}
robot.send_action(interp)
time.sleep(1 / fps)
-1
View File
@@ -99,7 +99,6 @@ def save_checkpoint(
optimizer (Optimizer | None, optional): The optimizer to save the state from. Defaults to None.
scheduler (LRScheduler | None, optional): The scheduler to save the state from. Defaults to None.
preprocessor: The preprocessor/pipeline to save. Defaults to None.
postprocessor: The postprocessor/pipeline to save. Defaults to None.
"""
pretrained_dir = checkpoint_dir / PRETRAINED_MODEL_DIR
policy.save_pretrained(pretrained_dir)
+1 -5
View File
@@ -41,12 +41,8 @@ def cfg_to_group(
return tag
return tag[:max_tag_length]
if cfg.is_reward_model_training:
trainable_tag = f"reward_model:{cfg.reward_model.type}"
else:
trainable_tag = f"policy:{cfg.policy.type}"
lst = [
trainable_tag,
f"policy:{cfg.policy.type}",
f"seed:{cfg.seed}",
]
if cfg.dataset is not None:
-16
View File
@@ -24,7 +24,6 @@ Import them directly: ``from lerobot.configs.train import TrainPipelineConfig``
from .dataset import DatasetRecordConfig
from .default import DatasetConfig, EvalConfig, PeftConfig, WandBConfig
from .policies import PreTrainedConfig
from .recipe import MessageTurn, TrainingRecipe, load_recipe
from .types import (
FeatureType,
NormalizationMode,
@@ -32,12 +31,6 @@ from .types import (
PolicyFeature,
RTCAttentionSchedule,
)
from .video import (
VALID_VIDEO_CODECS,
VIDEO_ENCODER_INFO_KEYS,
VideoEncoderConfig,
camera_encoder_defaults,
)
__all__ = [
# Types
@@ -50,16 +43,7 @@ __all__ = [
"DatasetRecordConfig",
"DatasetConfig",
"EvalConfig",
"MessageTurn",
"PeftConfig",
"PreTrainedConfig",
"TrainingRecipe",
"WandBConfig",
"load_recipe",
"VideoEncoderConfig",
# Defaults
"camera_encoder_defaults",
# Constants
"VALID_VIDEO_CODECS",
"VIDEO_ENCODER_INFO_KEYS",
]
+7 -8
View File
@@ -14,12 +14,10 @@
"""Shared dataset recording configuration used by both ``lerobot-record`` and ``lerobot-rollout``."""
from dataclasses import dataclass, field
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from .video import VideoEncoderConfig, camera_encoder_defaults
@dataclass
class DatasetRecordConfig:
@@ -41,8 +39,8 @@ class DatasetRecordConfig:
video: bool = True
# Upload dataset to Hugging Face hub.
push_to_hub: bool = True
# If True, upload as private; if None, defer to the org default on the Hub (only affects orgs).
private: bool | None = None
# Upload on private repository on the Hugging Face hub.
private: bool = False
# Add tags to your dataset on the hub.
tags: list[str] | None = None
# Number of subprocesses handling the saving of frames as PNG. Set to 0 to use threads only;
@@ -57,9 +55,10 @@ class DatasetRecordConfig:
# Number of episodes to record before batch encoding videos
# Set to 1 for immediate encoding (default behavior), or higher for batched encoding
video_encoding_batch_size: int = 1
# Video encoder settings for camera MP4s (codec, quality, GOP, etc.). Tuned via CLI nested keys,
# e.g. ``--dataset.camera_encoder.vcodec=h264`` (see ``VideoEncoderConfig``).
camera_encoder: VideoEncoderConfig = field(default_factory=camera_encoder_defaults)
# Video codec for encoding videos. Options: 'h264', 'hevc', 'libsvtav1', 'auto',
# or hardware-specific: 'h264_videotoolbox', 'h264_nvenc', 'h264_vaapi', 'h264_qsv'.
# Use 'auto' to auto-detect the best available hardware encoder.
vcodec: str = "libsvtav1"
# Enable streaming video encoding: encode frames in real-time during capture instead
# of writing PNG images first. Makes save_episode() near-instant. More info in the documentation: https://huggingface.co/docs/lerobot/streaming_video_encoding
streaming_encoding: bool = False
+2 -8
View File
@@ -17,7 +17,7 @@
from dataclasses import dataclass, field
from lerobot.transforms import ImageTransformsConfig
from lerobot.utils.import_utils import get_safe_default_video_backend
from lerobot.utils.import_utils import get_safe_default_codec
@dataclass
@@ -34,7 +34,7 @@ class DatasetConfig:
image_transforms: ImageTransformsConfig = field(default_factory=ImageTransformsConfig)
revision: str | None = None
use_imagenet_stats: bool = True
video_backend: str = field(default_factory=get_safe_default_video_backend)
video_backend: str = field(default_factory=get_safe_default_codec)
# When True, video frames are returned as uint8 tensors (0-255) instead of float32 (0.0-1.0).
# This reduces memory and speeds up DataLoader IPC. The training pipeline handles the conversion.
return_uint8: bool = False
@@ -117,9 +117,3 @@ class PeftConfig:
# the rank used for the adapter. In general a higher rank means more trainable parameters and closer to full
# fine-tuning.
r: int = 16
# Alpha parameter for LoRA scaling (scaling = lora_alpha / r).
# In general, a higher alpha means stronger adaptation signal.
# If None, the PEFT library defaults to alpha=8, which may dampen high-rank adapters.
# Common values are r (alpha == rank) or 2*r.
lora_alpha: int | None = None
+3 -6
View File
@@ -18,8 +18,8 @@ from logging import getLogger
from pathlib import Path
from lerobot import envs, policies # noqa: F401
from lerobot.configs import parser
from . import parser
from .default import EvalConfig
from .policies import PreTrainedConfig
@@ -46,11 +46,8 @@ class EvalPipelineConfig:
# HACK: We parse again the cli args here to get the pretrained path if there was one.
policy_path = parser.get_path_arg("policy")
if policy_path:
yaml_overrides = parser.get_yaml_overrides("policy")
cli_overrides = parser.get_cli_overrides("policy") or []
self.policy = PreTrainedConfig.from_pretrained(
policy_path, cli_overrides=yaml_overrides + cli_overrides
)
cli_overrides = parser.get_cli_overrides("policy")
self.policy = PreTrainedConfig.from_pretrained(policy_path, cli_overrides=cli_overrides)
self.policy.pretrained_path = Path(policy_path)
else:
+2 -89
View File
@@ -13,10 +13,8 @@
# limitations under the License.
import importlib
import inspect
import json
import pkgutil
import sys
import tempfile
from argparse import ArgumentError
from collections.abc import Callable, Iterable, Sequence
from functools import wraps
@@ -26,7 +24,6 @@ from types import ModuleType
from typing import Any, TypeVar, cast
import draccus
import yaml # type: ignore[import-untyped]
from lerobot.utils.utils import has_method
@@ -35,29 +32,6 @@ F = TypeVar("F", bound=Callable[..., object])
PATH_KEY = "path"
PLUGIN_DISCOVERY_SUFFIX = "discover_packages_path"
# Storage for path args extracted from YAML/JSON config files, so that
# get_path_arg() can find them even when they weren't passed via CLI.
_config_path_args: dict[str, str] = {}
# Storage for non-path YAML overrides so validate() can pass them to from_pretrained.
_config_yaml_overrides: dict[str, list[str]] = {}
def _flatten_to_cli_args(d: dict, prefix: str = "") -> list[str]:
"""Recursively flatten a nested dict to CLI-style args (e.g. {"lr": 1e-4} -> ["--lr=0.0001"])."""
args = []
for key, value in d.items():
if key in (PATH_KEY, draccus.CHOICE_TYPE_KEY):
continue
full_key = f"{prefix}.{key}" if prefix else key
if isinstance(value, bool):
value = str(value).lower()
if isinstance(value, dict):
args.extend(_flatten_to_cli_args(value, full_key))
elif value is not None and not isinstance(value, list):
args.append(f"--{full_key}={value}")
return args
def get_cli_overrides(field_name: str, args: Sequence[str] | None = None) -> list[str] | None:
"""Parses arguments from cli at a given nested attribute level.
@@ -171,14 +145,7 @@ def load_plugin(plugin_path: str) -> None:
def get_path_arg(field_name: str, args: Sequence[str] | None = None) -> str | None:
result = parse_arg(f"{field_name}.{PATH_KEY}", args)
if result is None:
result = _config_path_args.get(field_name)
return result
def get_yaml_overrides(field_name: str) -> list[str]:
return _config_yaml_overrides.get(field_name, [])
return parse_arg(f"{field_name}.{PATH_KEY}", args)
def get_type_arg(field_name: str, args: Sequence[str] | None = None) -> str | None:
@@ -225,51 +192,6 @@ def filter_path_args(fields_to_filter: str | list[str], args: Sequence[str] | No
return filtered_args
def extract_path_fields_from_config(config_path: str, path_fields: list[str]) -> str:
"""Extract `path` fields from a YAML/JSON config before draccus processes it.
When a user specifies e.g. ``policy.path: lerobot/smolvla_base`` in a YAML config,
draccus will fail because ``path`` is not a valid field on policy config classes.
This function extracts those path values, stores them in ``_config_path_args`` for
later retrieval by ``get_path_arg()``, and returns a cleaned temp config file path.
"""
config_file = Path(config_path)
suffix = config_file.suffix.lower()
if suffix in (".yaml", ".yml"):
with open(config_file) as f:
config_data = yaml.safe_load(f)
elif suffix == ".json":
with open(config_file) as f:
config_data = json.load(f)
else:
return config_path
if not isinstance(config_data, dict):
return config_path
modified = False
for field in path_fields:
if field in config_data and isinstance(config_data[field], dict) and PATH_KEY in config_data[field]:
_config_path_args[field] = str(config_data[field].pop(PATH_KEY))
remaining = config_data[field]
if remaining:
_config_yaml_overrides[field] = _flatten_to_cli_args(remaining)
del config_data[field]
modified = True
if not modified:
return config_path
# Write cleaned config to a temp file
with tempfile.NamedTemporaryFile(mode="w", suffix=suffix, delete=False) as tmp:
if suffix in (".yaml", ".yml"):
yaml.dump(config_data, tmp, default_flow_style=False)
else:
json.dump(config_data, tmp, indent=2)
return tmp.name
def wrap(config_path: Path | None = None) -> Callable[[F], F]:
"""
HACK: Similar to draccus.wrap but does three additional things:
@@ -303,20 +225,11 @@ def wrap(config_path: Path | None = None) -> Callable[[F], F]:
if has_method(argtype, "__get_path_fields__"):
path_fields = argtype.__get_path_fields__()
cli_args = filter_path_args(path_fields, cli_args)
# Also extract path fields from the YAML/JSON config file
if config_path_cli:
config_path_cli = extract_path_fields_from_config(config_path_cli, path_fields)
if has_method(argtype, "from_pretrained") and config_path_cli:
cli_args = filter_arg("config_path", cli_args)
cfg = argtype.from_pretrained(config_path_cli, cli_args=cli_args)
else:
if config_path_cli:
cli_args = filter_arg("config_path", cli_args)
cfg = draccus.parse(
config_class=argtype,
config_path=config_path_cli or config_path,
args=cli_args,
)
cfg = draccus.parse(config_class=argtype, config_path=config_path, args=cli_args)
response = fn(cfg, *args, **kwargs)
return response
-206
View File
@@ -1,206 +0,0 @@
#!/usr/bin/env python
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Literal, get_args
MessageRole = Literal["user", "assistant", "system", "tool"]
MessageStream = Literal["high_level", "low_level"]
DEFAULT_BINDINGS = {
"subtask": "active_at(t, style=subtask)",
"memory": "active_at(t, style=memory)",
"plan": "active_at(t, style=plan)",
"speech": "emitted_at(t, role=assistant, tool_name=say)",
"interjection": "emitted_at(t, style=interjection)",
"vqa": "emitted_at(t, style=vqa, role=assistant)",
"vqa_query": "emitted_at(t, style=vqa, role=user)",
}
PLACEHOLDER_RE = re.compile(r"\$\{([A-Za-z_][A-Za-z0-9_]*)\}")
"""``${name}`` placeholder pattern used by both recipe binding-reference
discovery (here) and rendered-message substitution (in ``language_render``)."""
_VALID_ROLES = frozenset(get_args(MessageRole))
_VALID_STREAMS = frozenset(get_args(MessageStream))
@dataclass
class MessageTurn:
"""A single chat-style turn in a recipe template.
``content`` may be a plain string, a list of HF-style multimodal blocks, or
``None`` when ``tool_calls_from`` supplies tool-call payloads instead.
``stream`` tags the turn for downstream filtering, ``target`` flags it as a
training target, and ``if_present`` skips the turn when the named binding
resolves to ``None``.
"""
role: MessageRole
content: str | list[dict[str, Any]] | None = None
stream: MessageStream | None = None
target: bool = False
if_present: str | None = None
tool_calls_from: str | None = None
def __post_init__(self) -> None:
"""Validate role, stream, and content after dataclass construction."""
if self.role not in _VALID_ROLES:
raise ValueError(f"Unsupported message role: {self.role!r}")
# ``stream`` is typed Optional only so the dataclass can keep its
# field ordering, but recipes must always tag every turn with a
# stream — the renderer's ``_validate_rendered`` would reject
# ``None`` later on. Fail at construction so the bad recipe is
# caught at YAML load time rather than at the first sample.
if self.stream is None:
raise ValueError(
f"MessageTurn(role={self.role!r}) is missing a stream — "
f"every turn must declare one of {sorted(_VALID_STREAMS)}."
)
if self.stream not in _VALID_STREAMS:
raise ValueError(f"Unsupported message stream: {self.stream!r}")
if self.content is None and self.tool_calls_from is None:
raise ValueError("MessageTurn.content is required unless tool_calls_from is set.")
if self.content is not None and not isinstance(self.content, (str, list)):
raise TypeError("MessageTurn.content must be a string, a list of HF-style blocks, or None.")
if isinstance(self.content, list):
for block in self.content:
if not isinstance(block, dict) or "type" not in block:
raise ValueError(
"Multimodal content blocks must be HF-style dictionaries with a type key."
)
@classmethod
def from_dict(cls, data: dict[str, Any]) -> MessageTurn:
"""Construct a :class:`MessageTurn` from a plain dictionary."""
return cls(**data)
@dataclass
class TrainingRecipe:
"""A recipe describing how to render training samples from language rows.
A recipe is either a *message recipe* (``messages`` plus optional
``bindings``) or a *blend recipe* (``blend`` mapping names to weighted
sub-recipes). ``weight`` is only meaningful inside a blend.
"""
messages: list[MessageTurn] | None = None
bindings: dict[str, str] | None = None
blend: dict[str, TrainingRecipe] | None = None
weight: float | None = None
def __post_init__(self) -> None:
"""Validate that exactly one of ``messages`` or ``blend`` is set."""
if self.messages is not None and self.blend is not None:
raise ValueError("TrainingRecipe must set only one of messages or blend.")
if self.messages is None and self.blend is None:
raise ValueError("TrainingRecipe must set one of messages or blend.")
if self.messages is not None:
self._validate_message_recipe()
if self.blend is not None:
self._validate_blend_recipe()
@classmethod
def from_dict(cls, data: dict[str, Any]) -> TrainingRecipe:
"""Construct a :class:`TrainingRecipe` from a nested dictionary."""
data = dict(data)
if data.get("messages") is not None:
data["messages"] = [
turn if isinstance(turn, MessageTurn) else MessageTurn.from_dict(turn)
for turn in data["messages"]
]
if data.get("blend") is not None:
data["blend"] = {
name: recipe if isinstance(recipe, TrainingRecipe) else cls.from_dict(recipe)
for name, recipe in data["blend"].items()
}
return cls(**data)
@classmethod
def from_yaml(cls, path: str | Path) -> TrainingRecipe:
"""Load a :class:`TrainingRecipe` from a YAML file at ``path``."""
import yaml # type: ignore[import-untyped]
with open(path) as f:
data = yaml.safe_load(f)
if not isinstance(data, dict):
raise ValueError(f"Recipe YAML must contain a mapping at the top level: {path}")
return cls.from_dict(data)
def _validate_message_recipe(self) -> None:
"""Ensure every templated binding is known and at least one turn is a target."""
assert self.messages is not None
known_bindings = set(DEFAULT_BINDINGS) | set(self.bindings or {}) | {"task"}
for turn in self.messages:
missing = self._referenced_bindings(turn) - known_bindings
if missing:
raise ValueError(f"MessageTurn references unknown binding(s): {sorted(missing)}")
if not any(turn.target for turn in self.messages):
raise ValueError("Message recipes must contain at least one target turn.")
def _validate_blend_recipe(self) -> None:
"""Ensure each blend component is a non-empty, weighted message recipe."""
assert self.blend is not None
if not self.blend:
raise ValueError("Blend recipes must contain at least one component.")
for name, recipe in self.blend.items():
if recipe.blend is not None:
raise ValueError(f"Blend component {name!r} cannot itself define a blend.")
if recipe.messages is None:
raise ValueError(f"Blend component {name!r} must define messages.")
if recipe.weight is None:
raise ValueError(f"Blend component {name!r} must define weight.")
if recipe.weight <= 0:
raise ValueError(f"Blend component {name!r} must have a positive weight.")
def _referenced_bindings(self, turn: MessageTurn) -> set[str]:
"""Return the binding names that ``turn`` references via placeholders or attributes."""
names: set[str] = set()
if turn.if_present is not None:
names.add(turn.if_present)
if turn.tool_calls_from is not None:
names.add(turn.tool_calls_from)
names.update(_placeholders_in_content(turn.content))
return names
def _placeholders_in_content(content: str | list[dict[str, Any]] | None) -> set[str]:
"""Return the set of ``${name}`` placeholders found anywhere in ``content``."""
if content is None:
return set()
if isinstance(content, str):
return set(PLACEHOLDER_RE.findall(content))
names: set[str] = set()
for block in content:
for value in block.values():
if isinstance(value, str):
names.update(PLACEHOLDER_RE.findall(value))
return names
def load_recipe(path: str | Path) -> TrainingRecipe:
"""Load a :class:`TrainingRecipe` from a YAML file at ``path``."""
return TrainingRecipe.from_yaml(path)
-164
View File
@@ -1,164 +0,0 @@
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import abc
import builtins
import json
import logging
import os
import tempfile
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, TypeVar
import draccus
from huggingface_hub import hf_hub_download
from huggingface_hub.constants import CONFIG_NAME
from huggingface_hub.errors import HfHubHTTPError
from lerobot.optim.optimizers import OptimizerConfig
from lerobot.optim.schedulers import LRSchedulerConfig
from lerobot.utils.device_utils import auto_select_torch_device, is_torch_device_available
from lerobot.utils.hub import HubMixin
from .types import PolicyFeature
T = TypeVar("T", bound="RewardModelConfig")
logger = logging.getLogger(__name__)
@dataclass
class RewardModelConfig(draccus.ChoiceRegistry, HubMixin, abc.ABC):
"""Base configuration for reward models.
Args:
input_features: A dictionary defining the PolicyFeature of the input data for the reward. The key represents
the input data name, and the value is PolicyFeature, which consists of FeatureType and shape attributes.
output_features: A dictionary defining the PolicyFeature of the output data for the reward. The key represents
the output data name, and the value is PolicyFeature, which consists of FeatureType and shape attributes.
"""
# Reuses PolicyFeature
input_features: dict[str, PolicyFeature] = field(default_factory=dict)
output_features: dict[str, PolicyFeature] = field(default_factory=dict)
device: str | None = None
pretrained_path: str | None = None
push_to_hub: bool = False
repo_id: str | None = None
# Hub metadata
license: str | None = None
tags: list[str] | None = None
private: bool | None = None
def __post_init__(self) -> None:
if not self.device or not is_torch_device_available(self.device):
auto_device = auto_select_torch_device()
logger.warning(f"Device '{self.device}' is not available. Switching to '{auto_device}'.")
self.device = auto_device.type
@property
def type(self) -> str:
choice_name = self.get_choice_name(self.__class__)
if not isinstance(choice_name, str):
raise TypeError(f"Expected string from get_choice_name, got {type(choice_name)}")
return choice_name
@property
def observation_delta_indices(self) -> list | None: # type: ignore[type-arg]
return None
@property
def action_delta_indices(self) -> list | None: # type: ignore[type-arg]
return None
@property
def reward_delta_indices(self) -> list | None: # type: ignore[type-arg]
return None
def get_optimizer_preset(self) -> OptimizerConfig | None:
"""Default optimizer for this reward model, or ``None`` for zero-shot models."""
return None
def get_scheduler_preset(self) -> LRSchedulerConfig | None:
return None
def validate_features(self) -> None:
pass
def _save_pretrained(self, save_directory: Path) -> None:
with open(save_directory / CONFIG_NAME, "w") as f, draccus.config_type("json"):
draccus.dump(self, f, indent=4)
@classmethod
def from_pretrained(
cls: builtins.type[T],
pretrained_name_or_path: str | Path,
*,
force_download: bool = False,
resume_download: bool | None = None,
proxies: dict[Any, Any] | None = None,
token: str | bool | None = None,
cache_dir: str | Path | None = None,
local_files_only: bool = False,
revision: str | None = None,
**reward_kwargs: Any,
) -> T:
model_id = str(pretrained_name_or_path)
config_file: str | None = None
if Path(model_id).is_dir():
if CONFIG_NAME in os.listdir(model_id):
config_file = os.path.join(model_id, CONFIG_NAME)
else:
logger.error(f"{CONFIG_NAME} not found in {Path(model_id).resolve()}")
else:
try:
config_file = hf_hub_download(
repo_id=model_id,
filename=CONFIG_NAME,
revision=revision,
cache_dir=cache_dir,
force_download=force_download,
proxies=proxies,
resume_download=resume_download,
token=token,
local_files_only=local_files_only,
)
except HfHubHTTPError as e:
raise FileNotFoundError(
f"{CONFIG_NAME} not found on the HuggingFace Hub in {model_id}"
) from e
if config_file is None:
raise FileNotFoundError(f"{CONFIG_NAME} not found in {model_id}")
# HACK: Parse the original config to get the config subclass, so that we can
# apply cli overrides.
with draccus.config_type("json"):
orig_config = draccus.parse(cls, config_file, args=[])
with open(config_file) as f:
config = json.load(f)
config.pop("type", None)
with tempfile.NamedTemporaryFile("w+", delete=False, suffix=".json") as f:
json.dump(config, f)
config_file = f.name
cli_overrides = reward_kwargs.pop("cli_overrides", [])
with draccus.config_type("json"):
return draccus.parse(orig_config.__class__, config_file, args=cli_overrides)
+39 -101
View File
@@ -13,9 +13,7 @@
# limitations under the License.
import builtins
import datetime as dt
import json
import os
import tempfile
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
@@ -25,60 +23,21 @@ from huggingface_hub import hf_hub_download
from huggingface_hub.errors import HfHubHTTPError
from lerobot import envs
from lerobot.configs import parser
from lerobot.optim import LRSchedulerConfig, OptimizerConfig
from lerobot.utils.hub import HubMixin
from lerobot.utils.sample_weighting import SampleWeightingConfig
from . import parser
from .default import DatasetConfig, EvalConfig, PeftConfig, WandBConfig
from .policies import PreTrainedConfig
from .rewards import RewardModelConfig
TRAIN_CONFIG_NAME = "train_config.json"
def _migrate_legacy_rabc_fields(config: dict[str, Any]) -> dict[str, Any] | None:
"""Return migrated payload for legacy RA-BC fields, or None when no migration is needed."""
legacy_fields = (
"use_rabc",
"rabc_progress_path",
"rabc_kappa",
"rabc_epsilon",
"rabc_head_mode",
)
if not any(key in config for key in legacy_fields):
return None
migrated_config = dict(config)
use_rabc = bool(migrated_config.pop("use_rabc", False))
rabc_progress_path = migrated_config.pop("rabc_progress_path", None)
rabc_kappa = migrated_config.pop("rabc_kappa", None)
rabc_epsilon = migrated_config.pop("rabc_epsilon", None)
rabc_head_mode = migrated_config.pop("rabc_head_mode", None)
# New configs may already define sample_weighting explicitly. In that case,
# legacy fields are ignored after being stripped from the payload.
if migrated_config.get("sample_weighting") is None and use_rabc:
sample_weighting: dict[str, Any] = {"type": "rabc"}
if rabc_progress_path is not None:
sample_weighting["progress_path"] = rabc_progress_path
if rabc_kappa is not None:
sample_weighting["kappa"] = rabc_kappa
if rabc_epsilon is not None:
sample_weighting["epsilon"] = rabc_epsilon
if rabc_head_mode is not None:
sample_weighting["head_mode"] = rabc_head_mode
migrated_config["sample_weighting"] = sample_weighting
return migrated_config
@dataclass
class TrainPipelineConfig(HubMixin):
dataset: DatasetConfig
env: envs.EnvConfig | None = None
policy: PreTrainedConfig | None = None
reward_model: RewardModelConfig | None = None
# Set `dir` to where you would like to save all of the run outputs. If you run another training session
# with the same value for `dir` its contents will be overwritten unless you set `resume` to true.
output_dir: Path | None = None
@@ -113,44 +72,27 @@ class TrainPipelineConfig(HubMixin):
wandb: WandBConfig = field(default_factory=WandBConfig)
peft: PeftConfig | None = None
# Sample weighting configuration (e.g., for RA-BC training)
sample_weighting: SampleWeightingConfig | None = None
# RA-BC (Reward-Aligned Behavior Cloning) parameters
use_rabc: bool = False # Enable reward-weighted training
rabc_progress_path: str | None = None # Path to precomputed SARM progress parquet file
rabc_kappa: float = 0.01 # Hard threshold for high-quality samples
rabc_epsilon: float = 1e-6 # Small constant for numerical stability
rabc_head_mode: str | None = "sparse" # For dual-head models: "sparse" or "dense"
# Rename map for the observation to override the image and state keys
rename_map: dict[str, str] = field(default_factory=dict)
checkpoint_path: Path | None = field(init=False, default=None)
@property
def is_reward_model_training(self) -> bool:
"""True when the config targets a reward model rather than a policy."""
return self.reward_model is not None
@property
def trainable_config(self) -> PreTrainedConfig | RewardModelConfig:
"""Return whichever config (policy or reward_model) is active."""
if self.is_reward_model_training:
return self.reward_model # type: ignore[return-value]
return self.policy # type: ignore[return-value]
def validate(self) -> None:
# HACK: We parse again the cli args here to get the pretrained paths if there was some.
policy_path = parser.get_path_arg("policy")
reward_model_path = parser.get_path_arg("reward_model")
if reward_model_path:
cli_overrides = parser.get_cli_overrides("reward_model")
self.reward_model = RewardModelConfig.from_pretrained(
reward_model_path, cli_overrides=cli_overrides
)
self.reward_model.pretrained_path = str(Path(reward_model_path))
elif policy_path:
yaml_overrides = parser.get_yaml_overrides("policy")
cli_overrides = parser.get_cli_overrides("policy") or []
self.policy = PreTrainedConfig.from_pretrained(
policy_path, cli_overrides=yaml_overrides + cli_overrides
)
if policy_path:
# Only load the policy config
cli_overrides = parser.get_cli_overrides("policy")
self.policy = PreTrainedConfig.from_pretrained(policy_path, cli_overrides=cli_overrides)
self.policy.pretrained_path = Path(policy_path)
elif self.resume:
# The entire train config is already loaded, we just need to get the checkpoint dir
config_path = parser.parse_arg("config_path")
if not config_path:
raise ValueError(
@@ -166,28 +108,18 @@ class TrainPipelineConfig(HubMixin):
policy_dir = Path(config_path).parent
if self.policy is not None:
self.policy.pretrained_path = policy_dir
if self.reward_model is not None:
self.reward_model.pretrained_path = str(policy_dir)
self.checkpoint_path = policy_dir.parent
if self.policy is None and self.reward_model is None:
if self.policy is None:
raise ValueError(
"Neither policy nor reward_model is configured. "
"Please specify one with `--policy.path` or `--reward_model.path`."
)
active_cfg = self.trainable_config
if self.rename_map and active_cfg.pretrained_path is None:
raise ValueError(
"`rename_map` requires a pretrained policy checkpoint. "
"Fresh initialization derives feature names from the current dataset, so no rename is applied."
"Policy is not configured. Please specify a pretrained policy with `--policy.path`."
)
if not self.job_name:
if self.env is None:
self.job_name = f"{active_cfg.type}"
self.job_name = f"{self.policy.type}"
else:
self.job_name = f"{self.env.type}_{active_cfg.type}"
self.job_name = f"{self.env.type}_{self.policy.type}"
if not self.resume and isinstance(self.output_dir, Path) and self.output_dir.is_dir():
raise FileExistsError(
@@ -205,16 +137,26 @@ class TrainPipelineConfig(HubMixin):
if not self.use_policy_training_preset and (self.optimizer is None or self.scheduler is None):
raise ValueError("Optimizer and Scheduler must be set when the policy presets are not used.")
elif self.use_policy_training_preset and not self.resume:
self.optimizer = active_cfg.get_optimizer_preset()
self.scheduler = active_cfg.get_scheduler_preset()
self.optimizer = self.policy.get_optimizer_preset()
self.scheduler = self.policy.get_scheduler_preset()
if hasattr(active_cfg, "push_to_hub") and active_cfg.push_to_hub and not active_cfg.repo_id:
raise ValueError("'repo_id' argument missing. Please specify it to push the model to the hub.")
if self.policy.push_to_hub and not self.policy.repo_id:
raise ValueError(
"'policy.repo_id' argument missing. Please specify it to push the model to the hub."
)
if self.use_rabc and not self.rabc_progress_path:
# Auto-detect from dataset path
repo_id = self.dataset.repo_id
if self.dataset.root:
self.rabc_progress_path = str(Path(self.dataset.root) / "sarm_progress.parquet")
else:
self.rabc_progress_path = f"hf://datasets/{repo_id}/sarm_progress.parquet"
@classmethod
def __get_path_fields__(cls) -> list[str]:
"""Keys for draccus pretrained-path loading."""
return ["policy", "reward_model"]
"""This enables the parser to load config from the policy using `--policy.path=local/dir`"""
return ["policy"]
def to_dict(self) -> dict[str, Any]:
return draccus.encode(self) # type: ignore[no-any-return] # because of the third-party library draccus uses Any as the return type
@@ -265,16 +207,12 @@ class TrainPipelineConfig(HubMixin):
) from e
cli_args = kwargs.pop("cli_args", [])
# Legacy RA-BC migration only applies to framework-saved checkpoints (always JSON).
# Hand-written YAML/TOML configs are expected to use the current sample_weighting schema.
if config_file is not None and config_file.endswith(".json"):
with open(config_file) as f:
config = json.load(f)
migrated_config = _migrate_legacy_rabc_fields(config)
if migrated_config is not None:
with tempfile.NamedTemporaryFile("w+", delete=False, suffix=".json") as f:
json.dump(migrated_config, f)
config_file = f.name
with draccus.config_type("json"):
return draccus.parse(cls, config_file, args=cli_args)
@dataclass(kw_only=True)
class TrainRLServerPipelineConfig(TrainPipelineConfig):
# NOTE: In RL, we don't need an offline dataset
# TODO: Make `TrainPipelineConfig.dataset` optional
dataset: DatasetConfig | None = None # type: ignore[assignment] # because the parent class has made it's type non-optional
-235
View File
@@ -1,235 +0,0 @@
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Note: We subclass str so that serialization is straightforward
# https://stackoverflow.com/questions/24481852/serialising-an-enum-member-to-json
"""Video encoder configurations."""
from __future__ import annotations
import logging
from dataclasses import dataclass, field
from typing import Any
from lerobot.utils.import_utils import require_package
logger = logging.getLogger(__name__)
# List of hardware encoders to probe for auto-selection. Availability depends on the platform and the chosen video backend.
# Determines the order of preference for auto-selection when vcodec="auto" is used.
HW_VIDEO_CODECS = [
"h264_videotoolbox", # macOS
"hevc_videotoolbox", # macOS
"h264_nvenc", # NVIDIA GPU
"hevc_nvenc", # NVIDIA GPU
"h264_vaapi", # Linux Intel/AMD
"h264_qsv", # Intel Quick Sync
]
VALID_VIDEO_CODECS: frozenset[str] = frozenset({"h264", "hevc", "libsvtav1", "auto", *HW_VIDEO_CODECS})
# Aliases for legacy video codec names.
VIDEO_CODECS_ALIASES: dict[str, str] = {"av1": "libsvtav1"}
LIBSVTAV1_DEFAULT_PRESET: int = 12
# Keys persisted under ``features[*]["info"]`` as ``video.<name>`` (from :class:`VideoEncoderConfig`).
# ``vcodec``` and ``pix_fmt`` are derived from the video stream directly.
VIDEO_ENCODER_INFO_FIELD_NAMES: frozenset[str] = frozenset(
{"g", "crf", "preset", "fast_decode", "extra_options", "video_backend"}
)
VIDEO_ENCODER_INFO_KEYS: frozenset[str] = frozenset(
f"video.{name}" for name in VIDEO_ENCODER_INFO_FIELD_NAMES
)
@dataclass
class VideoEncoderConfig:
"""Video encoder configuration.
Attributes:
vcodec: Video encoder name. ``"auto"`` is resolved during
construction (HW encoder if available, else ``libsvtav1``).
pix_fmt: Pixel format (e.g. ``"yuv420p"``).
g: GOP size (keyframe interval).
crf: Quality level — mapped to the native quality parameter of the
codec (``crf`` for software, ``qp`` for NVENC/VAAPI,
``q:v`` for VideoToolbox, ``global_quality`` for QSV).
preset: Speed/quality preset. Accepted type is per-codec.
fast_decode: Fast-decode tuning. For ``libsvtav1`` this is a level (0-2)
embedded in ``svtav1-params``. For ``h264`` and ``hevc`` non-zero values
set ``tune=fastdecode``. Ignored for other codecs.
video_backend: Python to be used for encoding. Only ``"pyav"``
is currently supported.
extra_options: Free-form dictionary of additional video encoder options
(e.g. ``{"tune": "film", "profile:v": "high", "bf": 2}``).
"""
vcodec: str = "libsvtav1" # TODO(CarolinePascal): rename to codec ?
pix_fmt: str = "yuv420p"
g: int | None = 2
crf: int | float | None = 30
preset: int | str | None = None
fast_decode: int = 0
# TODO(CarolinePascal): add torchcodec support + find a way to unify the
# two backends (encoding and decoding).
video_backend: str = "pyav"
extra_options: dict[str, Any] = field(default_factory=dict)
def __post_init__(self) -> None:
self.resolve_vcodec()
# Empty-constructor ergonomics: ``VideoEncoderConfig()`` must "just work".
if self.preset is None and self.vcodec == "libsvtav1":
self.preset = LIBSVTAV1_DEFAULT_PRESET
self.validate()
@classmethod
def from_video_info(cls, video_info: dict | None) -> VideoEncoderConfig:
"""Reconstruct a :class:`VideoEncoderConfig` from a video feature's ``info`` block.
Missing or ``None`` values fall back to the class defaults.
"""
video_info = video_info or {}
kwargs: dict[str, Any] = {}
for src_key, dst_field in (("video.codec", "vcodec"), ("video.pix_fmt", "pix_fmt")):
value = video_info.get(src_key)
if value is not None:
kwargs[dst_field] = value
for field_name in VIDEO_ENCODER_INFO_FIELD_NAMES:
value = video_info.get(f"video.{field_name}")
if value is None:
continue
# Persisted as ``{}`` after merges with disagreeing sources — treat as default.
if field_name == "extra_options" and not value:
continue
kwargs[field_name] = value
return cls(**kwargs)
def detect_available_encoders(self, encoders: list[str] | str) -> list[str]:
"""Return the subset of available encoders based on the specified video backend.
Args:
encoders: List of encoder names to detect. If a string, it is converted to a list.
Returns:
List of available encoder names. If the video backend is not "pyav", returns an empty list.
"""
if self.video_backend == "pyav":
require_package("av", extra="dataset")
from lerobot.datasets import detect_available_encoders_pyav
return detect_available_encoders_pyav(encoders)
return []
def validate(self) -> None:
"""Validate the video encoder configuration."""
if self.video_backend == "pyav":
require_package("av", extra="dataset")
from lerobot.datasets import check_video_encoder_parameters_pyav
check_video_encoder_parameters_pyav(self.vcodec, self.pix_fmt, self.get_codec_options())
def resolve_vcodec(self) -> None:
"""Check ``vcodec`` and, when it is ``"auto"``, pick a concrete encoder.
For ``"auto"``, the first hardware encoder in the preference list that is available is chosen; if none are available, ``libsvtav1`` is used. If the
resolved codec (explicit or after auto-selection) is not available, raises ``ValueError``.
Stream-derived canonical codec names listed in :data:`VIDEO_CODECS_ALIASES` are
rewritten to their corresponding encoder name (e.g. ``"av1"`` → ``"libsvtav1"``).
"""
self.vcodec = VIDEO_CODECS_ALIASES.get(self.vcodec, self.vcodec)
if self.vcodec not in VALID_VIDEO_CODECS:
raise ValueError(f"Invalid vcodec '{self.vcodec}'. Must be one of: {sorted(VALID_VIDEO_CODECS)}")
if self.vcodec == "auto":
available = self.detect_available_encoders(HW_VIDEO_CODECS)
for encoder in HW_VIDEO_CODECS:
if encoder in available:
logger.info(f"Auto-selected video codec: {encoder}")
self.vcodec = encoder
return
logger.warning("No hardware encoder available, falling back to software encoder 'libsvtav1'")
self.vcodec = "libsvtav1"
if self.detect_available_encoders(self.vcodec):
logger.info(f"Using video codec: {self.vcodec}")
return
raise ValueError(f"Unsupported video codec: {self.vcodec} with video backend {self.video_backend}")
def get_codec_options(
self, encoder_threads: int | None = None, as_strings: bool = False
) -> dict[str, Any]:
"""Translate the tuning fields to codec-specific options.
``VideoEncoderConfig.extra_options`` are merged last but never override a structured field.
Args:
encoder_threads: Number of encoder threads set globally for all VideoEncoderConfigs.
For libsvtav1, this is mapped to ``lp`` via ``svtav1-params``.
For h264/hevc, this is mapped to ``threads``.
Hardware encoders ignore this parameter.
as_strings: If ``True``, casts values to strings.
"""
opts: dict[str, Any] = {}
def set_if(key: str, value: Any) -> None:
if value is not None:
opts[key] = value if not as_strings else str(value)
# GOP size is not a codec-specific option, so it is always set.
set_if("g", self.g)
if self.vcodec == "libsvtav1":
set_if("crf", self.crf)
set_if("preset", self.preset)
svtav1_parts: list[str] = []
if self.fast_decode is not None:
svtav1_parts.append(f"fast-decode={max(0, min(2, self.fast_decode))}")
if encoder_threads is not None:
svtav1_parts.append(f"lp={encoder_threads}")
if svtav1_parts:
opts["svtav1-params"] = ":".join(svtav1_parts)
elif self.vcodec in ("h264", "hevc"):
set_if("crf", self.crf)
set_if("preset", self.preset)
if self.fast_decode:
opts["tune"] = "fastdecode"
set_if("threads", encoder_threads)
elif self.vcodec in ("h264_videotoolbox", "hevc_videotoolbox"):
if self.crf is not None:
opts["q:v"] = max(1, min(100, 100 - self.crf * 2))
elif self.vcodec in ("h264_nvenc", "hevc_nvenc"):
opts["rc"] = 0
set_if("qp", self.crf)
set_if("preset", self.preset)
elif self.vcodec == "h264_vaapi":
set_if("qp", self.crf)
elif self.vcodec == "h264_qsv":
set_if("global_quality", self.crf)
set_if("preset", self.preset)
else:
set_if("crf", self.crf)
set_if("preset", self.preset)
# Extra options are merged last but never override structured fields (values are kept as given).
for k, v in self.extra_options.items():
if k not in opts:
set_if(k, v)
return opts
def camera_encoder_defaults() -> VideoEncoderConfig:
"""Return a :class:`VideoEncoderConfig` with RGB-camera defaults."""
return VideoEncoderConfig()
-19
View File
@@ -31,25 +31,15 @@ from .dataset_tools import (
modify_features,
modify_tasks,
recompute_stats,
reencode_dataset,
remove_feature,
split_dataset,
)
from .factory import make_dataset, resolve_delta_timestamps
from .image_writer import safe_stop_image_writer
from .io_utils import load_episodes, write_stats
from .language import (
EVENT_ONLY_STYLES,
LANGUAGE_EVENTS,
LANGUAGE_PERSISTENT,
PERSISTENT_STYLES,
STYLE_REGISTRY,
column_for_style,
)
from .lerobot_dataset import LeRobotDataset
from .multi_dataset import MultiLeRobotDataset
from .pipeline_features import aggregate_pipeline_dataset_features, create_initial_features
from .pyav_utils import check_video_encoder_parameters_pyav, detect_available_encoders_pyav
from .sampler import EpisodeAwareSampler
from .streaming_dataset import StreamingLeRobotDataset
from .utils import DEFAULT_EPISODES_PATH, create_lerobot_dataset_card
@@ -63,19 +53,12 @@ __all__ = [
"CODEBASE_VERSION",
"DEFAULT_EPISODES_PATH",
"DEFAULT_QUANTILES",
"EVENT_ONLY_STYLES",
"EpisodeAwareSampler",
"LANGUAGE_EVENTS",
"LANGUAGE_PERSISTENT",
"LeRobotDataset",
"LeRobotDatasetMetadata",
"MultiLeRobotDataset",
"PERSISTENT_STYLES",
"STYLE_REGISTRY",
"StreamingLeRobotDataset",
"VideoEncodingManager",
"check_video_encoder_parameters_pyav",
"detect_available_encoders_pyav",
"add_features",
"aggregate_datasets",
"aggregate_pipeline_dataset_features",
@@ -83,7 +66,6 @@ __all__ = [
"convert_image_to_video_dataset",
"create_initial_features",
"create_lerobot_dataset_card",
"column_for_style",
"delete_episodes",
"get_feature_stats",
"load_episodes",
@@ -92,7 +74,6 @@ __all__ = [
"modify_features",
"modify_tasks",
"recompute_stats",
"reencode_dataset",
"remove_feature",
"resolve_delta_timestamps",
"safe_stop_image_writer",
+21 -65
View File
@@ -15,7 +15,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import logging
import shutil
from pathlib import Path
@@ -24,11 +23,9 @@ import datasets
import pandas as pd
import tqdm
from lerobot.configs import VIDEO_ENCODER_INFO_KEYS
from .compute_stats import aggregate_stats
from .dataset_metadata import LeRobotDatasetMetadata
from .feature_utils import features_equal_for_merge, get_hf_features_from_features
from .feature_utils import get_hf_features_from_features
from .io_utils import (
get_file_size_in_mb,
get_parquet_file_size_in_mb,
@@ -49,54 +46,11 @@ from .utils import (
from .video_utils import concatenate_video_files, get_video_duration_in_s
def merge_video_feature_info_for_aggregate(all_metadata: list[LeRobotDatasetMetadata]) -> dict[str, dict]:
"""Create a merged video feature info dictionary for aggregation. The video encoder info is merged field-by-field: each key is kept only when every source agrees; otherwise that key is set to ``null`` (or ``{}`` for ``video.extra_options``) and a warning is logged.
Args:
all_metadata: List of LeRobotDatasetMetadata objects to merge.
Returns:
dict: A dictionary of merged video feature info.
"""
merged_info = copy.deepcopy(all_metadata[0].features)
video_keys = [k for k in merged_info if merged_info[k].get("dtype") == "video"]
for vk in video_keys:
video_infos = [m.features.get(vk, {}).get("info") or {} for m in all_metadata]
base_video_info = video_infos[0]
merged_encoder_info: dict = {}
fallback_keys: list[str] = []
for info_key in VIDEO_ENCODER_INFO_KEYS:
values = [info.get(info_key, None) for info in video_infos]
first_value = values[0]
all_match = all(v == first_value for v in values[1:])
if all_match:
merged_encoder_info[info_key] = first_value
else:
fallback_keys.append(info_key)
merged_encoder_info[info_key] = {} if info_key == "video.extra_options" else None
if fallback_keys:
logging.warning(
f"Merging heterogeneous or incomplete video encoder metadata for feature {vk}. "
f"Setting these keys to null: {fallback_keys}.",
)
merged_info[vk]["info"] = {**base_video_info, **merged_encoder_info}
# TODO(CarolinePascal): make this variable once we have support for other video backends.
merged_info[vk]["info"]["video.video_backend"] = "pyav"
return merged_info
def validate_all_metadata(all_metadata: list[LeRobotDatasetMetadata]):
"""Validates that all dataset metadata have consistent properties.
Ensures all datasets have the same fps, robot_type, and features to guarantee
compatibility when aggregating them into a single dataset.
Video encoder info is not considered for validation but is merged during aggregation in ``merge_video_feature_info_for_aggregate``.
Args:
all_metadata: List of LeRobotDatasetMetadata objects to validate.
@@ -120,7 +74,7 @@ def validate_all_metadata(all_metadata: list[LeRobotDatasetMetadata]):
raise ValueError(
f"Same robot_type is expected, but got robot_type={meta.robot_type} instead of {robot_type}."
)
if not features_equal_for_merge(features, meta.features):
if features != meta.features:
raise ValueError(
f"Same features is expected, but got features={meta.features} instead of {features}."
)
@@ -143,8 +97,8 @@ def update_data_df(df, src_meta, dst_meta):
pd.DataFrame: Updated DataFrame with adjusted indices.
"""
df["episode_index"] = df["episode_index"] + dst_meta.info.total_episodes
df["index"] = df["index"] + dst_meta.info.total_frames
df["episode_index"] = df["episode_index"] + dst_meta.info["total_episodes"]
df["index"] = df["index"] + dst_meta.info["total_frames"]
src_task_names = src_meta.tasks.index.take(df["task_index"].to_numpy())
df["task_index"] = dst_meta.tasks.loc[src_task_names, "task_index"].to_numpy()
@@ -271,9 +225,9 @@ def update_meta_data(
# Clean up temporary columns
df = df.drop(columns=["_orig_chunk", "_orig_file"])
df["dataset_from_index"] = df["dataset_from_index"] + dst_meta.info.total_frames
df["dataset_to_index"] = df["dataset_to_index"] + dst_meta.info.total_frames
df["episode_index"] = df["episode_index"] + dst_meta.info.total_episodes
df["dataset_from_index"] = df["dataset_from_index"] + dst_meta.info["total_frames"]
df["dataset_to_index"] = df["dataset_to_index"] + dst_meta.info["total_frames"]
df["episode_index"] = df["episode_index"] + dst_meta.info["total_episodes"]
return df
@@ -283,8 +237,8 @@ def aggregate_datasets(
aggr_repo_id: str,
roots: list[Path] | None = None,
aggr_root: Path | None = None,
data_files_size_in_mb: int | None = None,
video_files_size_in_mb: int | None = None,
data_files_size_in_mb: float | None = None,
video_files_size_in_mb: float | None = None,
chunk_size: int | None = None,
):
"""Aggregates multiple LeRobot datasets into a single unified dataset.
@@ -320,8 +274,7 @@ def aggregate_datasets(
LeRobotDatasetMetadata(repo_id, root=root) for repo_id, root in zip(repo_ids, roots, strict=False)
]
)
fps, robot_type, _ = validate_all_metadata(all_metadata)
features = merge_video_feature_info_for_aggregate(all_metadata)
fps, robot_type, features = validate_all_metadata(all_metadata)
video_keys = [key for key in features if features[key]["dtype"] == "video"]
dst_meta = LeRobotDatasetMetadata.create(
@@ -360,8 +313,8 @@ def aggregate_datasets(
# to avoid interference between different source datasets
data_idx.pop("src_to_dst", None)
dst_meta.info.total_episodes += src_meta.total_episodes
dst_meta.info.total_frames += src_meta.total_frames
dst_meta.info["total_episodes"] += src_meta.total_episodes
dst_meta.info["total_frames"] += src_meta.total_frames
finalize_aggregation(dst_meta, all_metadata)
logging.info("Aggregation complete.")
@@ -379,6 +332,7 @@ def aggregate_videos(src_meta, dst_meta, videos_idx, video_files_size_in_mb, chu
videos_idx: Dictionary tracking video chunk and file indices.
video_files_size_in_mb: Maximum size for video files in MB (defaults to DEFAULT_VIDEO_FILE_SIZE_IN_MB)
chunk_size: Maximum number of files per chunk (defaults to DEFAULT_CHUNK_SIZE)
Returns:
dict: Updated videos_idx with current chunk and file indices.
"""
@@ -460,11 +414,9 @@ def aggregate_videos(src_meta, dst_meta, videos_idx, video_files_size_in_mb, chu
current_dst_duration = dst_file_durations.get(dst_key, 0)
videos_idx[key]["src_to_offset"][(src_chunk_idx, src_file_idx)] = current_dst_duration
videos_idx[key]["src_to_dst"][(src_chunk_idx, src_file_idx)] = dst_key
# TODO(CarolinePascal): Move the check before the loop to avoid failing in the middle + add possibility to re-encode the video if the check fails
concatenate_video_files(
[dst_path, src_path],
dst_path,
compatibility_check=True,
)
# Update duration of this destination file
dst_file_durations[dst_key] = current_dst_duration + src_duration
@@ -688,10 +640,14 @@ def finalize_aggregation(aggr_meta, all_metadata):
write_tasks(aggr_meta.tasks, aggr_meta.root)
logging.info("write info")
aggr_meta.info.total_tasks = len(aggr_meta.tasks)
aggr_meta.info.total_episodes = sum(m.total_episodes for m in all_metadata)
aggr_meta.info.total_frames = sum(m.total_frames for m in all_metadata)
aggr_meta.info.splits = {"train": f"0:{sum(m.total_episodes for m in all_metadata)}"}
aggr_meta.info.update(
{
"total_tasks": len(aggr_meta.tasks),
"total_episodes": sum(m.total_episodes for m in all_metadata),
"total_frames": sum(m.total_frames for m in all_metadata),
"splits": {"train": f"0:{sum(m.total_episodes for m in all_metadata)}"},
}
)
write_info(aggr_meta.info, aggr_meta.root)
logging.info("write stats")
+1 -1
View File
@@ -512,7 +512,7 @@ def compute_episode_stats(
ep_stats = {}
for key, data in episode_data.items():
if features[key]["dtype"] in {"string", "language"}:
if features[key]["dtype"] == "string":
continue
if features[key]["dtype"] in ["image", "video"]:
+28 -105
View File
@@ -14,7 +14,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import contextlib
from collections.abc import Callable
from pathlib import Path
import numpy as np
@@ -24,7 +23,6 @@ import pyarrow as pa
import pyarrow.parquet as pq
from huggingface_hub import snapshot_download
from lerobot.configs import VideoEncoderConfig
from lerobot.utils.constants import DEFAULT_FEATURES, HF_LEROBOT_HOME, HF_LEROBOT_HUB_CACHE
from lerobot.utils.feature_utils import _validate_feature_names
from lerobot.utils.utils import flatten_dict
@@ -36,14 +34,16 @@ from .io_utils import (
load_episodes,
load_info,
load_stats,
load_subtasks,
load_tasks,
write_info,
write_json,
write_stats,
write_tasks,
)
from .language import DEFAULT_TOOLS, LANGUAGE_COLUMNS
from .utils import (
DEFAULT_EPISODES_PATH,
INFO_PATH,
check_version_compatibility,
get_safe_version,
has_legacy_hub_download_metadata,
@@ -177,6 +177,7 @@ class LeRobotDatasetMetadata:
self.info = load_info(self.root)
check_version_compatibility(self.repo_id, self._version, CODEBASE_VERSION)
self.tasks = load_tasks(self.root)
self.subtasks = load_subtasks(self.root)
self.episodes = load_episodes(self.root)
self.stats = load_stats(self.root)
@@ -190,29 +191,6 @@ class LeRobotDatasetMetadata:
if self.episodes is None:
self._load_metadata()
def filter_episodes(
self,
predicate: Callable[[dict], bool],
candidates: list[int] | None = None,
) -> list[int]:
"""Filter episodes whose metadata satisfies a given predicate.
Args:
predicate: Predicate over per-episode metadata rows used to select episodes.
candidates: Optional list of episode indices to restrict evaluation to.
Returns:
List of sorted episode indices that satisfy the predicate.
"""
self.ensure_readable()
if candidates is not None:
candidate_set = set(candidates)
combined = lambda ep: ep["episode_index"] in candidate_set and predicate(ep) # noqa: E731
else:
combined = predicate
filtered = self.episodes.filter(combined, keep_in_memory=True, load_from_cache_file=False)
return sorted(int(idx) for idx in filtered["episode_index"])
def _pull_from_repo(
self,
allow_patterns: list[str] | str | None = None,
@@ -250,7 +228,7 @@ class LeRobotDatasetMetadata:
@property
def _version(self) -> packaging.version.Version:
"""Codebase version used to create this dataset."""
return packaging.version.parse(self.info.codebase_version)
return packaging.version.parse(self.info["codebase_version"])
def get_data_file_path(self, ep_index: int) -> Path:
"""Return the relative parquet file path for the given episode index.
@@ -305,27 +283,27 @@ class LeRobotDatasetMetadata:
@property
def data_path(self) -> str:
"""Formattable string for the parquet files."""
return self.info.data_path
return self.info["data_path"]
@property
def video_path(self) -> str | None:
"""Formattable string for the video files."""
return self.info.video_path
return self.info["video_path"]
@property
def robot_type(self) -> str | None:
"""Robot type used in recording this dataset."""
return self.info.robot_type
return self.info["robot_type"]
@property
def fps(self) -> int:
"""Frames per second used during data collection."""
return self.info.fps
return self.info["fps"]
@property
def features(self) -> dict[str, dict]:
"""All features contained in the dataset."""
return self.info.features
return self.info["features"]
@property
def image_keys(self) -> list[str]:
@@ -342,49 +320,6 @@ class LeRobotDatasetMetadata:
"""Keys to access visual modalities (regardless of their storage method)."""
return [key for key, ft in self.features.items() if ft["dtype"] in ["video", "image"]]
@property
def has_language_columns(self) -> bool:
"""Return ``True`` if the dataset declares any language column.
Used to gate language-aware code paths (collate, render step) so
unannotated datasets keep PyTorch's default collate behavior.
"""
return any(col in self.features for col in LANGUAGE_COLUMNS)
@property
def tools(self) -> list[dict]:
"""OpenAI-style tool schemas declared by this dataset.
Read from ``meta/info.json["tools"]``. Returns a copy, so callers
can mutate the result safely. Falls back to
:data:`lerobot.datasets.language.DEFAULT_TOOLS` (the canonical
``say`` schema) when the dataset doesn't declare any — that way
unannotated datasets and chat-template consumers
(``apply_chat_template(messages, tools=meta.tools)``) keep
working out of the box.
Implementations live under :mod:`lerobot.tools` (one file per
tool); see ``docs/source/tools.mdx`` for the authoring guide.
"""
declared = self.info.tools
if declared:
return [dict(t) for t in declared]
return [dict(t) for t in DEFAULT_TOOLS]
@tools.setter
def tools(self, value: list[dict] | None) -> None:
"""Persist a tool catalog to ``meta/info.json`` and reload metadata.
Writes ``value`` into the on-disk ``info.json`` (or clears the
``tools`` key when ``value`` is ``None`` or empty), then reloads
``self.info`` so the in-memory metadata matches what's on disk.
Saves callers from hand-editing ``info.json`` and re-instantiating
the metadata object.
"""
self.info.tools = [dict(t) for t in value] if value else None
write_info(self.info, self.root)
self.info = load_info(self.root)
@property
def names(self) -> dict[str, list | dict]:
"""Names of the various dimensions of vector modalities."""
@@ -398,32 +333,32 @@ class LeRobotDatasetMetadata:
@property
def total_episodes(self) -> int:
"""Total number of episodes available."""
return self.info.total_episodes
return self.info["total_episodes"]
@property
def total_frames(self) -> int:
"""Total number of frames saved in this dataset."""
return self.info.total_frames
return self.info["total_frames"]
@property
def total_tasks(self) -> int:
"""Total number of different tasks performed in this dataset."""
return self.info.total_tasks
return self.info["total_tasks"]
@property
def chunks_size(self) -> int:
"""Max number of files per chunk."""
return self.info.chunks_size
return self.info["chunks_size"]
@property
def data_files_size_in_mb(self) -> int:
"""Max size of data file in mega bytes."""
return self.info.data_files_size_in_mb
return self.info["data_files_size_in_mb"]
@property
def video_files_size_in_mb(self) -> int:
"""Max size of video file in mega bytes."""
return self.info.video_files_size_in_mb
return self.info["video_files_size_in_mb"]
def get_task_index(self, task: str) -> int | None:
"""
@@ -567,33 +502,20 @@ class LeRobotDatasetMetadata:
self._save_episode_metadata(episode_dict)
# Update info
self.info.total_episodes += 1
self.info.total_frames += episode_length
self.info.total_tasks = len(self.tasks)
self.info.splits = {"train": f"0:{self.info.total_episodes}"}
self.info["total_episodes"] += 1
self.info["total_frames"] += episode_length
self.info["total_tasks"] = len(self.tasks)
self.info["splits"] = {"train": f"0:{self.info['total_episodes']}"}
write_info(self.info, self.root)
self.stats = aggregate_stats([self.stats, episode_stats]) if self.stats is not None else episode_stats
write_stats(self.stats, self.root)
def update_video_info(
self,
video_key: str | None = None,
camera_encoder: VideoEncoderConfig | None = None,
) -> None:
"""Populate per-feature video info in ``info.json``.
def update_video_info(self, video_key: str | None = None) -> None:
"""
Warning: this function writes info from first episode videos, implicitly assuming that all videos have
been encoded the same way. Also, this means it assumes the first episode exists.
Args:
video_key: If provided, only update this video key. Otherwise update
all video keys in the dataset.
camera_encoder: Encoder configuration used to produce the
videos. When provided, its fields are recorded as
``video.<field>`` entries alongside the stream-derived
``video.*`` entries (see :func:`get_video_info`).
"""
if video_key is not None and video_key not in self.video_keys:
raise ValueError(f"Video key {video_key} not found in dataset")
@@ -602,7 +524,7 @@ class LeRobotDatasetMetadata:
for key in video_keys:
if not self.features[key].get("info", None):
video_path = self.root / self.video_path.format(video_key=key, chunk_index=0, file_index=0)
self.info.features[key]["info"] = get_video_info(video_path, camera_encoder=camera_encoder)
self.info["features"][key]["info"] = get_video_info(video_path)
def update_chunk_settings(
self,
@@ -624,17 +546,17 @@ class LeRobotDatasetMetadata:
if chunks_size is not None:
if chunks_size <= 0:
raise ValueError(f"chunks_size must be positive, got {chunks_size}")
self.info.chunks_size = chunks_size
self.info["chunks_size"] = chunks_size
if data_files_size_in_mb is not None:
if data_files_size_in_mb <= 0:
raise ValueError(f"data_files_size_in_mb must be positive, got {data_files_size_in_mb}")
self.info.data_files_size_in_mb = data_files_size_in_mb
self.info["data_files_size_in_mb"] = data_files_size_in_mb
if video_files_size_in_mb is not None:
if video_files_size_in_mb <= 0:
raise ValueError(f"video_files_size_in_mb must be positive, got {video_files_size_in_mb}")
self.info.video_files_size_in_mb = video_files_size_in_mb
self.info["video_files_size_in_mb"] = video_files_size_in_mb
# Update the info file on disk
write_info(self.info, self.root)
@@ -713,6 +635,7 @@ class LeRobotDatasetMetadata:
_validate_feature_names(features)
obj.tasks = None
obj.subtasks = None
obj.episodes = None
obj.stats = None
obj.info = create_empty_dataset_info(
@@ -730,7 +653,7 @@ class LeRobotDatasetMetadata:
f"Features contain video keys {obj.video_keys}, but 'use_videos' is set to False. "
"Either remove video features from the features dict, or set 'use_videos=True'."
)
write_info(obj.info, obj.root)
write_json(obj.info, obj.root / INFO_PATH)
obj.revision = None
obj._pq_writer = None
obj.latest_episode = None
+5
View File
@@ -295,4 +295,9 @@ class DatasetReader:
task_idx = item["task_index"].item()
item["task"] = self._meta.tasks.iloc[task_idx].name
# add subtask information if available
if "subtask_index" in self._meta.features and self._meta.subtasks is not None:
subtask_idx = item["subtask_index"].item()
item["subtask"] = self._meta.subtasks.iloc[subtask_idx].name
return item
+73 -146
View File
@@ -26,7 +26,7 @@ This module provides utilities for:
import logging
import shutil
from collections.abc import Callable
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
import datasets
@@ -36,7 +36,6 @@ import pyarrow.parquet as pq
import torch
from tqdm import tqdm
from lerobot.configs import VideoEncoderConfig, camera_encoder_defaults
from lerobot.utils.constants import ACTION, HF_LEROBOT_HOME, OBS_IMAGE, OBS_STATE
from lerobot.utils.utils import flatten_dict
@@ -61,14 +60,9 @@ from .utils import (
DEFAULT_DATA_FILE_SIZE_IN_MB,
DEFAULT_DATA_PATH,
DEFAULT_EPISODES_PATH,
VIDEO_DIR,
update_chunk_file_indices,
)
from .video_utils import (
encode_video_frames,
get_video_info,
reencode_video,
)
from .video_utils import encode_video_frames, get_video_info
def _load_episode_with_stats(src_dataset: LeRobotDataset, episode_idx: int) -> dict:
@@ -101,11 +95,6 @@ def delete_episodes(
) -> LeRobotDataset:
"""Delete episodes from a LeRobotDataset and create a new dataset.
Video segments that need re-encoding (because the source file mixes kept and
deleted episodes) are re-encoded with the source dataset's existing encoder
settings — read back from ``meta/info.json`` — so the output dataset stays
consistent with its own metadata.
Args:
dataset: The source LeRobotDataset.
episode_indices: List of episode indices to delete.
@@ -168,11 +157,6 @@ def split_dataset(
) -> dict[str, LeRobotDataset]:
"""Split a LeRobotDataset into multiple smaller datasets.
Video segments that need re-encoding (because the source file mixes episodes
that fall into different splits) are re-encoded with the source dataset's
existing encoder settings — read back from ``meta/info.json`` — so each
output split stays consistent with its own metadata.
Args:
dataset: The source LeRobotDataset to split.
splits: Either a dict mapping split names to episode indices, or a dict mapping
@@ -594,7 +578,8 @@ def _keep_episodes_from_video_with_av(
output_path: Path,
episodes_to_keep: list[tuple[int, int]],
fps: float,
camera_encoder: VideoEncoderConfig,
vcodec: str = "libsvtav1",
pix_fmt: str = "yuv420p",
) -> None:
"""Keep only specified episodes from a video file using PyAV.
@@ -608,7 +593,8 @@ def _keep_episodes_from_video_with_av(
Ranges are half-open intervals: [start_frame, end_frame), where start_frame
is inclusive and end_frame is exclusive.
fps: Frame rate of the video.
camera_encoder: Video encoder settings used to re-encode the kept frames.
vcodec: Video codec to use for encoding.
pix_fmt: Pixel format for output video.
"""
from fractions import Fraction
@@ -633,13 +619,12 @@ def _keep_episodes_from_video_with_av(
# Convert fps to Fraction for PyAV compatibility.
fps_fraction = Fraction(fps).limit_denominator(1000)
codec_options = camera_encoder.get_codec_options(as_strings=True)
v_out = out.add_stream(camera_encoder.vcodec, rate=fps_fraction, options=codec_options)
v_out = out.add_stream(vcodec, rate=fps_fraction)
# PyAV type stubs don't distinguish video streams from audio/subtitle streams.
v_out.width = v_in.codec_context.width
v_out.height = v_in.codec_context.height
v_out.pix_fmt = camera_encoder.pix_fmt
v_out.pix_fmt = pix_fmt
# Set time_base to match the frame rate for proper timestamp handling.
v_out.time_base = Fraction(1, int(fps))
@@ -702,14 +687,14 @@ def _copy_and_reindex_videos(
src_dataset: LeRobotDataset,
dst_meta: LeRobotDatasetMetadata,
episode_mapping: dict[int, int],
vcodec: str = "libsvtav1",
pix_fmt: str = "yuv420p",
) -> dict[int, dict]:
"""Copy and filter video files, only re-encoding files with deleted episodes.
For video files that only contain kept episodes, we copy them directly.
For files with mixed kept/deleted episodes, we use PyAV filters to efficiently
re-encode only the desired segments. The encoder used for re-encoding is
derived per video key from the source dataset's ``meta/info.json`` so the
destination metadata keeps describing the videos accurately.
re-encode only the desired segments.
Args:
src_dataset: Source dataset to copy from
@@ -726,9 +711,6 @@ def _copy_and_reindex_videos(
for video_key in src_dataset.meta.video_keys:
logging.info(f"Processing videos for {video_key}")
camera_encoder = VideoEncoderConfig.from_video_info(
src_dataset.meta.info.features.get(video_key, {}).get("info")
)
if dst_meta.video_path is None:
raise ValueError("Destination metadata has no video_path defined")
@@ -810,7 +792,8 @@ def _copy_and_reindex_videos(
dst_video_path,
episodes_to_keep_ranges,
src_dataset.meta.fps,
camera_encoder,
vcodec,
pix_fmt,
)
cumulative_ts = 0.0
@@ -914,10 +897,14 @@ def _copy_and_reindex_episodes_metadata(
dst_meta.finalize()
dst_meta.info.total_episodes = len(episode_mapping)
dst_meta.info.total_frames = total_frames
dst_meta.info.total_tasks = len(dst_meta.tasks) if dst_meta.tasks is not None else 0
dst_meta.info.splits = {"train": f"0:{len(episode_mapping)}"}
dst_meta.info.update(
{
"total_episodes": len(episode_mapping),
"total_frames": total_frames,
"total_tasks": len(dst_meta.tasks) if dst_meta.tasks is not None else 0,
"splits": {"train": f"0:{len(episode_mapping)}"},
}
)
write_info(dst_meta.info, dst_meta.root)
if not all_stats:
@@ -1082,20 +1069,21 @@ def _copy_episodes_metadata_and_stats(
if episodes_dir.exists():
shutil.copytree(episodes_dir, dst_episodes_dir, dirs_exist_ok=True)
dst_meta.info.total_episodes = src_dataset.meta.total_episodes
dst_meta.info.total_frames = src_dataset.meta.total_frames
dst_meta.info.total_tasks = src_dataset.meta.total_tasks
# Preserve original splits if available, otherwise create default
dst_meta.info.splits = (
src_dataset.meta.info.splits
if src_dataset.meta.info.splits
else {"train": f"0:{src_dataset.meta.total_episodes}"}
dst_meta.info.update(
{
"total_episodes": src_dataset.meta.total_episodes,
"total_frames": src_dataset.meta.total_frames,
"total_tasks": src_dataset.meta.total_tasks,
"splits": src_dataset.meta.info.get("splits", {"train": f"0:{src_dataset.meta.total_episodes}"}),
}
)
if dst_meta.video_keys and src_dataset.meta.video_keys:
for key in dst_meta.video_keys:
if key in src_dataset.meta.features:
dst_meta.info.features[key]["info"] = src_dataset.meta.info.features[key].get("info", {})
dst_meta.info["features"][key]["info"] = src_dataset.meta.info["features"][key].get(
"info", {}
)
write_info(dst_meta.info, dst_meta.root)
@@ -1281,7 +1269,11 @@ def _estimate_frame_size_via_calibration(
episode_indices: list[int],
temp_dir: Path,
fps: int,
camera_encoder: VideoEncoderConfig,
vcodec: str,
pix_fmt: str,
g: int,
crf: int,
fast_decode: int,
num_calibration_frames: int = 30,
) -> float:
"""Estimate MB per frame by encoding a small calibration sample.
@@ -1295,7 +1287,11 @@ def _estimate_frame_size_via_calibration(
episode_indices: List of episode indices being processed.
temp_dir: Temporary directory for calibration files.
fps: Frames per second for video encoding.
camera_encoder: Video encoder settings used for calibration encoding.
vcodec: Video codec (libsvtav1, h264, hevc).
pix_fmt: Pixel format (yuv420p, etc.).
g: GOP size (group of pictures).
crf: Constant Rate Factor (quality).
fast_decode: Fast decode tuning parameter.
num_calibration_frames: Number of frames to use for calibration (default: 30).
Returns:
@@ -1331,7 +1327,11 @@ def _estimate_frame_size_via_calibration(
imgs_dir=calibration_dir,
video_path=calibration_video_path,
fps=fps,
camera_encoder=camera_encoder,
vcodec=vcodec,
pix_fmt=pix_fmt,
g=g,
crf=crf,
fast_decode=fast_decode,
overwrite=True,
)
@@ -1525,7 +1525,7 @@ def modify_tasks(
write_tasks(new_task_df, root)
# Update info.json
dataset.meta.info.total_tasks = len(unique_tasks)
dataset.meta.info["total_tasks"] = len(unique_tasks)
write_info(dataset.meta.info, root)
# Reload metadata to reflect changes
@@ -1649,7 +1649,11 @@ def convert_image_to_video_dataset(
dataset: LeRobotDataset,
output_dir: Path | None = None,
repo_id: str | None = None,
camera_encoder: VideoEncoderConfig | None = None,
vcodec: str = "libsvtav1",
pix_fmt: str = "yuv420p",
g: int = 2,
crf: int = 30,
fast_decode: int = 0,
episode_indices: list[int] | None = None,
num_workers: int = 4,
max_episodes_per_batch: int | None = None,
@@ -1664,8 +1668,11 @@ def convert_image_to_video_dataset(
dataset: The source LeRobot dataset with images
output_dir: Root directory where the edited dataset will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id. Equivalent to new_root in EditDatasetConfig.
repo_id: Edited dataset identifier. Equivalent to new_repo_id in EditDatasetConfig.
camera_encoder: Video encoder settings
(``None`` uses :func:`~lerobot.configs.camera_encoder_defaults`).
vcodec: Video codec (default: libsvtav1)
pix_fmt: Pixel format (default: yuv420p)
g: Group of pictures size (default: 2)
crf: Constant rate factor (default: 30)
fast_decode: Fast decode tuning (default: 0)
episode_indices: List of episode indices to convert (None = all episodes)
num_workers: Number of threads for parallel processing (default: 4)
max_episodes_per_batch: Maximum episodes per video batch to avoid memory issues (None = no limit)
@@ -1674,9 +1681,6 @@ def convert_image_to_video_dataset(
Returns:
New LeRobotDataset with images encoded as videos
"""
if camera_encoder is None:
camera_encoder = camera_encoder_defaults()
# Check that it's an image dataset
if len(dataset.meta.video_keys) > 0:
raise ValueError(
@@ -1700,10 +1704,7 @@ def convert_image_to_video_dataset(
logging.info(
f"Converting {len(episode_indices)} episodes with {len(img_keys)} cameras from {dataset.repo_id}"
)
logging.info(
f"Video codec: {camera_encoder.vcodec}, pixel format: {camera_encoder.pix_fmt}, "
f"GOP: {camera_encoder.g}, CRF: {camera_encoder.crf}"
)
logging.info(f"Video codec: {vcodec}, pixel format: {pix_fmt}, GOP: {g}, CRF: {crf}")
# Create new features dict, converting image features to video features
new_features = {}
@@ -1773,7 +1774,11 @@ def convert_image_to_video_dataset(
episode_indices=episode_indices,
temp_dir=temp_dir,
fps=fps,
camera_encoder=camera_encoder,
vcodec=vcodec,
pix_fmt=pix_fmt,
g=g,
crf=crf,
fast_decode=fast_decode,
)
logging.info(f"Processing camera: {img_key}")
@@ -1815,7 +1820,11 @@ def convert_image_to_video_dataset(
imgs_dir=imgs_dir,
video_path=video_path,
fps=fps,
camera_encoder=camera_encoder,
vcodec=vcodec,
pix_fmt=pix_fmt,
g=g,
crf=crf,
fast_decode=fast_decode,
overwrite=True,
)
@@ -1849,10 +1858,10 @@ def convert_image_to_video_dataset(
episodes_df.to_parquet(episodes_path, index=False)
# Update metadata info
new_meta.info.total_episodes = len(episode_indices)
new_meta.info.total_frames = sum(ep["length"] for ep in all_episode_metadata.values())
new_meta.info.total_tasks = dataset.meta.total_tasks
new_meta.info.splits = {"train": f"0:{len(episode_indices)}"}
new_meta.info["total_episodes"] = len(episode_indices)
new_meta.info["total_frames"] = sum(ep["length"] for ep in all_episode_metadata.values())
new_meta.info["total_tasks"] = dataset.meta.total_tasks
new_meta.info["splits"] = {"train": f"0:{len(episode_indices)}"}
# Update video info for all image keys (now videos)
# We need to manually set video info since update_video_info() checks video_keys first
@@ -1861,9 +1870,7 @@ def convert_image_to_video_dataset(
video_path = new_meta.root / new_meta.video_path.format(
video_key=img_key, chunk_index=0, file_index=0
)
new_meta.info.features[img_key]["info"] = get_video_info(
video_path, camera_encoder=camera_encoder
)
new_meta.info["features"][img_key]["info"] = get_video_info(video_path)
write_info(new_meta.info, new_meta.root)
@@ -1886,83 +1893,3 @@ def convert_image_to_video_dataset(
# Return new dataset
return LeRobotDataset(repo_id=repo_id, root=output_dir)
def _reencode_video_worker(args: tuple) -> Path:
"""Picklable worker for :func:`reencode_dataset`'s process pool."""
video_path, camera_encoder, encoder_threads = args
reencode_video(
input_video_path=video_path,
output_video_path=video_path,
camera_encoder=camera_encoder,
encoder_threads=encoder_threads,
overwrite=True,
)
return video_path
def reencode_dataset(
dataset: LeRobotDataset,
camera_encoder: VideoEncoderConfig,
encoder_threads: int | None = None,
num_workers: int | None = None,
) -> LeRobotDataset:
"""Re-encode every video in a dataset with a new set of encoding parameters.
Videos are re-encoded in-place and the video information in ``info.json`` is refreshed.
Args:
dataset: An existing :class:`LeRobotDataset` whose videos will be
re-encoded.
camera_encoder: Target encoder configuration applied to every video
file.
encoder_threads: Per-encoder thread count forwarded to
:func:`reencode_video`. ``None`` lets the codec decide.
num_workers: Number of parallel processes. ``None`` or ``0`` means
sequential (no multiprocessing); ``1+`` spawns a
:class:`~concurrent.futures.ProcessPoolExecutor`.
Returns:
The same :class:`LeRobotDataset` instance with its metadata updated
on disk.
"""
meta = dataset.meta
video_paths_list = []
# Only re-encode if the videos are not already encoded with the given video encoding parameters
for video_key in meta.video_keys:
current_info = meta.info.features[video_key].get("info", {})
current_encoder = VideoEncoderConfig.from_video_info(current_info)
if current_encoder != camera_encoder:
video_paths_list.extend((meta.root / VIDEO_DIR / video_key).rglob("*.mp4"))
else:
logging.info(f"{video_key} videos are already encoded with {camera_encoder}. Nothing to do.")
if len(video_paths_list) == 0:
logging.warning("Dataset has no videos to re-encode.")
return dataset
logging.info(f"Re-encoding {len(video_paths_list)} video file(s) with {camera_encoder}")
worker_args = [(vp, camera_encoder, encoder_threads) for vp in video_paths_list]
if num_workers and num_workers > 1:
with ProcessPoolExecutor(max_workers=num_workers) as pool:
futures = [pool.submit(_reencode_video_worker, args) for args in worker_args]
for future in tqdm(
as_completed(futures),
total=len(futures),
desc="Re-encoding videos",
):
future.result()
else:
for args in tqdm(worker_args, desc="Re-encoding videos"):
_reencode_video_worker(args)
# Refresh video info in metadata for every video key.
for vid_key in meta.video_keys:
video_path = meta.root / meta.get_video_file_path(0, vid_key)
meta.info.features[vid_key]["info"] = get_video_info(video_path, camera_encoder=camera_encoder)
write_info(meta.info, meta.root)
logging.info("Dataset metadata updated.")
return dataset
+11 -32
View File
@@ -31,8 +31,6 @@ import PIL.Image
import pyarrow.parquet as pq
import torch
from lerobot.configs import VideoEncoderConfig, camera_encoder_defaults
from .compute_stats import compute_episode_stats
from .dataset_metadata import LeRobotDatasetMetadata
from .feature_utils import (
@@ -67,19 +65,14 @@ def _encode_video_worker(
episode_index: int,
root: Path,
fps: int,
camera_encoder: VideoEncoderConfig | None = None,
vcodec: str = "libsvtav1",
encoder_threads: int | None = None,
) -> Path:
temp_path = Path(tempfile.mkdtemp(dir=root)) / f"{video_key}_{episode_index:03d}.mp4"
fpath = DEFAULT_IMAGE_PATH.format(image_key=video_key, episode_index=episode_index, frame_index=0)
img_dir = (root / fpath).parent
encode_video_frames(
img_dir,
temp_path,
fps,
camera_encoder=camera_encoder,
encoder_threads=encoder_threads,
overwrite=True,
img_dir, temp_path, fps, vcodec=vcodec, overwrite=True, encoder_threads=encoder_threads
)
shutil.rmtree(img_dir)
return temp_path
@@ -96,22 +89,20 @@ class DatasetWriter:
self,
meta: LeRobotDatasetMetadata,
root: Path,
camera_encoder: VideoEncoderConfig | None,
vcodec: str,
encoder_threads: int | None,
batch_encoding_size: int,
streaming_encoder: StreamingVideoEncoder | None = None,
initial_frames: int = 0,
):
"""Initialize the writer with metadata, codec, and encoder config.
"""Initialize the writer with metadata, codec, and encoding config.
Args:
meta: Dataset metadata instance (used for feature schema, chunk
settings, and episode persistence).
root: Local dataset root directory.
camera_encoder: Video encoder settings applied to all cameras.
``None`` uses :func:`~lerobot.configs.camera_encoder_defaults`.
encoder_threads: Number of encoder threads (global). ``None``
lets the codec decide.
vcodec: Video codec for encoding (e.g. ``'libsvtav1'``, ``'h264'``).
encoder_threads: Threads per encoder instance. ``None`` for auto.
batch_encoding_size: Number of episodes to accumulate before
batch-encoding videos.
streaming_encoder: Optional pre-built :class:`StreamingVideoEncoder`
@@ -120,7 +111,7 @@ class DatasetWriter:
"""
self._meta = meta
self._root = root
self._camera_encoder = camera_encoder or camera_encoder_defaults()
self._vcodec = vcodec
self._encoder_threads = encoder_threads
self._batch_encoding_size = batch_encoding_size
self._streaming_encoder = streaming_encoder
@@ -250,14 +241,7 @@ class DatasetWriter:
for key, ft in self._meta.features.items():
if key in ["index", "episode_index", "task_index"] or ft["dtype"] in ["image", "video"]:
continue
stacked_values = np.stack(episode_buffer[key])
# `shape=(1,)` numeric features are serialized as `datasets.Value`, which expects scalars.
# Normalizing to `(N,)` keeps save semantics stable across dependency versions.
if tuple(ft["shape"]) == (1,) and ft["dtype"] != "string":
stacked_values = stacked_values.reshape(episode_length)
episode_buffer[key] = stacked_values
episode_buffer[key] = np.stack(episode_buffer[key])
# Wait for image writer to end, so that episode stats over images can be computed
self._wait_image_writer()
@@ -300,7 +284,7 @@ class DatasetWriter:
episode_index,
self._root,
self._meta.fps,
self._camera_encoder,
self._vcodec,
self._encoder_threads,
): video_key
for video_key in self._meta.video_keys
@@ -511,7 +495,7 @@ class DatasetWriter:
# Update video info (only needed when first episode is encoded)
if episode_index == 0:
self._meta.update_video_info(video_key, camera_encoder=self._camera_encoder)
self._meta.update_video_info(video_key)
write_info(self._meta.info, self._meta.root)
metadata = {
@@ -580,12 +564,7 @@ class DatasetWriter:
def _encode_temporary_episode_video(self, video_key: str, episode_index: int) -> Path:
"""Use ffmpeg to convert frames stored as png into mp4 videos."""
return _encode_video_worker(
video_key,
episode_index,
self._root,
self._meta.fps,
self._camera_encoder,
self._encoder_threads,
video_key, episode_index, self._root, self._meta.fps, self._vcodec, self._encoder_threads
)
def close_writer(self) -> None:
+4 -7
View File
@@ -19,7 +19,6 @@ from pprint import pformat
import torch
from lerobot.configs import PreTrainedConfig
from lerobot.configs.rewards import RewardModelConfig
from lerobot.configs.train import TrainPipelineConfig
from lerobot.transforms import ImageTransforms
from lerobot.utils.constants import ACTION, IMAGENET_STATS, OBS_PREFIX, REWARD
@@ -31,14 +30,12 @@ from .streaming_dataset import StreamingLeRobotDataset
def resolve_delta_timestamps(
cfg: PreTrainedConfig | RewardModelConfig, ds_meta: LeRobotDatasetMetadata
cfg: PreTrainedConfig, ds_meta: LeRobotDatasetMetadata
) -> dict[str, list] | None:
"""Resolves delta_timestamps by reading from the 'delta_indices' properties of the config.
"""Resolves delta_timestamps by reading from the 'delta_indices' properties of the PreTrainedConfig.
Args:
cfg (PreTrainedConfig | RewardModelConfig): The config to read delta_indices from. Both
``PreTrainedConfig`` and concrete ``RewardModelConfig`` subclasses expose the
``{observation,action,reward}_delta_indices`` properties used below.
cfg (PreTrainedConfig): The PreTrainedConfig to read delta_indices from.
ds_meta (LeRobotDatasetMetadata): The dataset from which features and fps are used to build
delta_timestamps against.
@@ -85,7 +82,7 @@ def make_dataset(cfg: TrainPipelineConfig) -> LeRobotDataset | MultiLeRobotDatas
ds_meta = LeRobotDatasetMetadata(
cfg.dataset.repo_id, root=cfg.dataset.root, revision=cfg.dataset.revision
)
delta_timestamps = resolve_delta_timestamps(cfg.trainable_config, ds_meta)
delta_timestamps = resolve_delta_timestamps(cfg.policy, ds_meta)
if not cfg.dataset.streaming:
dataset = LeRobotDataset(
cfg.dataset.repo_id,
+19 -94
View File
@@ -13,30 +13,21 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from pprint import pformat
import datasets
import numpy as np
from PIL import Image as PILImage
from lerobot.configs import VIDEO_ENCODER_INFO_KEYS
from lerobot.utils.constants import DEFAULT_FEATURES
from lerobot.utils.utils import is_valid_numpy_dtype_string
from .language import (
LANGUAGE_PERSISTENT,
is_language_column,
language_events_column_feature,
language_persistent_column_feature,
)
from .utils import (
DEFAULT_CHUNK_SIZE,
DEFAULT_DATA_FILE_SIZE_IN_MB,
DEFAULT_DATA_PATH,
DEFAULT_VIDEO_FILE_SIZE_IN_MB,
DEFAULT_VIDEO_PATH,
DatasetInfo,
)
@@ -54,13 +45,7 @@ def get_hf_features_from_features(features: dict) -> datasets.Features:
"""
hf_features = {}
for key, ft in features.items():
if is_language_column(key):
hf_features[key] = (
language_persistent_column_feature()
if key == LANGUAGE_PERSISTENT
else language_events_column_feature()
)
elif ft["dtype"] == "video":
if ft["dtype"] == "video":
continue
elif ft["dtype"] == "image":
hf_features[key] = datasets.Image()
@@ -93,8 +78,8 @@ def create_empty_dataset_info(
chunks_size: int | None = None,
data_files_size_in_mb: int | None = None,
video_files_size_in_mb: int | None = None,
) -> DatasetInfo:
"""Create a template ``DatasetInfo`` object for a new dataset's ``meta/info.json``.
) -> dict:
"""Create a template dictionary for a new dataset's `info.json`.
Args:
codebase_version (str): The version of the LeRobot codebase.
@@ -102,59 +87,25 @@ def create_empty_dataset_info(
features (dict): The LeRobot features dictionary for the dataset.
use_videos (bool): Whether the dataset will store videos.
robot_type (str | None): The type of robot used, if any.
chunks_size (int | None): Max files per chunk directory. Defaults to ``DEFAULT_CHUNK_SIZE``.
data_files_size_in_mb (int | None): Max parquet file size in MB. Defaults to ``DEFAULT_DATA_FILE_SIZE_IN_MB``.
video_files_size_in_mb (int | None): Max video file size in MB. Defaults to ``DEFAULT_VIDEO_FILE_SIZE_IN_MB``.
Returns:
DatasetInfo: A typed dataset information object with initial metadata.
dict: A dictionary with the initial dataset metadata.
"""
return DatasetInfo(
codebase_version=codebase_version,
fps=fps,
features=features,
robot_type=robot_type,
chunks_size=chunks_size or DEFAULT_CHUNK_SIZE,
data_files_size_in_mb=data_files_size_in_mb or DEFAULT_DATA_FILE_SIZE_IN_MB,
video_files_size_in_mb=video_files_size_in_mb or DEFAULT_VIDEO_FILE_SIZE_IN_MB,
data_path=DEFAULT_DATA_PATH,
video_path=DEFAULT_VIDEO_PATH if use_videos else None,
)
def features_equal_for_merge(features_a: dict[str, dict], features_b: dict[str, dict]) -> bool:
"""Return whether two LeRobotDatasetMetadata ``features`` dicts are compatible for aggregation.
For video features, keys under ``info`` related to video encoding parameters are ignored during
comparison as they do not prevent aggregation.
"""
def _without_encoder_info_keys(feature: dict) -> dict:
filtered = dict(feature)
filtered_info = filtered.get("info")
if isinstance(filtered_info, dict):
filtered["info"] = {
info_key: info_value
for info_key, info_value in filtered_info.items()
if info_key not in VIDEO_ENCODER_INFO_KEYS
}
return filtered
if set(features_a) != set(features_b):
return False
for key in features_a:
fa_key = features_a[key]
fb_key = features_b[key]
if fa_key.get("dtype") != fb_key.get("dtype"):
return False
if fa_key.get("dtype") != "video":
if fa_key != fb_key:
return False
continue
if _without_encoder_info_keys(fa_key) != _without_encoder_info_keys(fb_key):
return False
return True
return {
"codebase_version": codebase_version,
"robot_type": robot_type,
"total_episodes": 0,
"total_frames": 0,
"total_tasks": 0,
"chunks_size": chunks_size or DEFAULT_CHUNK_SIZE,
"data_files_size_in_mb": data_files_size_in_mb or DEFAULT_DATA_FILE_SIZE_IN_MB,
"video_files_size_in_mb": video_files_size_in_mb or DEFAULT_VIDEO_FILE_SIZE_IN_MB,
"fps": fps,
"splits": {},
"data_path": DEFAULT_DATA_PATH,
"video_path": DEFAULT_VIDEO_PATH if use_videos else None,
"features": features,
}
def check_delta_timestamps(
@@ -291,8 +242,6 @@ def validate_feature_dtype_and_shape(
return validate_feature_image_or_video(name, expected_shape, value)
elif expected_dtype == "string":
return validate_feature_string(name, value)
elif expected_dtype == "language":
return validate_feature_language(name, value)
else:
raise NotImplementedError(f"The feature dtype '{expected_dtype}' is not implemented yet.")
@@ -372,30 +321,6 @@ def validate_feature_string(name: str, value: str) -> str:
return ""
def validate_feature_language(name: str, value) -> str:
"""Validate a feature that is expected to hold language annotations.
Language columns (``language_persistent`` / ``language_events``) are
populated after recording by the annotation pipeline, not at record time.
Any value supplied here is dropped before the frame is written, so a
non-empty value almost certainly signals a mistake. We warn rather than
fail to keep recording resilient.
Args:
name (str): The name of the feature.
value: The value to validate.
Returns:
str: Always an empty string language values are non-fatal.
"""
if value is not None:
logging.warning(
f"The feature '{name}' is a 'language' column populated by the annotation pipeline, "
f"not at record time. The provided value will be dropped."
)
return ""
def validate_episode_buffer(episode_buffer: dict, total_episodes: int, features: dict) -> None:
"""Validate the episode buffer before it's written to disk.
+21 -13
View File
@@ -31,15 +31,14 @@ from torchvision import transforms
from lerobot.utils.io_utils import load_json, write_json
from lerobot.utils.utils import SuppressProgressBars, flatten_dict, unflatten_dict
from .language import LANGUAGE_COLUMNS
from .utils import (
DEFAULT_DATA_FILE_SIZE_IN_MB,
DEFAULT_EPISODES_PATH,
DEFAULT_SUBTASKS_PATH,
DEFAULT_TASKS_PATH,
EPISODES_DIR,
INFO_PATH,
STATS_PATH,
DatasetInfo,
serialize_dict,
)
@@ -116,21 +115,25 @@ def embed_images(dataset: datasets.Dataset) -> datasets.Dataset:
return dataset
def write_info(info: DatasetInfo, local_dir: Path) -> None:
write_json(info.to_dict(), local_dir / INFO_PATH)
def write_info(info: dict, local_dir: Path) -> None:
write_json(info, local_dir / INFO_PATH)
def load_info(local_dir: Path) -> DatasetInfo:
def load_info(local_dir: Path) -> dict:
"""Load dataset info metadata from its standard file path.
Also converts shape lists to tuples for consistency.
Args:
local_dir (Path): The root directory of the dataset.
Returns:
DatasetInfo: The typed dataset information object.
dict: The dataset information dictionary.
"""
raw = load_json(local_dir / INFO_PATH)
return DatasetInfo.from_dict(raw)
info = load_json(local_dir / INFO_PATH)
for ft in info["features"].values():
ft["shape"] = tuple(ft["shape"])
return info
def write_stats(stats: dict, local_dir: Path) -> None:
@@ -186,6 +189,14 @@ def load_tasks(local_dir: Path) -> pandas.DataFrame:
return tasks
def load_subtasks(local_dir: Path) -> pandas.DataFrame | None:
"""Load subtasks from subtasks.parquet if it exists."""
subtasks_path = local_dir / DEFAULT_SUBTASKS_PATH
if subtasks_path.exists():
return pd.read_parquet(subtasks_path)
return None
def write_episodes(episodes: Dataset, local_dir: Path) -> None:
"""Write episode metadata to a parquet file in the LeRobot v3.0 format.
This function writes episode-level metadata to a single parquet file.
@@ -257,13 +268,11 @@ def hf_transform_to_torch(items_dict: dict[str, list[Any]]) -> dict[str, list[to
dict: The batch with items converted to torch tensors.
"""
for key in items_dict:
if key in LANGUAGE_COLUMNS:
continue
first_item = items_dict[key][0]
if isinstance(first_item, PILImage.Image):
to_tensor = transforms.ToTensor()
items_dict[key] = [to_tensor(img) for img in items_dict[key]]
elif first_item is None or isinstance(first_item, dict):
elif first_item is None:
pass
else:
items_dict[key] = [x if isinstance(x, str) else torch.tensor(x) for x in items_dict[key]]
@@ -298,9 +307,8 @@ def item_to_torch(item: dict) -> dict:
Returns:
dict: Dictionary with all tensor-like items converted to torch.Tensor.
"""
skip_keys = {"task", *LANGUAGE_COLUMNS}
for key, val in item.items():
if isinstance(val, (np.ndarray | list)) and key not in skip_keys:
if isinstance(val, (np.ndarray | list)) and key not in ["task"]:
# Convert numpy arrays and lists to torch tensors
item[key] = torch.tensor(val)
return item
-242
View File
@@ -1,242 +0,0 @@
#!/usr/bin/env python
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
from typing import Literal
import datasets
import pyarrow as pa
LANGUAGE_PERSISTENT = "language_persistent"
LANGUAGE_EVENTS = "language_events"
LANGUAGE_COLUMNS = (LANGUAGE_PERSISTENT, LANGUAGE_EVENTS)
PERSISTENT_ROW_FIELDS = ("role", "content", "style", "timestamp", "camera", "tool_calls")
EVENT_ROW_FIELDS = ("role", "content", "style", "camera", "tool_calls")
CORE_STYLES = {
"subtask",
"plan",
"memory",
"motion",
"interjection",
"vqa",
"trace",
"task_aug",
}
# Project-local styles can be registered at import time by appending to
# ``EXTENDED_STYLES`` before ``column_for_style`` is called. Anything added
# here is treated as a known style alongside ``CORE_STYLES`` for resolver
# validation. Empty by default — populate from a downstream module that
# also extends ``PERSISTENT_STYLES`` or ``EVENT_ONLY_STYLES`` to declare
# the new style's column.
EXTENDED_STYLES: set[str] = set()
STYLE_REGISTRY = CORE_STYLES | EXTENDED_STYLES
PERSISTENT_STYLES = {"subtask", "plan", "memory", "motion", "task_aug"}
EVENT_ONLY_STYLES = {"interjection", "vqa", "trace"}
# Styles whose ``content`` is grounded in a specific camera view. Rows of these
# styles MUST carry a non-null ``camera`` referencing an ``observation.images.*``
# feature key. Rows of every other style MUST have ``camera=None``. ``motion``
# is intentionally NOT in this set: motion primitives are described in
# robot-frame (joint / Cartesian) terms, not pixel space, so they are
# camera-agnostic. ``trace`` is the pixel-trajectory event style and IS
# view-dependent. The ``camera`` field nevertheless lives on
# ``PERSISTENT_ROW_FIELDS`` too so the schema, validator, and resolver
# behave symmetrically across the two columns; persistent rows simply
# always have ``camera=None`` in practice today.
VIEW_DEPENDENT_STYLES = {"vqa", "trace"}
LanguageColumn = Literal["language_persistent", "language_events"]
def _json_arrow_type() -> pa.DataType:
"""Return the Arrow JSON type, falling back to ``string`` on older pyarrow."""
return pa.json_() if hasattr(pa, "json_") else pa.string()
def _json_feature() -> object:
"""Return the HF ``datasets`` JSON feature, falling back to a string value."""
return datasets.Json() if hasattr(datasets, "Json") else datasets.Value("string")
def language_persistent_row_arrow_type() -> pa.StructType:
"""Return the Arrow struct type for a single persistent language row.
Persistent rows carry their own ``timestamp`` because they represent a state
that became active at a specific moment and remains active until superseded.
``timestamp`` is ``float32`` to match the timestamp dtype LeRobotDataset
uses for frame data.
"""
return pa.struct(
[
pa.field("role", pa.string(), nullable=False),
pa.field("content", pa.string(), nullable=True),
pa.field("style", pa.string(), nullable=True),
pa.field("timestamp", pa.float32(), nullable=False),
pa.field("camera", pa.string(), nullable=True),
pa.field("tool_calls", pa.list_(_json_arrow_type()), nullable=True),
]
)
def language_event_row_arrow_type() -> pa.StructType:
"""Return the Arrow struct type for a single event language row.
Event rows have no ``timestamp`` field: each event is stored on the dataset
row whose frame timestamp is the event's firing time.
"""
return pa.struct(
[
pa.field("role", pa.string(), nullable=False),
pa.field("content", pa.string(), nullable=True),
pa.field("style", pa.string(), nullable=True),
pa.field("camera", pa.string(), nullable=True),
pa.field("tool_calls", pa.list_(_json_arrow_type()), nullable=True),
]
)
def language_persistent_arrow_type() -> pa.ListType:
"""Return the Arrow list type for the ``language_persistent`` column."""
return pa.list_(language_persistent_row_arrow_type())
def language_events_arrow_type() -> pa.ListType:
"""Return the Arrow list type for the ``language_events`` column."""
return pa.list_(language_event_row_arrow_type())
def language_persistent_row_feature() -> dict[str, object]:
"""Return the HF ``datasets`` feature mapping for a persistent language row."""
return {
"role": datasets.Value("string"),
"content": datasets.Value("string"),
"style": datasets.Value("string"),
"timestamp": datasets.Value("float32"),
"camera": datasets.Value("string"),
"tool_calls": datasets.List(_json_feature()),
}
def language_event_row_feature() -> dict[str, object]:
"""Return the HF ``datasets`` feature mapping for an event language row."""
return {
"role": datasets.Value("string"),
"content": datasets.Value("string"),
"style": datasets.Value("string"),
"camera": datasets.Value("string"),
"tool_calls": datasets.List(_json_feature()),
}
def language_persistent_column_feature() -> datasets.List:
"""Return the HF ``datasets`` feature for the ``language_persistent`` column."""
return datasets.List(language_persistent_row_feature())
def language_events_column_feature() -> datasets.List:
"""Return the HF ``datasets`` feature for the ``language_events`` column."""
return datasets.List(language_event_row_feature())
def language_feature_info() -> dict[str, dict]:
"""Return the ``info["features"]`` entries for both language columns."""
return {
LANGUAGE_PERSISTENT: {"dtype": "language", "shape": (1,), "names": None},
LANGUAGE_EVENTS: {"dtype": "language", "shape": (1,), "names": None},
}
def is_language_column(key: str) -> bool:
"""Return ``True`` if ``key`` is one of the dataset's language column names."""
return key in LANGUAGE_COLUMNS
def is_view_dependent_style(style: str | None) -> bool:
"""Return ``True`` if rows of ``style`` must be tagged with a ``camera`` key."""
return style in VIEW_DEPENDENT_STYLES
def validate_camera_field(style: str | None, camera: str | None) -> None:
"""Enforce the ``camera`` invariant: required iff ``style`` is view-dependent.
Raises ``ValueError`` if a view-dependent style is missing ``camera`` or if
a non-view-dependent style carries one. Pipeline writers and the validator
should call this on every emitted row.
"""
if is_view_dependent_style(style):
if not camera:
raise ValueError(
f"Rows of view-dependent style {style!r} require a non-empty 'camera' "
f"field referencing an 'observation.images.*' feature key."
)
elif camera is not None:
raise ValueError(f"Rows of style {style!r} must have camera=None; got camera={camera!r}.")
# --- Tool registry --------------------------------------------------------
# Tools declared on a dataset live in ``meta/info.json["tools"]`` as a list
# of OpenAI-style function schemas. The runtime / training stack reads them
# through :class:`LeRobotDatasetMetadata.tools` (with these constants as
# fallback when the dataset doesn't declare any). Implementations live
# under :mod:`lerobot.tools` (one file per tool); see
# ``docs/source/tools.mdx`` for the authoring guide.
SAY_TOOL_SCHEMA: dict = {
"type": "function",
"function": {
"name": "say",
"description": "Speak a short utterance to the user via the TTS executor.",
"parameters": {
"type": "object",
"properties": {
"text": {
"type": "string",
"description": "The verbatim text to speak.",
}
},
"required": ["text"],
},
},
}
"""Canonical schema for the ``say`` tool emitted by the steerable
annotation pipeline (PR 2 Module 2). Single source of truth PR 2's
writer, PR 3's runtime tool registry, and the dataset visualizer all
import this constant rather than duplicating the dict."""
DEFAULT_TOOLS: list[dict] = [SAY_TOOL_SCHEMA]
"""Fallback tools list. Returned by ``LeRobotDatasetMetadata.tools``
when ``meta/info.json["tools"]`` is unset, so unannotated datasets and
chat-template consumers (``apply_chat_template(messages, tools=...)``)
keep working out of the box."""
def column_for_style(style: str | None) -> LanguageColumn:
"""Map a language style to the column where rows of that style are stored.
Styles in :data:`PERSISTENT_STYLES` route to :data:`LANGUAGE_PERSISTENT`.
Styles in :data:`EVENT_ONLY_STYLES` and the implicit ``None`` style route
to :data:`LANGUAGE_EVENTS`.
"""
if style is None:
return LANGUAGE_EVENTS
if style in PERSISTENT_STYLES:
return LANGUAGE_PERSISTENT
if style in EVENT_ONLY_STYLES:
return LANGUAGE_EVENTS
raise ValueError(f"Unknown language style: {style!r}")
-545
View File
@@ -1,545 +0,0 @@
#!/usr/bin/env python
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import copy
import hashlib
import re
from collections.abc import Sequence
from typing import Any
from lerobot.configs.recipe import DEFAULT_BINDINGS, PLACEHOLDER_RE, TrainingRecipe
from lerobot.utils.utils import unwrap_scalar
from .language import LANGUAGE_PERSISTENT, column_for_style
LanguageRow = dict[str, Any]
RenderedMessages = dict[str, list[Any]]
_RESOLVER_RE = re.compile(r"^(?P<name>[A-Za-z_][A-Za-z0-9_]*)\((?P<args>.*)\)$")
def active_at(
t: float,
*,
persistent: Sequence[LanguageRow],
style: str | None = None,
role: str | None = None,
tool_name: str | None = None,
camera: str | None = None,
) -> LanguageRow | None:
"""Return the persistent row of ``style`` that is active at time ``t``.
A persistent row is "active" at ``t`` when its own ``timestamp`` is the
most recent one ``<= t`` for the given ``style``/``role``/``tool_name``/
``camera`` selector. Only valid for persistent styles.
"""
_validate_persistent_resolver("active_at", style)
matches = [
row
for row in _matching_rows(persistent, style=style, role=role, tool_name=tool_name, camera=camera)
if _timestamp(row) <= t
]
if not matches:
return None
latest_ts = max(_timestamp(row) for row in matches)
return _select_one(
[row for row in matches if _timestamp(row) == latest_ts],
style=style,
role=role,
tool_name=tool_name,
camera=camera,
)
EMITTED_AT_TOLERANCE_S = 0.1
"""Half-window for matching persistent rows to a frame timestamp in
``emitted_at``. Persistent timestamps come from parquet (float32) and ``t``
is also a float32 from parquet, so in the ideal hot path an exact match
would suffice but any caller that derives ``t`` arithmetically (e.g.
``frame_idx / fps``) breaks bit-equality. A 0.1 s tolerance covers
common arithmetic drift without admitting frames that are visibly far
apart at typical control rates (30100 Hz). This does mean two persistent
rows of the same selector emitted within 0.1 s of each other cannot be
told apart by ``emitted_at`` acceptable because persistent annotations
(subtask / plan / memory transitions) change on a human-action timescale,
not at the camera frame rate."""
def emitted_at(
t: float,
*,
persistent: Sequence[LanguageRow],
events: Sequence[LanguageRow],
style: str | None = None,
role: str | None = None,
tool_name: str | None = None,
camera: str | None = None,
) -> LanguageRow | None:
"""Return the row of ``style`` emitted at exactly time ``t``.
For persistent styles, this matches persistent rows whose own ``timestamp``
is within ``EMITTED_AT_TOLERANCE_S`` of ``t`` (see that constant for why
we use a tolerance instead of bit-equality). For event styles, the
``events`` list is assumed to come from the dataset row at frame ``t``
(event rows carry no timestamp of their own), so all matching event rows
are considered emitted at ``t``. ``camera`` filters by the row's
``camera`` field required to disambiguate when multiple view-dependent
rows share ``(t, role)`` across cameras.
"""
if column_for_style(style) == LANGUAGE_PERSISTENT:
matches = [
row
for row in _matching_rows(persistent, style=style, role=role, tool_name=tool_name, camera=camera)
if abs(_timestamp(row) - t) <= EMITTED_AT_TOLERANCE_S
]
else:
matches = _matching_rows(events, style=style, role=role, tool_name=tool_name, camera=camera)
return _select_one(matches, style=style, role=role, tool_name=tool_name, camera=camera)
def nth_prev(
t: float,
*,
persistent: Sequence[LanguageRow],
style: str | None = None,
offset: int = 1,
role: str | None = None,
tool_name: str | None = None,
camera: str | None = None,
) -> LanguageRow | None:
"""Return the persistent row that was active ``offset`` steps before ``t``.
Walks back through chronologically sorted persistent rows of ``style``
(filtered by optional ``role``/``tool_name``/``camera``) and returns the
one ``offset`` positions before the row active at ``t``. Only valid for
persistent styles.
"""
return _nth_relative("nth_prev", t, persistent, style, -offset, role, tool_name, camera)
def nth_next(
t: float,
*,
persistent: Sequence[LanguageRow],
style: str | None = None,
offset: int = 1,
role: str | None = None,
tool_name: str | None = None,
camera: str | None = None,
) -> LanguageRow | None:
"""Return the persistent row that becomes active ``offset`` steps after ``t``.
Walks forward through chronologically sorted persistent rows of ``style``
(filtered by optional ``role``/``tool_name``/``camera``) and returns the
one ``offset`` positions after the row active at ``t``. Only valid for
persistent styles.
"""
return _nth_relative("nth_next", t, persistent, style, offset, role, tool_name, camera)
def render_sample(
*,
recipe: TrainingRecipe,
persistent: Sequence[LanguageRow] | None,
events: Sequence[LanguageRow] | None,
t: float,
sample_idx: int,
task: str | None = None,
dataset_ctx: Any | None = None,
) -> RenderedMessages | None:
"""Render the chat-style messages for a single dataset sample.
Resolves the recipe's bindings against ``persistent`` and ``events`` rows
at frame timestamp ``t``, then expands the recipe's message templates.
Returns ``None`` if the resolved sample contains no target message.
"""
persistent_rows = _normalize_rows(persistent or [])
event_rows = _normalize_rows(events or [])
selected_recipe = _select_recipe(recipe, sample_idx)
bindings = _resolve_bindings(
selected_recipe,
persistent=persistent_rows,
events=event_rows,
t=t,
sample_idx=sample_idx,
task=task,
dataset_ctx=dataset_ctx,
)
return _render_message_recipe(selected_recipe, bindings)
def _select_recipe(recipe: TrainingRecipe, sample_idx: int) -> TrainingRecipe:
"""Pick a deterministic blend component for ``sample_idx`` (or return ``recipe``)."""
if recipe.blend is None:
return recipe
total_weight = sum(component.weight or 0.0 for component in recipe.blend.values())
if total_weight <= 0:
raise ValueError("Blend weights must sum to a positive value.")
digest = hashlib.blake2b(str(sample_idx).encode(), digest_size=8).digest()
draw = int.from_bytes(digest, "big") / 2**64 * total_weight
cumulative = 0.0
last_component: TrainingRecipe | None = None
for component in recipe.blend.values():
last_component = component
cumulative += component.weight or 0.0
if draw < cumulative:
return component
assert last_component is not None
return last_component
def _resolve_bindings(
recipe: TrainingRecipe,
*,
persistent: Sequence[LanguageRow],
events: Sequence[LanguageRow],
t: float,
sample_idx: int,
task: str | None,
dataset_ctx: Any | None,
) -> dict[str, LanguageRow | str | None]:
"""Resolve every binding in ``recipe`` (plus ``task``) at time ``t``."""
bindings: dict[str, LanguageRow | str | None] = {
"task": _resolve_task(task, dataset_ctx, persistent=persistent, sample_idx=sample_idx),
}
specs = {**DEFAULT_BINDINGS, **(recipe.bindings or {})}
for name, spec in specs.items():
bindings[name] = _resolve_spec(spec, persistent=persistent, events=events, t=t)
return bindings
def _resolve_task(
task: str | None,
dataset_ctx: Any | None,
*,
persistent: Sequence[LanguageRow] = (),
sample_idx: int = 0,
) -> str | None:
"""Return the task string for ``sample_idx``.
Resolution order:
1. Explicit ``task`` override (caller-supplied) wins.
2. If ``persistent`` contains rows of style ``task_aug`` (role=user),
deterministically pick one by ``sample_idx`` so each frame of an
episode rotates through the available rephrasings across an epoch.
This realizes Xiao 2022 / CAST-style task-prompt diversity without
changing ``meta/tasks.parquet`` and without forcing recipes to opt
in: ``${task}`` automatically picks a rephrasing when one exists,
and falls back to the canonical task otherwise. Recipes that want
the literal canonical task can override the binding.
3. Otherwise read the canonical task from ``dataset_ctx`` (which is
backed by ``meta/tasks.parquet``).
"""
if task is not None:
return task
aug_rows = [r for r in persistent if r.get("style") == "task_aug" and r.get("role") == "user"]
if aug_rows:
# Deterministic, blake2b-based pick keyed on sample_idx so the
# rotation is reproducible across runs (Python's built-in ``hash``
# is process-randomized).
digest = hashlib.blake2b(f"task_aug:{sample_idx}".encode(), digest_size=8).digest()
idx = int.from_bytes(digest, "big") % len(aug_rows)
chosen = aug_rows[idx].get("content")
if chosen:
return str(chosen)
if dataset_ctx is None:
return None
if isinstance(dataset_ctx, dict):
return dataset_ctx.get("task")
return getattr(dataset_ctx, "task", None)
def _resolve_spec(
spec: str,
*,
persistent: Sequence[LanguageRow],
events: Sequence[LanguageRow],
t: float,
) -> LanguageRow | None:
"""Parse a single binding's resolver expression and dispatch to its function."""
match = _RESOLVER_RE.match(spec.strip())
if match is None:
raise ValueError(f"Invalid resolver expression: {spec!r}")
name = match.group("name")
kwargs = _parse_resolver_args(match.group("args"))
kwargs.pop("t_arg", None)
if name == "emitted_at":
return emitted_at(t, persistent=persistent, events=events, **kwargs)
if name == "active_at":
return active_at(t, persistent=persistent, **kwargs)
if name == "nth_prev":
return nth_prev(t, persistent=persistent, **kwargs)
if name == "nth_next":
return nth_next(t, persistent=persistent, **kwargs)
raise ValueError(f"Unknown language resolver: {name!r}")
def _parse_resolver_args(args: str) -> dict[str, Any]:
"""Parse a comma-separated resolver argument list into a kwargs dict."""
kwargs: dict[str, Any] = {}
if not args.strip():
return kwargs
parts = [part.strip() for part in args.split(",") if part.strip()]
for part in parts:
if part == "t":
kwargs["t_arg"] = True
continue
if "=" not in part:
raise ValueError(f"Invalid resolver argument: {part!r}")
key, value = (item.strip() for item in part.split("=", 1))
if key == "offset":
kwargs[key] = int(value)
else:
kwargs[key] = value.strip("\"'")
return kwargs
def _render_message_recipe(
recipe: TrainingRecipe,
bindings: dict[str, LanguageRow | str | None],
) -> RenderedMessages | None:
"""Expand ``recipe.messages`` into rendered chat messages using ``bindings``."""
assert recipe.messages is not None
messages: list[dict[str, Any]] = []
streams: list[str | None] = []
target_indices: list[int] = []
for turn in recipe.messages:
if turn.if_present is not None and bindings.get(turn.if_present) is None:
continue
message = {"role": turn.role}
if turn.content is not None:
message["content"] = _render_content(turn.content, bindings)
if turn.tool_calls_from is not None:
row = bindings.get(turn.tool_calls_from)
tool_calls = row.get("tool_calls") if isinstance(row, dict) else None
if tool_calls:
message["tool_calls"] = copy.deepcopy(tool_calls)
message_idx = len(messages)
messages.append(message)
streams.append(turn.stream)
if turn.target:
target_indices.append(message_idx)
if not target_indices:
return None
rendered = {
"messages": messages,
"message_streams": streams,
"target_message_indices": target_indices,
}
_validate_rendered(rendered)
return rendered
def _render_content(
content: str | list[dict[str, Any]],
bindings: dict[str, LanguageRow | str | None],
) -> str | list[dict[str, Any]]:
"""Substitute bindings into a string or each string field of multimodal blocks."""
if isinstance(content, str):
return _substitute(content, bindings)
rendered_blocks = []
for block in content:
rendered_block = copy.deepcopy(block)
for key, value in rendered_block.items():
if isinstance(value, str):
rendered_block[key] = _substitute(value, bindings)
rendered_blocks.append(rendered_block)
return rendered_blocks
def _substitute(template: str, bindings: dict[str, LanguageRow | str | None]) -> str:
"""Replace ``${name}`` placeholders in ``template`` with their bound values."""
def replace(match: re.Match[str]) -> str:
"""Resolve a single ``${name}`` match to its bound string value."""
name = match.group(1)
if name not in bindings:
raise ValueError(f"Unknown template binding: {name!r}")
value = bindings[name]
if value is None:
return ""
if isinstance(value, dict):
content = value.get("content")
return "" if content is None else str(content)
return str(value)
return PLACEHOLDER_RE.sub(replace, template)
def _validate_rendered(rendered: RenderedMessages) -> None:
"""Sanity-check the rendered output for stream/target alignment."""
messages = rendered["messages"]
streams = rendered["message_streams"]
target_indices = rendered["target_message_indices"]
if len(streams) != len(messages):
raise ValueError("message_streams must be aligned with messages.")
if not target_indices:
raise ValueError("Rendered samples must contain at least one target message.")
for idx in target_indices:
if idx < 0 or idx >= len(messages):
raise ValueError(f"Target message index {idx} is out of bounds.")
# ``stream`` is enforced non-None at MessageTurn construction time
# (see ``MessageTurn.__post_init__``), so a missing stream here would
# mean the dataclass invariant was bypassed; no need to re-check.
def _nth_relative(
name: str,
t: float,
persistent: Sequence[LanguageRow],
style: str | None,
offset: int,
role: str | None,
tool_name: str | None,
camera: str | None,
) -> LanguageRow | None:
"""Shared body for ``nth_prev`` / ``nth_next`` with signed ``offset``."""
_validate_persistent_resolver(name, style)
if abs(offset) < 1:
raise ValueError(f"{name} offset must be non-zero.")
rows = sorted(
_matching_rows(persistent, style=style, role=role, tool_name=tool_name, camera=camera),
key=_row_sort_key,
)
if not rows:
return None
anchor_idx = None
for idx, row in enumerate(rows):
if _timestamp(row) <= t:
anchor_idx = idx
else:
break
target_idx = (offset - 1 if offset > 0 else None) if anchor_idx is None else anchor_idx + offset
if target_idx is None or target_idx < 0 or target_idx >= len(rows):
return None
return rows[target_idx]
def _validate_persistent_resolver(name: str, style: str | None) -> None:
"""Reject calls with missing or event-only ``style`` for persistent resolvers."""
if style is None:
raise ValueError(f"{name} requires a persistent style.")
if column_for_style(style) != LANGUAGE_PERSISTENT:
raise ValueError(f"{name} cannot be used with event-only style {style!r}.")
def _matching_rows(
rows: Sequence[LanguageRow],
*,
style: str | None,
role: str | None,
tool_name: str | None,
camera: str | None,
) -> list[LanguageRow]:
"""Return ``rows`` filtered by optional ``style``/``role``/``tool_name``/``camera`` selectors."""
return [
row
for row in rows
if (style is None or row.get("style") == style)
and (role is None or row.get("role") == role)
and (tool_name is None or _row_has_tool_name(row, tool_name))
and (camera is None or row.get("camera") == camera)
]
def _select_one(
rows: Sequence[LanguageRow],
*,
style: str | None,
role: str | None,
tool_name: str | None,
camera: str | None,
) -> LanguageRow | None:
"""Return the single matching row, or raise if the resolver is ambiguous.
Multiple matches always raise even when the caller already passed
some selectors because remaining ambiguity means the data has
several rows that look identical to the resolver and the caller
needs to pin down a specific one (e.g. add ``camera=...`` for VQA
rows shared across cameras).
"""
if not rows:
return None
if len(rows) > 1:
raise ValueError(
f"Ambiguous resolver for style={style!r} role={role!r} "
f"tool_name={tool_name!r} camera={camera!r}: {len(rows)} matching rows. "
f"Add a selector that distinguishes them."
)
return rows[0]
def _row_sort_key(row: LanguageRow) -> tuple[float, str, str]:
"""Stable sort key for both persistent and event rows.
Event rows lack ``timestamp`` (it is implicit in the frame), so default
to ``0.0`` within a single frame all event rows share the same sort
bucket and are tiebroken by ``(style, role)``.
"""
timestamp = row.get("timestamp")
ts = float(unwrap_scalar(timestamp)) if timestamp is not None else 0.0
return (ts, row.get("style") or "", row.get("role") or "")
def _timestamp(row: LanguageRow) -> float:
"""Extract a row's ``timestamp`` as a Python float (unwrapping numpy scalars)."""
return float(unwrap_scalar(row["timestamp"]))
def _row_has_tool_name(row: LanguageRow, tool_name: str) -> bool:
"""Return ``True`` if any of the row's tool calls invokes ``tool_name``."""
for tool_call in row.get("tool_calls") or []:
if isinstance(tool_call, str):
continue
function = tool_call.get("function") if isinstance(tool_call, dict) else None
if isinstance(function, dict) and function.get("name") == tool_name:
return True
return False
def _normalize_rows(rows: Sequence[Any]) -> list[LanguageRow]:
"""Convert pyarrow scalars / mappings into a fresh list of plain dict rows."""
normalized = []
for row in rows:
if row is None:
continue
if hasattr(row, "as_py"):
row = row.as_py()
if not isinstance(row, dict):
raise TypeError(f"Language rows must be dictionaries, got {type(row).__name__}.")
normalized.append(dict(row))
return normalized
+44 -68
View File
@@ -24,7 +24,6 @@ import torch.utils
from huggingface_hub import HfApi, snapshot_download
from huggingface_hub.errors import RevisionNotFoundError
from lerobot.configs import VideoEncoderConfig
from lerobot.utils.constants import HF_LEROBOT_HUB_CACHE
from .dataset_metadata import CODEBASE_VERSION, LeRobotDatasetMetadata
@@ -37,7 +36,8 @@ from .utils import (
)
from .video_utils import (
StreamingVideoEncoder,
get_safe_default_video_backend,
get_safe_default_codec,
resolve_vcodec,
)
logger = logging.getLogger(__name__)
@@ -49,7 +49,6 @@ class LeRobotDataset(torch.utils.data.Dataset):
repo_id: str,
root: str | Path | None = None,
episodes: list[int] | None = None,
episode_filter: Callable[[dict], bool] | None = None,
image_transforms: Callable | None = None,
delta_timestamps: dict[str, list[float]] | None = None,
tolerance_s: float = 1e-4,
@@ -59,10 +58,10 @@ class LeRobotDataset(torch.utils.data.Dataset):
video_backend: str | None = None,
return_uint8: bool = False,
batch_encoding_size: int = 1,
camera_encoder: VideoEncoderConfig | None = None,
encoder_threads: int | None = None,
vcodec: str = "libsvtav1",
streaming_encoding: bool = False,
encoder_queue_maxsize: int = 30,
encoder_threads: int | None = None,
):
"""
2 modes are available for instantiating this class, depending on 2 different use cases:
@@ -154,11 +153,6 @@ class LeRobotDataset(torch.utils.data.Dataset):
``$HF_LEROBOT_HOME/hub``.
episodes (list[int] | None, optional): If specified, this will only load episodes specified by
their episode_index in this list. Defaults to None.
episode_filter (Callable[[dict], bool] | None, optional): Predicate over per-episode
metadata rows used to select episodes. Evaluated against ``meta/`` without ``stats`` keys
(e.g.``task_index``, ``episode_index``, ``length``, ``from_timestamp``, ``to_timestamp``).
Intersected with ``episodes`` when both are set. Example: ``lambda ep: ep["length"] >= 100``.
Defaults to None.
image_transforms (Callable | None, optional):
Transform applied to visual modalities inside `__getitem__` after image decoding / tensor
conversion. This works for both image-backed and video-backed observations and can later be
@@ -183,15 +177,16 @@ class LeRobotDataset(torch.utils.data.Dataset):
You can also use the 'pyav' decoder used by Torchvision, which used to be the default option, or 'video_reader' which is another decoder of Torchvision.
batch_encoding_size (int, optional): Number of episodes to accumulate before batch encoding videos.
Set to 1 for immediate encoding (default), or higher for batched encoding. Defaults to 1.
camera_encoder (VideoEncoderConfig | None, optional): Video encoder settings for cameras
(codec, quality, etc.). When ``None``, :func:`~lerobot.configs.video.camera_encoder_defaults`
is used by the writer.
encoder_threads (int | None, optional): Number of encoder threads (global). ``None`` lets the
codec decide.
vcodec (str, optional): Video codec for encoding videos during recording. Options: 'h264', 'hevc',
'libsvtav1', 'auto', or hardware-specific codecs like 'h264_videotoolbox', 'h264_nvenc'.
Defaults to 'libsvtav1'. Use 'auto' to auto-detect the best available hardware encoder.
streaming_encoding (bool, optional): If True, encode video frames in real-time during capture
instead of writing PNG images first. This makes save_episode() near-instant. Defaults to False.
encoder_queue_maxsize (int, optional): Maximum number of frames to buffer per camera when using
streaming encoding. Defaults to 30 (~1s at 30fps).
encoder_threads (int | None, optional): Number of threads per encoder instance. None lets the
codec auto-detect (default). Lower values reduce CPU usage per encoder. Maps to 'lp' (via svtav1-params) for
libsvtav1 and 'threads' for h264/hevc.
Note:
Write-mode parameters (``streaming_encoding``, ``batch_encoding_size``) passed to
@@ -204,11 +199,13 @@ class LeRobotDataset(torch.utils.data.Dataset):
self.reader = None
self.set_image_transforms(image_transforms)
self.delta_timestamps = delta_timestamps
self.episodes = episodes
self.tolerance_s = tolerance_s
self.revision = revision if revision else CODEBASE_VERSION
self._video_backend = video_backend if video_backend else get_safe_default_video_backend()
self._video_backend = video_backend if video_backend else get_safe_default_codec()
self._return_uint8 = return_uint8
self._batch_encoding_size = batch_encoding_size
self._vcodec = resolve_vcodec(vcodec)
self._encoder_threads = encoder_threads
if self._requested_root is not None:
@@ -221,23 +218,6 @@ class LeRobotDataset(torch.utils.data.Dataset):
self.root = self.meta.root
self.revision = self.meta.revision
if episodes is not None and any(
episode >= self.meta.total_episodes or episode < 0 for episode in episodes
):
logger.warning(
f"Some episodes in the provided episodes list are out of range for this dataset ({self.meta.total_episodes})."
)
if episode_filter is not None:
resolved = self.meta.filter_episodes(episode_filter, candidates=episodes)
if not resolved:
raise ValueError(
"The episode filter did not match any episode. Make sure the filter and episodes list are valid and compatible."
)
logger.info(f"The episode filter matched {len(resolved)} episode(s).")
episodes = resolved
self.episodes = episodes
# Create reader (hf_dataset loaded below)
self.reader = DatasetReader(
meta=self.meta,
@@ -271,15 +251,12 @@ class LeRobotDataset(torch.utils.data.Dataset):
streaming_enc = None
if streaming_encoding and len(self.meta.video_keys) > 0:
streaming_enc = self._build_streaming_encoder(
self.meta.fps,
camera_encoder,
encoder_queue_maxsize,
encoder_threads,
self.meta.fps, self._vcodec, encoder_queue_maxsize, encoder_threads
)
self.writer = DatasetWriter(
meta=self.meta,
root=self.root,
camera_encoder=camera_encoder,
vcodec=self._vcodec,
encoder_threads=encoder_threads,
batch_encoding_size=batch_encoding_size,
streaming_encoder=streaming_enc,
@@ -321,13 +298,17 @@ class LeRobotDataset(torch.utils.data.Dataset):
@staticmethod
def _build_streaming_encoder(
fps: int,
camera_encoder: VideoEncoderConfig | None,
vcodec: str,
encoder_queue_maxsize: int,
encoder_threads: int | None,
) -> StreamingVideoEncoder:
return StreamingVideoEncoder(
fps=fps,
camera_encoder=camera_encoder,
vcodec=vcodec,
pix_fmt="yuv420p",
g=2,
crf=30,
preset=None,
queue_maxsize=encoder_queue_maxsize,
encoder_threads=encoder_threads,
)
@@ -524,7 +505,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
license: str | None = "apache-2.0",
tag_version: bool = True,
push_videos: bool = True,
private: bool | None = None,
private: bool = False,
allow_patterns: list[str] | str | None = None,
upload_large_folder: bool = False,
**card_kwargs,
@@ -543,8 +524,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
tag_version: If ``True``, create a Git tag for the current codebase
version.
push_videos: If ``False``, skip uploading the ``videos/`` directory.
private: If ``True``, create a private repository. If ``None``
(default), defer to the org default on the Hub (only affects orgs).
private: If ``True``, create a private repository.
allow_patterns: Glob pattern(s) restricting which files to upload.
upload_large_folder: If ``True``, use ``upload_large_folder`` instead
of ``upload_folder`` for very large datasets.
@@ -645,13 +625,11 @@ class LeRobotDataset(torch.utils.data.Dataset):
image_writer_threads: int = 0,
video_backend: str | None = None,
batch_encoding_size: int = 1,
camera_encoder: VideoEncoderConfig | None = None,
vcodec: str = "libsvtav1",
metadata_buffer_size: int = 10,
streaming_encoding: bool = False,
encoder_queue_maxsize: int = 30,
encoder_threads: int | None = None,
video_files_size_in_mb: int | None = None,
data_files_size_in_mb: int | None = None,
) -> "LeRobotDataset":
"""Create a new LeRobotDataset from scratch for recording data.
@@ -676,20 +654,20 @@ class LeRobotDataset(torch.utils.data.Dataset):
video_backend: Video decoding backend (used when reading back).
batch_encoding_size: Number of episodes to accumulate before
batch-encoding videos. ``1`` means encode immediately.
camera_encoder: Video encoder settings for cameras (codec, quality, etc.).
When ``None``, :func:`~lerobot.configs.video.camera_encoder_defaults` is used.
encoder_threads: Number of encoder threads (global). ``None``
lets the codec decide.
vcodec: Video codec for encoding. Options include ``'libsvtav1'``,
``'h264'``, ``'hevc'``, ``'auto'``.
metadata_buffer_size: Number of episode metadata records to buffer
before flushing to parquet.
streaming_encoding: If ``True``, encode video frames in real-time
during capture instead of writing images first.
encoder_queue_maxsize: Max buffered frames per camera when using
streaming encoding.
encoder_threads: Threads per encoder instance. ``None`` for auto.
Returns:
A new :class:`LeRobotDataset` in write mode.
"""
vcodec = resolve_vcodec(vcodec)
obj = cls.__new__(cls)
obj.meta = LeRobotDatasetMetadata.create(
repo_id=repo_id,
@@ -699,8 +677,6 @@ class LeRobotDataset(torch.utils.data.Dataset):
root=root,
use_videos=use_videos,
metadata_buffer_size=metadata_buffer_size,
video_files_size_in_mb=video_files_size_in_mb,
data_files_size_in_mb=data_files_size_in_mb,
)
obj.repo_id = obj.meta.repo_id
obj._requested_root = obj.meta.root
@@ -710,23 +686,23 @@ class LeRobotDataset(torch.utils.data.Dataset):
obj.image_transforms = None
obj.delta_timestamps = None
obj.episodes = None
obj._video_backend = video_backend if video_backend is not None else get_safe_default_video_backend()
obj._video_backend = video_backend if video_backend is not None else get_safe_default_codec()
obj._return_uint8 = False
obj._batch_encoding_size = batch_encoding_size
obj._vcodec = vcodec
obj._encoder_threads = encoder_threads
# Reader is lazily created on first access (write-only mode)
obj.reader = None
# Create writer
streaming_enc = None
if streaming_encoding and len(obj.meta.video_keys) > 0:
streaming_enc = cls._build_streaming_encoder(
fps, camera_encoder, encoder_queue_maxsize, encoder_threads
)
streaming_enc = cls._build_streaming_encoder(fps, vcodec, encoder_queue_maxsize, encoder_threads)
obj.writer = DatasetWriter(
meta=obj.meta,
root=obj.root,
camera_encoder=camera_encoder,
vcodec=vcodec,
encoder_threads=encoder_threads,
batch_encoding_size=batch_encoding_size,
streaming_encoder=streaming_enc,
@@ -749,12 +725,12 @@ class LeRobotDataset(torch.utils.data.Dataset):
force_cache_sync: bool = False,
video_backend: str | None = None,
batch_encoding_size: int = 1,
camera_encoder: VideoEncoderConfig | None = None,
encoder_threads: int | None = None,
vcodec: str = "libsvtav1",
image_writer_processes: int = 0,
image_writer_threads: int = 0,
streaming_encoding: bool = False,
encoder_queue_maxsize: int = 30,
encoder_threads: int | None = None,
) -> "LeRobotDataset":
"""Resume recording on an existing dataset.
@@ -777,15 +753,13 @@ class LeRobotDataset(torch.utils.data.Dataset):
video_backend: Video decoding backend for reading back data.
batch_encoding_size: Number of episodes to accumulate before
batch-encoding videos.
camera_encoder: Video encoder settings for cameras (codec, quality, etc.).
When ``None``, :func:`~lerobot.configs.video.camera_encoder_defaults` is used.
encoder_threads: Number of encoder threads (global). ``None``
lets the codec decide.
vcodec: Video codec for encoding.
image_writer_processes: Subprocesses for async image writing.
image_writer_threads: Threads for async image writing.
streaming_encoding: If ``True``, encode video in real-time during
capture.
encoder_queue_maxsize: Max buffered frames per camera for streaming.
encoder_threads: Threads per encoder instance. ``None`` for auto.
Returns:
A :class:`LeRobotDataset` in write mode, ready to append episodes.
@@ -796,6 +770,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
"Writing into the revision-safe Hub snapshot cache (used when root=None) would corrupt "
"the shared cache. Please provide a local directory path."
)
vcodec = resolve_vcodec(vcodec)
obj = cls.__new__(cls)
obj.repo_id = repo_id
obj._requested_root = Path(root)
@@ -804,9 +779,11 @@ class LeRobotDataset(torch.utils.data.Dataset):
obj.image_transforms = None
obj.delta_timestamps = None
obj.episodes = None
obj._video_backend = video_backend if video_backend else get_safe_default_video_backend()
obj._video_backend = video_backend if video_backend else get_safe_default_codec()
obj._return_uint8 = False
obj._batch_encoding_size = batch_encoding_size
obj._vcodec = vcodec
obj._encoder_threads = encoder_threads
if obj._requested_root is not None:
obj._requested_root.mkdir(exist_ok=True, parents=True)
@@ -815,22 +792,21 @@ class LeRobotDataset(torch.utils.data.Dataset):
obj.meta = LeRobotDatasetMetadata(
obj.repo_id, obj._requested_root, obj.revision, force_cache_sync=force_cache_sync
)
obj._encoder_threads = encoder_threads
obj.root = obj.meta.root
# Reader is lazily created on first access (write-only mode)
obj.reader = None
# Create writer for appending
streaming_enc = None
if streaming_encoding and len(obj.meta.video_keys) > 0:
streaming_enc = cls._build_streaming_encoder(
obj.meta.fps, camera_encoder, encoder_queue_maxsize, encoder_threads
obj.meta.fps, vcodec, encoder_queue_maxsize, encoder_threads
)
obj.writer = DatasetWriter(
meta=obj.meta,
root=obj.root,
camera_encoder=camera_encoder,
vcodec=vcodec,
encoder_threads=encoder_threads,
batch_encoding_size=batch_encoding_size,
streaming_encoder=streaming_enc,
+2 -2
View File
@@ -123,7 +123,7 @@ class MultiLeRobotDataset(torch.utils.data.Dataset):
NOTE: Fow now, this relies on a check in __init__ to make sure all sub-datasets have the same info.
"""
return self._datasets[0].meta.info.fps
return self._datasets[0].meta.info["fps"]
@property
def video(self) -> bool:
@@ -133,7 +133,7 @@ class MultiLeRobotDataset(torch.utils.data.Dataset):
NOTE: Fow now, this relies on a check in __init__ to make sure all sub-datasets have the same info.
"""
return len(self._datasets[0].meta.video_keys) > 0
return self._datasets[0].meta.info.get("video", False)
@property
def features(self) -> datasets.Features:

Some files were not shown because too many files have changed in this diff Show More