diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 87fcacf42..412386e2d 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -3,6 +3,8 @@
title: LeRobot
- local: installation
title: Installation
+ - local: cheat-sheet
+ title: Cheat sheet
title: Get started
- sections:
- local: il_robots
@@ -37,8 +39,12 @@
title: Porting Large Datasets
- local: using_dataset_tools
title: Using the Dataset Tools
- - local: dataset_subtask
- title: Using Subtasks in the Dataset
+ - local: language_and_recipes
+ title: Language Columns and Recipes
+ - local: tools
+ title: Tools
+ - local: video_encoding_parameters
+ title: Video encoding parameters
- local: streaming_video_encoding
title: Streaming Video Encoding
title: "Datasets"
@@ -139,6 +145,8 @@
title: OMX
- local: openarm
title: OpenArm
+ - local: rebot_b601
+ title: reBot B601-DM
title: "Robots"
- sections:
- local: phone_teleop
diff --git a/docs/source/cheat-sheet.mdx b/docs/source/cheat-sheet.mdx
new file mode 100644
index 000000000..a6afa14c2
--- /dev/null
+++ b/docs/source/cheat-sheet.mdx
@@ -0,0 +1,139 @@
+# Cheat sheet
+
+All of the LeRobot commands in one place. If you forgot how to use a specific command or want to learn about a new one you can do it here.
+
+> [!WARNING]
+> For all of the commands listed below remember to change the ports/names/ids to your own values!
+
+> [!TIP]
+> Another great way to look at all the commands and get them configured for your specific setup is to use this [Jupyter Notebook](https://github.com/huggingface/lerobot/blob/main/examples/notebooks/quickstart.ipynb).
+
+### Setup and installation
+
+For installation please look at [LeRobot Installation](https://huggingface.co/docs/lerobot/main/en/installation).
+
+### Useful tools
+
+###### Find port
+
+Use this to identify which serial ports your robots are connected to. Follow the instructions in your terminal: you will be asked to unplug the USB cable and press Enter. The script will then detect and print the correct serial port for that robot.
+
+```bash
+lerobot-find-port
+```
+
+###### Find cameras
+
+Quickly find camera indices and verify their output. This command prints camera information to the terminal and saves test frames from each detected camera to `lerobot/outputs/captured_images`
+
+```bash
+lerobot-find-cameras
+```
+
+### Calibration
+
+In most cases you will need to perform calibration just once for each robot and teleoperation device. Before performing the calibration make sure that all the joints are roughly in the middle position.
+
+```bash
+lerobot-calibrate \
+ --robot.type=so101_follower \
+ --robot.port=/dev/ttyACM0 \
+ --robot.id=my_follower_arm
+```
+
+Make sure that you use the same IDs used during calibration later for the other scripts. That's how LeRobot finds the calibration files.
+
+### Teleoperation
+
+Teleoperating with two cameras and displaying the data with Rerun.
+
+```bash
+lerobot-teleoperate \
+ --robot.type=so101_follower \
+ --robot.port=/dev/ttyACM0 \
+ --robot.id=my_follower_arm \
+ --robot.cameras="{ top: {type: opencv, index_or_path: 1, width: 640, height: 480, fps: 30}, wrist: {type: opencv, index_or_path: 0, width: 640, height: 480, fps: 30} }" \
+ --teleop.type=so101_leader \
+ --teleop.port=/dev/ttyACM1 \
+ --teleop.id=my_leader_arm \
+ --display_data=true
+```
+
+### Recording a dataset
+
+The dataset is automatically uploaded to the server and saved under repo_id, make sure you are logged in to your HF account with CLI:
+`hf auth login`
+
+You can get the token from: [https://huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)
+
+```bash
+lerobot-record \
+ --robot.type=so101_follower \
+ --robot.port=/dev/ttyACM0 \
+ --robot.id=my_follower_arm \
+ --robot.cameras="{ top: {type: opencv, index_or_path: 1, width: 640, height: 480, fps: 30}, wrist: {type: opencv, index_or_path: 0, width: 640, height: 480, fps: 30} }" \
+ --teleop.type=so101_leader \
+ --teleop.port=/dev/ttyACM1 \
+ --teleop.id=my_leader_arm \
+ --dataset.repo_id=${HF_USER}/so101_dataset_test \
+ --dataset.num_episodes=30 \
+ --dataset.single_task="put the red brick in a bowl" \
+ --dataset.streaming_encoding=true \
+ --display_data=true
+```
+
+While collecting the dataset you can control the process with your keyboard:
+Control the data recording flow using keyboard shortcuts:
+
+- Press **Right Arrow (`→`)**: Save episode and move to the next.
+- Press **Left Arrow (`←`)**: Delete current episode and retry.
+- Press **Escape (`ESC`)**: Stop, encode videos, and upload.
+
+### Training
+
+Depending on your hardware training the policy might take a few hours. That's how you train simple `ACT` policy:
+
+```bash
+lerobot-train \
+ --dataset.repo_id=${HF_USER}/so101_dataset_test \
+ --policy.type=act \
+ --output_dir=outputs/train/act_so101_test \
+ --job_name=act_so101_test \
+ --policy.device=cuda \
+ --wandb.enable=true \
+ --policy.repo_id=${HF_USER}/policy_test \
+ --steps=20000
+```
+
+- Policy Types: `act`, `diffusion`, `smolvla`, `pi05`
+- Devices: `cuda` (NVIDIA), `mps` (Apple Silicon), `cpu`
+
+If you want to fine-tune a specific model you can provide the path to the model. In this case path is enough and type can be skipped.
+
+```bash
+lerobot-train \
+ --dataset.repo_id=${HF_USER}/so101_dataset_test \
+ --policy.path=username/the_policy_to_finetune \
+ --policy.device=cuda \
+ --policy.repo_id=${HF_USER}/policy_test \
+ --output_dir=outputs/train/act_so101_test \
+ --steps=20000
+```
+
+### Inference
+
+Inference means running the trained policy/model on a robot. For that we use `lerobot-rollout`. You will need to provide a path to your policy. It can be a local path or a path to Hugging Face for example "lerobot/folding_latest". Your cameras configuration needs to match what was used when collecting the dataset. Duration is in seconds if unspecified, it will run forever.
+
+> [!TIP]
+> If you are using the previous release V0.5.1 instead of `lerobot-rollout` you need to use `lerobot-record`. More information [here](https://huggingface.co/docs/lerobot/v0.5.1/en/il_robots#run-inference-and-evaluate-your-policy).
+
+```bash
+lerobot-rollout \
+ --strategy.type=base \
+ --policy.path=${HF_USER}/my_policy \
+ --robot.type=so101_follower \
+ --robot.port=/dev/ttyACM1 \
+ --robot.cameras="{ up: {type: opencv, index_or_path: /dev/video1, width: 640, height: 480, fps: 30}, side: {type: opencv, index_or_path: /dev/video5, width: 640, height: 480, fps: 30}}" \
+ --task="Put lego brick into the transparent box" \
+ --duration=60
+```
diff --git a/docs/source/dataset_subtask.mdx b/docs/source/dataset_subtask.mdx
deleted file mode 100644
index 6264aca22..000000000
--- a/docs/source/dataset_subtask.mdx
+++ /dev/null
@@ -1,277 +0,0 @@
-# Using Subtasks in LeRobot Datasets
-
-Subtask support in robotics datasets has proven effective in improving robot reasoning and understanding. Subtasks are particularly useful for:
-
-- **Hierarchical policies**: Building policies that include subtask predictions to visualize robot reasoning in real time
-- **Reward modeling**: Helping reward models understand task progression (e.g., SARM-style stage-aware reward models)
-- **Task decomposition**: Breaking down complex manipulation tasks into atomic, interpretable steps
-
-LeRobotDataset now supports subtasks as part of its dataset structure, alongside tasks.
-
-## What are Subtasks?
-
-While a **task** describes the overall goal (e.g., "Pick up the apple and place it in the basket"), **subtasks** break down the execution into finer-grained steps:
-
-1. "Approach the apple"
-2. "Grasp the apple"
-3. "Lift the apple"
-4. "Move to basket"
-5. "Release the apple"
-
-Each frame in the dataset can be annotated with its corresponding subtask, enabling models to learn and predict these intermediate stages.
-
-
-
-
- Figure: Overview of subtask annotation.
-
-
-**Reference:** _Subtask-learning based for robot self-assembly in flexible collaborative assembly in manufacturing_, Original Article, Published: 19 April 2022.
-
-## Dataset Structure
-
-Subtask information is stored in the dataset metadata:
-
-```
-my-dataset/
-├── data/
-│ └── ...
-├── meta/
-│ ├── info.json
-│ ├── stats.json
-│ ├── tasks.parquet
-│ ├── subtasks.parquet # Subtask index → subtask string mapping
-│ └── episodes/
-│ └── ...
-└── videos/
- └── ...
-```
-
-### Subtasks Parquet File
-
-The `meta/subtasks.parquet` file maps subtask indices to their natural language descriptions:
-
-| subtask_index | subtask (index column) |
-| ------------- | ---------------------- |
-| 0 | "Approach the apple" |
-| 1 | "Grasp the apple" |
-| 2 | "Lift the apple" |
-| ... | ... |
-
-### Frame-Level Annotations
-
-Each frame in the dataset can include a `subtask_index` field that references the subtasks parquet file:
-
-```python
-# Example frame data in the parquet file
-{
- "index": 42,
- "timestamp": 1.4,
- "episode_index": 0,
- "task_index": 0,
- "subtask_index": 2, # References "Lift the apple"
- "observation.state": [...],
- "action": [...],
-}
-```
-
-## Annotating Datasets with Subtasks
-
-We provide a HuggingFace Space for easily annotating any LeRobotDataset with subtasks:
-
-**[https://huggingface.co/spaces/lerobot/annotate](https://huggingface.co/spaces/lerobot/annotate)**
-
-After completing your annotation:
-
-1. Click "Push to Hub" to upload your annotated dataset
-2. You can also run the annotation space locally by following the instructions at [github.com/huggingface/lerobot-annotate](https://github.com/huggingface/lerobot-annotate)
-
-## Loading Datasets with Subtasks
-
-When you load a dataset with subtask annotations, the subtask information is automatically available:
-
-```python
-from lerobot.datasets import LeRobotDataset
-
-# Load a dataset with subtask annotations
-dataset = LeRobotDataset("jadechoghari/collect-fruit-annotated")
-
-# Access a sample
-sample = dataset[100]
-
-# The sample includes both task and subtask information
-print(sample["task"]) # "Collect the fruit"
-print(sample["subtask"]) # "Grasp the apple"
-print(sample["task_index"]) # tensor(0)
-print(sample["subtask_index"]) # tensor(2)
-```
-
-### Checking for Subtask Support
-
-You can check if a dataset has subtask annotations:
-
-```python
-# Check if subtasks are available
-has_subtasks = (
- "subtask_index" in dataset.features
- and dataset.meta.subtasks is not None
-)
-
-if has_subtasks:
- print(f"Dataset has {len(dataset.meta.subtasks)} unique subtasks")
- print("Subtasks:", list(dataset.meta.subtasks.index))
-```
-
-## Using Subtasks for Training
-
-### With the Tokenizer Processor
-
-The `TokenizerProcessor` automatically handles subtask tokenization for Vision-Language Action (VLA) models:
-
-```python
-from lerobot.processor import TokenizerProcessorStep
-
-# Create a tokenizer processor step
-tokenizer_processor = TokenizerProcessorStep(
- tokenizer_name_or_path="google/paligemma-3b-pt-224",
- padding="max_length",
- max_length=64,
-)
-
-# The processor will automatically tokenize subtasks if present in the batch
-# and add them to the observation under:
-# - "observation.subtask.tokens"
-# - "observation.subtask.attention_mask"
-```
-
-When subtasks are available in the batch, the tokenizer processor adds:
-
-- `observation.subtask.tokens`: Tokenized subtask text
-- `observation.subtask.attention_mask`: Attention mask for the subtask tokens
-
-### DataLoader with Subtasks
-
-```python
-import torch
-from lerobot.datasets import LeRobotDataset
-
-dataset = LeRobotDataset("jadechoghari/collect-fruit-annotated")
-
-dataloader = torch.utils.data.DataLoader(
- dataset,
- batch_size=16,
- shuffle=True,
-)
-
-for batch in dataloader:
- # Access subtask information in the batch
- subtasks = batch["subtask"] # List of subtask strings
- subtask_indices = batch["subtask_index"] # Tensor of subtask indices
-
- # Use for training hierarchical policies or reward models
- print(f"Batch subtasks: {set(subtasks)}")
-```
-
-## Example Datasets with Subtask Annotations
-
-Try loading a dataset with subtask annotations:
-
-```python
-from lerobot.datasets import LeRobotDataset
-
-# Example dataset with subtask annotations
-dataset = LeRobotDataset("jadechoghari/collect-fruit-annotated")
-
-# Explore the subtasks
-print("Available subtasks:")
-for subtask_name in dataset.meta.subtasks.index:
- print(f" - {subtask_name}")
-
-# Get subtask distribution
-subtask_counts = {}
-for i in range(len(dataset)):
- sample = dataset[i]
- subtask = sample["subtask"]
- subtask_counts[subtask] = subtask_counts.get(subtask, 0) + 1
-
-print("\nSubtask distribution:")
-for subtask, count in sorted(subtask_counts.items(), key=lambda x: -x[1]):
- print(f" {subtask}: {count} frames")
-```
-
-## Use Cases
-
-### 1. Hierarchical Policy Training
-
-Train policies that predict both actions and current subtask:
-
-```python
-class HierarchicalPolicy(nn.Module):
- def __init__(self, num_subtasks):
- super().__init__()
- self.action_head = nn.Linear(hidden_dim, action_dim)
- self.subtask_head = nn.Linear(hidden_dim, num_subtasks)
-
- def forward(self, observations):
- features = self.encoder(observations)
- actions = self.action_head(features)
- subtask_logits = self.subtask_head(features)
- return actions, subtask_logits
-```
-
-### 2. Stage-Aware Reward Modeling (SARM)
-
-Build reward models that understand task progression:
-
-```python
-# SARM predicts:
-# - Stage: Which subtask is being executed (discrete)
-# - Progress: How far along the subtask (continuous 0-1)
-
-class SARMRewardModel(nn.Module):
- def forward(self, observations):
- features = self.encoder(observations)
- stage_logits = self.stage_classifier(features)
- progress = self.progress_regressor(features)
- return stage_logits, progress
-```
-
-### 3. Progress Visualization
-
-Monitor robot execution by tracking subtask progression:
-
-```python
-def visualize_execution(model, observations):
- for t, obs in enumerate(observations):
- action, subtask_logits = model(obs)
- predicted_subtask = subtask_names[subtask_logits.argmax()]
- print(f"t={t}: Executing '{predicted_subtask}'")
-```
-
-## API Reference
-
-### LeRobotDataset Properties
-
-| Property | Type | Description |
-| --------------------------- | ---------------------- | ------------------------------------------ |
-| `meta.subtasks` | `pd.DataFrame \| None` | DataFrame mapping subtask names to indices |
-| `features["subtask_index"]` | `dict` | Feature spec for subtask_index if present |
-
-### Sample Keys
-
-When subtasks are available, each sample includes:
-
-| Key | Type | Description |
-| --------------- | -------------- | ------------------------------------ |
-| `subtask_index` | `torch.Tensor` | Integer index of the current subtask |
-| `subtask` | `str` | Natural language subtask description |
-
-## Related Resources
-
-- [SARM Paper](https://arxiv.org/pdf/2509.25358) - Stage-Aware Reward Modeling for Long Horizon Robot Manipulation
-- [LeRobot Annotate Space](https://huggingface.co/spaces/lerobot/annotate) - Interactive annotation tool
-- [LeRobotDataset v3.0](./lerobot-dataset-v3) - Dataset format documentation
diff --git a/docs/source/earthrover_mini_plus.mdx b/docs/source/earthrover_mini_plus.mdx
index a87bd325b..508c0e3a9 100644
--- a/docs/source/earthrover_mini_plus.mdx
+++ b/docs/source/earthrover_mini_plus.mdx
@@ -194,7 +194,7 @@ lerobot-record \
--dataset.single_task="Navigate around obstacles" \
--dataset.streaming_encoding=true \
--dataset.encoder_threads=2 \
- # --dataset.vcodec=auto \
+ # --dataset.camera_encoder.vcodec=auto \
--display_data=true
```
diff --git a/docs/source/groot.mdx b/docs/source/groot.mdx
index 69f114ca6..a10b5e369 100644
--- a/docs/source/groot.mdx
+++ b/docs/source/groot.mdx
@@ -124,7 +124,7 @@ lerobot-rollout\
--dataset.single_task="Grab and handover the red cube to the other arm" \
--dataset.streaming_encoding=true \
--dataset.encoder_threads=2 \
- # --dataset.vcodec=auto \
+ # --dataset.camera_encoder.vcodec=auto \
--policy.path=/groot-bimanual \ # your trained model
--duration=600
```
diff --git a/docs/source/hope_jr.mdx b/docs/source/hope_jr.mdx
index 8826d9758..1f3b08fd7 100644
--- a/docs/source/hope_jr.mdx
+++ b/docs/source/hope_jr.mdx
@@ -232,7 +232,7 @@ lerobot-record \
--dataset.private=true \
--dataset.streaming_encoding=true \
--dataset.encoder_threads=2 \
- # --dataset.vcodec=auto \
+ # --dataset.camera_encoder.vcodec=auto \
--display_data=true
```
@@ -278,6 +278,6 @@ lerobot-record \
--dataset.num_episodes=10 \
--dataset.streaming_encoding=true \
--dataset.encoder_threads=2 \
- # --dataset.vcodec=auto \
+ # --dataset.camera_encoder.vcodec=auto \
--policy.path=outputs/train/hopejr_hand/checkpoints/last/pretrained_model
```
diff --git a/docs/source/il_robots.mdx b/docs/source/il_robots.mdx
index c5ed5be5b..dc2e02737 100644
--- a/docs/source/il_robots.mdx
+++ b/docs/source/il_robots.mdx
@@ -207,7 +207,7 @@ lerobot-record \
--dataset.num_episodes=5 \
--dataset.single_task="Grab the black cube" \
--dataset.streaming_encoding=true \
- # --dataset.vcodec=auto \
+ # --dataset.camera_encoder.vcodec=auto \
--dataset.encoder_threads=2
```
diff --git a/docs/source/language_and_recipes.mdx b/docs/source/language_and_recipes.mdx
new file mode 100644
index 000000000..4181dbe34
--- /dev/null
+++ b/docs/source/language_and_recipes.mdx
@@ -0,0 +1,147 @@
+# Language columns and recipes
+
+Most LeRobot datasets ship with a single `task` string per episode — fine for
+short, single-instruction skills, but not enough for the longer-horizon,
+multi-modal robot policies the field is moving toward (high-level planning,
+memory, interjections, VQA, tool use). To support those policies without
+forking the dataset format, LeRobot extends `LeRobotDataset` with two optional
+language columns and a small recipe layer that turns those rows into
+chat-style training samples on the fly.
+
+The design splits cleanly into three layers:
+
+1. **Data in the dataset** — language annotations stored next to frames in
+ `data/chunk-*/file-*.parquet` as two optional columns (`language_persistent`
+ and `language_events`). Datasets without these columns keep their existing
+ behavior.
+2. **Recipe** — a YAML file that declares which annotation rows to bind and
+ how to lay them out as chat turns (`role`, `content`, optional images,
+ optional tool calls). Recipes are pure config; no Python required to add a
+ new one.
+3. **Training format** — at sample time, `RenderMessagesStep` resolves the
+ recipe against the per-frame annotations and emits HF-style `messages` plus
+ LeRobot-specific sidecars (`message_streams`, `target_message_indices`)
+ that policy processors consume.
+
+This page describes each layer in turn.
+
+## Layer 1 — language columns in the dataset
+
+The two optional columns live next to frame data in
+`data/chunk-*/file-*.parquet`:
+
+- `language_persistent`: a list of rows broadcast across every frame in an episode for state that remains active, such as `subtask`, `plan`, and `memory`.
+- `language_events`: a list of rows only on the exact frame where an event was emitted, such as `interjection`, `vqa`, and speech tool calls.
+
+Both columns share the same row shape (event rows omit `timestamp` because the
+frame the row sits on already provides it):
+
+```text
+role: string
+content: string | null
+style: string | null
+timestamp: float32 # persistent rows only
+camera: string | null # observation.images.* feature key, view-dependent rows only
+tool_calls: list[Json] | null
+```
+
+The `camera` field tags rows whose `content` is grounded in a specific camera
+view. Rows of view-dependent styles (`vqa` and `trace`) MUST set `camera` to
+the matching `observation.images.*` feature key. Rows of every other style —
+including `motion`, which describes robot-frame primitives in joint / Cartesian
+terms — MUST leave `camera` as `null`. Pipeline writers and the validator
+enforce this via `validate_camera_field(style, camera)`.
+
+`meta/tasks.parquet` remains the canonical source for the task. The special `${task}` recipe binding always reads that task string and does not depend on language annotations.
+
+### Architecture
+
+The language stack itself has three internal modules backing layer 1:
+
+1. `lerobot.datasets.language` defines the schema, style registry, and `column_for_style`.
+2. `lerobot.datasets.language_render` resolves rows and renders messages.
+3. `RenderMessagesStep` turns dataset samples into `messages`, `message_streams`, and `target_message_indices`.
+
+`LeRobotDataset` stays recipe-agnostic. It passes `language_persistent` and `language_events` through when present, and unannotated datasets keep their existing behavior.
+
+## Layer 2 — recipe anatomy
+
+Recipes are YAML files backed by `TrainingRecipe` and `MessageTurn`. They
+declare which annotation rows to pull (via `bindings`) and how to compose them
+into chat turns (`messages`).
+
+```yaml
+messages:
+ - { role: user, content: "${task}", stream: high_level }
+ - { role: assistant, content: "${subtask}", stream: low_level, target: true }
+```
+
+A recipe can also branch into a weighted **blend** of sub-recipes. At sample
+time, exactly one branch is selected deterministically from the sample index,
+so different frames train different objectives (e.g. memory updates vs.
+low-level execution vs. VQA) without any Python wiring.
+
+### Temporal semantics
+
+Persistent styles are active after emission until replaced:
+
+- `active_at(t, style=subtask)`
+- `nth_prev(style=memory, offset=1)`
+- `nth_next(style=subtask, offset=1)`
+
+Event styles only exist on their exact timestamp:
+
+- `emitted_at(t, style=interjection)`
+- `emitted_at(t, style=vqa, role=user, camera=observation.images.top)`
+- `emitted_at(t, role=assistant, tool_name=say)`
+
+Exact event matching has no tolerance window, so writers must stamp event rows with frame timestamps from the parquet data.
+
+### View-dependent resolution
+
+For view-dependent styles (`vqa` and `trace`), the resolver gains a
+`camera=` filter parallel to `role=` and `tool_name=`. Datasets with multiple
+cameras typically emit one (`vqa`, `user`) + (`vqa`, `assistant`) pair per
+camera at the same timestamp; without `camera=`, those resolvers see two
+matches and raise an ambiguity error. Recipes consume each camera through its
+own binding plus a matching image block, e.g.
+
+```yaml
+ask_vqa_top:
+ bindings:
+ vqa_query: "emitted_at(t, style=vqa, role=user, camera=observation.images.top)"
+ vqa: "emitted_at(t, style=vqa, role=assistant, camera=observation.images.top)"
+ messages:
+ - role: user
+ stream: high_level
+ if_present: vqa_query
+ content:
+ - { type: image, feature: observation.images.top }
+ - { type: text, text: "${vqa_query}" }
+ - {
+ role: assistant,
+ content: "${vqa}",
+ stream: high_level,
+ target: true,
+ if_present: vqa,
+ }
+```
+
+Add one such sub-recipe per camera the dataset records.
+
+## Layer 3 — training format
+
+Rendered samples use HF-style chat messages plus LeRobot sidecars:
+
+```python
+sample["messages"]
+sample["message_streams"]
+sample["target_message_indices"]
+```
+
+The renderer does not apply a tokenizer chat template. Policy processors decide how to serialize the messages for their backbone, which keeps the same dataset usable across SmolVLA, Pi0.5, and any future VLM that expects OpenAI-style chat messages.
+
+## Graceful absence
+
+If both language columns are missing, `None`, or empty, `RenderMessagesStep` is a no-op.
+If an event-scoped branch is selected on a frame without the required event row, rendering returns `None`, allowing a loader to retry another sample.
diff --git a/docs/source/lerobot-dataset-v3.mdx b/docs/source/lerobot-dataset-v3.mdx
index 8ab4a5d40..c23677d8c 100644
--- a/docs/source/lerobot-dataset-v3.mdx
+++ b/docs/source/lerobot-dataset-v3.mdx
@@ -10,6 +10,7 @@ This docs will guide you to:
- Stream datasets without downloading using `StreamingLeRobotDataset`
- Apply image transforms for data augmentation during training
- Migrate existing `v2.1` datasets to `v3.0`
+- Experiment with other `LeRobotDataset` formats and implementations like Lance
## What’s new in `v3`
@@ -43,7 +44,7 @@ lerobot-record \
--dataset.num_episodes=5 \
--dataset.single_task="Grab the black cube" \
--dataset.streaming_encoding=true \
- # --dataset.vcodec=auto \
+ # --dataset.camera_encoder.vcodec=auto \
--dataset.encoder_threads=2
```
@@ -315,3 +316,39 @@ Dataset v3.0 uses incremental parquet writing with buffered metadata for efficie
- Ensures the dataset is valid for loading
Without calling `finalize()`, your parquet files will be incomplete and the dataset won't load properly.
+
+## Other formats and implementations
+
+### Lance
+
+Lance is a useful format for multimodal AI datasets, especially for large-scale training requiring high performance IO and random access.
+
+The `lerobot-lancedb` package implements `LeRobotLanceDataset` (for JPEG images) and `LeRobotLanceVideoDataset` (for mp4 videos).
+Those two storage layouts both subclass LeRobotDataset and can provide data loading speed ups.
+
+`LeRobotLanceDataset` is a drop-in replacement for `LeRobotDataset`:
+
+```python
+from lerobot.datasets import LeRobotDatasetMetadata
+from lerobot.policies.diffusion.configuration_diffusion import DiffusionConfig
+from lerobot_lancedb import LeRobotLanceDataset, LeRobotLanceVideoDataset
+
+cfg = DiffusionConfig(...)
+meta = LeRobotDatasetMetadata(root=local_dataset_path) # or use repo_id=... to load metadata from the Hub
+delta_timestamps = {...}
+
+# Use LeRobotLanceDataset for image datasets
+dataset = LeRobotLanceDataset(
+ root=local_dataset_path, # or use repo_id=... to stream from the Hub
+ delta_timestamps=delta_timestamps,
+ return_uint8=True,
+)
+# Or use LeRobotLanceVideoDataset for video datasets:
+dataset = LeRobotLanceVideoDataset(
+ root=local_dataset_path, # or use repo_id=... to stream from the Hub
+ delta_timestamps=delta_timestamps,
+ return_uint8=True,
+)
+```
+
+Join the discussion on [Github](https://github.com/huggingface/lerobot/issues/3608) and explore the `lerobot-lancedb` documentation [here](https://lancedb.github.io/lerobot-lancedb/).
diff --git a/docs/source/reachy2.mdx b/docs/source/reachy2.mdx
index 1b868711a..4b08569db 100644
--- a/docs/source/reachy2.mdx
+++ b/docs/source/reachy2.mdx
@@ -161,7 +161,7 @@ lerobot-record \
--dataset.private=true \
--dataset.streaming_encoding=true \
--dataset.encoder_threads=2 \
- # --dataset.vcodec=auto \
+ # --dataset.camera_encoder.vcodec=auto \
--display_data=true
```
@@ -203,7 +203,7 @@ lerobot-record \
--dataset.private=true \
--dataset.streaming_encoding=true \
--dataset.encoder_threads=2 \
- # --dataset.vcodec=auto \
+ # --dataset.camera_encoder.vcodec=auto \
--display_data=true
```
diff --git a/docs/source/rebot_b601.mdx b/docs/source/rebot_b601.mdx
new file mode 100644
index 000000000..adb751560
--- /dev/null
+++ b/docs/source/rebot_b601.mdx
@@ -0,0 +1,186 @@
+# reBot B601-DM
+
+[reBot B601-DM](https://wiki.seeedstudio.com/rebot_arm_b601_dm_lerobot/) is an open-source, low-cost robot arm from Seeed Studio for embodied-AI and imitation learning. It comes as a **follower** arm (the `B601-DM`, a 6-DOF arm plus gripper driven by Damiao CAN motors) and a **leader** arm (the `StarArm102` / `reBot Arm 102`, driven by FashionStar UART smart servos) used to teleoperate it.
+
+This page covers **calibration** and **teleoperation** for both single-arm and bimanual (dual-arm) setups.
+
+
+
+
+
+
+_Left: the B601-DM follower at its zero position. Right: the reBot Arm 102 leader at its zero position. Images courtesy of [Seeed Studio](https://wiki.seeedstudio.com/rebot_arm_b601_dm_lerobot/)._
+
+## Install LeRobot 🤗
+
+Follow our [Installation Guide](./installation), then install the reBot support:
+
+```bash
+pip install -e ".[rebot]"
+```
+
+This pulls in `motorbridge` (CAN motor control for the B601-DM follower) and `motorbridge-smart-servo` (FashionStar UART servos for the reBot Arm 102 leader).
+
+## Registered device types
+
+| Type | Kind |
+| ------------------------ | -------------------------------------------- |
+| `rebot_b601_follower` | single-arm B601-DM follower robot |
+| `bi_rebot_b601_follower` | bimanual (dual-arm) follower robot |
+| `rebot_102_leader` | single-arm reBot Arm 102 leader teleoperator |
+| `bi_rebot_102_leader` | bimanual (dual-arm) leader teleoperator |
+
+The bimanual types compose two single-arm instances and namespace each arm's
+observation/action keys with a `left_` / `right_` prefix. Per-arm settings are
+passed through nested `left_arm_config.*` / `right_arm_config.*` arguments.
+
+## Find the USB ports
+
+For each device, find the USB port associated with its motor bus using:
+
+```bash
+lerobot-find-port
+```
+
+
+ On Linux, remove `brltty` (`sudo apt remove brltty`) so it does not hold the
+ leader's USB serial port. You may also need to grant access to the serial
+ devices: `sudo chmod 666 /dev/ttyACM* /dev/ttyUSB*`.
+
+
+## Calibration
+
+Neither arm stores a persistent hardware calibration: every time it connects, the motors are re-zeroed against the pose the arm is physically holding. Calibration simply records that zero pose. When prompted, **manually move the arm to its zero position** (the default sit-down pose shown above, gripper fully closed) and press ENTER .
+
+### Follower (B601-DM)
+
+
+
+
+```bash
+lerobot-calibrate \
+ --robot.type=rebot_b601_follower \
+ --robot.port=/dev/ttyACM0 \
+ --robot.id=follower \
+ --robot.can_adapter=damiao
+```
+
+
+
+
+Connect the bimanual follower; calibration runs for the left arm, then the right arm.
+
+```bash
+lerobot-calibrate \
+ --robot.type=bi_rebot_b601_follower \
+ --robot.id=bi_follower \
+ --robot.left_arm_config.port=/dev/ttyACM0 \
+ --robot.left_arm_config.can_adapter=damiao \
+ --robot.right_arm_config.port=/dev/ttyACM1 \
+ --robot.right_arm_config.can_adapter=damiao
+```
+
+Per-arm calibration files are saved with `_left` / `_right` suffixes on the id.
+
+
+
+
+### Leader (reBot Arm 102)
+
+
+
+
+```bash
+lerobot-calibrate \
+ --teleop.type=rebot_102_leader \
+ --teleop.port=/dev/ttyUSB0 \
+ --teleop.id=leader
+```
+
+
+
+
+```bash
+lerobot-calibrate \
+ --teleop.type=bi_rebot_102_leader \
+ --teleop.id=bi_leader \
+ --teleop.left_arm_config.port=/dev/ttyUSB0 \
+ --teleop.right_arm_config.port=/dev/ttyUSB1
+```
+
+
+
+
+## Teleoperation
+
+Once both arms are calibrated, drive the follower with the leader. The follower talks to its CAN bus through a Damiao serial bridge (`can_adapter=damiao`, the default) or a SocketCAN adapter (`can_adapter=socketcan`). See the [OpenArm page](./openarm) for more details on the SocketCAN adapter configuration.
+
+
+
+
+```bash
+lerobot-teleoperate \
+ --robot.type=rebot_b601_follower \
+ --robot.port=/dev/ttyACM0 \
+ --robot.id=follower \
+ --robot.can_adapter=damiao \
+ --teleop.type=rebot_102_leader \
+ --teleop.port=/dev/ttyUSB0 \
+ --teleop.id=leader
+```
+
+
+
+
+The bimanual leader and follower reuse the single-arm classes; each arm is
+configured through nested `left_arm_config.*` / `right_arm_config.*` arguments,
+so a bimanual reBot Arm 102 leader drives a bimanual B601-DM follower.
+
+```bash
+lerobot-teleoperate \
+ --robot.type=bi_rebot_b601_follower \
+ --robot.id=bi_follower \
+ --robot.left_arm_config.port=/dev/ttyACM0 \
+ --robot.left_arm_config.can_adapter=damiao \
+ --robot.right_arm_config.port=/dev/ttyACM1 \
+ --robot.right_arm_config.can_adapter=damiao \
+ --teleop.type=bi_rebot_102_leader \
+ --teleop.id=bi_leader \
+ --teleop.left_arm_config.port=/dev/ttyUSB0 \
+ --teleop.right_arm_config.port=/dev/ttyUSB1
+```
+
+
+
+
+
+ The leader and follower share the same joint names (`shoulder_pan,
+ shoulder_lift, elbow_flex, wrist_flex, wrist_yaw, wrist_roll, gripper`), so
+ leader actions map directly onto the follower.
+
+
+If the motion of a joint is reversed, flip its sign in the leader's `joint_directions` (the gripper also carries a scale to widen its range to the follower):
+
+```bash
+lerobot-teleoperate \
+ --robot.type=rebot_b601_follower \
+ --robot.port=/dev/ttyACM0 \
+ --robot.can_adapter=damiao \
+ --teleop.type=rebot_102_leader \
+ --teleop.port=/dev/ttyUSB0 \
+ --teleop.joint_directions='{"shoulder_pan":-1,"shoulder_lift":-1,"elbow_flex":1,"wrist_flex":1,"wrist_yaw":1,"wrist_roll":-1,"gripper":-6}'
+```
+
+## Recording datasets
+
+Swap `lerobot-teleoperate` for `lerobot-record` (with the same `--robot.*` / `--teleop.*` arguments, plus `--dataset.*`) to record demonstrations for training. See [Imitation Learning for Robots](./il_robots) for the full workflow.
+
+For hardware assembly and wiring, see the [Seeed Studio reBot wiki](https://wiki.seeedstudio.com/rebot_arm_b601_dm_lerobot/).
diff --git a/docs/source/streaming_video_encoding.mdx b/docs/source/streaming_video_encoding.mdx
index 40004200e..96e049eb3 100644
--- a/docs/source/streaming_video_encoding.mdx
+++ b/docs/source/streaming_video_encoding.mdx
@@ -17,9 +17,9 @@ This makes `save_episode()` near-instant (the video is already encoded by the ti
| Parameter | CLI Flag | Type | Default | Description |
| ----------------------- | --------------------------------- | ------------- | ------------- | ----------------------------------------------------------------- |
| `streaming_encoding` | `--dataset.streaming_encoding` | `bool` | `True` | Enable real-time encoding during capture |
-| `vcodec` | `--dataset.vcodec` | `str` | `"libsvtav1"` | Video codec. `"auto"` detects best HW encoder |
+| `vcodec` | `--dataset.camera_encoder.vcodec` | `str` | `"libsvtav1"` | Video codec. `"auto"` detects best HW encoder |
| `encoder_threads` | `--dataset.encoder_threads` | `int \| None` | `None` (auto) | Threads per encoder instance. `None` will leave the vcoded decide |
-| `encoder_queue_maxsize` | `--dataset.encoder_queue_maxsize` | `int` | `60` | Max buffered frames per camera (~2s at 30fps). Consumes RAM |
+| `encoder_queue_maxsize` | `--dataset.encoder_queue_maxsize` | `int` | `30` | Max buffered frames per camera (~1s at 30fps). Consumes RAM |
## 3. Performance Considerations
@@ -48,7 +48,7 @@ This parameter controls how many threads each encoder instance uses internally:
### Backpressure and Frame Dropping
-Each camera has a bounded queue (`encoder_queue_maxsize`, default 60 frames). When the encoder can't keep up:
+Each camera has a bounded queue (`encoder_queue_maxsize`, default 30 frames). When the encoder can't keep up:
1. The queue fills up (consuming RAM)
2. New frames are **dropped** (not blocked) — the capture loop continues uninterrupted
@@ -82,15 +82,15 @@ Use HW encoding when:
### Available HW Encoders
-| Encoder | Platform | Hardware | CLI Value |
-| ------------------- | ------------- | ------------------------------------------------------------------------------------------------ | ------------------------------------ |
-| `h264_videotoolbox` | macOS | Apple Silicon / Intel | `--dataset.vcodec=h264_videotoolbox` |
-| `hevc_videotoolbox` | macOS | Apple Silicon / Intel | `--dataset.vcodec=hevc_videotoolbox` |
-| `h264_nvenc` | Linux/Windows | NVIDIA GPU | `--dataset.vcodec=h264_nvenc` |
-| `hevc_nvenc` | Linux/Windows | NVIDIA GPU | `--dataset.vcodec=hevc_nvenc` |
-| `h264_vaapi` | Linux | Intel/AMD GPU | `--dataset.vcodec=h264_vaapi` |
-| `h264_qsv` | Linux/Windows | Intel Quick Sync | `--dataset.vcodec=h264_qsv` |
-| `auto` | Any | Probes the system for available HW encoders. Falls back to `libsvtav1` if no HW encoder is found | `--dataset.vcodec=auto` |
+| Encoder | Platform | Hardware | CLI Value |
+| ------------------- | ------------- | ------------------------------------------------------------------------------------------------ | --------------------------------------------------- |
+| `h264_videotoolbox` | macOS | Apple Silicon / Intel | `--dataset.camera_encoder.vcodec=h264_videotoolbox` |
+| `hevc_videotoolbox` | macOS | Apple Silicon / Intel | `--dataset.camera_encoder.vcodec=hevc_videotoolbox` |
+| `h264_nvenc` | Linux/Windows | NVIDIA GPU | `--dataset.camera_encoder.vcodec=h264_nvenc` |
+| `hevc_nvenc` | Linux/Windows | NVIDIA GPU | `--dataset.camera_encoder.vcodec=hevc_nvenc` |
+| `h264_vaapi` | Linux | Intel/AMD GPU | `--dataset.camera_encoder.vcodec=h264_vaapi` |
+| `h264_qsv` | Linux/Windows | Intel Quick Sync | `--dataset.camera_encoder.vcodec=h264_qsv` |
+| `auto` | Any | Probes the system for available HW encoders. Falls back to `libsvtav1` if no HW encoder is found | `--dataset.camera_encoder.vcodec=auto` |
> [!NOTE]
> In order to use the HW accelerated encoders you might need to upgrade your GPU drivers.
@@ -100,15 +100,15 @@ Use HW encoding when:
## 5. Troubleshooting
-| Symptom | Likely Cause | Fix |
-| ------------------------------------------------------------------ | -------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| System freezes or choppy robot movement or Rerun visualization lag | CPU starved (100% load usage) | Close other apps, reduce encoding throughput, lower `encoder_threads`, use `h264`, use `display_data=False`. If the CPU continues to be at 100% then it might be insufficient for your setup, consider `--dataset.streaming_encoding=false` or HW encoding (`--dataset.vcodec=auto`) |
-| "Encoder queue full" warnings or dropped frames in dataset | Encoder can't keep up (Queue overflow) | If CPU is not at 100%: Increase `encoder_threads`, increase `encoder_queue_maxsize` or use HW encoding (`--dataset.vcodec=auto`). |
-| High RAM usage | Queue filling faster than encoding | `encoder_threads` too low or CPU insufficient. Reduce `encoder_queue_maxsize` or use HW encoding |
-| Large video files | Using HW encoder or H.264 | Expected trade-off. Switch to `libsvtav1` if CPU allows |
-| `save_episode()` still slow | `streaming_encoding` is `False` | Set `--dataset.streaming_encoding=true` |
-| Encoder thread crash | Codec not available or invalid settings | Check `vcodec` is installed, try `--dataset.vcodec=auto` |
-| Recorded dataset is missing frames | CPU/GPU starvation or occasional load spikes | If ~5% of frames are missing, your system is likely overloaded — follow the recommendations above. If fewer frames are missing (~2%), they are probably due to occasional transient load spikes (often at startup) and can be considered expected. |
+| Symptom | Likely Cause | Fix |
+| ------------------------------------------------------------------ | -------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| System freezes or choppy robot movement or Rerun visualization lag | CPU starved (100% load usage) | Close other apps, reduce encoding throughput, lower `encoder_threads`, use `h264`, use `display_data=False`. If the CPU continues to be at 100% then it might be insufficient for your setup, consider `--dataset.streaming_encoding=false` or HW encoding (`--dataset.camera_encoder.vcodec=auto`) |
+| "Encoder queue full" warnings or dropped frames in dataset | Encoder can't keep up (Queue overflow) | If CPU is not at 100%: Increase `encoder_threads`, increase `encoder_queue_maxsize` or use HW encoding (`--dataset.camera_encoder.vcodec=auto`). |
+| High RAM usage | Queue filling faster than encoding | `encoder_threads` too low or CPU insufficient. Reduce `encoder_queue_maxsize` or use HW encoding |
+| Large video files | Using HW encoder or H.264 | Expected trade-off. Switch to `libsvtav1` if CPU allows |
+| `save_episode()` still slow | `streaming_encoding` is `False` | Set `--dataset.streaming_encoding=true` |
+| Encoder thread crash | Codec not available or invalid settings | Check `vcodec` is installed, try `--dataset.camera_encoder.vcodec=auto` |
+| Recorded dataset is missing frames | CPU/GPU starvation or occasional load spikes | If ~5% of frames are missing, your system is likely overloaded — follow the recommendations above. If fewer frames are missing (~2%), they are probably due to occasional transient load spikes (often at startup) and can be considered expected. |
## 6. Recommended Configurations
@@ -146,7 +146,7 @@ On very constrained systems, streaming encoding may compete too heavily with the
# 2camsx 640x480x3 @30fps: Requires some tuning.
# Use H.264, disable streaming, consider batching encoding
-lerobot-record --dataset.vcodec=h264 --dataset.streaming_encoding=false ...
+lerobot-record --dataset.camera_encoder.vcodec=h264 --dataset.streaming_encoding=false ...
```
## 7. Closing note
diff --git a/docs/source/tools.mdx b/docs/source/tools.mdx
new file mode 100644
index 000000000..d88881184
--- /dev/null
+++ b/docs/source/tools.mdx
@@ -0,0 +1,210 @@
+# Tools
+
+LeRobot v3.1 supports **tool calls** in policies — assistant messages can
+emit structured invocations like `say(text="OK, starting now")` that the
+runtime dispatches to a real implementation (TTS, controller, logger, …).
+
+This page covers:
+
+1. Where the tool catalog lives.
+2. How the annotation pipeline produces tool-call atoms.
+3. How to add your own tool.
+
+## Where tools are declared
+
+Two layers.
+
+**The catalog** — a list of OpenAI-style function schemas — lives at
+`meta/info.json["tools"]` on each dataset. Example:
+
+```json
+{
+ "features": { "...": "..." },
+ "tools": [
+ {
+ "type": "function",
+ "function": {
+ "name": "say",
+ "description": "Speak a short utterance to the user via the TTS executor.",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "text": {
+ "type": "string",
+ "description": "The verbatim text to speak."
+ }
+ },
+ "required": ["text"]
+ }
+ }
+ }
+ ]
+}
+```
+
+Read it via the dataset metadata accessor:
+
+```python
+from lerobot.datasets.dataset_metadata import LeRobotDatasetMetadata
+
+meta = LeRobotDatasetMetadata(repo_id="pepijn/super_poulain_final_annotations")
+tools = meta.tools # list[dict] — OpenAI tool schemas
+```
+
+If the dataset's `info.json` doesn't declare any tools, `meta.tools`
+returns `DEFAULT_TOOLS` from `lerobot.datasets.language` — currently a
+single-entry list with the canonical `say` schema. So unannotated
+datasets and chat-template consumers keep working without any
+configuration:
+
+```python
+prompt_str = tokenizer.apply_chat_template(
+ sample["messages"],
+ tools=meta.tools, # works either way
+ add_generation_prompt=False,
+ tokenize=False,
+)
+```
+
+**The implementations** — runnable Python — will live under
+`src/lerobot/tools/`, one file per tool. The runtime dispatcher and
+the canonical `say` implementation (wrapping Kyutai's pocket-tts) are
+not part of the catalog layer described here; today this layer ships
+only the schema storage and the `DEFAULT_TOOLS` fallback constant.
+
+## Per-row tool _invocations_
+
+The catalog above describes _what can be called_. The actual _call_ — the
+function name plus the argument values — is stored per-row, on the
+assistant atoms in `language_events`:
+
+```python
+{
+ "role": "assistant",
+ "content": null,
+ "style": null,
+ "timestamp": 12.4,
+ "camera": null,
+ "tool_calls": [
+ { "type": "function",
+ "function": { "name": "say", "arguments": { "text": "On it." } } }
+ ]
+}
+```
+
+Recipes splice these into rendered messages via `tool_calls_from`:
+
+```yaml
+user_interjection_response:
+ bindings:
+ speech: "emitted_at(t, role=assistant, tool_name=say)"
+ messages:
+ - { role: user, content: "${task}", stream: high_level }
+ - {
+ role: assistant,
+ content: "${current_plan}",
+ stream: high_level,
+ target: true,
+ tool_calls_from: speech,
+ }
+```
+
+The model's training target is one assistant turn that carries both the
+plan text _and_ the `say` tool call. At inference, the runtime parses
+the generated text back into structured `tool_calls` and dispatches to
+the matching implementation.
+
+## How to add your own tool
+
+> **Note:** Steps 2 and 3 below describe the runtime layer
+> (`src/lerobot/tools/`, the `Tool` protocol, `TOOL_REGISTRY`,
+> `get_tools(meta)`) which is not part of the catalog layer shipped
+> today — those modules don't yet exist in the tree. Step 1 alone is
+> enough to make the tool visible to the chat template via
+> `meta.tools` so the model can learn to _generate_ the call;
+> executing the call at inference requires the runtime layer.
+
+Three steps. Concrete example: a `record_observation` tool the policy
+can call to capture an extra observation outside the regular control
+loop.
+
+### Step 1 — declare the schema
+
+Add an entry under `meta/info.json["tools"]`. Either edit the file
+directly on disk _before_ running the annotation pipeline (it'll be
+preserved) or hand it to `lerobot-annotate` via a config flag.
+
+```json
+{
+ "tools": [
+ { "type": "function", "function": { "name": "say", "...": "..." } },
+ {
+ "type": "function",
+ "function": {
+ "name": "record_observation",
+ "description": "Capture a high-resolution still image for the user.",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "label": {
+ "type": "string",
+ "description": "Short label for the saved image."
+ }
+ },
+ "required": ["label"]
+ }
+ }
+ }
+ ]
+}
+```
+
+The schema follows OpenAI's function-calling convention exactly, so the
+chat template can render it natively.
+
+### Step 2 — implement the call
+
+Create `src/lerobot/tools/record_observation.py`:
+
+```python
+from .base import Tool
+from typing import Any
+
+RECORD_OBSERVATION_SCHEMA: dict[str, Any] = { "...": "..." } # mirrors the JSON above
+
+
+class RecordObservationTool:
+ name = "record_observation"
+ schema = RECORD_OBSERVATION_SCHEMA
+
+ def __init__(self, schema: dict | None = None, output_dir: str = "."):
+ self.output_dir = output_dir
+
+ def call(self, arguments: dict) -> str:
+ label = arguments["label"]
+ # ... save the latest camera frame to /.png ...
+ return f"saved {label}.png"
+```
+
+One file per tool keeps dependencies isolated — `record_observation`
+might pull `pillow`, while `say` pulls `pocket-tts`. Users installing
+only the tools they need avoid heavy transitive deps.
+
+### Step 3 — register it
+
+Add to `src/lerobot/tools/registry.py`:
+
+```python
+from .record_observation import RecordObservationTool
+
+TOOL_REGISTRY["record_observation"] = RecordObservationTool
+```
+
+That's it. At runtime `get_tools(meta)` looks up each schema in
+`meta.tools`, instantiates the matching registered class, and returns
+a name → instance dict the dispatcher can route into.
+
+If you want to use a tool _without_ writing an implementation (e.g. for
+training-time chat-template formatting only), step 1 alone is enough —
+the model still learns to _generate_ the call. Steps 2 and 3 are only
+needed to actually _execute_ it at inference.
diff --git a/docs/source/using_dataset_tools.mdx b/docs/source/using_dataset_tools.mdx
index f7fc9be20..49247a6c1 100644
--- a/docs/source/using_dataset_tools.mdx
+++ b/docs/source/using_dataset_tools.mdx
@@ -117,10 +117,10 @@ lerobot-edit-dataset \
--repo_id lerobot/pusht_image \
--operation.type convert_image_to_video \
--operation.output_dir outputs/pusht_video \
- --operation.vcodec libsvtav1 \
- --operation.pix_fmt yuv420p \
- --operation.g 2 \
- --operation.crf 30
+ --operation.camera_encoder.vcodec libsvtav1 \
+ --operation.camera_encoder.pix_fmt yuv420p \
+ --operation.camera_encoder.g 2 \
+ --operation.camera_encoder.crf 30
# Convert only specific episodes
lerobot-edit-dataset \
@@ -147,11 +147,7 @@ lerobot-edit-dataset \
**Parameters:**
- `output_dir`: Custom output directory (optional - by default uses `new_repo_id` or `{repo_id}_video`)
-- `vcodec`: Video codec to use - options: `h264`, `hevc`, `libsvtav1` (default: `libsvtav1`)
-- `pix_fmt`: Pixel format - options: `yuv420p`, `yuv444p` (default: `yuv420p`)
-- `g`: Group of pictures (GOP) size - lower values give better quality but larger files (default: 2)
-- `crf`: Constant rate factor - lower values give better quality but larger files, 0 is lossless (default: 30)
-- `fast_decode`: Fast decode tuning option (default: 0)
+- `camera_encoder`: Video encoder settings — all sub-fields accessible via `--operation.camera_encoder.. See [Video Encoding Parameters](./video_encoding_parameters) for more details.
- `episode_indices`: List of specific episodes to convert (default: all episodes)
- `num_workers`: Number of parallel workers for processing (default: 4)
diff --git a/docs/source/video_encoding_parameters.mdx b/docs/source/video_encoding_parameters.mdx
new file mode 100644
index 000000000..0b5b99b2b
--- /dev/null
+++ b/docs/source/video_encoding_parameters.mdx
@@ -0,0 +1,117 @@
+# Video encoding parameters
+
+When video storage is enabled, LeRobot stores each camera stream as an **MP4** file instead of saving one image file per timestep. Video encoding compresses across time, which usually cuts dataset size and I/O compared to a pile of PNG, while keeping MP4 — a format every player and loader understands.
+
+Encoding frames into an MP4 is a full FFmpeg pipeline: choice of encoder, pixel format, GOP/keyframes, quality vs. speed, and optional extra encoder flags. Most of these knobs are user-tunable through `camera_encoder`, a nested `VideoEncoderConfig` (`lerobot.configs.video.VideoEncoderConfig`) passed through PyAV.
+
+You can set these parameters from the CLI with `--dataset.camera_encoder.` (e.g. with `lerobot-record` or `lerobot-rollout`). The same block applies to every camera video stream in that run.
+
+
+ Video storage must be on for `camera_encoder` to have any effect —
+ `use_videos=True` in Python APIs, or `--dataset.video=true` on the CLI (the
+ recording default). With video off, inputs stay as images and `camera_encoder`
+ is ignored.
+
+
+For details on **when** frames are written vs. encoded (streaming vs. post-episode), queues, and other top-level `--dataset.*` switches, see [Streaming Video Encoding](./streaming_video_encoding). For an encoding-parameter comparison and experiments, see the [video-benchmark Space](https://huggingface.co/spaces/lerobot/video-benchmark).
+
+---
+
+## Example
+
+```bash
+lerobot-record \
+ --robot.type=so100_follower \
+ --robot.port=/dev/tty.usbmodem58760431541 \
+ --robot.cameras="{laptop: {type: opencv, index_or_path: 0, width: 640, height: 480, fps: 30}}" \
+ --robot.id=black \
+ --teleop.type=so100_leader \
+ --teleop.port=/dev/tty.usbmodem58760431551 \
+ --teleop.id=blue \
+ --dataset.repo_id=/ \
+ --dataset.num_episodes=2 \
+ --dataset.single_task="Grab the cube" \
+ --dataset.streaming_encoding=true \
+ --dataset.encoder_threads=2 \
+ --dataset.camera_encoder.vcodec=h264 \
+ --dataset.camera_encoder.preset=fast \
+ --dataset.camera_encoder.extra_options={"tune": "film", "profile:v": "high", "bf": 2} \
+ --display_data=true
+```
+
+---
+
+## Tuning parameters
+
+
+The defaults are tuned to balance **compression ratio**, **visual quality**, and **decoding/seek speed** for typical robotics datasets. Changing them can affect both recording (CPU load, frame drops) and training (decoding throughput, image quality).
+
+Only override these parameters if you have a specific reason to, and measure the impact on your pipeline before relying on the new settings.
+
+
+
+All flags below are prefixed with `--dataset.camera_encoder.` on the CLI.
+
+| Parameter | Type | Default | Description |
+| --------------- | ---------------- | ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `vcodec` | `str` | `"libsvtav1"` | Video codec name. `"auto"` picks the first available hardware encoder from a fixed preference list, falling back to `libsvtav1`. |
+| `pix_fmt` | `str` | `"yuv420p"` | Output pixel format. Must be supported by the chosen codec in your FFmpeg build. |
+| `g` | `int` | `2` | GOP size — a keyframe every `g` frames. Emitted as FFmpeg option `g`. |
+| `crf` | `int` or `float` | `30` | Abstract quality value, mapped per codec (see the [mapping](#mapping-videoencoderconfig--ffmpeg-options) below). Lower → higher quality / larger output where the mapping is monotone. |
+| `preset` | `int` or `str` | `12` \* | Encoder speed preset; meaning depends on the codec. \* When unset and `vcodec=libsvtav1`, LeRobot defaults to `12`. |
+| `fast_decode` | `int` | `0` | `libsvtav1`: `0–2`, passed via `svtav1-params`. `h264` / `hevc` (software): if `>0`, sets `tune=fastdecode`. Other codecs: usually unused. |
+| `video_backend` | `str` | `"pyav"` | Only `"pyav"` is currently implemented for video encoding. |
+| `extra_options` | `dict` | `{}` | Extra FFmpeg or codec specific options merged after the structured fields above. Cannot override keys already set by those fields. |
+
+---
+
+## Persistence in dataset metadata
+
+After the first episode of a video stream is encoded, the encoder configuration is **persisted into the dataset metadata** (`meta/info.json`) under each video feature, alongside the values probed from the file itself. For a video feature `observation.images.`, the layout in `info.json` is:
+
+```json
+{
+ "features": {
+ "observation.images.laptop": {
+ "dtype": "video",
+ "shape": [480, 640, 3],
+ "info": {
+ "video.height": 480,
+ "video.width": 640,
+ "video.codec": "h264",
+ "video.pix_fmt": "yuv420p",
+ "video.fps": 30,
+ "video.channels": 3,
+ "video.is_depth_map": false,
+ "video.g": 2,
+ "video.crf": 30,
+ "video.preset": "fast",
+ "video.fast_decode": 0,
+ "video.video_backend": "pyav",
+ "video.extra_options": { "tune": "film", "profile:v": "high", "bf": 2 }
+ }
+ }
+ }
+}
+```
+
+Two sources contribute to the `info` block:
+
+- **Stream-derived** (read back from the encoded MP4 with PyAV): `video.height`, `video.width`, `video.codec`, `video.pix_fmt`, `video.fps`, `video.channels`, `video.is_depth_map`, plus `audio.*` if an audio stream is present.
+- **Encoder-derived** (taken from `VideoEncoderConfig`): `video.g`, `video.crf`, `video.preset`, `video.fast_decode`, `video.video_backend`, `video.extra_options`.
+
+
+ This block is populated **once**, from the **first** episode. It assumes every
+ episode in the dataset was encoded with the same `camera_encoder`. Changing
+ encoder settings partway through a recording is not supported — the
+ `info.json` will only reflect the parameters used for the first episode.
+
+
+---
+
+## Merging datasets
+
+When aggregating datasets with `merge_datasets`, video files are concatenated as-is (no re-encoding), and encoder fields in `info.json` are merged per-key:
+
+- **Stream-derived fields must match** across sources: `video.codec`, `video.pix_fmt`, `video.height`, `video.width`, `video.fps`. Otherwise FFmpeg's concat demuxer fails.
+- **Encoder-tuning fields are merged loosely**: `video.g`, `video.crf`, `video.preset`, `video.fast_decode`, `video.extra_options`. If every source agrees, the value is kept; if not, it's set to `null` (or `{}` for `video.extra_options`) and a warning is logged.
diff --git a/examples/dataset/create_progress_videos.py b/examples/dataset/create_progress_videos.py
index 5f98d2cea..cb85a9d3a 100644
--- a/examples/dataset/create_progress_videos.py
+++ b/examples/dataset/create_progress_videos.py
@@ -15,10 +15,12 @@
# limitations under the License.
"""
-Create MP4 (or GIF) videos with sarm_progress overlay for specified episodes.
+Create MP4 (or GIF) videos with per-frame progress overlay for specified episodes.
Downloads datasets from HuggingFace, seeks directly into the episode segment
of the source video, draws a progress line on each frame, and writes the result.
+The progress data is read from a parquet file that lives alongside the dataset
+(configurable via ``--progress-file``).
Usage:
python examples/dataset/create_progress_videos.py \
@@ -56,22 +58,26 @@ SCORE_FONT_SCALE = 0.8
TASK_FONT_SCALE = 0.55
-def download_episode_metadata(repo_id: str, episode: int) -> Path:
- """Download only the metadata and sarm_progress files for a dataset.
+def download_episode_metadata(
+ repo_id: str, episode: int, progress_file: str = "sarm_progress.parquet"
+) -> Path:
+ """Download only the metadata and per-frame progress file for a dataset.
Args:
repo_id: HuggingFace dataset repository ID.
episode: Episode index (used for logging only; all meta is fetched).
+ progress_file: Filename of the per-frame progress parquet inside the
+ dataset repo.
Returns:
Local cache path for the downloaded snapshot.
"""
- logging.info("[1/4] Downloading metadata for %s (episode %d) ...", repo_id, episode)
+ logging.info("[1/4] Downloading metadata + %s for %s (episode %d) ...", progress_file, repo_id, episode)
local_path = Path(
snapshot_download(
repo_id=repo_id,
repo_type="dataset",
- allow_patterns=["meta/**", "sarm_progress.parquet"],
+ allow_patterns=["meta/**", progress_file],
ignore_patterns=["*.mp4"],
)
)
@@ -215,25 +221,28 @@ def download_video_file(repo_id: str, local_path: Path, video_rel: str) -> Path:
return video_path
-def load_progress_data(local_path: Path, episode: int) -> np.ndarray | None:
- """Load sarm_progress values for an episode.
+def load_progress_data(
+ local_path: Path, episode: int, progress_file: str = "sarm_progress.parquet"
+) -> np.ndarray | None:
+ """Load per-frame progress values for an episode.
Args:
local_path: Dataset cache root.
episode: Episode index.
+ progress_file: Filename of the per-frame progress parquet.
Returns:
Sorted (N, 2) array of (frame_index, progress), or None if unavailable.
"""
- parquet_path = local_path / "sarm_progress.parquet"
+ parquet_path = local_path / progress_file
if not parquet_path.exists():
- logging.warning("sarm_progress.parquet not found")
+ logging.warning("%s not found", progress_file)
return None
df = pd.read_parquet(parquet_path)
- logging.info(" sarm_progress.parquet columns: %s", list(df.columns))
+ logging.info(" %s columns: %s", progress_file, list(df.columns))
episode_df = df[df["episode_index"] == episode].copy()
if episode_df.empty:
- logging.warning("No sarm_progress rows for episode %d", episode)
+ logging.warning("No progress rows for episode %d in %s", episode, progress_file)
return None
episode_df = episode_df.sort_values("frame_index")
@@ -576,6 +585,7 @@ def process_dataset(
camera_key: str | None,
output_dir: Path,
create_gif: bool = False,
+ progress_file: str = "sarm_progress.parquet",
) -> Path | None:
"""Full pipeline: download, extract metadata, composite progress, write output.
@@ -585,6 +595,8 @@ def process_dataset(
camera_key: Camera key to use, or None for auto-selection.
output_dir: Directory to write output files.
create_gif: If True, also generate a GIF from the MP4.
+ progress_file: Filename of the per-frame progress parquet inside the
+ dataset repo.
Returns:
Path to the final output file, or None on failure.
@@ -592,7 +604,7 @@ def process_dataset(
safe_name = repo_id.replace("/", "_")
logging.info("Processing: %s | episode %d", repo_id, episode)
- local_path = download_episode_metadata(repo_id, episode)
+ local_path = download_episode_metadata(repo_id, episode, progress_file)
logging.info(" Local cache: %s", local_path)
episode_meta = load_episode_meta(local_path, episode, camera_key)
@@ -600,9 +612,9 @@ def process_dataset(
video_path = download_video_file(repo_id, local_path, episode_meta["video_rel"])
- progress_data = load_progress_data(local_path, episode)
+ progress_data = load_progress_data(local_path, episode, progress_file)
if progress_data is None:
- logging.error("Could not load sarm_progress data. Skipping overlay.")
+ logging.error("Could not load progress data from %s. Skipping overlay.", progress_file)
return None
logging.info(" Progress frames: %d", len(progress_data))
@@ -627,7 +639,7 @@ def process_dataset(
def main() -> None:
parser = argparse.ArgumentParser(
- description="Create MP4/GIF videos with sarm_progress overlay for dataset episodes."
+ description="Create MP4/GIF videos with per-frame progress overlay for dataset episodes."
)
parser.add_argument(
"--repo-id",
@@ -658,6 +670,15 @@ def main() -> None:
action="store_true",
help="Also generate a GIF from the MP4 output.",
)
+ parser.add_argument(
+ "--progress-file",
+ type=str,
+ default="sarm_progress.parquet",
+ help=(
+ "Filename of the per-frame progress parquet inside the dataset repo "
+ "(default: 'sarm_progress.parquet')."
+ ),
+ )
args = parser.parse_args()
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
@@ -670,6 +691,7 @@ def main() -> None:
camera_key=args.camera_key,
output_dir=args.output_dir,
create_gif=args.gif,
+ progress_file=args.progress_file,
)
if result:
diff --git a/examples/notebooks/quickstart.ipynb b/examples/notebooks/quickstart.ipynb
index 647b79506..16034a687 100644
--- a/examples/notebooks/quickstart.ipynb
+++ b/examples/notebooks/quickstart.ipynb
@@ -80,7 +80,7 @@
"}\n",
"\n",
"# Dataset\n",
- "HF_USER = \"your_hf_username\" # `huggingface-cli whoami` to find your username\n",
+ "HF_USER = \"your_hf_username\" # `hf auth whoami` to find your username\n",
"DATASET_NAME = \"my_so101_dataset\"\n",
"TASK_DESCRIPTION = \"pick and place the block\"\n",
"NUM_EPISODES = 10\n",
@@ -291,7 +291,34 @@
"\n",
"Uses `POLICY_PATH` from the Configuration cell (defaults to the Hub repo ID). You can also put there the `LAST_CHECKPOINT_PATH`.\n",
"\n",
- "See the [inference docs](https://huggingface.co/docs/lerobot/il_robots#run-inference-and-evaluate-your-policy) for details."
+ "See the [inference docs](https://huggingface.co/docs/lerobot/il_robots#run-inference-and-evaluate-your-policy) for details.\n",
+ "\n",
+ "Recently ```lerobot-rollout``` was introduced, you can [read more about it here](https://huggingface.co/docs/lerobot/main/en/il_robots?eval=Base+mode+%28no+recording%29#run-inference-and-evaluate-your-policy)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print_cmd(\n",
+ " \"lerobot-rollout\",\n",
+ " \"--strategy.type=base\",\n",
+ " f\"--policy.path={POLICY_PATH}\",\n",
+ " f\"--robot.type={ROBOT_TYPE}\",\n",
+ " f\"--robot.port={ROBOT_PORT}\",\n",
+ " CAMERAS_FLAG,\n",
+ " f'--task=\"{TASK_DESCRIPTION}\"',\n",
+ " \"--duration=60\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "if you are using the V0.5.1 release you should use ```lerobot-record``` instead of rollout"
]
},
{
diff --git a/pyproject.toml b/pyproject.toml
index f983134ab..ca6248c95 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -95,7 +95,7 @@ dependencies = [
# ── Feature-scoped extras ──────────────────────────────────
dataset = [
- "datasets>=4.0.0,<5.0.0",
+ "datasets>=4.7.0,<5.0.0",
"pandas>=2.0.0,<3.0.0", # NOTE: Transitive dependency of datasets
"pyarrow>=21.0.0,<30.0.0", # NOTE: Transitive dependency of datasets
"lerobot[av-dep]",
@@ -151,6 +151,8 @@ pyserial-dep = ["pyserial>=3.5,<4.0"]
deepdiff-dep = ["deepdiff>=7.0.1,<9.0.0"]
pynput-dep = ["pynput>=1.7.8,<1.9.0"]
pyzmq-dep = ["pyzmq>=26.2.1,<28.0.0"]
+motorbridge-dep = ["motorbridge>=0.3.2,<0.4.0"]
+motorbridge-smart-servo-dep = ["motorbridge-smart-servo>=0.0.4,<0.1.0"]
# Motors
feetech = ["feetech-servo-sdk>=1.0.0,<2.0.0", "lerobot[pyserial-dep]", "lerobot[deepdiff-dep]"]
@@ -174,6 +176,9 @@ unitree_g1 = [
"lerobot[pygame-dep]",
]
reachy2 = ["reachy2_sdk>=1.0.15,<1.1.0"]
+# Seeed Studio reBot B601-DM follower (motorbridge / CAN) + StarArm102 / reBot Arm 102
+# leader (motorbridge-smart-servo / FashionStar UART servos).
+rebot = ["lerobot[motorbridge-dep]", "lerobot[motorbridge-smart-servo-dep]"]
kinematics = ["lerobot[placo-dep]"]
intelrealsense = [
"pyrealsense2>=2.55.1.6486,<2.57.0 ; sys_platform != 'darwin'",
@@ -260,6 +265,7 @@ all = [
"lerobot[lekiwi]",
"lerobot[openarms]",
"lerobot[reachy2]",
+ "lerobot[rebot]",
"lerobot[kinematics]",
"lerobot[intelrealsense]",
"lerobot[diffusion]",
diff --git a/src/lerobot/cameras/opencv/camera_opencv.py b/src/lerobot/cameras/opencv/camera_opencv.py
index f3289ddc7..3e92eaf06 100644
--- a/src/lerobot/cameras/opencv/camera_opencv.py
+++ b/src/lerobot/cameras/opencv/camera_opencv.py
@@ -199,12 +199,13 @@ class OpenCVCamera(Camera):
DeviceNotConnectedError: If the camera is not connected.
"""
- # Set FOURCC first (if specified) as it can affect available FPS/resolution options
- if self.config.fourcc is not None:
- self._validate_fourcc()
if self.videocapture is None:
raise DeviceNotConnectedError(f"{self} videocapture is not initialized")
+ set_fourcc_after_size_and_fps = platform.system() == "Windows"
+ if self.config.fourcc is not None and not set_fourcc_after_size_and_fps:
+ self._validate_fourcc()
+
default_width = int(round(self.videocapture.get(cv2.CAP_PROP_FRAME_WIDTH)))
default_height = int(round(self.videocapture.get(cv2.CAP_PROP_FRAME_HEIGHT)))
@@ -222,6 +223,11 @@ class OpenCVCamera(Camera):
else:
self._validate_fps()
+ if self.config.fourcc is not None and set_fourcc_after_size_and_fps:
+ # On Windows with DSHOW, changing the resolution can silently override the FOURCC setting.
+ # Set FOURCC last to make sure the requested pixel format is actually enforced.
+ self._validate_fourcc()
+
def _validate_fps(self) -> None:
"""Validates and sets the camera's frames per second (FPS)."""
diff --git a/src/lerobot/configs/__init__.py b/src/lerobot/configs/__init__.py
index ab74c3cd3..be4491811 100644
--- a/src/lerobot/configs/__init__.py
+++ b/src/lerobot/configs/__init__.py
@@ -24,6 +24,7 @@ Import them directly: ``from lerobot.configs.train import TrainPipelineConfig``
from .dataset import DatasetRecordConfig
from .default import DatasetConfig, EvalConfig, PeftConfig, WandBConfig
from .policies import PreTrainedConfig
+from .recipe import MessageTurn, TrainingRecipe, load_recipe
from .types import (
FeatureType,
NormalizationMode,
@@ -31,6 +32,12 @@ from .types import (
PolicyFeature,
RTCAttentionSchedule,
)
+from .video import (
+ VALID_VIDEO_CODECS,
+ VIDEO_ENCODER_INFO_KEYS,
+ VideoEncoderConfig,
+ camera_encoder_defaults,
+)
__all__ = [
# Types
@@ -43,7 +50,16 @@ __all__ = [
"DatasetRecordConfig",
"DatasetConfig",
"EvalConfig",
+ "MessageTurn",
"PeftConfig",
"PreTrainedConfig",
+ "TrainingRecipe",
"WandBConfig",
+ "load_recipe",
+ "VideoEncoderConfig",
+ # Defaults
+ "camera_encoder_defaults",
+ # Constants
+ "VALID_VIDEO_CODECS",
+ "VIDEO_ENCODER_INFO_KEYS",
]
diff --git a/src/lerobot/configs/dataset.py b/src/lerobot/configs/dataset.py
index e3e17e62b..d5c6fa312 100644
--- a/src/lerobot/configs/dataset.py
+++ b/src/lerobot/configs/dataset.py
@@ -14,10 +14,12 @@
"""Shared dataset recording configuration used by both ``lerobot-record`` and ``lerobot-rollout``."""
-from dataclasses import dataclass
+from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
+from .video import VideoEncoderConfig, camera_encoder_defaults
+
@dataclass
class DatasetRecordConfig:
@@ -55,10 +57,9 @@ class DatasetRecordConfig:
# Number of episodes to record before batch encoding videos
# Set to 1 for immediate encoding (default behavior), or higher for batched encoding
video_encoding_batch_size: int = 1
- # Video codec for encoding videos. Options: 'h264', 'hevc', 'libsvtav1', 'auto',
- # or hardware-specific: 'h264_videotoolbox', 'h264_nvenc', 'h264_vaapi', 'h264_qsv'.
- # Use 'auto' to auto-detect the best available hardware encoder.
- vcodec: str = "libsvtav1"
+ # Video encoder settings for camera MP4s (codec, quality, GOP, etc.). Tuned via CLI nested keys,
+ # e.g. ``--dataset.camera_encoder.vcodec=h264`` (see ``VideoEncoderConfig``).
+ camera_encoder: VideoEncoderConfig = field(default_factory=camera_encoder_defaults)
# Enable streaming video encoding: encode frames in real-time during capture instead
# of writing PNG images first. Makes save_episode() near-instant. More info in the documentation: https://huggingface.co/docs/lerobot/streaming_video_encoding
streaming_encoding: bool = False
diff --git a/src/lerobot/configs/default.py b/src/lerobot/configs/default.py
index b1eebba94..b809e71d9 100644
--- a/src/lerobot/configs/default.py
+++ b/src/lerobot/configs/default.py
@@ -17,7 +17,7 @@
from dataclasses import dataclass, field
from lerobot.transforms import ImageTransformsConfig
-from lerobot.utils.import_utils import get_safe_default_codec
+from lerobot.utils.import_utils import get_safe_default_video_backend
@dataclass
@@ -34,7 +34,7 @@ class DatasetConfig:
image_transforms: ImageTransformsConfig = field(default_factory=ImageTransformsConfig)
revision: str | None = None
use_imagenet_stats: bool = True
- video_backend: str = field(default_factory=get_safe_default_codec)
+ video_backend: str = field(default_factory=get_safe_default_video_backend)
# When True, video frames are returned as uint8 tensors (0-255) instead of float32 (0.0-1.0).
# This reduces memory and speeds up DataLoader IPC. The training pipeline handles the conversion.
return_uint8: bool = False
diff --git a/src/lerobot/configs/eval.py b/src/lerobot/configs/eval.py
index f2a1d3065..c285025ad 100644
--- a/src/lerobot/configs/eval.py
+++ b/src/lerobot/configs/eval.py
@@ -18,8 +18,8 @@ from logging import getLogger
from pathlib import Path
from lerobot import envs, policies # noqa: F401
-from lerobot.configs import parser
+from . import parser
from .default import EvalConfig
from .policies import PreTrainedConfig
diff --git a/src/lerobot/configs/recipe.py b/src/lerobot/configs/recipe.py
new file mode 100644
index 000000000..28e5a0db3
--- /dev/null
+++ b/src/lerobot/configs/recipe.py
@@ -0,0 +1,206 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Literal, get_args
+
+MessageRole = Literal["user", "assistant", "system", "tool"]
+MessageStream = Literal["high_level", "low_level"]
+
+DEFAULT_BINDINGS = {
+ "subtask": "active_at(t, style=subtask)",
+ "memory": "active_at(t, style=memory)",
+ "plan": "active_at(t, style=plan)",
+ "speech": "emitted_at(t, role=assistant, tool_name=say)",
+ "interjection": "emitted_at(t, style=interjection)",
+ "vqa": "emitted_at(t, style=vqa, role=assistant)",
+ "vqa_query": "emitted_at(t, style=vqa, role=user)",
+}
+
+PLACEHOLDER_RE = re.compile(r"\$\{([A-Za-z_][A-Za-z0-9_]*)\}")
+"""``${name}`` placeholder pattern used by both recipe binding-reference
+discovery (here) and rendered-message substitution (in ``language_render``)."""
+
+_VALID_ROLES = frozenset(get_args(MessageRole))
+_VALID_STREAMS = frozenset(get_args(MessageStream))
+
+
+@dataclass
+class MessageTurn:
+ """A single chat-style turn in a recipe template.
+
+ ``content`` may be a plain string, a list of HF-style multimodal blocks, or
+ ``None`` when ``tool_calls_from`` supplies tool-call payloads instead.
+ ``stream`` tags the turn for downstream filtering, ``target`` flags it as a
+ training target, and ``if_present`` skips the turn when the named binding
+ resolves to ``None``.
+ """
+
+ role: MessageRole
+ content: str | list[dict[str, Any]] | None = None
+ stream: MessageStream | None = None
+ target: bool = False
+ if_present: str | None = None
+ tool_calls_from: str | None = None
+
+ def __post_init__(self) -> None:
+ """Validate role, stream, and content after dataclass construction."""
+ if self.role not in _VALID_ROLES:
+ raise ValueError(f"Unsupported message role: {self.role!r}")
+ # ``stream`` is typed Optional only so the dataclass can keep its
+ # field ordering, but recipes must always tag every turn with a
+ # stream — the renderer's ``_validate_rendered`` would reject
+ # ``None`` later on. Fail at construction so the bad recipe is
+ # caught at YAML load time rather than at the first sample.
+ if self.stream is None:
+ raise ValueError(
+ f"MessageTurn(role={self.role!r}) is missing a stream — "
+ f"every turn must declare one of {sorted(_VALID_STREAMS)}."
+ )
+ if self.stream not in _VALID_STREAMS:
+ raise ValueError(f"Unsupported message stream: {self.stream!r}")
+ if self.content is None and self.tool_calls_from is None:
+ raise ValueError("MessageTurn.content is required unless tool_calls_from is set.")
+ if self.content is not None and not isinstance(self.content, (str, list)):
+ raise TypeError("MessageTurn.content must be a string, a list of HF-style blocks, or None.")
+ if isinstance(self.content, list):
+ for block in self.content:
+ if not isinstance(block, dict) or "type" not in block:
+ raise ValueError(
+ "Multimodal content blocks must be HF-style dictionaries with a type key."
+ )
+
+ @classmethod
+ def from_dict(cls, data: dict[str, Any]) -> MessageTurn:
+ """Construct a :class:`MessageTurn` from a plain dictionary."""
+ return cls(**data)
+
+
+@dataclass
+class TrainingRecipe:
+ """A recipe describing how to render training samples from language rows.
+
+ A recipe is either a *message recipe* (``messages`` plus optional
+ ``bindings``) or a *blend recipe* (``blend`` mapping names to weighted
+ sub-recipes). ``weight`` is only meaningful inside a blend.
+ """
+
+ messages: list[MessageTurn] | None = None
+ bindings: dict[str, str] | None = None
+ blend: dict[str, TrainingRecipe] | None = None
+ weight: float | None = None
+
+ def __post_init__(self) -> None:
+ """Validate that exactly one of ``messages`` or ``blend`` is set."""
+ if self.messages is not None and self.blend is not None:
+ raise ValueError("TrainingRecipe must set only one of messages or blend.")
+ if self.messages is None and self.blend is None:
+ raise ValueError("TrainingRecipe must set one of messages or blend.")
+
+ if self.messages is not None:
+ self._validate_message_recipe()
+ if self.blend is not None:
+ self._validate_blend_recipe()
+
+ @classmethod
+ def from_dict(cls, data: dict[str, Any]) -> TrainingRecipe:
+ """Construct a :class:`TrainingRecipe` from a nested dictionary."""
+ data = dict(data)
+ if data.get("messages") is not None:
+ data["messages"] = [
+ turn if isinstance(turn, MessageTurn) else MessageTurn.from_dict(turn)
+ for turn in data["messages"]
+ ]
+ if data.get("blend") is not None:
+ data["blend"] = {
+ name: recipe if isinstance(recipe, TrainingRecipe) else cls.from_dict(recipe)
+ for name, recipe in data["blend"].items()
+ }
+ return cls(**data)
+
+ @classmethod
+ def from_yaml(cls, path: str | Path) -> TrainingRecipe:
+ """Load a :class:`TrainingRecipe` from a YAML file at ``path``."""
+ import yaml # type: ignore[import-untyped]
+
+ with open(path) as f:
+ data = yaml.safe_load(f)
+ if not isinstance(data, dict):
+ raise ValueError(f"Recipe YAML must contain a mapping at the top level: {path}")
+ return cls.from_dict(data)
+
+ def _validate_message_recipe(self) -> None:
+ """Ensure every templated binding is known and at least one turn is a target."""
+ assert self.messages is not None
+ known_bindings = set(DEFAULT_BINDINGS) | set(self.bindings or {}) | {"task"}
+
+ for turn in self.messages:
+ missing = self._referenced_bindings(turn) - known_bindings
+ if missing:
+ raise ValueError(f"MessageTurn references unknown binding(s): {sorted(missing)}")
+
+ if not any(turn.target for turn in self.messages):
+ raise ValueError("Message recipes must contain at least one target turn.")
+
+ def _validate_blend_recipe(self) -> None:
+ """Ensure each blend component is a non-empty, weighted message recipe."""
+ assert self.blend is not None
+ if not self.blend:
+ raise ValueError("Blend recipes must contain at least one component.")
+
+ for name, recipe in self.blend.items():
+ if recipe.blend is not None:
+ raise ValueError(f"Blend component {name!r} cannot itself define a blend.")
+ if recipe.messages is None:
+ raise ValueError(f"Blend component {name!r} must define messages.")
+ if recipe.weight is None:
+ raise ValueError(f"Blend component {name!r} must define weight.")
+ if recipe.weight <= 0:
+ raise ValueError(f"Blend component {name!r} must have a positive weight.")
+
+ def _referenced_bindings(self, turn: MessageTurn) -> set[str]:
+ """Return the binding names that ``turn`` references via placeholders or attributes."""
+ names: set[str] = set()
+ if turn.if_present is not None:
+ names.add(turn.if_present)
+ if turn.tool_calls_from is not None:
+ names.add(turn.tool_calls_from)
+ names.update(_placeholders_in_content(turn.content))
+ return names
+
+
+def _placeholders_in_content(content: str | list[dict[str, Any]] | None) -> set[str]:
+ """Return the set of ``${name}`` placeholders found anywhere in ``content``."""
+ if content is None:
+ return set()
+ if isinstance(content, str):
+ return set(PLACEHOLDER_RE.findall(content))
+
+ names: set[str] = set()
+ for block in content:
+ for value in block.values():
+ if isinstance(value, str):
+ names.update(PLACEHOLDER_RE.findall(value))
+ return names
+
+
+def load_recipe(path: str | Path) -> TrainingRecipe:
+ """Load a :class:`TrainingRecipe` from a YAML file at ``path``."""
+ return TrainingRecipe.from_yaml(path)
diff --git a/src/lerobot/configs/rewards.py b/src/lerobot/configs/rewards.py
index d495160bf..7e99e7f71 100644
--- a/src/lerobot/configs/rewards.py
+++ b/src/lerobot/configs/rewards.py
@@ -27,12 +27,13 @@ from huggingface_hub import hf_hub_download
from huggingface_hub.constants import CONFIG_NAME
from huggingface_hub.errors import HfHubHTTPError
-from lerobot.configs.types import PolicyFeature
from lerobot.optim.optimizers import OptimizerConfig
from lerobot.optim.schedulers import LRSchedulerConfig
from lerobot.utils.device_utils import auto_select_torch_device, is_torch_device_available
from lerobot.utils.hub import HubMixin
+from .types import PolicyFeature
+
T = TypeVar("T", bound="RewardModelConfig")
logger = logging.getLogger(__name__)
@@ -89,9 +90,9 @@ class RewardModelConfig(draccus.ChoiceRegistry, HubMixin, abc.ABC):
def reward_delta_indices(self) -> list | None: # type: ignore[type-arg]
return None
- @abc.abstractmethod
- def get_optimizer_preset(self) -> OptimizerConfig:
- raise NotImplementedError
+ def get_optimizer_preset(self) -> OptimizerConfig | None:
+ """Default optimizer for this reward model, or ``None`` for zero-shot models."""
+ return None
def get_scheduler_preset(self) -> LRSchedulerConfig | None:
return None
diff --git a/src/lerobot/configs/train.py b/src/lerobot/configs/train.py
index c5b4ff5f5..55498d3ac 100644
--- a/src/lerobot/configs/train.py
+++ b/src/lerobot/configs/train.py
@@ -25,11 +25,11 @@ from huggingface_hub import hf_hub_download
from huggingface_hub.errors import HfHubHTTPError
from lerobot import envs
-from lerobot.configs import parser
from lerobot.optim import LRSchedulerConfig, OptimizerConfig
from lerobot.utils.hub import HubMixin
from lerobot.utils.sample_weighting import SampleWeightingConfig
+from . import parser
from .default import DatasetConfig, EvalConfig, PeftConfig, WandBConfig
from .policies import PreTrainedConfig
from .rewards import RewardModelConfig
diff --git a/src/lerobot/configs/video.py b/src/lerobot/configs/video.py
new file mode 100644
index 000000000..bf2471453
--- /dev/null
+++ b/src/lerobot/configs/video.py
@@ -0,0 +1,235 @@
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Note: We subclass str so that serialization is straightforward
+# https://stackoverflow.com/questions/24481852/serialising-an-enum-member-to-json
+
+"""Video encoder configurations."""
+
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass, field
+from typing import Any
+
+from lerobot.utils.import_utils import require_package
+
+logger = logging.getLogger(__name__)
+
+# List of hardware encoders to probe for auto-selection. Availability depends on the platform and the chosen video backend.
+# Determines the order of preference for auto-selection when vcodec="auto" is used.
+HW_VIDEO_CODECS = [
+ "h264_videotoolbox", # macOS
+ "hevc_videotoolbox", # macOS
+ "h264_nvenc", # NVIDIA GPU
+ "hevc_nvenc", # NVIDIA GPU
+ "h264_vaapi", # Linux Intel/AMD
+ "h264_qsv", # Intel Quick Sync
+]
+VALID_VIDEO_CODECS: frozenset[str] = frozenset({"h264", "hevc", "libsvtav1", "auto", *HW_VIDEO_CODECS})
+# Aliases for legacy video codec names.
+VIDEO_CODECS_ALIASES: dict[str, str] = {"av1": "libsvtav1"}
+
+
+LIBSVTAV1_DEFAULT_PRESET: int = 12
+
+# Keys persisted under ``features[*]["info"]`` as ``video.`` (from :class:`VideoEncoderConfig`).
+# ``vcodec``` and ``pix_fmt`` are derived from the video stream directly.
+VIDEO_ENCODER_INFO_FIELD_NAMES: frozenset[str] = frozenset(
+ {"g", "crf", "preset", "fast_decode", "extra_options", "video_backend"}
+)
+VIDEO_ENCODER_INFO_KEYS: frozenset[str] = frozenset(
+ f"video.{name}" for name in VIDEO_ENCODER_INFO_FIELD_NAMES
+)
+
+
+@dataclass
+class VideoEncoderConfig:
+ """Video encoder configuration.
+
+ Attributes:
+ vcodec: Video encoder name. ``"auto"`` is resolved during
+ construction (HW encoder if available, else ``libsvtav1``).
+ pix_fmt: Pixel format (e.g. ``"yuv420p"``).
+ g: GOP size (keyframe interval).
+ crf: Quality level — mapped to the native quality parameter of the
+ codec (``crf`` for software, ``qp`` for NVENC/VAAPI,
+ ``q:v`` for VideoToolbox, ``global_quality`` for QSV).
+ preset: Speed/quality preset. Accepted type is per-codec.
+ fast_decode: Fast-decode tuning. For ``libsvtav1`` this is a level (0-2)
+ embedded in ``svtav1-params``. For ``h264`` and ``hevc`` non-zero values
+ set ``tune=fastdecode``. Ignored for other codecs.
+ video_backend: Python to be used for encoding. Only ``"pyav"``
+ is currently supported.
+ extra_options: Free-form dictionary of additional video encoder options
+ (e.g. ``{"tune": "film", "profile:v": "high", "bf": 2}``).
+ """
+
+ vcodec: str = "libsvtav1" # TODO(CarolinePascal): rename to codec ?
+ pix_fmt: str = "yuv420p"
+ g: int | None = 2
+ crf: int | float | None = 30
+ preset: int | str | None = None
+ fast_decode: int = 0
+ # TODO(CarolinePascal): add torchcodec support + find a way to unify the
+ # two backends (encoding and decoding).
+ video_backend: str = "pyav"
+ extra_options: dict[str, Any] = field(default_factory=dict)
+
+ def __post_init__(self) -> None:
+ self.resolve_vcodec()
+ # Empty-constructor ergonomics: ``VideoEncoderConfig()`` must "just work".
+ if self.preset is None and self.vcodec == "libsvtav1":
+ self.preset = LIBSVTAV1_DEFAULT_PRESET
+ self.validate()
+
+ @classmethod
+ def from_video_info(cls, video_info: dict | None) -> VideoEncoderConfig:
+ """Reconstruct a :class:`VideoEncoderConfig` from a video feature's ``info`` block.
+ Missing or ``None`` values fall back to the class defaults.
+ """
+ video_info = video_info or {}
+ kwargs: dict[str, Any] = {}
+
+ for src_key, dst_field in (("video.codec", "vcodec"), ("video.pix_fmt", "pix_fmt")):
+ value = video_info.get(src_key)
+ if value is not None:
+ kwargs[dst_field] = value
+
+ for field_name in VIDEO_ENCODER_INFO_FIELD_NAMES:
+ value = video_info.get(f"video.{field_name}")
+ if value is None:
+ continue
+ # Persisted as ``{}`` after merges with disagreeing sources — treat as default.
+ if field_name == "extra_options" and not value:
+ continue
+ kwargs[field_name] = value
+
+ return cls(**kwargs)
+
+ def detect_available_encoders(self, encoders: list[str] | str) -> list[str]:
+ """Return the subset of available encoders based on the specified video backend.
+
+ Args:
+ encoders: List of encoder names to detect. If a string, it is converted to a list.
+ Returns:
+ List of available encoder names. If the video backend is not "pyav", returns an empty list.
+ """
+ if self.video_backend == "pyav":
+ require_package("av", extra="dataset")
+ from lerobot.datasets import detect_available_encoders_pyav
+
+ return detect_available_encoders_pyav(encoders)
+ return []
+
+ def validate(self) -> None:
+ """Validate the video encoder configuration."""
+ if self.video_backend == "pyav":
+ require_package("av", extra="dataset")
+ from lerobot.datasets import check_video_encoder_parameters_pyav
+
+ check_video_encoder_parameters_pyav(self.vcodec, self.pix_fmt, self.get_codec_options())
+
+ def resolve_vcodec(self) -> None:
+ """Check ``vcodec`` and, when it is ``"auto"``, pick a concrete encoder.
+
+ For ``"auto"``, the first hardware encoder in the preference list that is available is chosen; if none are available, ``libsvtav1`` is used. If the
+ resolved codec (explicit or after auto-selection) is not available, raises ``ValueError``.
+
+ Stream-derived canonical codec names listed in :data:`VIDEO_CODECS_ALIASES` are
+ rewritten to their corresponding encoder name (e.g. ``"av1"`` → ``"libsvtav1"``).
+ """
+ self.vcodec = VIDEO_CODECS_ALIASES.get(self.vcodec, self.vcodec)
+ if self.vcodec not in VALID_VIDEO_CODECS:
+ raise ValueError(f"Invalid vcodec '{self.vcodec}'. Must be one of: {sorted(VALID_VIDEO_CODECS)}")
+ if self.vcodec == "auto":
+ available = self.detect_available_encoders(HW_VIDEO_CODECS)
+ for encoder in HW_VIDEO_CODECS:
+ if encoder in available:
+ logger.info(f"Auto-selected video codec: {encoder}")
+ self.vcodec = encoder
+ return
+ logger.warning("No hardware encoder available, falling back to software encoder 'libsvtav1'")
+ self.vcodec = "libsvtav1"
+
+ if self.detect_available_encoders(self.vcodec):
+ logger.info(f"Using video codec: {self.vcodec}")
+ return
+ raise ValueError(f"Unsupported video codec: {self.vcodec} with video backend {self.video_backend}")
+
+ def get_codec_options(
+ self, encoder_threads: int | None = None, as_strings: bool = False
+ ) -> dict[str, Any]:
+ """Translate the tuning fields to codec-specific options.
+
+ ``VideoEncoderConfig.extra_options`` are merged last but never override a structured field.
+
+ Args:
+ encoder_threads: Number of encoder threads set globally for all VideoEncoderConfigs.
+ For libsvtav1, this is mapped to ``lp`` via ``svtav1-params``.
+ For h264/hevc, this is mapped to ``threads``.
+ Hardware encoders ignore this parameter.
+ as_strings: If ``True``, casts values to strings.
+ """
+ opts: dict[str, Any] = {}
+
+ def set_if(key: str, value: Any) -> None:
+ if value is not None:
+ opts[key] = value if not as_strings else str(value)
+
+ # GOP size is not a codec-specific option, so it is always set.
+ set_if("g", self.g)
+
+ if self.vcodec == "libsvtav1":
+ set_if("crf", self.crf)
+ set_if("preset", self.preset)
+ svtav1_parts: list[str] = []
+ if self.fast_decode is not None:
+ svtav1_parts.append(f"fast-decode={max(0, min(2, self.fast_decode))}")
+ if encoder_threads is not None:
+ svtav1_parts.append(f"lp={encoder_threads}")
+ if svtav1_parts:
+ opts["svtav1-params"] = ":".join(svtav1_parts)
+ elif self.vcodec in ("h264", "hevc"):
+ set_if("crf", self.crf)
+ set_if("preset", self.preset)
+ if self.fast_decode:
+ opts["tune"] = "fastdecode"
+ set_if("threads", encoder_threads)
+ elif self.vcodec in ("h264_videotoolbox", "hevc_videotoolbox"):
+ if self.crf is not None:
+ opts["q:v"] = max(1, min(100, 100 - self.crf * 2))
+ elif self.vcodec in ("h264_nvenc", "hevc_nvenc"):
+ opts["rc"] = 0
+ set_if("qp", self.crf)
+ set_if("preset", self.preset)
+ elif self.vcodec == "h264_vaapi":
+ set_if("qp", self.crf)
+ elif self.vcodec == "h264_qsv":
+ set_if("global_quality", self.crf)
+ set_if("preset", self.preset)
+ else:
+ set_if("crf", self.crf)
+ set_if("preset", self.preset)
+
+ # Extra options are merged last but never override structured fields (values are kept as given).
+ for k, v in self.extra_options.items():
+ if k not in opts:
+ set_if(k, v)
+
+ return opts
+
+
+def camera_encoder_defaults() -> VideoEncoderConfig:
+ """Return a :class:`VideoEncoderConfig` with RGB-camera defaults."""
+ return VideoEncoderConfig()
diff --git a/src/lerobot/datasets/__init__.py b/src/lerobot/datasets/__init__.py
index 6c42959a5..2a67858d2 100644
--- a/src/lerobot/datasets/__init__.py
+++ b/src/lerobot/datasets/__init__.py
@@ -31,15 +31,25 @@ from .dataset_tools import (
modify_features,
modify_tasks,
recompute_stats,
+ reencode_dataset,
remove_feature,
split_dataset,
)
from .factory import make_dataset, resolve_delta_timestamps
from .image_writer import safe_stop_image_writer
from .io_utils import load_episodes, write_stats
+from .language import (
+ EVENT_ONLY_STYLES,
+ LANGUAGE_EVENTS,
+ LANGUAGE_PERSISTENT,
+ PERSISTENT_STYLES,
+ STYLE_REGISTRY,
+ column_for_style,
+)
from .lerobot_dataset import LeRobotDataset
from .multi_dataset import MultiLeRobotDataset
from .pipeline_features import aggregate_pipeline_dataset_features, create_initial_features
+from .pyav_utils import check_video_encoder_parameters_pyav, detect_available_encoders_pyav
from .sampler import EpisodeAwareSampler
from .streaming_dataset import StreamingLeRobotDataset
from .utils import DEFAULT_EPISODES_PATH, create_lerobot_dataset_card
@@ -53,12 +63,19 @@ __all__ = [
"CODEBASE_VERSION",
"DEFAULT_EPISODES_PATH",
"DEFAULT_QUANTILES",
+ "EVENT_ONLY_STYLES",
"EpisodeAwareSampler",
+ "LANGUAGE_EVENTS",
+ "LANGUAGE_PERSISTENT",
"LeRobotDataset",
"LeRobotDatasetMetadata",
"MultiLeRobotDataset",
+ "PERSISTENT_STYLES",
+ "STYLE_REGISTRY",
"StreamingLeRobotDataset",
"VideoEncodingManager",
+ "check_video_encoder_parameters_pyav",
+ "detect_available_encoders_pyav",
"add_features",
"aggregate_datasets",
"aggregate_pipeline_dataset_features",
@@ -66,6 +83,7 @@ __all__ = [
"convert_image_to_video_dataset",
"create_initial_features",
"create_lerobot_dataset_card",
+ "column_for_style",
"delete_episodes",
"get_feature_stats",
"load_episodes",
@@ -74,6 +92,7 @@ __all__ = [
"modify_features",
"modify_tasks",
"recompute_stats",
+ "reencode_dataset",
"remove_feature",
"resolve_delta_timestamps",
"safe_stop_image_writer",
diff --git a/src/lerobot/datasets/aggregate.py b/src/lerobot/datasets/aggregate.py
index 90fc8f583..5db3f934d 100644
--- a/src/lerobot/datasets/aggregate.py
+++ b/src/lerobot/datasets/aggregate.py
@@ -15,6 +15,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+import copy
import logging
import shutil
from pathlib import Path
@@ -23,9 +24,11 @@ import datasets
import pandas as pd
import tqdm
+from lerobot.configs import VIDEO_ENCODER_INFO_KEYS
+
from .compute_stats import aggregate_stats
from .dataset_metadata import LeRobotDatasetMetadata
-from .feature_utils import get_hf_features_from_features
+from .feature_utils import features_equal_for_merge, get_hf_features_from_features
from .io_utils import (
get_file_size_in_mb,
get_parquet_file_size_in_mb,
@@ -46,11 +49,54 @@ from .utils import (
from .video_utils import concatenate_video_files, get_video_duration_in_s
+def merge_video_feature_info_for_aggregate(all_metadata: list[LeRobotDatasetMetadata]) -> dict[str, dict]:
+ """Create a merged video feature info dictionary for aggregation. The video encoder info is merged field-by-field: each key is kept only when every source agrees; otherwise that key is set to ``null`` (or ``{}`` for ``video.extra_options``) and a warning is logged.
+
+ Args:
+ all_metadata: List of LeRobotDatasetMetadata objects to merge.
+
+ Returns:
+ dict: A dictionary of merged video feature info.
+ """
+ merged_info = copy.deepcopy(all_metadata[0].features)
+ video_keys = [k for k in merged_info if merged_info[k].get("dtype") == "video"]
+
+ for vk in video_keys:
+ video_infos = [m.features.get(vk, {}).get("info") or {} for m in all_metadata]
+ base_video_info = video_infos[0]
+
+ merged_encoder_info: dict = {}
+ fallback_keys: list[str] = []
+ for info_key in VIDEO_ENCODER_INFO_KEYS:
+ values = [info.get(info_key, None) for info in video_infos]
+ first_value = values[0]
+ all_match = all(v == first_value for v in values[1:])
+
+ if all_match:
+ merged_encoder_info[info_key] = first_value
+ else:
+ fallback_keys.append(info_key)
+ merged_encoder_info[info_key] = {} if info_key == "video.extra_options" else None
+
+ if fallback_keys:
+ logging.warning(
+ f"Merging heterogeneous or incomplete video encoder metadata for feature {vk}. "
+ f"Setting these keys to null: {fallback_keys}.",
+ )
+
+ merged_info[vk]["info"] = {**base_video_info, **merged_encoder_info}
+ # TODO(CarolinePascal): make this variable once we have support for other video backends.
+ merged_info[vk]["info"]["video.video_backend"] = "pyav"
+
+ return merged_info
+
+
def validate_all_metadata(all_metadata: list[LeRobotDatasetMetadata]):
"""Validates that all dataset metadata have consistent properties.
Ensures all datasets have the same fps, robot_type, and features to guarantee
compatibility when aggregating them into a single dataset.
+ Video encoder info is not considered for validation but is merged during aggregation in ``merge_video_feature_info_for_aggregate``.
Args:
all_metadata: List of LeRobotDatasetMetadata objects to validate.
@@ -74,7 +120,7 @@ def validate_all_metadata(all_metadata: list[LeRobotDatasetMetadata]):
raise ValueError(
f"Same robot_type is expected, but got robot_type={meta.robot_type} instead of {robot_type}."
)
- if features != meta.features:
+ if not features_equal_for_merge(features, meta.features):
raise ValueError(
f"Same features is expected, but got features={meta.features} instead of {features}."
)
@@ -274,7 +320,8 @@ def aggregate_datasets(
LeRobotDatasetMetadata(repo_id, root=root) for repo_id, root in zip(repo_ids, roots, strict=False)
]
)
- fps, robot_type, features = validate_all_metadata(all_metadata)
+ fps, robot_type, _ = validate_all_metadata(all_metadata)
+ features = merge_video_feature_info_for_aggregate(all_metadata)
video_keys = [key for key in features if features[key]["dtype"] == "video"]
dst_meta = LeRobotDatasetMetadata.create(
@@ -332,7 +379,6 @@ def aggregate_videos(src_meta, dst_meta, videos_idx, video_files_size_in_mb, chu
videos_idx: Dictionary tracking video chunk and file indices.
video_files_size_in_mb: Maximum size for video files in MB (defaults to DEFAULT_VIDEO_FILE_SIZE_IN_MB)
chunk_size: Maximum number of files per chunk (defaults to DEFAULT_CHUNK_SIZE)
-
Returns:
dict: Updated videos_idx with current chunk and file indices.
"""
@@ -414,9 +460,11 @@ def aggregate_videos(src_meta, dst_meta, videos_idx, video_files_size_in_mb, chu
current_dst_duration = dst_file_durations.get(dst_key, 0)
videos_idx[key]["src_to_offset"][(src_chunk_idx, src_file_idx)] = current_dst_duration
videos_idx[key]["src_to_dst"][(src_chunk_idx, src_file_idx)] = dst_key
+ # TODO(CarolinePascal): Move the check before the loop to avoid failing in the middle + add possibility to re-encode the video if the check fails
concatenate_video_files(
[dst_path, src_path],
dst_path,
+ compatibility_check=True,
)
# Update duration of this destination file
dst_file_durations[dst_key] = current_dst_duration + src_duration
diff --git a/src/lerobot/datasets/compute_stats.py b/src/lerobot/datasets/compute_stats.py
index f489c84a7..438ac7fba 100644
--- a/src/lerobot/datasets/compute_stats.py
+++ b/src/lerobot/datasets/compute_stats.py
@@ -512,7 +512,7 @@ def compute_episode_stats(
ep_stats = {}
for key, data in episode_data.items():
- if features[key]["dtype"] == "string":
+ if features[key]["dtype"] in {"string", "language"}:
continue
if features[key]["dtype"] in ["image", "video"]:
diff --git a/src/lerobot/datasets/dataset_metadata.py b/src/lerobot/datasets/dataset_metadata.py
index b404ddb18..39a1b6d2b 100644
--- a/src/lerobot/datasets/dataset_metadata.py
+++ b/src/lerobot/datasets/dataset_metadata.py
@@ -24,6 +24,7 @@ import pyarrow as pa
import pyarrow.parquet as pq
from huggingface_hub import snapshot_download
+from lerobot.configs import VideoEncoderConfig
from lerobot.utils.constants import DEFAULT_FEATURES, HF_LEROBOT_HOME, HF_LEROBOT_HUB_CACHE
from lerobot.utils.feature_utils import _validate_feature_names
from lerobot.utils.utils import flatten_dict
@@ -35,12 +36,12 @@ from .io_utils import (
load_episodes,
load_info,
load_stats,
- load_subtasks,
load_tasks,
write_info,
write_stats,
write_tasks,
)
+from .language import DEFAULT_TOOLS, LANGUAGE_COLUMNS
from .utils import (
DEFAULT_EPISODES_PATH,
check_version_compatibility,
@@ -176,7 +177,6 @@ class LeRobotDatasetMetadata:
self.info = load_info(self.root)
check_version_compatibility(self.repo_id, self._version, CODEBASE_VERSION)
self.tasks = load_tasks(self.root)
- self.subtasks = load_subtasks(self.root)
self.episodes = load_episodes(self.root)
self.stats = load_stats(self.root)
@@ -342,6 +342,49 @@ class LeRobotDatasetMetadata:
"""Keys to access visual modalities (regardless of their storage method)."""
return [key for key, ft in self.features.items() if ft["dtype"] in ["video", "image"]]
+ @property
+ def has_language_columns(self) -> bool:
+ """Return ``True`` if the dataset declares any language column.
+
+ Used to gate language-aware code paths (collate, render step) so
+ unannotated datasets keep PyTorch's default collate behavior.
+ """
+ return any(col in self.features for col in LANGUAGE_COLUMNS)
+
+ @property
+ def tools(self) -> list[dict]:
+ """OpenAI-style tool schemas declared by this dataset.
+
+ Read from ``meta/info.json["tools"]``. Returns a copy, so callers
+ can mutate the result safely. Falls back to
+ :data:`lerobot.datasets.language.DEFAULT_TOOLS` (the canonical
+ ``say`` schema) when the dataset doesn't declare any — that way
+ unannotated datasets and chat-template consumers
+ (``apply_chat_template(messages, tools=meta.tools)``) keep
+ working out of the box.
+
+ Implementations live under :mod:`lerobot.tools` (one file per
+ tool); see ``docs/source/tools.mdx`` for the authoring guide.
+ """
+ declared = self.info.tools
+ if declared:
+ return [dict(t) for t in declared]
+ return [dict(t) for t in DEFAULT_TOOLS]
+
+ @tools.setter
+ def tools(self, value: list[dict] | None) -> None:
+ """Persist a tool catalog to ``meta/info.json`` and reload metadata.
+
+ Writes ``value`` into the on-disk ``info.json`` (or clears the
+ ``tools`` key when ``value`` is ``None`` or empty), then reloads
+ ``self.info`` so the in-memory metadata matches what's on disk.
+ Saves callers from hand-editing ``info.json`` and re-instantiating
+ the metadata object.
+ """
+ self.info.tools = [dict(t) for t in value] if value else None
+ write_info(self.info, self.root)
+ self.info = load_info(self.root)
+
@property
def names(self) -> dict[str, list | dict]:
"""Names of the various dimensions of vector modalities."""
@@ -534,10 +577,23 @@ class LeRobotDatasetMetadata:
self.stats = aggregate_stats([self.stats, episode_stats]) if self.stats is not None else episode_stats
write_stats(self.stats, self.root)
- def update_video_info(self, video_key: str | None = None) -> None:
- """
+ def update_video_info(
+ self,
+ video_key: str | None = None,
+ camera_encoder: VideoEncoderConfig | None = None,
+ ) -> None:
+ """Populate per-feature video info in ``info.json``.
+
Warning: this function writes info from first episode videos, implicitly assuming that all videos have
been encoded the same way. Also, this means it assumes the first episode exists.
+
+ Args:
+ video_key: If provided, only update this video key. Otherwise update
+ all video keys in the dataset.
+ camera_encoder: Encoder configuration used to produce the
+ videos. When provided, its fields are recorded as
+ ``video.`` entries alongside the stream-derived
+ ``video.*`` entries (see :func:`get_video_info`).
"""
if video_key is not None and video_key not in self.video_keys:
raise ValueError(f"Video key {video_key} not found in dataset")
@@ -546,7 +602,7 @@ class LeRobotDatasetMetadata:
for key in video_keys:
if not self.features[key].get("info", None):
video_path = self.root / self.video_path.format(video_key=key, chunk_index=0, file_index=0)
- self.info.features[key]["info"] = get_video_info(video_path)
+ self.info.features[key]["info"] = get_video_info(video_path, camera_encoder=camera_encoder)
def update_chunk_settings(
self,
@@ -657,7 +713,6 @@ class LeRobotDatasetMetadata:
_validate_feature_names(features)
obj.tasks = None
- obj.subtasks = None
obj.episodes = None
obj.stats = None
obj.info = create_empty_dataset_info(
diff --git a/src/lerobot/datasets/dataset_reader.py b/src/lerobot/datasets/dataset_reader.py
index bd1298590..59aaa40e5 100644
--- a/src/lerobot/datasets/dataset_reader.py
+++ b/src/lerobot/datasets/dataset_reader.py
@@ -295,9 +295,4 @@ class DatasetReader:
task_idx = item["task_index"].item()
item["task"] = self._meta.tasks.iloc[task_idx].name
- # add subtask information if available
- if "subtask_index" in self._meta.features and self._meta.subtasks is not None:
- subtask_idx = item["subtask_index"].item()
- item["subtask"] = self._meta.subtasks.iloc[subtask_idx].name
-
return item
diff --git a/src/lerobot/datasets/dataset_tools.py b/src/lerobot/datasets/dataset_tools.py
index 46dd9bff2..adbb841c4 100644
--- a/src/lerobot/datasets/dataset_tools.py
+++ b/src/lerobot/datasets/dataset_tools.py
@@ -26,7 +26,7 @@ This module provides utilities for:
import logging
import shutil
from collections.abc import Callable
-from concurrent.futures import ThreadPoolExecutor, as_completed
+from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from pathlib import Path
import datasets
@@ -36,6 +36,7 @@ import pyarrow.parquet as pq
import torch
from tqdm import tqdm
+from lerobot.configs import VideoEncoderConfig, camera_encoder_defaults
from lerobot.utils.constants import ACTION, HF_LEROBOT_HOME, OBS_IMAGE, OBS_STATE
from lerobot.utils.utils import flatten_dict
@@ -60,9 +61,14 @@ from .utils import (
DEFAULT_DATA_FILE_SIZE_IN_MB,
DEFAULT_DATA_PATH,
DEFAULT_EPISODES_PATH,
+ VIDEO_DIR,
update_chunk_file_indices,
)
-from .video_utils import encode_video_frames, get_video_info
+from .video_utils import (
+ encode_video_frames,
+ get_video_info,
+ reencode_video,
+)
def _load_episode_with_stats(src_dataset: LeRobotDataset, episode_idx: int) -> dict:
@@ -95,6 +101,11 @@ def delete_episodes(
) -> LeRobotDataset:
"""Delete episodes from a LeRobotDataset and create a new dataset.
+ Video segments that need re-encoding (because the source file mixes kept and
+ deleted episodes) are re-encoded with the source dataset's existing encoder
+ settings — read back from ``meta/info.json`` — so the output dataset stays
+ consistent with its own metadata.
+
Args:
dataset: The source LeRobotDataset.
episode_indices: List of episode indices to delete.
@@ -157,6 +168,11 @@ def split_dataset(
) -> dict[str, LeRobotDataset]:
"""Split a LeRobotDataset into multiple smaller datasets.
+ Video segments that need re-encoding (because the source file mixes episodes
+ that fall into different splits) are re-encoded with the source dataset's
+ existing encoder settings — read back from ``meta/info.json`` — so each
+ output split stays consistent with its own metadata.
+
Args:
dataset: The source LeRobotDataset to split.
splits: Either a dict mapping split names to episode indices, or a dict mapping
@@ -578,8 +594,7 @@ def _keep_episodes_from_video_with_av(
output_path: Path,
episodes_to_keep: list[tuple[int, int]],
fps: float,
- vcodec: str = "libsvtav1",
- pix_fmt: str = "yuv420p",
+ camera_encoder: VideoEncoderConfig,
) -> None:
"""Keep only specified episodes from a video file using PyAV.
@@ -593,8 +608,7 @@ def _keep_episodes_from_video_with_av(
Ranges are half-open intervals: [start_frame, end_frame), where start_frame
is inclusive and end_frame is exclusive.
fps: Frame rate of the video.
- vcodec: Video codec to use for encoding.
- pix_fmt: Pixel format for output video.
+ camera_encoder: Video encoder settings used to re-encode the kept frames.
"""
from fractions import Fraction
@@ -619,12 +633,13 @@ def _keep_episodes_from_video_with_av(
# Convert fps to Fraction for PyAV compatibility.
fps_fraction = Fraction(fps).limit_denominator(1000)
- v_out = out.add_stream(vcodec, rate=fps_fraction)
+ codec_options = camera_encoder.get_codec_options(as_strings=True)
+ v_out = out.add_stream(camera_encoder.vcodec, rate=fps_fraction, options=codec_options)
# PyAV type stubs don't distinguish video streams from audio/subtitle streams.
v_out.width = v_in.codec_context.width
v_out.height = v_in.codec_context.height
- v_out.pix_fmt = pix_fmt
+ v_out.pix_fmt = camera_encoder.pix_fmt
# Set time_base to match the frame rate for proper timestamp handling.
v_out.time_base = Fraction(1, int(fps))
@@ -687,14 +702,14 @@ def _copy_and_reindex_videos(
src_dataset: LeRobotDataset,
dst_meta: LeRobotDatasetMetadata,
episode_mapping: dict[int, int],
- vcodec: str = "libsvtav1",
- pix_fmt: str = "yuv420p",
) -> dict[int, dict]:
"""Copy and filter video files, only re-encoding files with deleted episodes.
For video files that only contain kept episodes, we copy them directly.
For files with mixed kept/deleted episodes, we use PyAV filters to efficiently
- re-encode only the desired segments.
+ re-encode only the desired segments. The encoder used for re-encoding is
+ derived per video key from the source dataset's ``meta/info.json`` so the
+ destination metadata keeps describing the videos accurately.
Args:
src_dataset: Source dataset to copy from
@@ -711,6 +726,9 @@ def _copy_and_reindex_videos(
for video_key in src_dataset.meta.video_keys:
logging.info(f"Processing videos for {video_key}")
+ camera_encoder = VideoEncoderConfig.from_video_info(
+ src_dataset.meta.info.features.get(video_key, {}).get("info")
+ )
if dst_meta.video_path is None:
raise ValueError("Destination metadata has no video_path defined")
@@ -792,8 +810,7 @@ def _copy_and_reindex_videos(
dst_video_path,
episodes_to_keep_ranges,
src_dataset.meta.fps,
- vcodec,
- pix_fmt,
+ camera_encoder,
)
cumulative_ts = 0.0
@@ -1264,11 +1281,7 @@ def _estimate_frame_size_via_calibration(
episode_indices: list[int],
temp_dir: Path,
fps: int,
- vcodec: str,
- pix_fmt: str,
- g: int,
- crf: int,
- fast_decode: int,
+ camera_encoder: VideoEncoderConfig,
num_calibration_frames: int = 30,
) -> float:
"""Estimate MB per frame by encoding a small calibration sample.
@@ -1282,11 +1295,7 @@ def _estimate_frame_size_via_calibration(
episode_indices: List of episode indices being processed.
temp_dir: Temporary directory for calibration files.
fps: Frames per second for video encoding.
- vcodec: Video codec (libsvtav1, h264, hevc).
- pix_fmt: Pixel format (yuv420p, etc.).
- g: GOP size (group of pictures).
- crf: Constant Rate Factor (quality).
- fast_decode: Fast decode tuning parameter.
+ camera_encoder: Video encoder settings used for calibration encoding.
num_calibration_frames: Number of frames to use for calibration (default: 30).
Returns:
@@ -1322,11 +1331,7 @@ def _estimate_frame_size_via_calibration(
imgs_dir=calibration_dir,
video_path=calibration_video_path,
fps=fps,
- vcodec=vcodec,
- pix_fmt=pix_fmt,
- g=g,
- crf=crf,
- fast_decode=fast_decode,
+ camera_encoder=camera_encoder,
overwrite=True,
)
@@ -1644,11 +1649,7 @@ def convert_image_to_video_dataset(
dataset: LeRobotDataset,
output_dir: Path | None = None,
repo_id: str | None = None,
- vcodec: str = "libsvtav1",
- pix_fmt: str = "yuv420p",
- g: int = 2,
- crf: int = 30,
- fast_decode: int = 0,
+ camera_encoder: VideoEncoderConfig | None = None,
episode_indices: list[int] | None = None,
num_workers: int = 4,
max_episodes_per_batch: int | None = None,
@@ -1663,11 +1664,8 @@ def convert_image_to_video_dataset(
dataset: The source LeRobot dataset with images
output_dir: Root directory where the edited dataset will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id. Equivalent to new_root in EditDatasetConfig.
repo_id: Edited dataset identifier. Equivalent to new_repo_id in EditDatasetConfig.
- vcodec: Video codec (default: libsvtav1)
- pix_fmt: Pixel format (default: yuv420p)
- g: Group of pictures size (default: 2)
- crf: Constant rate factor (default: 30)
- fast_decode: Fast decode tuning (default: 0)
+ camera_encoder: Video encoder settings
+ (``None`` uses :func:`~lerobot.configs.camera_encoder_defaults`).
episode_indices: List of episode indices to convert (None = all episodes)
num_workers: Number of threads for parallel processing (default: 4)
max_episodes_per_batch: Maximum episodes per video batch to avoid memory issues (None = no limit)
@@ -1676,6 +1674,9 @@ def convert_image_to_video_dataset(
Returns:
New LeRobotDataset with images encoded as videos
"""
+ if camera_encoder is None:
+ camera_encoder = camera_encoder_defaults()
+
# Check that it's an image dataset
if len(dataset.meta.video_keys) > 0:
raise ValueError(
@@ -1699,7 +1700,10 @@ def convert_image_to_video_dataset(
logging.info(
f"Converting {len(episode_indices)} episodes with {len(img_keys)} cameras from {dataset.repo_id}"
)
- logging.info(f"Video codec: {vcodec}, pixel format: {pix_fmt}, GOP: {g}, CRF: {crf}")
+ logging.info(
+ f"Video codec: {camera_encoder.vcodec}, pixel format: {camera_encoder.pix_fmt}, "
+ f"GOP: {camera_encoder.g}, CRF: {camera_encoder.crf}"
+ )
# Create new features dict, converting image features to video features
new_features = {}
@@ -1769,11 +1773,7 @@ def convert_image_to_video_dataset(
episode_indices=episode_indices,
temp_dir=temp_dir,
fps=fps,
- vcodec=vcodec,
- pix_fmt=pix_fmt,
- g=g,
- crf=crf,
- fast_decode=fast_decode,
+ camera_encoder=camera_encoder,
)
logging.info(f"Processing camera: {img_key}")
@@ -1815,11 +1815,7 @@ def convert_image_to_video_dataset(
imgs_dir=imgs_dir,
video_path=video_path,
fps=fps,
- vcodec=vcodec,
- pix_fmt=pix_fmt,
- g=g,
- crf=crf,
- fast_decode=fast_decode,
+ camera_encoder=camera_encoder,
overwrite=True,
)
@@ -1865,7 +1861,9 @@ def convert_image_to_video_dataset(
video_path = new_meta.root / new_meta.video_path.format(
video_key=img_key, chunk_index=0, file_index=0
)
- new_meta.info.features[img_key]["info"] = get_video_info(video_path)
+ new_meta.info.features[img_key]["info"] = get_video_info(
+ video_path, camera_encoder=camera_encoder
+ )
write_info(new_meta.info, new_meta.root)
@@ -1888,3 +1886,83 @@ def convert_image_to_video_dataset(
# Return new dataset
return LeRobotDataset(repo_id=repo_id, root=output_dir)
+
+
+def _reencode_video_worker(args: tuple) -> Path:
+ """Picklable worker for :func:`reencode_dataset`'s process pool."""
+ video_path, camera_encoder, encoder_threads = args
+ reencode_video(
+ input_video_path=video_path,
+ output_video_path=video_path,
+ camera_encoder=camera_encoder,
+ encoder_threads=encoder_threads,
+ overwrite=True,
+ )
+ return video_path
+
+
+def reencode_dataset(
+ dataset: LeRobotDataset,
+ camera_encoder: VideoEncoderConfig,
+ encoder_threads: int | None = None,
+ num_workers: int | None = None,
+) -> LeRobotDataset:
+ """Re-encode every video in a dataset with a new set of encoding parameters.
+
+ Videos are re-encoded in-place and the video information in ``info.json`` is refreshed.
+
+ Args:
+ dataset: An existing :class:`LeRobotDataset` whose videos will be
+ re-encoded.
+ camera_encoder: Target encoder configuration applied to every video
+ file.
+ encoder_threads: Per-encoder thread count forwarded to
+ :func:`reencode_video`. ``None`` lets the codec decide.
+ num_workers: Number of parallel processes. ``None`` or ``0`` means
+ sequential (no multiprocessing); ``1+`` spawns a
+ :class:`~concurrent.futures.ProcessPoolExecutor`.
+
+ Returns:
+ The same :class:`LeRobotDataset` instance with its metadata updated
+ on disk.
+ """
+ meta = dataset.meta
+ video_paths_list = []
+
+ # Only re-encode if the videos are not already encoded with the given video encoding parameters
+ for video_key in meta.video_keys:
+ current_info = meta.info.features[video_key].get("info", {})
+ current_encoder = VideoEncoderConfig.from_video_info(current_info)
+ if current_encoder != camera_encoder:
+ video_paths_list.extend((meta.root / VIDEO_DIR / video_key).rglob("*.mp4"))
+ else:
+ logging.info(f"{video_key} videos are already encoded with {camera_encoder}. Nothing to do.")
+
+ if len(video_paths_list) == 0:
+ logging.warning("Dataset has no videos to re-encode.")
+ return dataset
+ logging.info(f"Re-encoding {len(video_paths_list)} video file(s) with {camera_encoder}")
+
+ worker_args = [(vp, camera_encoder, encoder_threads) for vp in video_paths_list]
+ if num_workers and num_workers > 1:
+ with ProcessPoolExecutor(max_workers=num_workers) as pool:
+ futures = [pool.submit(_reencode_video_worker, args) for args in worker_args]
+ for future in tqdm(
+ as_completed(futures),
+ total=len(futures),
+ desc="Re-encoding videos",
+ ):
+ future.result()
+ else:
+ for args in tqdm(worker_args, desc="Re-encoding videos"):
+ _reencode_video_worker(args)
+
+ # Refresh video info in metadata for every video key.
+ for vid_key in meta.video_keys:
+ video_path = meta.root / meta.get_video_file_path(0, vid_key)
+ meta.info.features[vid_key]["info"] = get_video_info(video_path, camera_encoder=camera_encoder)
+
+ write_info(meta.info, meta.root)
+ logging.info("Dataset metadata updated.")
+
+ return dataset
diff --git a/src/lerobot/datasets/dataset_writer.py b/src/lerobot/datasets/dataset_writer.py
index cf306a86a..633c00c1a 100644
--- a/src/lerobot/datasets/dataset_writer.py
+++ b/src/lerobot/datasets/dataset_writer.py
@@ -31,6 +31,8 @@ import PIL.Image
import pyarrow.parquet as pq
import torch
+from lerobot.configs import VideoEncoderConfig, camera_encoder_defaults
+
from .compute_stats import compute_episode_stats
from .dataset_metadata import LeRobotDatasetMetadata
from .feature_utils import (
@@ -65,14 +67,19 @@ def _encode_video_worker(
episode_index: int,
root: Path,
fps: int,
- vcodec: str = "libsvtav1",
+ camera_encoder: VideoEncoderConfig | None = None,
encoder_threads: int | None = None,
) -> Path:
temp_path = Path(tempfile.mkdtemp(dir=root)) / f"{video_key}_{episode_index:03d}.mp4"
fpath = DEFAULT_IMAGE_PATH.format(image_key=video_key, episode_index=episode_index, frame_index=0)
img_dir = (root / fpath).parent
encode_video_frames(
- img_dir, temp_path, fps, vcodec=vcodec, overwrite=True, encoder_threads=encoder_threads
+ img_dir,
+ temp_path,
+ fps,
+ camera_encoder=camera_encoder,
+ encoder_threads=encoder_threads,
+ overwrite=True,
)
shutil.rmtree(img_dir)
return temp_path
@@ -89,20 +96,22 @@ class DatasetWriter:
self,
meta: LeRobotDatasetMetadata,
root: Path,
- vcodec: str,
+ camera_encoder: VideoEncoderConfig | None,
encoder_threads: int | None,
batch_encoding_size: int,
streaming_encoder: StreamingVideoEncoder | None = None,
initial_frames: int = 0,
):
- """Initialize the writer with metadata, codec, and encoding config.
+ """Initialize the writer with metadata, codec, and encoder config.
Args:
meta: Dataset metadata instance (used for feature schema, chunk
settings, and episode persistence).
root: Local dataset root directory.
- vcodec: Video codec for encoding (e.g. ``'libsvtav1'``, ``'h264'``).
- encoder_threads: Threads per encoder instance. ``None`` for auto.
+ camera_encoder: Video encoder settings applied to all cameras.
+ ``None`` uses :func:`~lerobot.configs.camera_encoder_defaults`.
+ encoder_threads: Number of encoder threads (global). ``None``
+ lets the codec decide.
batch_encoding_size: Number of episodes to accumulate before
batch-encoding videos.
streaming_encoder: Optional pre-built :class:`StreamingVideoEncoder`
@@ -111,7 +120,7 @@ class DatasetWriter:
"""
self._meta = meta
self._root = root
- self._vcodec = vcodec
+ self._camera_encoder = camera_encoder or camera_encoder_defaults()
self._encoder_threads = encoder_threads
self._batch_encoding_size = batch_encoding_size
self._streaming_encoder = streaming_encoder
@@ -241,7 +250,14 @@ class DatasetWriter:
for key, ft in self._meta.features.items():
if key in ["index", "episode_index", "task_index"] or ft["dtype"] in ["image", "video"]:
continue
- episode_buffer[key] = np.stack(episode_buffer[key])
+ stacked_values = np.stack(episode_buffer[key])
+
+ # `shape=(1,)` numeric features are serialized as `datasets.Value`, which expects scalars.
+ # Normalizing to `(N,)` keeps save semantics stable across dependency versions.
+ if tuple(ft["shape"]) == (1,) and ft["dtype"] != "string":
+ stacked_values = stacked_values.reshape(episode_length)
+
+ episode_buffer[key] = stacked_values
# Wait for image writer to end, so that episode stats over images can be computed
self._wait_image_writer()
@@ -284,7 +300,7 @@ class DatasetWriter:
episode_index,
self._root,
self._meta.fps,
- self._vcodec,
+ self._camera_encoder,
self._encoder_threads,
): video_key
for video_key in self._meta.video_keys
@@ -495,7 +511,7 @@ class DatasetWriter:
# Update video info (only needed when first episode is encoded)
if episode_index == 0:
- self._meta.update_video_info(video_key)
+ self._meta.update_video_info(video_key, camera_encoder=self._camera_encoder)
write_info(self._meta.info, self._meta.root)
metadata = {
@@ -564,7 +580,12 @@ class DatasetWriter:
def _encode_temporary_episode_video(self, video_key: str, episode_index: int) -> Path:
"""Use ffmpeg to convert frames stored as png into mp4 videos."""
return _encode_video_worker(
- video_key, episode_index, self._root, self._meta.fps, self._vcodec, self._encoder_threads
+ video_key,
+ episode_index,
+ self._root,
+ self._meta.fps,
+ self._camera_encoder,
+ self._encoder_threads,
)
def close_writer(self) -> None:
diff --git a/src/lerobot/datasets/feature_utils.py b/src/lerobot/datasets/feature_utils.py
index 2ab4b0ea6..56264408f 100644
--- a/src/lerobot/datasets/feature_utils.py
+++ b/src/lerobot/datasets/feature_utils.py
@@ -13,15 +13,23 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+import logging
from pprint import pformat
import datasets
import numpy as np
from PIL import Image as PILImage
+from lerobot.configs import VIDEO_ENCODER_INFO_KEYS
from lerobot.utils.constants import DEFAULT_FEATURES
from lerobot.utils.utils import is_valid_numpy_dtype_string
+from .language import (
+ LANGUAGE_PERSISTENT,
+ is_language_column,
+ language_events_column_feature,
+ language_persistent_column_feature,
+)
from .utils import (
DEFAULT_CHUNK_SIZE,
DEFAULT_DATA_FILE_SIZE_IN_MB,
@@ -46,7 +54,13 @@ def get_hf_features_from_features(features: dict) -> datasets.Features:
"""
hf_features = {}
for key, ft in features.items():
- if ft["dtype"] == "video":
+ if is_language_column(key):
+ hf_features[key] = (
+ language_persistent_column_feature()
+ if key == LANGUAGE_PERSISTENT
+ else language_events_column_feature()
+ )
+ elif ft["dtype"] == "video":
continue
elif ft["dtype"] == "image":
hf_features[key] = datasets.Image()
@@ -108,6 +122,41 @@ def create_empty_dataset_info(
)
+def features_equal_for_merge(features_a: dict[str, dict], features_b: dict[str, dict]) -> bool:
+ """Return whether two LeRobotDatasetMetadata ``features`` dicts are compatible for aggregation.
+
+ For video features, keys under ``info`` related to video encoding parameters are ignored during
+ comparison as they do not prevent aggregation.
+ """
+
+ def _without_encoder_info_keys(feature: dict) -> dict:
+ filtered = dict(feature)
+ filtered_info = filtered.get("info")
+ if isinstance(filtered_info, dict):
+ filtered["info"] = {
+ info_key: info_value
+ for info_key, info_value in filtered_info.items()
+ if info_key not in VIDEO_ENCODER_INFO_KEYS
+ }
+ return filtered
+
+ if set(features_a) != set(features_b):
+ return False
+ for key in features_a:
+ fa_key = features_a[key]
+ fb_key = features_b[key]
+ if fa_key.get("dtype") != fb_key.get("dtype"):
+ return False
+ if fa_key.get("dtype") != "video":
+ if fa_key != fb_key:
+ return False
+ continue
+
+ if _without_encoder_info_keys(fa_key) != _without_encoder_info_keys(fb_key):
+ return False
+ return True
+
+
def check_delta_timestamps(
delta_timestamps: dict[str, list[float]], fps: int, tolerance_s: float, raise_value_error: bool = True
) -> bool:
@@ -242,6 +291,8 @@ def validate_feature_dtype_and_shape(
return validate_feature_image_or_video(name, expected_shape, value)
elif expected_dtype == "string":
return validate_feature_string(name, value)
+ elif expected_dtype == "language":
+ return validate_feature_language(name, value)
else:
raise NotImplementedError(f"The feature dtype '{expected_dtype}' is not implemented yet.")
@@ -321,6 +372,30 @@ def validate_feature_string(name: str, value: str) -> str:
return ""
+def validate_feature_language(name: str, value) -> str:
+ """Validate a feature that is expected to hold language annotations.
+
+ Language columns (``language_persistent`` / ``language_events``) are
+ populated after recording by the annotation pipeline, not at record time.
+ Any value supplied here is dropped before the frame is written, so a
+ non-empty value almost certainly signals a mistake. We warn rather than
+ fail to keep recording resilient.
+
+ Args:
+ name (str): The name of the feature.
+ value: The value to validate.
+
+ Returns:
+ str: Always an empty string — language values are non-fatal.
+ """
+ if value is not None:
+ logging.warning(
+ f"The feature '{name}' is a 'language' column populated by the annotation pipeline, "
+ f"not at record time. The provided value will be dropped."
+ )
+ return ""
+
+
def validate_episode_buffer(episode_buffer: dict, total_episodes: int, features: dict) -> None:
"""Validate the episode buffer before it's written to disk.
diff --git a/src/lerobot/datasets/io_utils.py b/src/lerobot/datasets/io_utils.py
index f5681c7c0..a41f34704 100644
--- a/src/lerobot/datasets/io_utils.py
+++ b/src/lerobot/datasets/io_utils.py
@@ -31,10 +31,10 @@ from torchvision import transforms
from lerobot.utils.io_utils import load_json, write_json
from lerobot.utils.utils import SuppressProgressBars, flatten_dict, unflatten_dict
+from .language import LANGUAGE_COLUMNS
from .utils import (
DEFAULT_DATA_FILE_SIZE_IN_MB,
DEFAULT_EPISODES_PATH,
- DEFAULT_SUBTASKS_PATH,
DEFAULT_TASKS_PATH,
EPISODES_DIR,
INFO_PATH,
@@ -186,14 +186,6 @@ def load_tasks(local_dir: Path) -> pandas.DataFrame:
return tasks
-def load_subtasks(local_dir: Path) -> pandas.DataFrame | None:
- """Load subtasks from subtasks.parquet if it exists."""
- subtasks_path = local_dir / DEFAULT_SUBTASKS_PATH
- if subtasks_path.exists():
- return pd.read_parquet(subtasks_path)
- return None
-
-
def write_episodes(episodes: Dataset, local_dir: Path) -> None:
"""Write episode metadata to a parquet file in the LeRobot v3.0 format.
This function writes episode-level metadata to a single parquet file.
@@ -265,11 +257,13 @@ def hf_transform_to_torch(items_dict: dict[str, list[Any]]) -> dict[str, list[to
dict: The batch with items converted to torch tensors.
"""
for key in items_dict:
+ if key in LANGUAGE_COLUMNS:
+ continue
first_item = items_dict[key][0]
if isinstance(first_item, PILImage.Image):
to_tensor = transforms.ToTensor()
items_dict[key] = [to_tensor(img) for img in items_dict[key]]
- elif first_item is None:
+ elif first_item is None or isinstance(first_item, dict):
pass
else:
items_dict[key] = [x if isinstance(x, str) else torch.tensor(x) for x in items_dict[key]]
@@ -304,8 +298,9 @@ def item_to_torch(item: dict) -> dict:
Returns:
dict: Dictionary with all tensor-like items converted to torch.Tensor.
"""
+ skip_keys = {"task", *LANGUAGE_COLUMNS}
for key, val in item.items():
- if isinstance(val, (np.ndarray | list)) and key not in ["task"]:
+ if isinstance(val, (np.ndarray | list)) and key not in skip_keys:
# Convert numpy arrays and lists to torch tensors
item[key] = torch.tensor(val)
return item
diff --git a/src/lerobot/datasets/language.py b/src/lerobot/datasets/language.py
new file mode 100644
index 000000000..124c25221
--- /dev/null
+++ b/src/lerobot/datasets/language.py
@@ -0,0 +1,242 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from typing import Literal
+
+import datasets
+import pyarrow as pa
+
+LANGUAGE_PERSISTENT = "language_persistent"
+LANGUAGE_EVENTS = "language_events"
+LANGUAGE_COLUMNS = (LANGUAGE_PERSISTENT, LANGUAGE_EVENTS)
+PERSISTENT_ROW_FIELDS = ("role", "content", "style", "timestamp", "camera", "tool_calls")
+EVENT_ROW_FIELDS = ("role", "content", "style", "camera", "tool_calls")
+
+CORE_STYLES = {
+ "subtask",
+ "plan",
+ "memory",
+ "motion",
+ "interjection",
+ "vqa",
+ "trace",
+ "task_aug",
+}
+# Project-local styles can be registered at import time by appending to
+# ``EXTENDED_STYLES`` before ``column_for_style`` is called. Anything added
+# here is treated as a known style alongside ``CORE_STYLES`` for resolver
+# validation. Empty by default — populate from a downstream module that
+# also extends ``PERSISTENT_STYLES`` or ``EVENT_ONLY_STYLES`` to declare
+# the new style's column.
+EXTENDED_STYLES: set[str] = set()
+STYLE_REGISTRY = CORE_STYLES | EXTENDED_STYLES
+
+PERSISTENT_STYLES = {"subtask", "plan", "memory", "motion", "task_aug"}
+EVENT_ONLY_STYLES = {"interjection", "vqa", "trace"}
+
+# Styles whose ``content`` is grounded in a specific camera view. Rows of these
+# styles MUST carry a non-null ``camera`` referencing an ``observation.images.*``
+# feature key. Rows of every other style MUST have ``camera=None``. ``motion``
+# is intentionally NOT in this set: motion primitives are described in
+# robot-frame (joint / Cartesian) terms, not pixel space, so they are
+# camera-agnostic. ``trace`` is the pixel-trajectory event style and IS
+# view-dependent. The ``camera`` field nevertheless lives on
+# ``PERSISTENT_ROW_FIELDS`` too so the schema, validator, and resolver
+# behave symmetrically across the two columns; persistent rows simply
+# always have ``camera=None`` in practice today.
+VIEW_DEPENDENT_STYLES = {"vqa", "trace"}
+
+LanguageColumn = Literal["language_persistent", "language_events"]
+
+
+def _json_arrow_type() -> pa.DataType:
+ """Return the Arrow JSON type, falling back to ``string`` on older pyarrow."""
+ return pa.json_() if hasattr(pa, "json_") else pa.string()
+
+
+def _json_feature() -> object:
+ """Return the HF ``datasets`` JSON feature, falling back to a string value."""
+ return datasets.Json() if hasattr(datasets, "Json") else datasets.Value("string")
+
+
+def language_persistent_row_arrow_type() -> pa.StructType:
+ """Return the Arrow struct type for a single persistent language row.
+
+ Persistent rows carry their own ``timestamp`` because they represent a state
+ that became active at a specific moment and remains active until superseded.
+ ``timestamp`` is ``float32`` to match the timestamp dtype LeRobotDataset
+ uses for frame data.
+ """
+ return pa.struct(
+ [
+ pa.field("role", pa.string(), nullable=False),
+ pa.field("content", pa.string(), nullable=True),
+ pa.field("style", pa.string(), nullable=True),
+ pa.field("timestamp", pa.float32(), nullable=False),
+ pa.field("camera", pa.string(), nullable=True),
+ pa.field("tool_calls", pa.list_(_json_arrow_type()), nullable=True),
+ ]
+ )
+
+
+def language_event_row_arrow_type() -> pa.StructType:
+ """Return the Arrow struct type for a single event language row.
+
+ Event rows have no ``timestamp`` field: each event is stored on the dataset
+ row whose frame timestamp is the event's firing time.
+ """
+ return pa.struct(
+ [
+ pa.field("role", pa.string(), nullable=False),
+ pa.field("content", pa.string(), nullable=True),
+ pa.field("style", pa.string(), nullable=True),
+ pa.field("camera", pa.string(), nullable=True),
+ pa.field("tool_calls", pa.list_(_json_arrow_type()), nullable=True),
+ ]
+ )
+
+
+def language_persistent_arrow_type() -> pa.ListType:
+ """Return the Arrow list type for the ``language_persistent`` column."""
+ return pa.list_(language_persistent_row_arrow_type())
+
+
+def language_events_arrow_type() -> pa.ListType:
+ """Return the Arrow list type for the ``language_events`` column."""
+ return pa.list_(language_event_row_arrow_type())
+
+
+def language_persistent_row_feature() -> dict[str, object]:
+ """Return the HF ``datasets`` feature mapping for a persistent language row."""
+ return {
+ "role": datasets.Value("string"),
+ "content": datasets.Value("string"),
+ "style": datasets.Value("string"),
+ "timestamp": datasets.Value("float32"),
+ "camera": datasets.Value("string"),
+ "tool_calls": datasets.List(_json_feature()),
+ }
+
+
+def language_event_row_feature() -> dict[str, object]:
+ """Return the HF ``datasets`` feature mapping for an event language row."""
+ return {
+ "role": datasets.Value("string"),
+ "content": datasets.Value("string"),
+ "style": datasets.Value("string"),
+ "camera": datasets.Value("string"),
+ "tool_calls": datasets.List(_json_feature()),
+ }
+
+
+def language_persistent_column_feature() -> datasets.List:
+ """Return the HF ``datasets`` feature for the ``language_persistent`` column."""
+ return datasets.List(language_persistent_row_feature())
+
+
+def language_events_column_feature() -> datasets.List:
+ """Return the HF ``datasets`` feature for the ``language_events`` column."""
+ return datasets.List(language_event_row_feature())
+
+
+def language_feature_info() -> dict[str, dict]:
+ """Return the ``info["features"]`` entries for both language columns."""
+ return {
+ LANGUAGE_PERSISTENT: {"dtype": "language", "shape": (1,), "names": None},
+ LANGUAGE_EVENTS: {"dtype": "language", "shape": (1,), "names": None},
+ }
+
+
+def is_language_column(key: str) -> bool:
+ """Return ``True`` if ``key`` is one of the dataset's language column names."""
+ return key in LANGUAGE_COLUMNS
+
+
+def is_view_dependent_style(style: str | None) -> bool:
+ """Return ``True`` if rows of ``style`` must be tagged with a ``camera`` key."""
+ return style in VIEW_DEPENDENT_STYLES
+
+
+def validate_camera_field(style: str | None, camera: str | None) -> None:
+ """Enforce the ``camera`` invariant: required iff ``style`` is view-dependent.
+
+ Raises ``ValueError`` if a view-dependent style is missing ``camera`` or if
+ a non-view-dependent style carries one. Pipeline writers and the validator
+ should call this on every emitted row.
+ """
+ if is_view_dependent_style(style):
+ if not camera:
+ raise ValueError(
+ f"Rows of view-dependent style {style!r} require a non-empty 'camera' "
+ f"field referencing an 'observation.images.*' feature key."
+ )
+ elif camera is not None:
+ raise ValueError(f"Rows of style {style!r} must have camera=None; got camera={camera!r}.")
+
+
+# --- Tool registry --------------------------------------------------------
+# Tools declared on a dataset live in ``meta/info.json["tools"]`` as a list
+# of OpenAI-style function schemas. The runtime / training stack reads them
+# through :class:`LeRobotDatasetMetadata.tools` (with these constants as
+# fallback when the dataset doesn't declare any). Implementations live
+# under :mod:`lerobot.tools` (one file per tool); see
+# ``docs/source/tools.mdx`` for the authoring guide.
+
+SAY_TOOL_SCHEMA: dict = {
+ "type": "function",
+ "function": {
+ "name": "say",
+ "description": "Speak a short utterance to the user via the TTS executor.",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "text": {
+ "type": "string",
+ "description": "The verbatim text to speak.",
+ }
+ },
+ "required": ["text"],
+ },
+ },
+}
+"""Canonical schema for the ``say`` tool emitted by the steerable
+annotation pipeline (PR 2 Module 2). Single source of truth — PR 2's
+writer, PR 3's runtime tool registry, and the dataset visualizer all
+import this constant rather than duplicating the dict."""
+
+DEFAULT_TOOLS: list[dict] = [SAY_TOOL_SCHEMA]
+"""Fallback tools list. Returned by ``LeRobotDatasetMetadata.tools``
+when ``meta/info.json["tools"]`` is unset, so unannotated datasets and
+chat-template consumers (``apply_chat_template(messages, tools=...)``)
+keep working out of the box."""
+
+
+def column_for_style(style: str | None) -> LanguageColumn:
+ """Map a language style to the column where rows of that style are stored.
+
+ Styles in :data:`PERSISTENT_STYLES` route to :data:`LANGUAGE_PERSISTENT`.
+ Styles in :data:`EVENT_ONLY_STYLES` and the implicit ``None`` style route
+ to :data:`LANGUAGE_EVENTS`.
+ """
+ if style is None:
+ return LANGUAGE_EVENTS
+ if style in PERSISTENT_STYLES:
+ return LANGUAGE_PERSISTENT
+ if style in EVENT_ONLY_STYLES:
+ return LANGUAGE_EVENTS
+ raise ValueError(f"Unknown language style: {style!r}")
diff --git a/src/lerobot/datasets/language_render.py b/src/lerobot/datasets/language_render.py
new file mode 100644
index 000000000..999fa19ad
--- /dev/null
+++ b/src/lerobot/datasets/language_render.py
@@ -0,0 +1,545 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import copy
+import hashlib
+import re
+from collections.abc import Sequence
+from typing import Any
+
+from lerobot.configs.recipe import DEFAULT_BINDINGS, PLACEHOLDER_RE, TrainingRecipe
+from lerobot.utils.utils import unwrap_scalar
+
+from .language import LANGUAGE_PERSISTENT, column_for_style
+
+LanguageRow = dict[str, Any]
+RenderedMessages = dict[str, list[Any]]
+
+_RESOLVER_RE = re.compile(r"^(?P[A-Za-z_][A-Za-z0-9_]*)\((?P.*)\)$")
+
+
+def active_at(
+ t: float,
+ *,
+ persistent: Sequence[LanguageRow],
+ style: str | None = None,
+ role: str | None = None,
+ tool_name: str | None = None,
+ camera: str | None = None,
+) -> LanguageRow | None:
+ """Return the persistent row of ``style`` that is active at time ``t``.
+
+ A persistent row is "active" at ``t`` when its own ``timestamp`` is the
+ most recent one ``<= t`` for the given ``style``/``role``/``tool_name``/
+ ``camera`` selector. Only valid for persistent styles.
+ """
+ _validate_persistent_resolver("active_at", style)
+ matches = [
+ row
+ for row in _matching_rows(persistent, style=style, role=role, tool_name=tool_name, camera=camera)
+ if _timestamp(row) <= t
+ ]
+ if not matches:
+ return None
+ latest_ts = max(_timestamp(row) for row in matches)
+ return _select_one(
+ [row for row in matches if _timestamp(row) == latest_ts],
+ style=style,
+ role=role,
+ tool_name=tool_name,
+ camera=camera,
+ )
+
+
+EMITTED_AT_TOLERANCE_S = 0.1
+"""Half-window for matching persistent rows to a frame timestamp in
+``emitted_at``. Persistent timestamps come from parquet (float32) and ``t``
+is also a float32 from parquet, so in the ideal hot path an exact match
+would suffice — but any caller that derives ``t`` arithmetically (e.g.
+``frame_idx / fps``) breaks bit-equality. A 0.1 s tolerance covers
+common arithmetic drift without admitting frames that are visibly far
+apart at typical control rates (30–100 Hz). This does mean two persistent
+rows of the same selector emitted within 0.1 s of each other cannot be
+told apart by ``emitted_at`` — acceptable because persistent annotations
+(subtask / plan / memory transitions) change on a human-action timescale,
+not at the camera frame rate."""
+
+
+def emitted_at(
+ t: float,
+ *,
+ persistent: Sequence[LanguageRow],
+ events: Sequence[LanguageRow],
+ style: str | None = None,
+ role: str | None = None,
+ tool_name: str | None = None,
+ camera: str | None = None,
+) -> LanguageRow | None:
+ """Return the row of ``style`` emitted at exactly time ``t``.
+
+ For persistent styles, this matches persistent rows whose own ``timestamp``
+ is within ``EMITTED_AT_TOLERANCE_S`` of ``t`` (see that constant for why
+ we use a tolerance instead of bit-equality). For event styles, the
+ ``events`` list is assumed to come from the dataset row at frame ``t``
+ (event rows carry no timestamp of their own), so all matching event rows
+ are considered emitted at ``t``. ``camera`` filters by the row's
+ ``camera`` field — required to disambiguate when multiple view-dependent
+ rows share ``(t, role)`` across cameras.
+ """
+ if column_for_style(style) == LANGUAGE_PERSISTENT:
+ matches = [
+ row
+ for row in _matching_rows(persistent, style=style, role=role, tool_name=tool_name, camera=camera)
+ if abs(_timestamp(row) - t) <= EMITTED_AT_TOLERANCE_S
+ ]
+ else:
+ matches = _matching_rows(events, style=style, role=role, tool_name=tool_name, camera=camera)
+ return _select_one(matches, style=style, role=role, tool_name=tool_name, camera=camera)
+
+
+def nth_prev(
+ t: float,
+ *,
+ persistent: Sequence[LanguageRow],
+ style: str | None = None,
+ offset: int = 1,
+ role: str | None = None,
+ tool_name: str | None = None,
+ camera: str | None = None,
+) -> LanguageRow | None:
+ """Return the persistent row that was active ``offset`` steps before ``t``.
+
+ Walks back through chronologically sorted persistent rows of ``style``
+ (filtered by optional ``role``/``tool_name``/``camera``) and returns the
+ one ``offset`` positions before the row active at ``t``. Only valid for
+ persistent styles.
+ """
+ return _nth_relative("nth_prev", t, persistent, style, -offset, role, tool_name, camera)
+
+
+def nth_next(
+ t: float,
+ *,
+ persistent: Sequence[LanguageRow],
+ style: str | None = None,
+ offset: int = 1,
+ role: str | None = None,
+ tool_name: str | None = None,
+ camera: str | None = None,
+) -> LanguageRow | None:
+ """Return the persistent row that becomes active ``offset`` steps after ``t``.
+
+ Walks forward through chronologically sorted persistent rows of ``style``
+ (filtered by optional ``role``/``tool_name``/``camera``) and returns the
+ one ``offset`` positions after the row active at ``t``. Only valid for
+ persistent styles.
+ """
+ return _nth_relative("nth_next", t, persistent, style, offset, role, tool_name, camera)
+
+
+def render_sample(
+ *,
+ recipe: TrainingRecipe,
+ persistent: Sequence[LanguageRow] | None,
+ events: Sequence[LanguageRow] | None,
+ t: float,
+ sample_idx: int,
+ task: str | None = None,
+ dataset_ctx: Any | None = None,
+) -> RenderedMessages | None:
+ """Render the chat-style messages for a single dataset sample.
+
+ Resolves the recipe's bindings against ``persistent`` and ``events`` rows
+ at frame timestamp ``t``, then expands the recipe's message templates.
+ Returns ``None`` if the resolved sample contains no target message.
+ """
+ persistent_rows = _normalize_rows(persistent or [])
+ event_rows = _normalize_rows(events or [])
+ selected_recipe = _select_recipe(recipe, sample_idx)
+ bindings = _resolve_bindings(
+ selected_recipe,
+ persistent=persistent_rows,
+ events=event_rows,
+ t=t,
+ sample_idx=sample_idx,
+ task=task,
+ dataset_ctx=dataset_ctx,
+ )
+ return _render_message_recipe(selected_recipe, bindings)
+
+
+def _select_recipe(recipe: TrainingRecipe, sample_idx: int) -> TrainingRecipe:
+ """Pick a deterministic blend component for ``sample_idx`` (or return ``recipe``)."""
+ if recipe.blend is None:
+ return recipe
+
+ total_weight = sum(component.weight or 0.0 for component in recipe.blend.values())
+ if total_weight <= 0:
+ raise ValueError("Blend weights must sum to a positive value.")
+
+ digest = hashlib.blake2b(str(sample_idx).encode(), digest_size=8).digest()
+ draw = int.from_bytes(digest, "big") / 2**64 * total_weight
+ cumulative = 0.0
+ last_component: TrainingRecipe | None = None
+ for component in recipe.blend.values():
+ last_component = component
+ cumulative += component.weight or 0.0
+ if draw < cumulative:
+ return component
+ assert last_component is not None
+ return last_component
+
+
+def _resolve_bindings(
+ recipe: TrainingRecipe,
+ *,
+ persistent: Sequence[LanguageRow],
+ events: Sequence[LanguageRow],
+ t: float,
+ sample_idx: int,
+ task: str | None,
+ dataset_ctx: Any | None,
+) -> dict[str, LanguageRow | str | None]:
+ """Resolve every binding in ``recipe`` (plus ``task``) at time ``t``."""
+ bindings: dict[str, LanguageRow | str | None] = {
+ "task": _resolve_task(task, dataset_ctx, persistent=persistent, sample_idx=sample_idx),
+ }
+ specs = {**DEFAULT_BINDINGS, **(recipe.bindings or {})}
+ for name, spec in specs.items():
+ bindings[name] = _resolve_spec(spec, persistent=persistent, events=events, t=t)
+ return bindings
+
+
+def _resolve_task(
+ task: str | None,
+ dataset_ctx: Any | None,
+ *,
+ persistent: Sequence[LanguageRow] = (),
+ sample_idx: int = 0,
+) -> str | None:
+ """Return the task string for ``sample_idx``.
+
+ Resolution order:
+
+ 1. Explicit ``task`` override (caller-supplied) wins.
+ 2. If ``persistent`` contains rows of style ``task_aug`` (role=user),
+ deterministically pick one by ``sample_idx`` so each frame of an
+ episode rotates through the available rephrasings across an epoch.
+ This realizes Xiao 2022 / CAST-style task-prompt diversity without
+ changing ``meta/tasks.parquet`` and without forcing recipes to opt
+ in: ``${task}`` automatically picks a rephrasing when one exists,
+ and falls back to the canonical task otherwise. Recipes that want
+ the literal canonical task can override the binding.
+ 3. Otherwise read the canonical task from ``dataset_ctx`` (which is
+ backed by ``meta/tasks.parquet``).
+ """
+ if task is not None:
+ return task
+
+ aug_rows = [r for r in persistent if r.get("style") == "task_aug" and r.get("role") == "user"]
+ if aug_rows:
+ # Deterministic, blake2b-based pick keyed on sample_idx so the
+ # rotation is reproducible across runs (Python's built-in ``hash``
+ # is process-randomized).
+ digest = hashlib.blake2b(f"task_aug:{sample_idx}".encode(), digest_size=8).digest()
+ idx = int.from_bytes(digest, "big") % len(aug_rows)
+ chosen = aug_rows[idx].get("content")
+ if chosen:
+ return str(chosen)
+
+ if dataset_ctx is None:
+ return None
+ if isinstance(dataset_ctx, dict):
+ return dataset_ctx.get("task")
+ return getattr(dataset_ctx, "task", None)
+
+
+def _resolve_spec(
+ spec: str,
+ *,
+ persistent: Sequence[LanguageRow],
+ events: Sequence[LanguageRow],
+ t: float,
+) -> LanguageRow | None:
+ """Parse a single binding's resolver expression and dispatch to its function."""
+ match = _RESOLVER_RE.match(spec.strip())
+ if match is None:
+ raise ValueError(f"Invalid resolver expression: {spec!r}")
+ name = match.group("name")
+ kwargs = _parse_resolver_args(match.group("args"))
+ kwargs.pop("t_arg", None)
+
+ if name == "emitted_at":
+ return emitted_at(t, persistent=persistent, events=events, **kwargs)
+ if name == "active_at":
+ return active_at(t, persistent=persistent, **kwargs)
+ if name == "nth_prev":
+ return nth_prev(t, persistent=persistent, **kwargs)
+ if name == "nth_next":
+ return nth_next(t, persistent=persistent, **kwargs)
+ raise ValueError(f"Unknown language resolver: {name!r}")
+
+
+def _parse_resolver_args(args: str) -> dict[str, Any]:
+ """Parse a comma-separated resolver argument list into a kwargs dict."""
+ kwargs: dict[str, Any] = {}
+ if not args.strip():
+ return kwargs
+
+ parts = [part.strip() for part in args.split(",") if part.strip()]
+ for part in parts:
+ if part == "t":
+ kwargs["t_arg"] = True
+ continue
+ if "=" not in part:
+ raise ValueError(f"Invalid resolver argument: {part!r}")
+ key, value = (item.strip() for item in part.split("=", 1))
+ if key == "offset":
+ kwargs[key] = int(value)
+ else:
+ kwargs[key] = value.strip("\"'")
+ return kwargs
+
+
+def _render_message_recipe(
+ recipe: TrainingRecipe,
+ bindings: dict[str, LanguageRow | str | None],
+) -> RenderedMessages | None:
+ """Expand ``recipe.messages`` into rendered chat messages using ``bindings``."""
+ assert recipe.messages is not None
+ messages: list[dict[str, Any]] = []
+ streams: list[str | None] = []
+ target_indices: list[int] = []
+
+ for turn in recipe.messages:
+ if turn.if_present is not None and bindings.get(turn.if_present) is None:
+ continue
+
+ message = {"role": turn.role}
+ if turn.content is not None:
+ message["content"] = _render_content(turn.content, bindings)
+
+ if turn.tool_calls_from is not None:
+ row = bindings.get(turn.tool_calls_from)
+ tool_calls = row.get("tool_calls") if isinstance(row, dict) else None
+ if tool_calls:
+ message["tool_calls"] = copy.deepcopy(tool_calls)
+
+ message_idx = len(messages)
+ messages.append(message)
+ streams.append(turn.stream)
+ if turn.target:
+ target_indices.append(message_idx)
+
+ if not target_indices:
+ return None
+
+ rendered = {
+ "messages": messages,
+ "message_streams": streams,
+ "target_message_indices": target_indices,
+ }
+ _validate_rendered(rendered)
+ return rendered
+
+
+def _render_content(
+ content: str | list[dict[str, Any]],
+ bindings: dict[str, LanguageRow | str | None],
+) -> str | list[dict[str, Any]]:
+ """Substitute bindings into a string or each string field of multimodal blocks."""
+ if isinstance(content, str):
+ return _substitute(content, bindings)
+
+ rendered_blocks = []
+ for block in content:
+ rendered_block = copy.deepcopy(block)
+ for key, value in rendered_block.items():
+ if isinstance(value, str):
+ rendered_block[key] = _substitute(value, bindings)
+ rendered_blocks.append(rendered_block)
+ return rendered_blocks
+
+
+def _substitute(template: str, bindings: dict[str, LanguageRow | str | None]) -> str:
+ """Replace ``${name}`` placeholders in ``template`` with their bound values."""
+
+ def replace(match: re.Match[str]) -> str:
+ """Resolve a single ``${name}`` match to its bound string value."""
+ name = match.group(1)
+ if name not in bindings:
+ raise ValueError(f"Unknown template binding: {name!r}")
+ value = bindings[name]
+ if value is None:
+ return ""
+ if isinstance(value, dict):
+ content = value.get("content")
+ return "" if content is None else str(content)
+ return str(value)
+
+ return PLACEHOLDER_RE.sub(replace, template)
+
+
+def _validate_rendered(rendered: RenderedMessages) -> None:
+ """Sanity-check the rendered output for stream/target alignment."""
+ messages = rendered["messages"]
+ streams = rendered["message_streams"]
+ target_indices = rendered["target_message_indices"]
+
+ if len(streams) != len(messages):
+ raise ValueError("message_streams must be aligned with messages.")
+ if not target_indices:
+ raise ValueError("Rendered samples must contain at least one target message.")
+ for idx in target_indices:
+ if idx < 0 or idx >= len(messages):
+ raise ValueError(f"Target message index {idx} is out of bounds.")
+ # ``stream`` is enforced non-None at MessageTurn construction time
+ # (see ``MessageTurn.__post_init__``), so a missing stream here would
+ # mean the dataclass invariant was bypassed; no need to re-check.
+
+
+def _nth_relative(
+ name: str,
+ t: float,
+ persistent: Sequence[LanguageRow],
+ style: str | None,
+ offset: int,
+ role: str | None,
+ tool_name: str | None,
+ camera: str | None,
+) -> LanguageRow | None:
+ """Shared body for ``nth_prev`` / ``nth_next`` with signed ``offset``."""
+ _validate_persistent_resolver(name, style)
+ if abs(offset) < 1:
+ raise ValueError(f"{name} offset must be non-zero.")
+
+ rows = sorted(
+ _matching_rows(persistent, style=style, role=role, tool_name=tool_name, camera=camera),
+ key=_row_sort_key,
+ )
+ if not rows:
+ return None
+
+ anchor_idx = None
+ for idx, row in enumerate(rows):
+ if _timestamp(row) <= t:
+ anchor_idx = idx
+ else:
+ break
+
+ target_idx = (offset - 1 if offset > 0 else None) if anchor_idx is None else anchor_idx + offset
+
+ if target_idx is None or target_idx < 0 or target_idx >= len(rows):
+ return None
+ return rows[target_idx]
+
+
+def _validate_persistent_resolver(name: str, style: str | None) -> None:
+ """Reject calls with missing or event-only ``style`` for persistent resolvers."""
+ if style is None:
+ raise ValueError(f"{name} requires a persistent style.")
+ if column_for_style(style) != LANGUAGE_PERSISTENT:
+ raise ValueError(f"{name} cannot be used with event-only style {style!r}.")
+
+
+def _matching_rows(
+ rows: Sequence[LanguageRow],
+ *,
+ style: str | None,
+ role: str | None,
+ tool_name: str | None,
+ camera: str | None,
+) -> list[LanguageRow]:
+ """Return ``rows`` filtered by optional ``style``/``role``/``tool_name``/``camera`` selectors."""
+ return [
+ row
+ for row in rows
+ if (style is None or row.get("style") == style)
+ and (role is None or row.get("role") == role)
+ and (tool_name is None or _row_has_tool_name(row, tool_name))
+ and (camera is None or row.get("camera") == camera)
+ ]
+
+
+def _select_one(
+ rows: Sequence[LanguageRow],
+ *,
+ style: str | None,
+ role: str | None,
+ tool_name: str | None,
+ camera: str | None,
+) -> LanguageRow | None:
+ """Return the single matching row, or raise if the resolver is ambiguous.
+
+ Multiple matches always raise — even when the caller already passed
+ some selectors — because remaining ambiguity means the data has
+ several rows that look identical to the resolver and the caller
+ needs to pin down a specific one (e.g. add ``camera=...`` for VQA
+ rows shared across cameras).
+ """
+ if not rows:
+ return None
+ if len(rows) > 1:
+ raise ValueError(
+ f"Ambiguous resolver for style={style!r} role={role!r} "
+ f"tool_name={tool_name!r} camera={camera!r}: {len(rows)} matching rows. "
+ f"Add a selector that distinguishes them."
+ )
+ return rows[0]
+
+
+def _row_sort_key(row: LanguageRow) -> tuple[float, str, str]:
+ """Stable sort key for both persistent and event rows.
+
+ Event rows lack ``timestamp`` (it is implicit in the frame), so default
+ to ``0.0`` — within a single frame all event rows share the same sort
+ bucket and are tiebroken by ``(style, role)``.
+ """
+ timestamp = row.get("timestamp")
+ ts = float(unwrap_scalar(timestamp)) if timestamp is not None else 0.0
+ return (ts, row.get("style") or "", row.get("role") or "")
+
+
+def _timestamp(row: LanguageRow) -> float:
+ """Extract a row's ``timestamp`` as a Python float (unwrapping numpy scalars)."""
+ return float(unwrap_scalar(row["timestamp"]))
+
+
+def _row_has_tool_name(row: LanguageRow, tool_name: str) -> bool:
+ """Return ``True`` if any of the row's tool calls invokes ``tool_name``."""
+ for tool_call in row.get("tool_calls") or []:
+ if isinstance(tool_call, str):
+ continue
+ function = tool_call.get("function") if isinstance(tool_call, dict) else None
+ if isinstance(function, dict) and function.get("name") == tool_name:
+ return True
+ return False
+
+
+def _normalize_rows(rows: Sequence[Any]) -> list[LanguageRow]:
+ """Convert pyarrow scalars / mappings into a fresh list of plain dict rows."""
+ normalized = []
+ for row in rows:
+ if row is None:
+ continue
+ if hasattr(row, "as_py"):
+ row = row.as_py()
+ if not isinstance(row, dict):
+ raise TypeError(f"Language rows must be dictionaries, got {type(row).__name__}.")
+ normalized.append(dict(row))
+ return normalized
diff --git a/src/lerobot/datasets/lerobot_dataset.py b/src/lerobot/datasets/lerobot_dataset.py
index ab55aa9f8..9734bcc74 100644
--- a/src/lerobot/datasets/lerobot_dataset.py
+++ b/src/lerobot/datasets/lerobot_dataset.py
@@ -24,6 +24,7 @@ import torch.utils
from huggingface_hub import HfApi, snapshot_download
from huggingface_hub.errors import RevisionNotFoundError
+from lerobot.configs import VideoEncoderConfig
from lerobot.utils.constants import HF_LEROBOT_HUB_CACHE
from .dataset_metadata import CODEBASE_VERSION, LeRobotDatasetMetadata
@@ -36,8 +37,7 @@ from .utils import (
)
from .video_utils import (
StreamingVideoEncoder,
- get_safe_default_codec,
- resolve_vcodec,
+ get_safe_default_video_backend,
)
logger = logging.getLogger(__name__)
@@ -59,10 +59,10 @@ class LeRobotDataset(torch.utils.data.Dataset):
video_backend: str | None = None,
return_uint8: bool = False,
batch_encoding_size: int = 1,
- vcodec: str = "libsvtav1",
+ camera_encoder: VideoEncoderConfig | None = None,
+ encoder_threads: int | None = None,
streaming_encoding: bool = False,
encoder_queue_maxsize: int = 30,
- encoder_threads: int | None = None,
):
"""
2 modes are available for instantiating this class, depending on 2 different use cases:
@@ -183,16 +183,15 @@ class LeRobotDataset(torch.utils.data.Dataset):
You can also use the 'pyav' decoder used by Torchvision, which used to be the default option, or 'video_reader' which is another decoder of Torchvision.
batch_encoding_size (int, optional): Number of episodes to accumulate before batch encoding videos.
Set to 1 for immediate encoding (default), or higher for batched encoding. Defaults to 1.
- vcodec (str, optional): Video codec for encoding videos during recording. Options: 'h264', 'hevc',
- 'libsvtav1', 'auto', or hardware-specific codecs like 'h264_videotoolbox', 'h264_nvenc'.
- Defaults to 'libsvtav1'. Use 'auto' to auto-detect the best available hardware encoder.
+ camera_encoder (VideoEncoderConfig | None, optional): Video encoder settings for cameras
+ (codec, quality, etc.). When ``None``, :func:`~lerobot.configs.video.camera_encoder_defaults`
+ is used by the writer.
+ encoder_threads (int | None, optional): Number of encoder threads (global). ``None`` lets the
+ codec decide.
streaming_encoding (bool, optional): If True, encode video frames in real-time during capture
instead of writing PNG images first. This makes save_episode() near-instant. Defaults to False.
encoder_queue_maxsize (int, optional): Maximum number of frames to buffer per camera when using
streaming encoding. Defaults to 30 (~1s at 30fps).
- encoder_threads (int | None, optional): Number of threads per encoder instance. None lets the
- codec auto-detect (default). Lower values reduce CPU usage per encoder. Maps to 'lp' (via svtav1-params) for
- libsvtav1 and 'threads' for h264/hevc.
Note:
Write-mode parameters (``streaming_encoding``, ``batch_encoding_size``) passed to
@@ -207,10 +206,9 @@ class LeRobotDataset(torch.utils.data.Dataset):
self.delta_timestamps = delta_timestamps
self.tolerance_s = tolerance_s
self.revision = revision if revision else CODEBASE_VERSION
- self._video_backend = video_backend if video_backend else get_safe_default_codec()
+ self._video_backend = video_backend if video_backend else get_safe_default_video_backend()
self._return_uint8 = return_uint8
self._batch_encoding_size = batch_encoding_size
- self._vcodec = resolve_vcodec(vcodec)
self._encoder_threads = encoder_threads
if self._requested_root is not None:
@@ -273,12 +271,15 @@ class LeRobotDataset(torch.utils.data.Dataset):
streaming_enc = None
if streaming_encoding and len(self.meta.video_keys) > 0:
streaming_enc = self._build_streaming_encoder(
- self.meta.fps, self._vcodec, encoder_queue_maxsize, encoder_threads
+ self.meta.fps,
+ camera_encoder,
+ encoder_queue_maxsize,
+ encoder_threads,
)
self.writer = DatasetWriter(
meta=self.meta,
root=self.root,
- vcodec=self._vcodec,
+ camera_encoder=camera_encoder,
encoder_threads=encoder_threads,
batch_encoding_size=batch_encoding_size,
streaming_encoder=streaming_enc,
@@ -320,17 +321,13 @@ class LeRobotDataset(torch.utils.data.Dataset):
@staticmethod
def _build_streaming_encoder(
fps: int,
- vcodec: str,
+ camera_encoder: VideoEncoderConfig | None,
encoder_queue_maxsize: int,
encoder_threads: int | None,
) -> StreamingVideoEncoder:
return StreamingVideoEncoder(
fps=fps,
- vcodec=vcodec,
- pix_fmt="yuv420p",
- g=2,
- crf=30,
- preset=None,
+ camera_encoder=camera_encoder,
queue_maxsize=encoder_queue_maxsize,
encoder_threads=encoder_threads,
)
@@ -647,7 +644,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
image_writer_threads: int = 0,
video_backend: str | None = None,
batch_encoding_size: int = 1,
- vcodec: str = "libsvtav1",
+ camera_encoder: VideoEncoderConfig | None = None,
metadata_buffer_size: int = 10,
streaming_encoding: bool = False,
encoder_queue_maxsize: int = 30,
@@ -678,20 +675,20 @@ class LeRobotDataset(torch.utils.data.Dataset):
video_backend: Video decoding backend (used when reading back).
batch_encoding_size: Number of episodes to accumulate before
batch-encoding videos. ``1`` means encode immediately.
- vcodec: Video codec for encoding. Options include ``'libsvtav1'``,
- ``'h264'``, ``'hevc'``, ``'auto'``.
+ camera_encoder: Video encoder settings for cameras (codec, quality, etc.).
+ When ``None``, :func:`~lerobot.configs.video.camera_encoder_defaults` is used.
+ encoder_threads: Number of encoder threads (global). ``None``
+ lets the codec decide.
metadata_buffer_size: Number of episode metadata records to buffer
before flushing to parquet.
streaming_encoding: If ``True``, encode video frames in real-time
during capture instead of writing images first.
encoder_queue_maxsize: Max buffered frames per camera when using
streaming encoding.
- encoder_threads: Threads per encoder instance. ``None`` for auto.
Returns:
A new :class:`LeRobotDataset` in write mode.
"""
- vcodec = resolve_vcodec(vcodec)
obj = cls.__new__(cls)
obj.meta = LeRobotDatasetMetadata.create(
repo_id=repo_id,
@@ -712,23 +709,23 @@ class LeRobotDataset(torch.utils.data.Dataset):
obj.image_transforms = None
obj.delta_timestamps = None
obj.episodes = None
- obj._video_backend = video_backend if video_backend is not None else get_safe_default_codec()
+ obj._video_backend = video_backend if video_backend is not None else get_safe_default_video_backend()
obj._return_uint8 = False
obj._batch_encoding_size = batch_encoding_size
- obj._vcodec = vcodec
obj._encoder_threads = encoder_threads
# Reader is lazily created on first access (write-only mode)
obj.reader = None
- # Create writer
streaming_enc = None
if streaming_encoding and len(obj.meta.video_keys) > 0:
- streaming_enc = cls._build_streaming_encoder(fps, vcodec, encoder_queue_maxsize, encoder_threads)
+ streaming_enc = cls._build_streaming_encoder(
+ fps, camera_encoder, encoder_queue_maxsize, encoder_threads
+ )
obj.writer = DatasetWriter(
meta=obj.meta,
root=obj.root,
- vcodec=vcodec,
+ camera_encoder=camera_encoder,
encoder_threads=encoder_threads,
batch_encoding_size=batch_encoding_size,
streaming_encoder=streaming_enc,
@@ -751,12 +748,12 @@ class LeRobotDataset(torch.utils.data.Dataset):
force_cache_sync: bool = False,
video_backend: str | None = None,
batch_encoding_size: int = 1,
- vcodec: str = "libsvtav1",
+ camera_encoder: VideoEncoderConfig | None = None,
+ encoder_threads: int | None = None,
image_writer_processes: int = 0,
image_writer_threads: int = 0,
streaming_encoding: bool = False,
encoder_queue_maxsize: int = 30,
- encoder_threads: int | None = None,
) -> "LeRobotDataset":
"""Resume recording on an existing dataset.
@@ -779,13 +776,15 @@ class LeRobotDataset(torch.utils.data.Dataset):
video_backend: Video decoding backend for reading back data.
batch_encoding_size: Number of episodes to accumulate before
batch-encoding videos.
- vcodec: Video codec for encoding.
+ camera_encoder: Video encoder settings for cameras (codec, quality, etc.).
+ When ``None``, :func:`~lerobot.configs.video.camera_encoder_defaults` is used.
+ encoder_threads: Number of encoder threads (global). ``None``
+ lets the codec decide.
image_writer_processes: Subprocesses for async image writing.
image_writer_threads: Threads for async image writing.
streaming_encoding: If ``True``, encode video in real-time during
capture.
encoder_queue_maxsize: Max buffered frames per camera for streaming.
- encoder_threads: Threads per encoder instance. ``None`` for auto.
Returns:
A :class:`LeRobotDataset` in write mode, ready to append episodes.
@@ -796,7 +795,6 @@ class LeRobotDataset(torch.utils.data.Dataset):
"Writing into the revision-safe Hub snapshot cache (used when root=None) would corrupt "
"the shared cache. Please provide a local directory path."
)
- vcodec = resolve_vcodec(vcodec)
obj = cls.__new__(cls)
obj.repo_id = repo_id
obj._requested_root = Path(root)
@@ -805,11 +803,9 @@ class LeRobotDataset(torch.utils.data.Dataset):
obj.image_transforms = None
obj.delta_timestamps = None
obj.episodes = None
- obj._video_backend = video_backend if video_backend else get_safe_default_codec()
+ obj._video_backend = video_backend if video_backend else get_safe_default_video_backend()
obj._return_uint8 = False
obj._batch_encoding_size = batch_encoding_size
- obj._vcodec = vcodec
- obj._encoder_threads = encoder_threads
if obj._requested_root is not None:
obj._requested_root.mkdir(exist_ok=True, parents=True)
@@ -818,21 +814,22 @@ class LeRobotDataset(torch.utils.data.Dataset):
obj.meta = LeRobotDatasetMetadata(
obj.repo_id, obj._requested_root, obj.revision, force_cache_sync=force_cache_sync
)
+
+ obj._encoder_threads = encoder_threads
obj.root = obj.meta.root
# Reader is lazily created on first access (write-only mode)
obj.reader = None
- # Create writer for appending
streaming_enc = None
if streaming_encoding and len(obj.meta.video_keys) > 0:
streaming_enc = cls._build_streaming_encoder(
- obj.meta.fps, vcodec, encoder_queue_maxsize, encoder_threads
+ obj.meta.fps, camera_encoder, encoder_queue_maxsize, encoder_threads
)
obj.writer = DatasetWriter(
meta=obj.meta,
root=obj.root,
- vcodec=vcodec,
+ camera_encoder=camera_encoder,
encoder_threads=encoder_threads,
batch_encoding_size=batch_encoding_size,
streaming_encoder=streaming_enc,
diff --git a/src/lerobot/datasets/pyav_utils.py b/src/lerobot/datasets/pyav_utils.py
new file mode 100644
index 000000000..d291f8b40
--- /dev/null
+++ b/src/lerobot/datasets/pyav_utils.py
@@ -0,0 +1,174 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyAV-based compatibility checks for :class:`VideoEncoderConfig`.
+
+Centralises all :mod:`av` introspection of the bundled FFmpeg build.
+Checks degrade to a no-op when the target codec isn't available locally.
+"""
+
+import functools
+import logging
+from typing import Any
+
+import av
+
+logger = logging.getLogger(__name__)
+
+FFMPEG_NUMERIC_OPTION_TYPES = ("INT", "INT64", "UINT64", "FLOAT", "DOUBLE")
+FFMPEG_INTEGER_OPTION_TYPES = ("INT", "INT64", "UINT64")
+
+
+@functools.cache
+def get_codec(vcodec: str) -> av.codec.Codec | None:
+ """PyAV write-mode ``Codec`` for *vcodec*, or ``None`` if unavailable."""
+ try:
+ return av.codec.Codec(vcodec, "w")
+ except Exception:
+ return None
+
+
+@functools.cache
+def _get_codec_options_by_name(vcodec: str) -> dict[str, av.option.Option]:
+ """Private-option name → PyAV ``Option`` for *vcodec* (empty if unavailable)."""
+ codec = get_codec(vcodec)
+ if codec is None:
+ return {}
+ return {opt.name: opt for opt in codec.descriptor.options}
+
+
+@functools.cache
+def _get_codec_video_formats(vcodec: str) -> tuple[str, ...]:
+ """Pixel formats accepted by *vcodec* in PyAV's preferred order (empty if unknown)."""
+ codec = get_codec(vcodec)
+ if codec is None:
+ return ()
+ return tuple(fmt.name for fmt in (codec.video_formats or []))
+
+
+def detect_available_encoders_pyav(encoders: list[str] | str) -> list[str]:
+ """Return the subset of *encoders* available as video encoders in the local FFmpeg build.
+
+ Each name is probed directly via :func:`get_codec`; input order is preserved.
+ """
+ if isinstance(encoders, str):
+ encoders = [encoders]
+
+ available: list[str] = []
+ for name in encoders:
+ codec = get_codec(name)
+ if codec is not None and codec.type == "video":
+ available.append(name)
+ else:
+ logger.debug("encoder '%s' not available as video encoder", name)
+ return available
+
+
+def _check_option_value(vcodec: str, label: str, value: Any, opt: av.option.Option) -> None:
+ """Range-check numeric *value* and choice-check string *value* against *opt*."""
+ type_name = opt.type.name
+ if type_name in FFMPEG_NUMERIC_OPTION_TYPES:
+ if isinstance(value, bool):
+ raise ValueError(
+ f"{label}={value!r} is not numeric; codec {vcodec!r} expects a number for this option."
+ )
+ elif isinstance(value, str):
+ try:
+ num_val = float(value)
+ except ValueError as e:
+ raise ValueError(
+ f"{label}={value!r} is not numeric; codec {vcodec!r} expects a number for this option."
+ ) from e
+ elif isinstance(value, (float, int)):
+ num_val = value
+ else:
+ raise ValueError(
+ f"{label}={value!r} is not numeric; codec {vcodec!r} expects a number for this option."
+ )
+
+ # Check integer type compatibility
+ if type_name in FFMPEG_INTEGER_OPTION_TYPES and not num_val.is_integer():
+ raise ValueError(
+ f"{label}={num_val!r} must be an integer for codec {vcodec!r} "
+ f"(FFmpeg option {opt.name!r} is {type_name}); float values are not allowed."
+ )
+
+ # Check numeric range compatibility
+ lo, hi = float(opt.min), float(opt.max)
+ if lo < hi and not (lo <= num_val <= hi):
+ raise ValueError(
+ f"{label}={num_val} is out of range for codec {vcodec!r}; must be in [{lo}, {hi}]"
+ )
+
+ elif type_name == "STRING":
+ if isinstance(value, bool):
+ raise ValueError(f"{label}={value!r} is not a valid string value for codec {vcodec!r}.")
+ if isinstance(value, str):
+ str_val = value
+ elif isinstance(value, (int, float)):
+ str_val = str(value)
+ else:
+ raise ValueError(f"{label}={value!r} has unsupported type for STRING option on codec {vcodec!r}")
+
+ # Check string choice compatibility
+ choices = [c.name for c in (opt.choices or [])]
+ if choices and str_val not in choices:
+ raise ValueError(
+ f"{label}={str_val!r} is not a supported choice for codec "
+ f"{vcodec!r}; valid choices: {choices}"
+ )
+ else:
+ return
+
+
+def _check_pixel_format(vcodec: str, pix_fmt: str) -> None:
+ formats = _get_codec_video_formats(vcodec)
+ if formats and pix_fmt not in formats:
+ raise ValueError(
+ f"pix_fmt={pix_fmt!r} is not supported by codec {vcodec!r}; "
+ f"supported pixel formats: {list(formats)}"
+ )
+
+
+def _check_codec_options(vcodec: str, codec_options: dict[str, Any]) -> None:
+ """Validate merged encoder options (typed) against the codec's published AVOptions."""
+ supported_options = _get_codec_options_by_name(vcodec)
+ for key, value in codec_options.items():
+ # GOP size is not a codec-specific option, it has to be validated separately.
+ if key == "g":
+ if isinstance(value, bool) or not isinstance(value, int) or value < 1:
+ raise ValueError(f"g={value!r} must be a positive integer for codec {vcodec!r}")
+ continue
+ if key not in supported_options:
+ continue
+ _check_option_value(vcodec, key, value, supported_options[key])
+
+
+def check_video_encoder_parameters_pyav(vcodec: str, pix_fmt: str, codec_options: dict[str, Any]) -> None:
+ """Verify *config* is compatible with the bundled FFmpeg build.
+
+ Checks pixel format, abstract tuning-field compatibility, and each merged
+ encoder option from :meth:`~lerobot.configs.video.VideoEncoderConfig.get_codec_options`
+ against PyAV (including numeric ``extra_options`` present in that dict).
+ No-op when ``config.vcodec`` isn't in the local FFmpeg build.
+
+ Raises:
+ ValueError: on the first incompatibility encountered.
+ """
+ options = _get_codec_options_by_name(vcodec)
+ if not options:
+ raise ValueError(f"Codec {vcodec!r} is not available in the bundled FFmpeg build")
+ _check_pixel_format(vcodec, pix_fmt)
+ _check_codec_options(vcodec, codec_options)
diff --git a/src/lerobot/datasets/utils.py b/src/lerobot/datasets/utils.py
index 715bd2f9b..de91978ea 100644
--- a/src/lerobot/datasets/utils.py
+++ b/src/lerobot/datasets/utils.py
@@ -88,7 +88,6 @@ VIDEO_DIR = "videos"
CHUNK_FILE_PATTERN = "chunk-{chunk_index:03d}/file-{file_index:03d}"
DEFAULT_TASKS_PATH = "meta/tasks.parquet"
-DEFAULT_SUBTASKS_PATH = "meta/subtasks.parquet"
DEFAULT_EPISODES_PATH = EPISODES_DIR + "/" + CHUNK_FILE_PATTERN + ".parquet"
DEFAULT_DATA_PATH = DATA_DIR + "/" + CHUNK_FILE_PATTERN + ".parquet"
DEFAULT_VIDEO_PATH = VIDEO_DIR + "/{video_key}/" + CHUNK_FILE_PATTERN + ".mp4"
@@ -130,6 +129,9 @@ class DatasetInfo:
# Optional metadata
robot_type: str | None = None
splits: dict[str, str] = field(default_factory=dict)
+ # OpenAI-style tool schemas declared by the dataset. ``None`` means the
+ # dataset doesn't declare any — readers fall back to ``DEFAULT_TOOLS``.
+ tools: list[dict] | None = None
def __post_init__(self) -> None:
# Coerce feature shapes from list to tuple — JSON deserialisation
@@ -151,11 +153,15 @@ class DatasetInfo:
"""Return a JSON-serialisable dict.
Converts tuple shapes back to lists so ``json.dump`` can handle them.
+ Drops ``tools`` when unset so existing datasets keep a clean
+ ``info.json``.
"""
d = dataclasses.asdict(self)
for ft in d["features"].values():
if isinstance(ft.get("shape"), tuple):
ft["shape"] = list(ft["shape"])
+ if d.get("tools") is None:
+ d.pop("tools", None)
return d
@classmethod
diff --git a/src/lerobot/datasets/video_utils.py b/src/lerobot/datasets/video_utils.py
index 00ff09ee7..84ab56e08 100644
--- a/src/lerobot/datasets/video_utils.py
+++ b/src/lerobot/datasets/video_utils.py
@@ -17,12 +17,14 @@ import contextlib
import glob
import importlib
import logging
+import os
import queue
import shutil
import tempfile
import threading
import warnings
-from dataclasses import dataclass, field
+from collections import OrderedDict
+from dataclasses import asdict, dataclass, field
from fractions import Fraction
from pathlib import Path
from threading import Lock
@@ -36,86 +38,14 @@ import torch
from datasets.features.features import register_feature
from PIL import Image
-from lerobot.utils.import_utils import get_safe_default_codec
+from lerobot.configs import (
+ VideoEncoderConfig,
+ camera_encoder_defaults,
+)
+from lerobot.utils.import_utils import get_safe_default_video_backend
logger = logging.getLogger(__name__)
-# List of hardware encoders to probe for auto-selection. Availability depends on the platform and FFmpeg build.
-# Determines the order of preference for auto-selection when vcodec="auto" is used.
-HW_ENCODERS = [
- "h264_videotoolbox", # macOS
- "hevc_videotoolbox", # macOS
- "h264_nvenc", # NVIDIA GPU
- "hevc_nvenc", # NVIDIA GPU
- "h264_vaapi", # Linux Intel/AMD
- "h264_qsv", # Intel Quick Sync
-]
-
-VALID_VIDEO_CODECS = {"h264", "hevc", "libsvtav1", "auto"} | set(HW_ENCODERS)
-
-
-def _get_codec_options(
- vcodec: str,
- g: int | None = 2,
- crf: int | None = 30,
- preset: int | None = None,
-) -> dict:
- """Build codec-specific options dict for video encoding."""
- options = {}
-
- # GOP size (keyframe interval) - supported by VideoToolbox and software encoders
- if g is not None and (vcodec in ("h264_videotoolbox", "hevc_videotoolbox") or vcodec not in HW_ENCODERS):
- options["g"] = str(g)
-
- # Quality control (codec-specific parameter names)
- if crf is not None:
- if vcodec in ("h264", "hevc", "libsvtav1"):
- options["crf"] = str(crf)
- elif vcodec in ("h264_videotoolbox", "hevc_videotoolbox"):
- quality = max(1, min(100, int(100 - crf * 2)))
- options["q:v"] = str(quality)
- elif vcodec in ("h264_nvenc", "hevc_nvenc"):
- options["rc"] = "constqp"
- options["qp"] = str(crf)
- elif vcodec in ("h264_vaapi",):
- options["qp"] = str(crf)
- elif vcodec in ("h264_qsv",):
- options["global_quality"] = str(crf)
-
- # Preset (only for libsvtav1)
- if vcodec == "libsvtav1":
- options["preset"] = str(preset) if preset is not None else "12"
-
- return options
-
-
-def detect_available_hw_encoders() -> list[str]:
- """Probe PyAV/FFmpeg for available hardware video encoders."""
- available = []
- for codec_name in HW_ENCODERS:
- try:
- av.codec.Codec(codec_name, "w")
- available.append(codec_name)
- except Exception: # nosec B110
- logger.debug("HW encoder '%s' not available", codec_name) # nosec B110
- return available
-
-
-def resolve_vcodec(vcodec: str) -> str:
- """Validate vcodec and resolve 'auto' to best available HW encoder, fallback to libsvtav1."""
- if vcodec not in VALID_VIDEO_CODECS:
- raise ValueError(f"Invalid vcodec '{vcodec}'. Must be one of: {sorted(VALID_VIDEO_CODECS)}")
- if vcodec != "auto":
- logger.info(f"Using video codec: {vcodec}")
- return vcodec
- available = detect_available_hw_encoders()
- for encoder in HW_ENCODERS:
- if encoder in available:
- logger.info(f"Auto-selected video codec: {encoder}")
- return encoder
- logger.info("No hardware encoder available, falling back to software encoder 'libsvtav1'")
- return "libsvtav1"
-
def decode_video_frames(
video_path: Path | str,
@@ -143,7 +73,7 @@ def decode_video_frames(
Currently supports torchcodec on cpu and pyav.
"""
if backend is None:
- backend = get_safe_default_codec()
+ backend = get_safe_default_video_backend()
if backend == "torchcodec":
return decode_video_frames_torchcodec(video_path, timestamps, tolerance_s, return_uint8=return_uint8)
elif backend == "pyav":
@@ -263,15 +193,70 @@ def decode_video_frames_pyav(
return closest_frames
-class VideoDecoderCache:
- """Thread-safe cache for video decoders to avoid expensive re-initialization."""
+DEFAULT_DECODER_CACHE_SIZE = 100
+"""Default LRU capacity for :class:`VideoDecoderCache`.
- def __init__(self):
- self._cache: dict[str, tuple[Any, Any]] = {}
+Sized to comfortably hold a small rolling window of episodes worth of decoders
+(typical recipes: 2-4 cameras per episode × tens of episodes in flight) while
+bounding host RAM. Each cached entry retains a torchcodec ``VideoDecoder`` plus
+an open ``fsspec`` file handle — on the order of a few MB per entry. Override
+via the ``LEROBOT_VIDEO_DECODER_CACHE_SIZE`` env var or by passing ``max_size``
+to the constructor (``None`` restores the legacy unbounded behaviour).
+"""
+
+
+def _default_max_cache_size() -> int | None:
+ raw = os.environ.get("LEROBOT_VIDEO_DECODER_CACHE_SIZE")
+ if raw is None:
+ return DEFAULT_DECODER_CACHE_SIZE
+ raw = raw.strip().lower()
+ if raw in ("", "none", "unbounded", "-1"):
+ return None
+ try:
+ value = int(raw)
+ except ValueError as e:
+ raise ValueError(
+ f"LEROBOT_VIDEO_DECODER_CACHE_SIZE must be an integer, 'none', or '-1'; got {raw!r}"
+ ) from e
+ if value <= 0:
+ raise ValueError(f"LEROBOT_VIDEO_DECODER_CACHE_SIZE must be positive; got {value}")
+ return value
+
+
+class VideoDecoderCache:
+ """Thread-safe LRU cache for torchcodec ``VideoDecoder`` instances.
+
+ Cached entries hold a ``VideoDecoder`` plus the open ``fsspec`` file handle
+ backing it. When the cache is full and a new path is requested, the
+ least-recently-used entry is evicted and its file handle is closed. This
+ bounds host-RAM growth when iterating over datasets with many distinct
+ video files (otherwise each ``DataLoader`` worker pins every decoder it has
+ ever opened until the process exits).
+
+ Args:
+ max_size: Maximum number of decoders to retain. ``None`` disables
+ eviction and restores legacy unbounded behaviour. Defaults to the
+ value of ``LEROBOT_VIDEO_DECODER_CACHE_SIZE`` if set, otherwise
+ :data:`DEFAULT_DECODER_CACHE_SIZE`.
+ """
+
+ _SENTINEL: ClassVar[object] = object()
+
+ def __init__(self, max_size: int | None | object = _SENTINEL):
+ if max_size is VideoDecoderCache._SENTINEL:
+ max_size = _default_max_cache_size()
+ if max_size is not None and max_size <= 0:
+ raise ValueError(f"max_size must be positive or None; got {max_size}")
+ self.max_size: int | None = max_size # type: ignore[assignment]
+ self._cache: OrderedDict[str, tuple[Any, Any]] = OrderedDict()
self._lock = Lock()
+ def __contains__(self, video_path: object) -> bool:
+ with self._lock:
+ return str(video_path) in self._cache
+
def get_decoder(self, video_path: str):
- """Get a cached decoder or create a new one."""
+ """Get a cached decoder or create a new one, evicting LRU if at capacity."""
if importlib.util.find_spec("torchcodec"):
from torchcodec.decoders import VideoDecoder
else:
@@ -283,22 +268,36 @@ class VideoDecoderCache:
video_path = str(video_path)
with self._lock:
- if video_path not in self._cache:
- file_handle = fsspec.open(video_path).__enter__()
- try:
- decoder = VideoDecoder(file_handle, seek_mode="approximate")
- except Exception:
- file_handle.close()
- raise
- self._cache[video_path] = (decoder, file_handle)
+ entry = self._cache.get(video_path)
+ if entry is not None:
+ self._cache.move_to_end(video_path)
+ return entry[0]
- return self._cache[video_path][0]
+ file_handle = fsspec.open(video_path).__enter__()
+ try:
+ decoder = VideoDecoder(file_handle, seek_mode="approximate")
+ except Exception:
+ file_handle.close()
+ raise
+ self._cache[video_path] = (decoder, file_handle)
+
+ # Evict LRU entries until we are back under the cap. We close
+ # evicted file handles immediately; the associated ``VideoDecoder``
+ # is released to the GC when its last reference goes away.
+ if self.max_size is not None:
+ while len(self._cache) > self.max_size:
+ _evicted_path, (_evicted_decoder, evicted_handle) = self._cache.popitem(last=False)
+ with contextlib.suppress(Exception):
+ evicted_handle.close()
+
+ return decoder
def clear(self):
- """Clear the cache and close file handles."""
+ """Clear the cache and close all file handles."""
with self._lock:
for _, file_handle in self._cache.values():
- file_handle.close()
+ with contextlib.suppress(Exception):
+ file_handle.close()
self._cache.clear()
def size(self) -> int:
@@ -407,18 +406,17 @@ def encode_video_frames(
imgs_dir: Path | str,
video_path: Path | str,
fps: int,
- vcodec: str = "libsvtav1",
- pix_fmt: str = "yuv420p",
- g: int | None = 2,
- crf: int | None = 30,
- fast_decode: int = 0,
+ camera_encoder: VideoEncoderConfig | None = None,
+ encoder_threads: int | None = None,
+ *,
log_level: int | None = av.logging.WARNING,
overwrite: bool = False,
- preset: int | None = None,
- encoder_threads: int | None = None,
) -> None:
"""More info on ffmpeg arguments tuning on `benchmark/video/README.md`"""
- vcodec = resolve_vcodec(vcodec)
+ if camera_encoder is None:
+ camera_encoder = camera_encoder_defaults()
+ vcodec = camera_encoder.vcodec
+ pix_fmt = camera_encoder.pix_fmt
video_path = Path(video_path)
imgs_dir = Path(imgs_dir)
@@ -429,42 +427,18 @@ def encode_video_frames(
video_path.parent.mkdir(parents=True, exist_ok=True)
- # Encoders/pixel formats incompatibility check
- if (vcodec == "libsvtav1" or vcodec == "hevc") and pix_fmt == "yuv444p":
- logger.warning(
- f"Incompatible pixel format 'yuv444p' for codec {vcodec}, auto-selecting format 'yuv420p'"
- )
- pix_fmt = "yuv420p"
-
# Get input frames
template = "frame-" + ("[0-9]" * 6) + ".png"
input_list = sorted(
glob.glob(str(imgs_dir / template)), key=lambda x: int(x.split("-")[-1].split(".")[0])
)
- # Define video output frame size (assuming all input frames are the same size)
if len(input_list) == 0:
raise FileNotFoundError(f"No images found in {imgs_dir}.")
with Image.open(input_list[0]) as dummy_image:
width, height = dummy_image.size
- # Define video codec options
- video_options = _get_codec_options(vcodec, g, crf, preset)
-
- if fast_decode:
- key = "svtav1-params" if vcodec == "libsvtav1" else "tune"
- value = f"fast-decode={fast_decode}" if vcodec == "libsvtav1" else "fastdecode"
- video_options[key] = value
-
- if encoder_threads is not None:
- if vcodec == "libsvtav1":
- lp_param = f"lp={encoder_threads}"
- if "svtav1-params" in video_options:
- video_options["svtav1-params"] += f":{lp_param}"
- else:
- video_options["svtav1-params"] = lp_param
- else:
- video_options["threads"] = str(encoder_threads)
+ video_options = camera_encoder.get_codec_options(encoder_threads, as_strings=True)
# Set logging level
if log_level is not None:
@@ -500,8 +474,97 @@ def encode_video_frames(
raise OSError(f"Video encoding did not work. File not found: {video_path}.")
+def reencode_video(
+ input_video_path: Path | str,
+ output_video_path: Path | str,
+ camera_encoder: VideoEncoderConfig | None = None,
+ encoder_threads: int | None = None,
+ log_level: int | None = av.logging.WARNING,
+ overwrite: bool = False,
+) -> None:
+ """Re-encode a video file using the given encoder configuration.
+
+ Args:
+ input_video_path: Existing video file to read.
+ output_video_path: Path for the re-encoded file.
+ camera_encoder: Encoder configuration. Defaults to :func:`camera_encoder_defaults`.
+ encoder_threads: Optional thread count forwarded to :meth:`VideoEncoderConfig.get_codec_options`.
+ log_level: libav log level while encoding, or ``None`` to leave logging unchanged. Defaults to WARNING.
+ overwrite: When ``False`` and ``output_video_path`` already exists, skip and log a warning.
+ """
+
+ camera_encoder = camera_encoder or camera_encoder_defaults()
+
+ output_video_path = Path(output_video_path)
+
+ if output_video_path.exists() and not overwrite:
+ logger.warning(f"Video file already exists: {output_video_path}. Skipping re-encode.")
+ return
+
+ output_video_path.parent.mkdir(parents=True, exist_ok=True)
+
+ video_options = camera_encoder.get_codec_options(encoder_threads, as_strings=True)
+ vcodec = camera_encoder.vcodec
+ pix_fmt = camera_encoder.pix_fmt
+
+ with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp_named_file:
+ tmp_output_video_path = tmp_named_file.name
+
+ if log_level is not None:
+ logging.getLogger("libav").setLevel(log_level)
+
+ try:
+ with av.open(input_video_path, mode="r") as src:
+ try:
+ in_stream = src.streams.video[0]
+ except IndexError as e:
+ raise ValueError(f"No video stream in {input_video_path}") from e
+
+ fps = (
+ in_stream.base_rate
+ ) # We allow fractional fps though LeRobotDataset only supports integer fps
+ width = int(in_stream.width)
+ height = int(in_stream.height)
+
+ with av.open(
+ tmp_output_video_path,
+ mode="w",
+ options={
+ "movflags": "faststart"
+ }, # faststart is to move the metadata to the beginning of the file to speed up loading
+ ) as dst:
+ out_stream = dst.add_stream(vcodec, fps, options=video_options)
+ out_stream.pix_fmt = pix_fmt
+ out_stream.width = width
+ out_stream.height = height
+
+ for frame in src.decode(in_stream):
+ frame = frame.reformat(width=width, height=height, format=pix_fmt)
+ packet = out_stream.encode(frame)
+ if packet:
+ dst.mux(packet)
+
+ packet = out_stream.encode()
+ if packet:
+ dst.mux(packet)
+
+ shutil.move(tmp_output_video_path, output_video_path)
+ except Exception:
+ Path(tmp_output_video_path).unlink(missing_ok=True)
+ raise
+ finally:
+ if log_level is not None:
+ av.logging.restore_default_callback()
+
+ if not output_video_path.exists():
+ raise OSError(f"Video re-encoding did not work. File not found: {output_video_path}.")
+
+
def concatenate_video_files(
- input_video_paths: list[Path | str], output_video_path: Path, overwrite: bool = True
+ input_video_paths: list[Path | str],
+ output_video_path: Path,
+ overwrite: bool = True,
+ compatibility_check: bool = False,
):
"""
Concatenate multiple video files into a single video file using pyav.
@@ -514,6 +577,7 @@ def concatenate_video_files(
input_video_paths: Ordered list of input video file paths to concatenate.
output_video_path: Path to the output video file.
overwrite: Whether to overwrite the output video file if it already exists. Default is True.
+ compatibility_check: Whether to check if the input videos are compatible. Default is False.
Note:
- Creates a temporary directory for intermediate files that is cleaned up after use.
@@ -532,6 +596,22 @@ def concatenate_video_files(
if len(input_video_paths) == 0:
raise FileNotFoundError("No input video paths provided.")
+ # This check may be skipped at recording time as videos are encoded with the same encoder config.
+ if compatibility_check:
+ reference_video_info = get_video_info(input_video_paths[0])
+ for input_path in input_video_paths[1:]:
+ video_info = get_video_info(input_path)
+ if (
+ video_info["video.height"] != reference_video_info["video.height"]
+ or video_info["video.width"] != reference_video_info["video.width"]
+ or video_info["video.fps"] != reference_video_info["video.fps"]
+ or video_info["video.codec"] != reference_video_info["video.codec"]
+ or video_info["video.pix_fmt"] != reference_video_info["video.pix_fmt"]
+ ):
+ raise ValueError(
+ f"Input video {input_path} is not compatible with the reference video {input_video_paths[0]}."
+ )
+
# Create a temporary .ffconcat file to list the input video paths
with tempfile.NamedTemporaryFile(mode="w", suffix=".ffconcat", delete=False) as tmp_concatenate_file:
tmp_concatenate_file.write("ffconcat version 1.0\n")
@@ -598,26 +678,20 @@ class _CameraEncoderThread(threading.Thread):
fps: int,
vcodec: str,
pix_fmt: str,
- g: int | None,
- crf: int | None,
- preset: int | None,
+ codec_options: dict[str, str],
frame_queue: queue.Queue,
result_queue: queue.Queue,
stop_event: threading.Event,
- encoder_threads: int | None = None,
):
super().__init__(daemon=True)
self.video_path = video_path
self.fps = fps
self.vcodec = vcodec
self.pix_fmt = pix_fmt
- self.g = g
- self.crf = crf
- self.preset = preset
+ self.codec_options = codec_options
self.frame_queue = frame_queue
self.result_queue = result_queue
self.stop_event = stop_event
- self.encoder_threads = encoder_threads
def run(self) -> None:
from .compute_stats import RunningQuantileStats, auto_downsample_height_width
@@ -653,19 +727,9 @@ class _CameraEncoderThread(threading.Thread):
# Open container on first frame (to get width/height)
if container is None:
height, width = frame_data.shape[:2]
- video_options = _get_codec_options(self.vcodec, self.g, self.crf, self.preset)
- if self.encoder_threads is not None:
- if self.vcodec == "libsvtav1":
- lp_param = f"lp={self.encoder_threads}"
- if "svtav1-params" in video_options:
- video_options["svtav1-params"] += f":{lp_param}"
- else:
- video_options["svtav1-params"] = lp_param
- else:
- video_options["threads"] = str(self.encoder_threads)
Path(self.video_path).parent.mkdir(parents=True, exist_ok=True)
container = av.open(str(self.video_path), "w")
- output_stream = container.add_stream(self.vcodec, self.fps, options=video_options)
+ output_stream = container.add_stream(self.vcodec, self.fps, options=self.codec_options)
output_stream.pix_fmt = self.pix_fmt
output_stream.width = width
output_stream.height = height
@@ -731,22 +795,24 @@ class StreamingVideoEncoder:
def __init__(
self,
fps: int,
- vcodec: str = "libsvtav1",
- pix_fmt: str = "yuv420p",
- g: int | None = 2,
- crf: int | None = 30,
- preset: int | None = None,
+ camera_encoder: VideoEncoderConfig | None = None,
queue_maxsize: int = 30,
encoder_threads: int | None = None,
):
+ """
+ Args:
+ fps: Frames per second for the output videos.
+ camera_encoder: Video encoder settings applied to all cameras.
+ When ``None``, :func:`camera_encoder_defaults` is used.
+ encoder_threads: Number of encoder threads (global setting).
+ ``None`` lets the codec decide.
+ queue_maxsize: Max frames to buffer per camera before
+ back-pressure drops frames.
+ """
self.fps = fps
- self.vcodec = resolve_vcodec(vcodec)
- self.pix_fmt = pix_fmt
- self.g = g
- self.crf = crf
- self.preset = preset
+ self._camera_encoder = camera_encoder or camera_encoder_defaults()
+ self._encoder_threads = encoder_threads
self.queue_maxsize = queue_maxsize
- self.encoder_threads = encoder_threads
self._frame_queues: dict[str, queue.Queue] = {}
self._result_queues: dict[str, queue.Queue] = {}
@@ -777,18 +843,17 @@ class StreamingVideoEncoder:
temp_video_dir = Path(tempfile.mkdtemp(dir=temp_dir))
video_path = temp_video_dir / f"{video_key.replace('/', '_')}_streaming.mp4"
+ vcodec = self._camera_encoder.vcodec
+ codec_options = self._camera_encoder.get_codec_options(self._encoder_threads, as_strings=True)
encoder_thread = _CameraEncoderThread(
video_path=video_path,
fps=self.fps,
- vcodec=self.vcodec,
- pix_fmt=self.pix_fmt,
- g=self.g,
- crf=self.crf,
- preset=self.preset,
+ vcodec=vcodec,
+ pix_fmt=self._camera_encoder.pix_fmt,
+ codec_options=codec_options,
frame_queue=frame_queue,
result_queue=result_queue,
stop_event=stop_event,
- encoder_threads=self.encoder_threads,
)
encoder_thread.start()
@@ -993,8 +1058,18 @@ def get_audio_info(video_path: Path | str) -> dict:
return audio_info
-def get_video_info(video_path: Path | str) -> dict:
- # Set logging level
+def get_video_info(
+ video_path: Path | str,
+ camera_encoder: VideoEncoderConfig | None = None,
+) -> dict:
+ """Build the ``video.*`` / ``audio.*`` info dict persisted in ``info.json``.
+
+ Args:
+ video_path: Path to the encoded video file to probe.
+ camera_encoder: If provided, record the exact encoder settings used to encode this
+ video. Stream-derived values take precedence — encoder fields are only written for keys
+ not already populated from the video file itself.
+ """
logging.getLogger("libav").setLevel(av.logging.WARNING)
# Getting video stream information
@@ -1025,6 +1100,14 @@ def get_video_info(video_path: Path | str) -> dict:
# Adding audio stream information
video_info.update(**get_audio_info(video_path))
+ # Add additional encoder configuration if provided
+ if camera_encoder is not None:
+ for field_name, field_value in asdict(camera_encoder).items():
+ # vcodec is already populated from the video stream
+ if field_name == "vcodec":
+ continue
+ video_info.setdefault(f"video.{field_name}", field_value)
+
return video_info
diff --git a/src/lerobot/motors/robstride/robstride.py b/src/lerobot/motors/robstride/robstride.py
index ecde01e9a..359fc9385 100644
--- a/src/lerobot/motors/robstride/robstride.py
+++ b/src/lerobot/motors/robstride/robstride.py
@@ -43,6 +43,7 @@ from .tables import (
CAN_CMD_SET_ZERO,
DEFAULT_BAUDRATE,
DEFAULT_TIMEOUT_MS,
+ HANDSHAKE_TIMEOUT_S,
MODEL_RESOLUTION,
MOTOR_LIMIT_PARAMS,
NORMALIZED_DATA,
@@ -215,14 +216,16 @@ class RobstrideMotorsBus(MotorsBusBase):
self._is_connected = False
raise ConnectionError(f"Failed to connect to CAN bus: {e}") from e
- def _query_status_via_clear_fault(self, motor: NameOrID) -> tuple[bool, can.Message | None]:
+ def _query_status_via_clear_fault(
+ self, motor: NameOrID, timeout: float = RUNNING_TIMEOUT
+ ) -> tuple[bool, can.Message | None]:
motor_name = self._get_motor_name(motor)
motor_id = self._get_motor_id(motor_name)
recv_id = self._get_motor_recv_id(motor_name)
data = [0xFF] * 7 + [CAN_CMD_CLEAR_FAULT]
msg = can.Message(arbitration_id=motor_id, data=data, is_extended_id=False)
self._bus().send(msg)
- return self._recv_status_via_clear_fault(expected_recv_id=recv_id)
+ return self._recv_status_via_clear_fault(expected_recv_id=recv_id, timeout=timeout)
def _recv_status_via_clear_fault(
self, expected_recv_id: int | None = None, timeout: float = RUNNING_TIMEOUT
@@ -280,7 +283,7 @@ class RobstrideMotorsBus(MotorsBusBase):
faulted_motors = []
for motor_name in self.motors:
- has_fault, msg = self._query_status_via_clear_fault(motor_name)
+ has_fault, msg = self._query_status_via_clear_fault(motor_name, timeout=HANDSHAKE_TIMEOUT_S)
if msg is None:
missing_motors.append(motor_name)
elif has_fault:
@@ -505,6 +508,87 @@ class RobstrideMotorsBus(MotorsBusBase):
return responses
+ def _recv_all_messages_until_quiet(
+ self,
+ *,
+ timeout: float = RUNNING_TIMEOUT,
+ max_messages: int = 4096,
+ ) -> list[can.Message]:
+ """
+ Receive frames until the bus goes quiet.
+
+ Args:
+ timeout: Poll timeout used for each recv() call. Collection stops
+ when one recv() times out (quiet gap).
+ max_messages: Safety cap to prevent unbounded loops.
+ """
+ out: list[can.Message] = []
+ max_messages = max(1, max_messages)
+ timeout = max(0.0, timeout)
+
+ try:
+ while len(out) < max_messages:
+ msg = self._bus().recv(timeout=timeout)
+ if msg is None:
+ break
+ out.append(msg)
+ except (can.CanError, OSError) as e:
+ logger.debug(f"Error draining CAN RX queue on {self.port}: {e}")
+
+ return out
+
+ def _process_feedback_messages(self, messages: list[can.Message]) -> set[int]:
+ """
+ Decode all received feedback frames and update cached motor states.
+
+ Returns:
+ Set of payload recv_ids that were successfully mapped to motors.
+ """
+ processed_recv_ids: set[int] = set()
+ for msg in messages:
+ if len(msg.data) < 1:
+ logger.debug(
+ f"Dropping short CAN frame on {self.port} "
+ f"(arb=0x{int(msg.arbitration_id):02X}, data={bytes(msg.data).hex()})"
+ )
+ continue
+
+ recv_id = int(msg.data[0])
+ motor_name = self._recv_id_to_motor.get(recv_id)
+ if motor_name is None:
+ logger.debug(
+ f"Unmapped CAN frame on {self.port} "
+ f"(arb=0x{int(msg.arbitration_id):02X}, recv_id=0x{recv_id:02X}, data={bytes(msg.data).hex()})"
+ )
+ continue
+
+ self._process_response(motor_name, msg)
+ processed_recv_ids.add(recv_id)
+
+ return processed_recv_ids
+
+ def flush_rx_queue(self, poll_timeout_s: float = 0.0005, max_messages: int = 4096) -> int:
+ """
+ Drain pending RX frames from the CAN interface.
+
+ This is used by higher-level controllers to drop stale feedback before issuing
+ a fresh read cycle, so subsequent state reads are based on most recent replies.
+ It should also be called once when a controller instance is created/connected,
+ to clear residual frames left on the interface from previous sessions.
+ """
+ drained = 0
+ poll_timeout_s = max(0.0, poll_timeout_s)
+ max_messages = max(1, max_messages)
+ try:
+ while drained < max_messages:
+ msg = self._bus().recv(timeout=poll_timeout_s)
+ if msg is None:
+ break
+ drained += 1
+ except (can.CanError, OSError) as e:
+ logger.debug(f"Failed to flush CAN RX queue on {self.port}: {e}")
+ return drained
+
def _speed_control(
self,
motor: NameOrID,
@@ -644,11 +728,14 @@ class RobstrideMotorsBus(MotorsBusBase):
msg = can.Message(arbitration_id=motor_id, data=data, is_extended_id=False)
self._bus().send(msg)
recv_id_to_motor[self._get_motor_recv_id(motor)] = motor_name
+ # Read every feedback frame until RX goes quiet, then decode all of them.
+ # This avoids dropping useful frames when responses from different motors interleave.
+ messages = self._recv_all_messages_until_quiet()
+ processed_recv_ids = self._process_feedback_messages(messages)
- responses = self._recv_all_responses(list(recv_id_to_motor.keys()), timeout=RUNNING_TIMEOUT)
for recv_id, motor_name in recv_id_to_motor.items():
- if msg := responses.get(recv_id):
- self._process_response(motor_name, msg)
+ if recv_id not in processed_recv_ids:
+ logger.warning(f"Packet drop: {motor_name} (ID: 0x{recv_id:02X}). Using last known state.")
def _float_to_uint(self, x: float, x_min: float, x_max: float, bits: int) -> int:
"""Convert float to unsigned integer for CAN transmission."""
@@ -711,7 +798,10 @@ class RobstrideMotorsBus(MotorsBusBase):
try:
self._decode_motor_state(msg.data)
except Exception as e:
- logger.warning(f"Failed to decode response from {motor}: {e}")
+ logger.warning(
+ f"Failed to decode response from {motor} "
+ f"(arb=0x{int(msg.arbitration_id):02X}, data={bytes(msg.data).hex()}): {e}"
+ )
def _get_cached_value(self, motor: str, data_name: str) -> Value:
"""Retrieve a specific value from the state cache."""
@@ -848,20 +938,12 @@ class RobstrideMotorsBus(MotorsBusBase):
self._bus().send(msg)
updated_motors.append(motor)
- expected_recv_ids = [self._get_motor_recv_id(motor) for motor in updated_motors]
- responses = self._recv_all_responses(expected_recv_ids, timeout=RUNNING_TIMEOUT)
-
- for response in responses.values():
- payload_motor_name = self._recv_id_to_motor.get(response.data[0])
- if payload_motor_name is not None:
- self._process_response(payload_motor_name, response)
- else:
- # Fallback: still attempt to decode based on payload byte0 mapping.
- self._decode_motor_state(response.data)
+ messages = self._recv_all_messages_until_quiet()
+ processed_recv_ids = self._process_feedback_messages(messages)
for motor in updated_motors:
recv_id = self._get_motor_recv_id(motor)
- if recv_id not in responses:
+ if recv_id not in processed_recv_ids:
logger.warning(f"Packet drop: {motor} (ID: 0x{recv_id:02X}). Using last known state.")
def read_calibration(self) -> dict[str, MotorCalibration]:
diff --git a/src/lerobot/motors/robstride/tables.py b/src/lerobot/motors/robstride/tables.py
index 2fc1a97b0..06b90df3a 100644
--- a/src/lerobot/motors/robstride/tables.py
+++ b/src/lerobot/motors/robstride/tables.py
@@ -114,7 +114,8 @@ CAN_CMD_SAVE_PARAM = 0xAA
CAN_PARAM_ID = 0x7FF
-RUNNING_TIMEOUT = 0.001
+RUNNING_TIMEOUT = 0.003
+HANDSHAKE_TIMEOUT_S = 0.05
PARAM_TIMEOUT = 0.01
STATE_CACHE_TTL_S = 0.02
diff --git a/src/lerobot/policies/eo1/modeling_eo1.py b/src/lerobot/policies/eo1/modeling_eo1.py
index 27d609ec1..1c5860de5 100644
--- a/src/lerobot/policies/eo1/modeling_eo1.py
+++ b/src/lerobot/policies/eo1/modeling_eo1.py
@@ -28,11 +28,12 @@ import torch.nn.functional as F # noqa: N812
import torch.utils.checkpoint
from torch import Tensor
-from lerobot.policies.eo1.configuration_eo1 import EO1Config
-from lerobot.policies.pretrained import PreTrainedPolicy
from lerobot.utils.constants import ACTION, OBS_STATE
from lerobot.utils.import_utils import _transformers_available, require_package
+from ..pretrained import PreTrainedPolicy
+from .configuration_eo1 import EO1Config
+
if TYPE_CHECKING or _transformers_available:
from transformers.activations import ACT2FN
from transformers.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
diff --git a/src/lerobot/policies/eo1/processor_eo1.py b/src/lerobot/policies/eo1/processor_eo1.py
index 2d7bb48ae..b1f32756a 100644
--- a/src/lerobot/policies/eo1/processor_eo1.py
+++ b/src/lerobot/policies/eo1/processor_eo1.py
@@ -22,7 +22,6 @@ from typing import TYPE_CHECKING, Any
import torch
from lerobot.configs.types import FeatureType, PipelineFeatureType, PolicyFeature
-from lerobot.policies.eo1.configuration_eo1 import EO1Config
from lerobot.processor import (
AddBatchDimensionProcessorStep,
ComplementaryDataProcessorStep,
@@ -44,6 +43,8 @@ from lerobot.utils.constants import (
)
from lerobot.utils.import_utils import _transformers_available, require_package
+from .configuration_eo1 import EO1Config
+
if TYPE_CHECKING or _transformers_available:
from transformers.models.qwen2_5_vl import Qwen2_5_VLProcessor
else:
diff --git a/src/lerobot/policies/pi05/modeling_pi05.py b/src/lerobot/policies/pi05/modeling_pi05.py
index bb206d608..bdaf01f2c 100644
--- a/src/lerobot/policies/pi05/modeling_pi05.py
+++ b/src/lerobot/policies/pi05/modeling_pi05.py
@@ -441,13 +441,13 @@ class PaliGemmaWithExpertModel(
if image.dtype != torch.float32:
image = image.to(torch.float32)
image_outputs = self.paligemma.model.get_image_features(image)
- features = image_outputs.pooler_output * self.paligemma.config.text_config.hidden_size**0.5
+ features = image_outputs.pooler_output
if features.dtype != out_dtype:
features = features.to(out_dtype)
return features
def embed_language_tokens(self, tokens: torch.Tensor):
- return self.paligemma.model.language_model.embed_tokens(tokens)
+ return self.paligemma.model.language_model.get_input_embeddings()(tokens)
def forward(
self,
@@ -662,8 +662,7 @@ class PI05Pytorch(nn.Module): # see openpi `PI0Pytorch`
# Process language tokens
def lang_embed_func(tokens):
lang_emb = self.paligemma_with_expert.embed_language_tokens(tokens)
- lang_emb_dim = lang_emb.shape[-1]
- return lang_emb * math.sqrt(lang_emb_dim)
+ return lang_emb
lang_emb = self._apply_checkpoint(lang_embed_func, tokens)
embs.append(lang_emb)
diff --git a/src/lerobot/processor/__init__.py b/src/lerobot/processor/__init__.py
index 3688a4b8c..fe35af4b4 100644
--- a/src/lerobot/processor/__init__.py
+++ b/src/lerobot/processor/__init__.py
@@ -95,6 +95,13 @@ from .relative_action_processor import (
from .rename_processor import RenameObservationsProcessorStep, rename_stats
from .tokenizer_processor import ActionTokenizerProcessorStep, TokenizerProcessorStep
+# RenderMessagesStep is intentionally NOT re-exported here: it pulls in
+# `lerobot.datasets.language`, which requires the `[dataset]` extra
+# (`datasets`, `pyarrow`). Importing it from the processor package would
+# break every base-install consumer of `lerobot.processor`. Users that
+# need it import directly:
+# from lerobot.processor.render_messages_processor import RenderMessagesStep
+
__all__ = [
"ActionProcessorStep",
"AddTeleopActionAsComplimentaryDataStep",
diff --git a/src/lerobot/processor/batch_processor.py b/src/lerobot/processor/batch_processor.py
index eb7db255a..669c68a0a 100644
--- a/src/lerobot/processor/batch_processor.py
+++ b/src/lerobot/processor/batch_processor.py
@@ -174,6 +174,24 @@ class AddBatchDimensionComplementaryDataStep(ComplementaryDataProcessorStep):
task_index_value = complementary_data["task_index"]
if isinstance(task_index_value, Tensor) and task_index_value.dim() == 0:
complementary_data["task_index"] = task_index_value.unsqueeze(0)
+
+ complementary_data.pop("language_persistent", None)
+ complementary_data.pop("language_events", None)
+
+ if "messages" in complementary_data:
+ messages = complementary_data["messages"]
+ if isinstance(messages, list) and (not messages or isinstance(messages[0], dict)):
+ complementary_data["messages"] = [messages]
+
+ if "message_streams" in complementary_data:
+ streams = complementary_data["message_streams"]
+ if isinstance(streams, list) and (not streams or isinstance(streams[0], str)):
+ complementary_data["message_streams"] = [streams]
+
+ if "target_message_indices" in complementary_data:
+ indices = complementary_data["target_message_indices"]
+ if isinstance(indices, list) and (not indices or isinstance(indices[0], int)):
+ complementary_data["target_message_indices"] = [indices]
return complementary_data
def transform_features(
diff --git a/src/lerobot/processor/converters.py b/src/lerobot/processor/converters.py
index ffdf0098c..faa4d5cd9 100644
--- a/src/lerobot/processor/converters.py
+++ b/src/lerobot/processor/converters.py
@@ -153,26 +153,30 @@ def from_tensor_to_numpy(x: torch.Tensor | Any) -> np.ndarray | float | int | An
return x
+_COMPLEMENTARY_KEYS = (
+ "task",
+ "index",
+ "task_index",
+ "episode_index",
+ "timestamp",
+ "language_persistent",
+ "language_events",
+ "messages",
+ "message_streams",
+ "target_message_indices",
+)
+
+
def _extract_complementary_data(batch: dict[str, Any]) -> dict[str, Any]:
- """
- Extract complementary data from a batch dictionary.
+ """Extract complementary data from a batch dictionary.
- This includes padding flags, task description, and indices.
-
- Args:
- batch: The batch dictionary.
-
- Returns:
- A dictionary with the extracted complementary data.
+ Includes padding flags (any key containing ``_is_pad``) plus the fixed
+ set of metadata / language keys defined in ``_COMPLEMENTARY_KEYS`` —
+ each only when present in ``batch``.
"""
pad_keys = {k: v for k, v in batch.items() if "_is_pad" in k}
- task_key = {"task": batch["task"]} if "task" in batch else {}
- subtask_key = {"subtask": batch["subtask"]} if "subtask" in batch else {}
- index_key = {"index": batch["index"]} if "index" in batch else {}
- task_index_key = {"task_index": batch["task_index"]} if "task_index" in batch else {}
- episode_index_key = {"episode_index": batch["episode_index"]} if "episode_index" in batch else {}
-
- return {**pad_keys, **task_key, **subtask_key, **index_key, **task_index_key, **episode_index_key}
+ extras = {k: batch[k] for k in _COMPLEMENTARY_KEYS if k in batch}
+ return {**pad_keys, **extras}
def create_transition(
diff --git a/src/lerobot/processor/render_messages_processor.py b/src/lerobot/processor/render_messages_processor.py
new file mode 100644
index 000000000..140592f0e
--- /dev/null
+++ b/src/lerobot/processor/render_messages_processor.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any
+
+from lerobot.configs import PipelineFeatureType, PolicyFeature
+from lerobot.configs.recipe import TrainingRecipe
+from lerobot.datasets.language import LANGUAGE_EVENTS, LANGUAGE_PERSISTENT
+from lerobot.datasets.language_render import render_sample
+from lerobot.types import EnvTransition, TransitionKey
+from lerobot.utils.utils import unwrap_scalar
+
+from .pipeline import ProcessorStep, ProcessorStepRegistry
+
+
+@dataclass
+@ProcessorStepRegistry.register(name="render_messages_processor")
+class RenderMessagesStep(ProcessorStep):
+ """Processor step that turns raw language columns into rendered chat messages.
+
+ Reads ``language_persistent`` and ``language_events`` from the transition's
+ complementary data, renders them through ``recipe`` at the sample timestamp,
+ and replaces the raw columns with the resulting ``messages`` /
+ ``message_streams`` / ``target_message_indices`` keys.
+ """
+
+ recipe: TrainingRecipe
+ dataset_ctx: Any | None = None
+
+ def __call__(self, transition: EnvTransition) -> EnvTransition | None:
+ """Render messages for a single transition; return ``None`` to drop it."""
+ complementary_data = transition.get(TransitionKey.COMPLEMENTARY_DATA) or {}
+ persistent = complementary_data.get(LANGUAGE_PERSISTENT) or []
+ events = complementary_data.get(LANGUAGE_EVENTS) or []
+
+ if not persistent and not events:
+ return transition
+
+ timestamp = complementary_data.get("timestamp")
+ if timestamp is None:
+ raise KeyError("RenderMessagesStep requires sample timestamp in complementary data.")
+
+ sample_idx = complementary_data.get("index", 0)
+ rendered = render_sample(
+ recipe=self.recipe,
+ persistent=persistent,
+ events=events,
+ t=unwrap_scalar(timestamp),
+ sample_idx=int(unwrap_scalar(sample_idx)),
+ task=complementary_data.get("task"),
+ dataset_ctx=self.dataset_ctx,
+ )
+ if rendered is None:
+ return None
+
+ new_transition = transition.copy()
+ new_complementary_data = dict(complementary_data)
+ new_complementary_data.pop(LANGUAGE_PERSISTENT, None)
+ new_complementary_data.pop(LANGUAGE_EVENTS, None)
+ new_complementary_data.update(rendered)
+ new_transition[TransitionKey.COMPLEMENTARY_DATA] = new_complementary_data
+ return new_transition
+
+ def transform_features(
+ self, features: dict[PipelineFeatureType, dict[str, PolicyFeature]]
+ ) -> dict[PipelineFeatureType, dict[str, PolicyFeature]]:
+ """Pass features through unchanged; rendering only touches complementary data."""
+ return features
diff --git a/src/lerobot/rewards/classifier/modeling_classifier.py b/src/lerobot/rewards/classifier/modeling_classifier.py
index 1d8057135..ca02b532f 100644
--- a/src/lerobot/rewards/classifier/modeling_classifier.py
+++ b/src/lerobot/rewards/classifier/modeling_classifier.py
@@ -17,10 +17,11 @@ import logging
import torch
from torch import Tensor, nn
-from lerobot.rewards.classifier.configuration_classifier import RewardClassifierConfig
-from lerobot.rewards.pretrained import PreTrainedRewardModel
from lerobot.utils.constants import OBS_IMAGE, REWARD
+from ..pretrained import PreTrainedRewardModel
+from .configuration_classifier import RewardClassifierConfig
+
class ClassifierOutput:
"""Wrapper for classifier outputs with additional metadata."""
diff --git a/src/lerobot/rewards/classifier/processor_classifier.py b/src/lerobot/rewards/classifier/processor_classifier.py
index 056d7e91b..a5f609d0c 100644
--- a/src/lerobot/rewards/classifier/processor_classifier.py
+++ b/src/lerobot/rewards/classifier/processor_classifier.py
@@ -25,7 +25,8 @@ from lerobot.processor import (
policy_action_to_transition,
transition_to_policy_action,
)
-from lerobot.rewards.classifier.configuration_classifier import RewardClassifierConfig
+
+from .configuration_classifier import RewardClassifierConfig
def make_classifier_processor(
diff --git a/src/lerobot/rewards/factory.py b/src/lerobot/rewards/factory.py
index f6716f3fb..c173f44a5 100644
--- a/src/lerobot/rewards/factory.py
+++ b/src/lerobot/rewards/factory.py
@@ -22,9 +22,10 @@ import torch
from lerobot.configs.rewards import RewardModelConfig
from lerobot.processor import PolicyAction, PolicyProcessorPipeline
-from lerobot.rewards.classifier.configuration_classifier import RewardClassifierConfig
-from lerobot.rewards.pretrained import PreTrainedRewardModel
-from lerobot.rewards.sarm.configuration_sarm import SARMConfig
+
+from .classifier.configuration_classifier import RewardClassifierConfig
+from .pretrained import PreTrainedRewardModel
+from .sarm.configuration_sarm import SARMConfig
def get_reward_model_class(name: str) -> type[PreTrainedRewardModel]:
diff --git a/src/lerobot/rewards/sarm/compute_rabc_weights.py b/src/lerobot/rewards/sarm/compute_rabc_weights.py
index b1bf2e1f5..bdbb0d297 100644
--- a/src/lerobot/rewards/sarm/compute_rabc_weights.py
+++ b/src/lerobot/rewards/sarm/compute_rabc_weights.py
@@ -58,9 +58,10 @@ import torch
from tqdm import tqdm
from lerobot.datasets import LeRobotDataset
-from lerobot.rewards.sarm.modeling_sarm import SARMRewardModel
-from lerobot.rewards.sarm.processor_sarm import make_sarm_pre_post_processors
-from lerobot.rewards.sarm.sarm_utils import normalize_stage_tau
+
+from .modeling_sarm import SARMRewardModel
+from .processor_sarm import make_sarm_pre_post_processors
+from .sarm_utils import normalize_stage_tau
def get_reward_model_path_from_parquet(parquet_path: Path) -> str | None:
diff --git a/src/lerobot/rewards/sarm/modeling_sarm.py b/src/lerobot/rewards/sarm/modeling_sarm.py
index 365f519b2..5ebd42d30 100644
--- a/src/lerobot/rewards/sarm/modeling_sarm.py
+++ b/src/lerobot/rewards/sarm/modeling_sarm.py
@@ -32,13 +32,14 @@ import torch.nn as nn
import torch.nn.functional as F # noqa: N812
from torch import Tensor
-from lerobot.rewards.pretrained import PreTrainedRewardModel
-from lerobot.rewards.sarm.configuration_sarm import SARMConfig
-from lerobot.rewards.sarm.sarm_utils import (
+from lerobot.utils.constants import OBS_STR
+
+from ..pretrained import PreTrainedRewardModel
+from .configuration_sarm import SARMConfig
+from .sarm_utils import (
normalize_stage_tau,
pad_state_to_max_dim,
)
-from lerobot.utils.constants import OBS_STR
class StageTransformer(nn.Module):
diff --git a/src/lerobot/rewards/sarm/processor_sarm.py b/src/lerobot/rewards/sarm/processor_sarm.py
index eaa5f66f5..37db374d4 100644
--- a/src/lerobot/rewards/sarm/processor_sarm.py
+++ b/src/lerobot/rewards/sarm/processor_sarm.py
@@ -58,15 +58,16 @@ from lerobot.processor import (
policy_action_to_transition,
transition_to_policy_action,
)
-from lerobot.rewards.sarm.configuration_sarm import SARMConfig
-from lerobot.rewards.sarm.sarm_utils import (
+from lerobot.types import EnvTransition, PolicyAction, TransitionKey
+from lerobot.utils.constants import POLICY_POSTPROCESSOR_DEFAULT_NAME, POLICY_PREPROCESSOR_DEFAULT_NAME
+
+from .configuration_sarm import SARMConfig
+from .sarm_utils import (
apply_rewind_augmentation,
compute_absolute_indices,
find_stage_and_tau,
pad_state_to_max_dim,
)
-from lerobot.types import EnvTransition, PolicyAction, TransitionKey
-from lerobot.utils.constants import POLICY_POSTPROCESSOR_DEFAULT_NAME, POLICY_PREPROCESSOR_DEFAULT_NAME
class SARMEncodingProcessorStep(ProcessorStep):
diff --git a/src/lerobot/robots/bi_rebot_b601_follower/__init__.py b/src/lerobot/robots/bi_rebot_b601_follower/__init__.py
new file mode 100644
index 000000000..8ef454f45
--- /dev/null
+++ b/src/lerobot/robots/bi_rebot_b601_follower/__init__.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .bi_rebot_b601_follower import BiRebotB601Follower
+from .config_bi_rebot_b601_follower import BiRebotB601FollowerConfig
+
+__all__ = ["BiRebotB601Follower", "BiRebotB601FollowerConfig"]
diff --git a/src/lerobot/robots/bi_rebot_b601_follower/bi_rebot_b601_follower.py b/src/lerobot/robots/bi_rebot_b601_follower/bi_rebot_b601_follower.py
new file mode 100644
index 000000000..bd19f1b62
--- /dev/null
+++ b/src/lerobot/robots/bi_rebot_b601_follower/bi_rebot_b601_follower.py
@@ -0,0 +1,150 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from functools import cached_property
+
+from lerobot.types import RobotAction, RobotObservation
+from lerobot.utils.decorators import check_if_already_connected, check_if_not_connected
+
+from ..rebot_b601_follower import RebotB601Follower, RebotB601FollowerRobotConfig
+from ..robot import Robot
+from .config_bi_rebot_b601_follower import BiRebotB601FollowerConfig
+
+logger = logging.getLogger(__name__)
+
+
+class BiRebotB601Follower(Robot):
+ """Bimanual Seeed Studio reBot B601-DM follower.
+
+ Composes two single-arm :class:`RebotB601Follower` instances. Observation and
+ action keys of each arm are namespaced with a ``left_`` / ``right_`` prefix.
+ """
+
+ config_class = BiRebotB601FollowerConfig
+ name = "bi_rebot_b601_follower"
+
+ def __init__(self, config: BiRebotB601FollowerConfig):
+ super().__init__(config)
+ self.config = config
+
+ left_arm_config = RebotB601FollowerRobotConfig(
+ id=f"{config.id}_left" if config.id else None,
+ calibration_dir=config.calibration_dir,
+ port=config.left_arm_config.port,
+ can_adapter=config.left_arm_config.can_adapter,
+ dm_serial_baud=config.left_arm_config.dm_serial_baud,
+ disable_torque_on_disconnect=config.left_arm_config.disable_torque_on_disconnect,
+ max_relative_target=config.left_arm_config.max_relative_target,
+ cameras=config.left_arm_config.cameras,
+ motor_can_ids=config.left_arm_config.motor_can_ids,
+ pos_vel_velocity=config.left_arm_config.pos_vel_velocity,
+ gripper_torque_ratio=config.left_arm_config.gripper_torque_ratio,
+ joint_limits=config.left_arm_config.joint_limits,
+ )
+
+ right_arm_config = RebotB601FollowerRobotConfig(
+ id=f"{config.id}_right" if config.id else None,
+ calibration_dir=config.calibration_dir,
+ port=config.right_arm_config.port,
+ can_adapter=config.right_arm_config.can_adapter,
+ dm_serial_baud=config.right_arm_config.dm_serial_baud,
+ disable_torque_on_disconnect=config.right_arm_config.disable_torque_on_disconnect,
+ max_relative_target=config.right_arm_config.max_relative_target,
+ cameras=config.right_arm_config.cameras,
+ motor_can_ids=config.right_arm_config.motor_can_ids,
+ pos_vel_velocity=config.right_arm_config.pos_vel_velocity,
+ gripper_torque_ratio=config.right_arm_config.gripper_torque_ratio,
+ joint_limits=config.right_arm_config.joint_limits,
+ )
+
+ self.left_arm = RebotB601Follower(left_arm_config)
+ self.right_arm = RebotB601Follower(right_arm_config)
+
+ # Only for compatibility with parts of the codebase that expect `robot.cameras`.
+ self.cameras = {**self.left_arm.cameras, **self.right_arm.cameras}
+
+ @property
+ def _motors_ft(self) -> dict[str, type]:
+ return {
+ **{f"left_{k}": v for k, v in self.left_arm._motors_ft.items()},
+ **{f"right_{k}": v for k, v in self.right_arm._motors_ft.items()},
+ }
+
+ @property
+ def _cameras_ft(self) -> dict[str, tuple]:
+ return {
+ **{f"left_{k}": v for k, v in self.left_arm._cameras_ft.items()},
+ **{f"right_{k}": v for k, v in self.right_arm._cameras_ft.items()},
+ }
+
+ @cached_property
+ def observation_features(self) -> dict[str, type | tuple]:
+ return {**self._motors_ft, **self._cameras_ft}
+
+ @cached_property
+ def action_features(self) -> dict[str, type]:
+ return self._motors_ft
+
+ @property
+ def is_connected(self) -> bool:
+ return self.left_arm.is_connected and self.right_arm.is_connected
+
+ @check_if_already_connected
+ def connect(self, calibrate: bool = True) -> None:
+ self.left_arm.connect(calibrate)
+ self.right_arm.connect(calibrate)
+
+ @property
+ def is_calibrated(self) -> bool:
+ return self.left_arm.is_calibrated and self.right_arm.is_calibrated
+
+ def calibrate(self) -> None:
+ self.left_arm.calibrate()
+ self.right_arm.calibrate()
+
+ def configure(self) -> None:
+ self.left_arm.configure()
+ self.right_arm.configure()
+
+ @check_if_not_connected
+ def get_observation(self) -> RobotObservation:
+ obs_dict = {}
+ obs_dict.update({f"left_{k}": v for k, v in self.left_arm.get_observation().items()})
+ obs_dict.update({f"right_{k}": v for k, v in self.right_arm.get_observation().items()})
+ return obs_dict
+
+ @check_if_not_connected
+ def send_action(self, action: RobotAction) -> RobotAction:
+ left_action = {
+ key.removeprefix("left_"): value for key, value in action.items() if key.startswith("left_")
+ }
+ right_action = {
+ key.removeprefix("right_"): value for key, value in action.items() if key.startswith("right_")
+ }
+
+ sent_action_left = self.left_arm.send_action(left_action)
+ sent_action_right = self.right_arm.send_action(right_action)
+
+ return {
+ **{f"left_{k}": v for k, v in sent_action_left.items()},
+ **{f"right_{k}": v for k, v in sent_action_right.items()},
+ }
+
+ @check_if_not_connected
+ def disconnect(self) -> None:
+ self.left_arm.disconnect()
+ self.right_arm.disconnect()
diff --git a/src/lerobot/robots/bi_rebot_b601_follower/config_bi_rebot_b601_follower.py b/src/lerobot/robots/bi_rebot_b601_follower/config_bi_rebot_b601_follower.py
new file mode 100644
index 000000000..079b7a355
--- /dev/null
+++ b/src/lerobot/robots/bi_rebot_b601_follower/config_bi_rebot_b601_follower.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+
+from ..config import RobotConfig
+from ..rebot_b601_follower import RebotB601FollowerConfig
+
+
+@RobotConfig.register_subclass("bi_rebot_b601_follower")
+@dataclass
+class BiRebotB601FollowerConfig(RobotConfig):
+ """Configuration class for the bimanual reBot B601-DM follower robot."""
+
+ left_arm_config: RebotB601FollowerConfig
+ right_arm_config: RebotB601FollowerConfig
diff --git a/src/lerobot/robots/rebot_b601_follower/__init__.py b/src/lerobot/robots/rebot_b601_follower/__init__.py
new file mode 100644
index 000000000..43fcbb769
--- /dev/null
+++ b/src/lerobot/robots/rebot_b601_follower/__init__.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .config_rebot_b601_follower import RebotB601FollowerConfig, RebotB601FollowerRobotConfig
+from .rebot_b601_follower import RebotB601Follower
+
+__all__ = ["RebotB601Follower", "RebotB601FollowerConfig", "RebotB601FollowerRobotConfig"]
diff --git a/src/lerobot/robots/rebot_b601_follower/config_rebot_b601_follower.py b/src/lerobot/robots/rebot_b601_follower/config_rebot_b601_follower.py
new file mode 100644
index 000000000..096548afb
--- /dev/null
+++ b/src/lerobot/robots/rebot_b601_follower/config_rebot_b601_follower.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+
+from lerobot.cameras import CameraConfig
+
+from ..config import RobotConfig
+
+
+@dataclass
+class RebotB601FollowerConfig:
+ """Base configuration class for the Seeed Studio reBot B601-DM follower arm.
+
+ The B601-DM is a 6-DOF arm plus gripper driven by Damiao CAN motors. Motor
+ communication goes through the ``motorbridge`` package.
+ """
+
+ # Communication port. For ``can_adapter="damiao"`` this is the Damiao serial
+ # bridge device (e.g. "/dev/ttyACM0"); for ``can_adapter="socketcan"`` it is
+ # the CAN channel name (e.g. "can0").
+ port: str
+
+ # CAN adapter type:
+ # "damiao" - Damiao dedicated serial bridge (default)
+ # "socketcan" - SocketCAN based adapters (PCAN, slcan, embedded controllers, ...)
+ can_adapter: str = "damiao"
+
+ # Baud rate for the Damiao serial bridge (only used when can_adapter="damiao").
+ dm_serial_baud: int = 921600
+
+ disable_torque_on_disconnect: bool = True
+
+ # `max_relative_target` limits the magnitude of the relative positional target
+ # vector for safety purposes (in degrees). Set to a positive scalar to apply the
+ # same value to all motors, or to a dict mapping motor names to per-motor values.
+ max_relative_target: float | dict[str, float] | None = None
+
+ # cameras
+ cameras: dict[str, CameraConfig] = field(default_factory=dict)
+
+ # Maps motor names to their (send_can_id, recv_can_id) pair.
+ motor_can_ids: dict[str, tuple[int, int]] = field(
+ default_factory=lambda: {
+ "shoulder_pan": (0x01, 0x11),
+ "shoulder_lift": (0x02, 0x12),
+ "elbow_flex": (0x03, 0x13),
+ "wrist_flex": (0x04, 0x14),
+ "wrist_yaw": (0x05, 0x15),
+ "wrist_roll": (0x06, 0x16),
+ "gripper": (0x07, 0x17),
+ }
+ )
+
+ # Target velocity for joints running in POS_VEL mode, in degrees/s. A scalar is
+ # applied to every joint; a list provides one value per joint (in motor order).
+ pos_vel_velocity: float | list[float] = field(default_factory=lambda: [150.0] * 7)
+
+ # Torque/current ratio for the gripper's FORCE_POS mode, in range [0, 1].
+ gripper_torque_ratio: float = 0.1
+
+ # Soft joint limits (degrees). These are clipped against on every action.
+ joint_limits: dict[str, tuple[float, float]] = field(
+ default_factory=lambda: {
+ "shoulder_pan": (-145.0, 145.0),
+ "shoulder_lift": (-170.0, 1.0),
+ "elbow_flex": (-200.0, 1.0),
+ "wrist_flex": (-80.0, 90.0),
+ "wrist_yaw": (-90.0, 90.0),
+ "wrist_roll": (-90.0, 90.0),
+ "gripper": (-270.0, 0.0),
+ }
+ )
+
+
+@RobotConfig.register_subclass("rebot_b601_follower")
+@dataclass
+class RebotB601FollowerRobotConfig(RobotConfig, RebotB601FollowerConfig):
+ """Registered configuration for the reBot B601-DM follower robot."""
+
+ pass
diff --git a/src/lerobot/robots/rebot_b601_follower/rebot_b601_follower.py b/src/lerobot/robots/rebot_b601_follower/rebot_b601_follower.py
new file mode 100644
index 000000000..ec00f4aa9
--- /dev/null
+++ b/src/lerobot/robots/rebot_b601_follower/rebot_b601_follower.py
@@ -0,0 +1,289 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import math
+import time
+from functools import cached_property
+from typing import TYPE_CHECKING
+
+from lerobot.cameras import make_cameras_from_configs
+from lerobot.motors import MotorCalibration
+from lerobot.types import RobotAction, RobotObservation
+from lerobot.utils.decorators import check_if_already_connected, check_if_not_connected
+from lerobot.utils.import_utils import _motorbridge_available, require_package
+
+from ..robot import Robot
+from ..utils import ensure_safe_goal_position
+from .config_rebot_b601_follower import RebotB601FollowerRobotConfig
+
+if TYPE_CHECKING or _motorbridge_available:
+ from motorbridge import Controller as MotorBridgeController, Mode as MotorBridgeMode
+else:
+ MotorBridgeController = None
+ MotorBridgeMode = None
+
+logger = logging.getLogger(__name__)
+
+# Joint controlled in FORCE_POS mode; every other joint runs in POS_VEL mode.
+GRIPPER_MOTOR = "gripper"
+# Per-joint Damiao motor models for the B601-DM (passed to motorbridge).
+MOTOR_MODELS = {
+ "shoulder_pan": "4340P",
+ "shoulder_lift": "4340P",
+ "elbow_flex": "4340P",
+ "wrist_flex": "4310",
+ "wrist_yaw": "4310",
+ "wrist_roll": "4310",
+ "gripper": "4310",
+}
+_ENSURE_MODE_RETRIES = 9
+_SETTLE_SEC = 0.01
+_ZERO_SETTLE_SEC = 0.1
+
+
+class RebotB601Follower(Robot):
+ """Seeed Studio reBot B601-DM follower arm (6-DOF + gripper, Damiao CAN motors).
+
+ Motor communication is handled by the ``motorbridge`` package over a CAN bus,
+ reached either through a Damiao serial bridge or a SocketCAN adapter.
+ """
+
+ config_class = RebotB601FollowerRobotConfig
+ name = "rebot_b601_follower"
+
+ def __init__(self, config: RebotB601FollowerRobotConfig):
+ require_package("motorbridge", extra="rebot")
+ super().__init__(config)
+ self.config = config
+ self.bus: MotorBridgeController | None = None
+ self.motors: dict = {}
+ self.motor_names = list(config.motor_can_ids.keys())
+ self.cameras = make_cameras_from_configs(config.cameras)
+
+ @property
+ def _motors_ft(self) -> dict[str, type]:
+ return {f"{motor}.pos": float for motor in self.motor_names}
+
+ @property
+ def _cameras_ft(self) -> dict[str, tuple]:
+ return {
+ cam: (self.config.cameras[cam].height, self.config.cameras[cam].width, 3) for cam in self.cameras
+ }
+
+ @cached_property
+ def observation_features(self) -> dict[str, type | tuple]:
+ return {**self._motors_ft, **self._cameras_ft}
+
+ @cached_property
+ def action_features(self) -> dict[str, type]:
+ return self._motors_ft
+
+ @property
+ def is_connected(self) -> bool:
+ return self.bus is not None and all(cam.is_connected for cam in self.cameras.values())
+
+ @check_if_already_connected
+ def connect(self, calibrate: bool = True) -> None:
+ logger.info(f"Connecting {self} on {self.config.port} (adapter={self.config.can_adapter})...")
+ if self.config.can_adapter == "damiao":
+ self.bus = MotorBridgeController.from_dm_serial(
+ serial_port=self.config.port,
+ baud=self.config.dm_serial_baud,
+ )
+ elif self.config.can_adapter == "socketcan":
+ self.bus = MotorBridgeController(channel=self.config.port)
+ else:
+ raise ValueError(
+ f"Unsupported can_adapter '{self.config.can_adapter}'. Use 'damiao' or 'socketcan'."
+ )
+
+ for motor_name, (send_id, recv_id) in self.config.motor_can_ids.items():
+ self.motors[motor_name] = self.bus.add_damiao_motor(send_id, recv_id, MOTOR_MODELS[motor_name])
+
+ if not self.is_calibrated and calibrate:
+ logger.info(
+ "Mismatch between calibration values in the motor and the calibration file or no calibration file found"
+ )
+ self.calibrate()
+
+ for cam in self.cameras.values():
+ cam.connect()
+
+ self.configure()
+ logger.info(f"{self} connected.")
+
+ @property
+ def is_calibrated(self) -> bool:
+ return bool(self.calibration)
+
+ def calibrate(self) -> None:
+ if self.calibration:
+ user_input = input(
+ f"Press ENTER to use provided calibration file associated with the id {self.id}, "
+ "or type 'c' and press ENTER to run calibration: "
+ )
+ if user_input.strip().lower() != "c":
+ logger.info(f"Using calibration file associated with the id {self.id}")
+ return
+
+ logger.info(f"\nRunning calibration of {self}")
+ self.bus.disable_all()
+ print(
+ "\nCalibration: set zero position.\n"
+ "Manually move the reBot B601 to its ZERO POSITION and close the gripper.\n"
+ "See the B601 manual for the zero pose (the default sit-down position).\n"
+ )
+ input("Press ENTER when ready...")
+
+ for motor in self.motors.values():
+ motor.set_zero_position()
+ time.sleep(_ZERO_SETTLE_SEC)
+ logger.info("Arm zero position set.")
+
+ self.calibration = {}
+ for motor_name, (send_id, _recv_id) in self.config.motor_can_ids.items():
+ range_min, range_max = self.config.joint_limits[motor_name]
+ self.calibration[motor_name] = MotorCalibration(
+ id=send_id,
+ drive_mode=0,
+ homing_offset=0,
+ range_min=int(range_min),
+ range_max=int(range_max),
+ )
+
+ self._save_calibration()
+ print(f"Calibration saved to {self.calibration_fpath}")
+
+ def configure(self) -> None:
+ self.bus.enable_all()
+ for motor_name, motor in self.motors.items():
+ target_mode = (
+ MotorBridgeMode.FORCE_POS if motor_name == GRIPPER_MOTOR else MotorBridgeMode.POS_VEL
+ )
+ for attempt in range(_ENSURE_MODE_RETRIES + 1):
+ try:
+ motor.ensure_mode(target_mode)
+ break
+ except Exception:
+ if attempt == _ENSURE_MODE_RETRIES:
+ raise
+ time.sleep(_SETTLE_SEC)
+ logger.debug(f"{motor_name} mode set to {target_mode}")
+
+ @check_if_not_connected
+ def disable_torque(self) -> None:
+ """Disable motor torque so the arm can be moved by hand (read-only debugging)."""
+ self.bus.disable_all()
+ logger.info(f"{self} torque disabled.")
+
+ def _present_pos(self) -> dict[str, float]:
+ """Read present joint positions in degrees."""
+ for motor in self.motors.values():
+ motor.request_feedback()
+ try:
+ self.bus.poll_feedback_once()
+ except Exception:
+ logger.warning("CAN bus poll feedback failed.")
+
+ present_pos = {}
+ for motor_name, motor in self.motors.items():
+ state = motor.get_state()
+ present_pos[motor_name] = math.degrees(state.pos) if state is not None else 0.0
+ return present_pos
+
+ @check_if_not_connected
+ def get_observation(self) -> RobotObservation:
+ start = time.perf_counter()
+ obs_dict = {f"{motor}.pos": pos for motor, pos in self._present_pos().items()}
+ dt_ms = (time.perf_counter() - start) * 1e3
+ logger.debug(f"{self} read state: {dt_ms:.1f}ms")
+
+ for cam_key, cam in self.cameras.items():
+ start = time.perf_counter()
+ obs_dict[cam_key] = cam.read_latest()
+ dt_ms = (time.perf_counter() - start) * 1e3
+ logger.debug(f"{self} read {cam_key}: {dt_ms:.1f}ms")
+
+ return obs_dict
+
+ @check_if_not_connected
+ def send_action(self, action: RobotAction) -> RobotAction:
+ """Command the arm to a target joint configuration.
+
+ Positions are expressed in degrees. The relative action magnitude may be
+ clipped depending on `max_relative_target`, so the action actually sent is
+ always returned.
+ """
+ goal_pos = {key.removesuffix(".pos"): val for key, val in action.items() if key.endswith(".pos")}
+
+ # Clip against soft joint limits.
+ for motor_name in list(goal_pos):
+ if motor_name in self.config.joint_limits:
+ min_limit, max_limit = self.config.joint_limits[motor_name]
+ clipped = max(min_limit, min(max_limit, goal_pos[motor_name]))
+ if clipped != goal_pos[motor_name]:
+ logger.debug(f"Clipped {motor_name} from {goal_pos[motor_name]:.2f} to {clipped:.2f}")
+ goal_pos[motor_name] = clipped
+
+ # Tolerate 6-DOF leaders that have no wrist_yaw joint by holding it at zero.
+ # This is intentional: it lets a 6-DOF leader such as the SO-100 / SO-101
+ # (so100_leader / so101_leader) teleoperate this 7-DOF follower — the missing
+ # wrist_yaw command is simply treated as 0.0 instead of raising.
+ if "wrist_yaw" not in goal_pos:
+ goal_pos["wrist_yaw"] = 0.0
+
+ # Cap relative target when too far from the present position.
+ if self.config.max_relative_target is not None:
+ present_pos = self._present_pos()
+ goal_present_pos = {key: (g, present_pos.get(key, g)) for key, g in goal_pos.items()}
+ goal_pos = ensure_safe_goal_position(goal_present_pos, self.config.max_relative_target)
+
+ for motor_name, position_deg in goal_pos.items():
+ motor = self.motors.get(motor_name)
+ if motor is None:
+ continue
+ idx = self.motor_names.index(motor_name)
+ vel_deg_s = (
+ self.config.pos_vel_velocity[idx]
+ if isinstance(self.config.pos_vel_velocity, list)
+ else self.config.pos_vel_velocity
+ )
+ pos_rad = math.radians(position_deg)
+ vel_rad = math.radians(vel_deg_s)
+ if motor_name == GRIPPER_MOTOR:
+ motor.send_force_pos(pos_rad, vel_rad, self.config.gripper_torque_ratio)
+ else:
+ motor.send_pos_vel(pos_rad, vel_rad)
+
+ return {f"{motor}.pos": val for motor, val in goal_pos.items()}
+
+ @check_if_not_connected
+ def disconnect(self) -> None:
+ for motor in self.motors.values():
+ if self.config.disable_torque_on_disconnect:
+ motor.disable()
+ motor.clear_error()
+ motor.close()
+
+ self.bus.close()
+ self.bus = None
+ self.motors = {}
+
+ for cam in self.cameras.values():
+ cam.disconnect()
+
+ logger.info(f"{self} disconnected.")
diff --git a/src/lerobot/robots/utils.py b/src/lerobot/robots/utils.py
index 92da597f1..f897a560e 100644
--- a/src/lerobot/robots/utils.py
+++ b/src/lerobot/robots/utils.py
@@ -68,6 +68,14 @@ def make_robot_from_config(config: RobotConfig) -> Robot:
from .bi_openarm_follower import BiOpenArmFollower
return BiOpenArmFollower(config)
+ elif config.type == "rebot_b601_follower":
+ from .rebot_b601_follower import RebotB601Follower
+
+ return RebotB601Follower(config)
+ elif config.type == "bi_rebot_b601_follower":
+ from .bi_rebot_b601_follower import BiRebotB601Follower
+
+ return BiRebotB601Follower(config)
elif config.type == "mock_robot":
from tests.mocks.mock_robot import MockRobot
diff --git a/src/lerobot/rollout/context.py b/src/lerobot/rollout/context.py
index 8804cd789..bf5fa0fd4 100644
--- a/src/lerobot/rollout/context.py
+++ b/src/lerobot/rollout/context.py
@@ -332,7 +332,7 @@ def build_rollout_context(
cfg.dataset.repo_id,
root=cfg.dataset.root,
batch_encoding_size=cfg.dataset.video_encoding_batch_size,
- vcodec=cfg.dataset.vcodec,
+ camera_encoder=cfg.dataset.camera_encoder,
streaming_encoding=cfg.dataset.streaming_encoding,
encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize,
encoder_threads=cfg.dataset.encoder_threads,
@@ -367,7 +367,7 @@ def build_rollout_context(
image_writer_threads=cfg.dataset.num_image_writer_threads_per_camera
* len(robot.cameras if hasattr(robot, "cameras") else []),
batch_encoding_size=cfg.dataset.video_encoding_batch_size,
- vcodec=cfg.dataset.vcodec,
+ camera_encoder=cfg.dataset.camera_encoder,
streaming_encoding=cfg.dataset.streaming_encoding,
encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize,
encoder_threads=cfg.dataset.encoder_threads,
diff --git a/src/lerobot/scripts/lerobot_calibrate.py b/src/lerobot/scripts/lerobot_calibrate.py
index e68d7438b..e43736954 100644
--- a/src/lerobot/scripts/lerobot_calibrate.py
+++ b/src/lerobot/scripts/lerobot_calibrate.py
@@ -39,6 +39,7 @@ from lerobot.robots import ( # noqa: F401
Robot,
RobotConfig,
bi_openarm_follower,
+ bi_rebot_b601_follower,
bi_so_follower,
hope_jr,
koch_follower,
@@ -46,12 +47,14 @@ from lerobot.robots import ( # noqa: F401
make_robot_from_config,
omx_follower,
openarm_follower,
+ rebot_b601_follower,
so_follower,
)
from lerobot.teleoperators import ( # noqa: F401
Teleoperator,
TeleoperatorConfig,
bi_openarm_leader,
+ bi_rebot_102_leader,
bi_so_leader,
homunculus,
koch_leader,
@@ -59,6 +62,7 @@ from lerobot.teleoperators import ( # noqa: F401
omx_leader,
openarm_leader,
openarm_mini,
+ rebot_102_leader,
so_leader,
unitree_g1,
)
diff --git a/src/lerobot/scripts/lerobot_edit_dataset.py b/src/lerobot/scripts/lerobot_edit_dataset.py
index a708d37a3..3c1edbb31 100644
--- a/src/lerobot/scripts/lerobot_edit_dataset.py
+++ b/src/lerobot/scripts/lerobot_edit_dataset.py
@@ -178,6 +178,31 @@ Recompute stats for relative actions and push to hub:
--operation.num_workers 4 \
--push_to_hub true
+Re-encode all videos in a dataset (saves to lerobot/pusht_reencoded by default):
+ lerobot-edit-dataset \
+ --repo_id lerobot/pusht \
+ --operation.type reencode_videos \
+ --operation.camera_encoder.vcodec h264 \
+ --operation.camera_encoder.pix_fmt yuv420p \
+ --operation.camera_encoder.crf 23
+
+Re-encode videos into a new dataset using 4 parallel processes:
+ lerobot-edit-dataset \
+ --repo_id lerobot/pusht \
+ --new_repo_id lerobot/pusht_h264 \
+ --operation.type reencode_videos \
+ --operation.camera_encoder.vcodec h264 \
+ --operation.camera_encoder.crf 23 \
+ --operation.num_workers 4
+
+Re-encode videos in-place (overwrites original dataset):
+ lerobot-edit-dataset \
+ --repo_id lerobot/pusht \
+ --new_repo_id lerobot/pusht \
+ --operation.type reencode_videos \
+ --operation.camera_encoder.vcodec h264 \
+ --operation.overwrite true
+
Using JSON config file:
lerobot-edit-dataset \
--config_path path/to/edit_config.json
@@ -187,12 +212,12 @@ import abc
import logging
import shutil
import sys
-from dataclasses import dataclass
+from dataclasses import dataclass, field
from pathlib import Path
import draccus
-from lerobot.configs import parser
+from lerobot.configs import VideoEncoderConfig, camera_encoder_defaults, parser
from lerobot.datasets import (
LeRobotDataset,
convert_image_to_video_dataset,
@@ -200,6 +225,7 @@ from lerobot.datasets import (
merge_datasets,
modify_tasks,
recompute_stats,
+ reencode_dataset,
remove_feature,
split_dataset,
)
@@ -250,11 +276,7 @@ class ModifyTasksConfig(OperationConfig):
@dataclass
class ConvertImageToVideoConfig(OperationConfig):
output_dir: str | None = None
- vcodec: str = "libsvtav1"
- pix_fmt: str = "yuv420p"
- g: int = 2
- crf: int = 30
- fast_decode: int = 0
+ camera_encoder: VideoEncoderConfig = field(default_factory=camera_encoder_defaults)
episode_indices: list[int] | None = None
num_workers: int = 4
max_episodes_per_batch: int | None = None
@@ -272,6 +294,15 @@ class RecomputeStatsConfig(OperationConfig):
overwrite: bool = False
+@OperationConfig.register_subclass("reencode_videos")
+@dataclass
+class ReencodeVideosConfig(OperationConfig):
+ camera_encoder: VideoEncoderConfig = field(default_factory=camera_encoder_defaults)
+ num_workers: int = 0
+ encoder_threads: int | None = None
+ overwrite: bool = False
+
+
@OperationConfig.register_subclass("info")
@dataclass
class InfoConfig(OperationConfig):
@@ -557,11 +588,7 @@ def handle_convert_image_to_video(cfg: EditDatasetConfig) -> None:
dataset=dataset,
output_dir=output_dir,
repo_id=output_repo_id,
- vcodec=getattr(cfg.operation, "vcodec", "libsvtav1"),
- pix_fmt=getattr(cfg.operation, "pix_fmt", "yuv420p"),
- g=getattr(cfg.operation, "g", 2),
- crf=getattr(cfg.operation, "crf", 30),
- fast_decode=getattr(cfg.operation, "fast_decode", 0),
+ camera_encoder=getattr(cfg.operation, "camera_encoder", None) or camera_encoder_defaults(),
episode_indices=getattr(cfg.operation, "episode_indices", None),
num_workers=getattr(cfg.operation, "num_workers", 4),
max_episodes_per_batch=getattr(cfg.operation, "max_episodes_per_batch", None),
@@ -642,6 +669,58 @@ def handle_recompute_stats(cfg: EditDatasetConfig) -> None:
dataset.push_to_hub()
+def handle_reencode_videos(cfg: EditDatasetConfig) -> None:
+ if not isinstance(cfg.operation, ReencodeVideosConfig):
+ raise ValueError("Operation config must be ReencodeVideosConfig")
+
+ output_repo_id, input_root, output_root = _resolve_io_paths(
+ cfg.repo_id,
+ cfg.new_repo_id,
+ cfg.root,
+ cfg.new_root,
+ default_new_repo_id=f"{cfg.repo_id}_reencoded",
+ )
+ in_place = output_root == input_root
+
+ if in_place and not cfg.operation.overwrite:
+ raise ValueError(
+ f"reencode_videos would overwrite the dataset in-place at {input_root}. "
+ "Pass --operation.overwrite true to allow in-place modification, "
+ "or use --new_repo_id / --new_root to write to a different location. "
+ f"Default output repo_id when neither is set: '{cfg.repo_id}_reencoded'."
+ )
+
+ if in_place:
+ logging.warning(
+ f"Overwriting dataset videos in-place at {input_root}. The original videos will be lost."
+ )
+ dataset = LeRobotDataset(cfg.repo_id, root=input_root)
+ else:
+ logging.info(f"Copying dataset from {input_root} to {output_root}")
+ if output_root.exists():
+ backup_path = output_root.with_name(output_root.name + "_old")
+ logging.warning(f"Output directory {output_root} already exists. Moving to {backup_path}")
+ if backup_path.exists():
+ shutil.rmtree(backup_path)
+ shutil.move(output_root, backup_path)
+ shutil.copytree(input_root, output_root)
+ dataset = LeRobotDataset(output_repo_id, root=output_root)
+
+ logging.info(f"Re-encoding videos in {output_repo_id} with {cfg.operation.camera_encoder}")
+ reencode_dataset(
+ dataset,
+ camera_encoder=cfg.operation.camera_encoder,
+ encoder_threads=cfg.operation.encoder_threads,
+ num_workers=cfg.operation.num_workers,
+ )
+
+ logging.info(f"All videos re-encoded at {dataset.root}")
+
+ if cfg.push_to_hub:
+ logging.info(f"Pushing to hub as {output_repo_id}...")
+ dataset.push_to_hub()
+
+
def _get_dataset_size(repo_path):
import os
@@ -715,6 +794,8 @@ def edit_dataset(cfg: EditDatasetConfig) -> None:
handle_convert_image_to_video(cfg)
elif operation_type == "recompute_stats":
handle_recompute_stats(cfg)
+ elif operation_type == "reencode_videos":
+ handle_reencode_videos(cfg)
elif operation_type == "info":
handle_info(cfg)
else:
diff --git a/src/lerobot/scripts/lerobot_find_joint_limits.py b/src/lerobot/scripts/lerobot_find_joint_limits.py
index c4f867631..5b9166a2e 100644
--- a/src/lerobot/scripts/lerobot_find_joint_limits.py
+++ b/src/lerobot/scripts/lerobot_find_joint_limits.py
@@ -45,16 +45,19 @@ from lerobot.model import RobotKinematics
from lerobot.robots import ( # noqa: F401
RobotConfig,
bi_openarm_follower,
+ bi_rebot_b601_follower,
bi_so_follower,
koch_follower,
make_robot_from_config,
omx_follower,
openarm_follower,
+ rebot_b601_follower,
so_follower,
)
from lerobot.teleoperators import ( # noqa: F401
TeleoperatorConfig,
bi_openarm_leader,
+ bi_rebot_102_leader,
bi_so_leader,
gamepad,
koch_leader,
@@ -62,6 +65,7 @@ from lerobot.teleoperators import ( # noqa: F401
omx_leader,
openarm_leader,
openarm_mini,
+ rebot_102_leader,
so_leader,
)
from lerobot.utils.robot_utils import precise_sleep
diff --git a/src/lerobot/scripts/lerobot_record.py b/src/lerobot/scripts/lerobot_record.py
index 129696bd3..c411ebf9e 100644
--- a/src/lerobot/scripts/lerobot_record.py
+++ b/src/lerobot/scripts/lerobot_record.py
@@ -63,6 +63,27 @@ lerobot-record \\
--dataset.streaming_encoding=true \\
--dataset.encoder_threads=2
```
+
+Example recording with custom video encoding parameters:
+```shell
+lerobot-record \\
+ --robot.type=so100_follower \\
+ --robot.port=/dev/tty.usbmodem58760431541 \\
+ --robot.cameras="{laptop: {type: opencv, index_or_path: 0, width: 640, height: 480, fps: 30}}" \\
+ --robot.id=black \\
+ --teleop.type=so100_leader \\
+ --teleop.port=/dev/tty.usbmodem58760431551 \\
+ --teleop.id=blue \\
+ --dataset.repo_id=/ \\
+ --dataset.num_episodes=2 \\
+ --dataset.single_task="Grab the cube" \\
+ --dataset.streaming_encoding=true \\
+ --dataset.encoder_threads=2 \\
+ --dataset.camera_encoder.vcodec=h264 \\
+ --dataset.camera_encoder.preset=fast \\
+ --dataset.camera_encoder.extra_options={"tune": "film", "profile:v": "high", "bf": 2} \\
+ --display_data=true
+```
"""
import logging
@@ -99,6 +120,7 @@ from lerobot.robots import ( # noqa: F401
Robot,
RobotConfig,
bi_openarm_follower,
+ bi_rebot_b601_follower,
bi_so_follower,
earthrover_mini_plus,
hope_jr,
@@ -107,6 +129,7 @@ from lerobot.robots import ( # noqa: F401
omx_follower,
openarm_follower,
reachy2,
+ rebot_b601_follower,
so_follower,
unitree_g1 as unitree_g1_robot,
)
@@ -114,6 +137,7 @@ from lerobot.teleoperators import ( # noqa: F401
Teleoperator,
TeleoperatorConfig,
bi_openarm_leader,
+ bi_rebot_102_leader,
bi_so_leader,
homunculus,
koch_leader,
@@ -122,6 +146,7 @@ from lerobot.teleoperators import ( # noqa: F401
openarm_leader,
openarm_mini,
reachy2_teleoperator,
+ rebot_102_leader,
so_leader,
unitree_g1,
)
@@ -377,10 +402,10 @@ def record(
cfg.dataset.repo_id,
root=cfg.dataset.root,
batch_encoding_size=cfg.dataset.video_encoding_batch_size,
- vcodec=cfg.dataset.vcodec,
+ camera_encoder=cfg.dataset.camera_encoder,
+ encoder_threads=cfg.dataset.encoder_threads,
streaming_encoding=cfg.dataset.streaming_encoding,
encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize,
- encoder_threads=cfg.dataset.encoder_threads,
image_writer_processes=cfg.dataset.num_image_writer_processes if num_cameras > 0 else 0,
image_writer_threads=cfg.dataset.num_image_writer_threads_per_camera * num_cameras
if num_cameras > 0
@@ -406,10 +431,10 @@ def record(
image_writer_processes=cfg.dataset.num_image_writer_processes,
image_writer_threads=cfg.dataset.num_image_writer_threads_per_camera * len(robot.cameras),
batch_encoding_size=cfg.dataset.video_encoding_batch_size,
- vcodec=cfg.dataset.vcodec,
+ camera_encoder=cfg.dataset.camera_encoder,
+ encoder_threads=cfg.dataset.encoder_threads,
streaming_encoding=cfg.dataset.streaming_encoding,
encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize,
- encoder_threads=cfg.dataset.encoder_threads,
)
robot.connect()
@@ -420,7 +445,7 @@ def record(
if not cfg.dataset.streaming_encoding:
logging.info(
- "Streaming encoding is disabled. If you have capable hardware, consider enabling it for way faster episode saving. --dataset.streaming_encoding=true --dataset.encoder_threads=2 # --dataset.vcodec=auto. More info in the documentation: https://huggingface.co/docs/lerobot/streaming_video_encoding"
+ "Streaming encoding is disabled. If you have capable hardware, consider enabling it for way faster episode saving. --dataset.streaming_encoding=true --dataset.encoder_threads=2 # --dataset.camera_encoder.vcodec=auto. More info in the documentation: https://huggingface.co/docs/lerobot/streaming_video_encoding"
)
with VideoEncodingManager(dataset):
diff --git a/src/lerobot/scripts/lerobot_replay.py b/src/lerobot/scripts/lerobot_replay.py
index 41d2926cc..1851f7c2b 100644
--- a/src/lerobot/scripts/lerobot_replay.py
+++ b/src/lerobot/scripts/lerobot_replay.py
@@ -56,6 +56,7 @@ from lerobot.robots import ( # noqa: F401
Robot,
RobotConfig,
bi_openarm_follower,
+ bi_rebot_b601_follower,
bi_so_follower,
earthrover_mini_plus,
hope_jr,
@@ -64,6 +65,7 @@ from lerobot.robots import ( # noqa: F401
omx_follower,
openarm_follower,
reachy2,
+ rebot_b601_follower,
so_follower,
unitree_g1,
)
diff --git a/src/lerobot/scripts/lerobot_rollout.py b/src/lerobot/scripts/lerobot_rollout.py
index 6a81563ee..3378b6de4 100644
--- a/src/lerobot/scripts/lerobot_rollout.py
+++ b/src/lerobot/scripts/lerobot_rollout.py
@@ -120,6 +120,18 @@ Usage examples
--dataset.repo_id=user/rollout_sentry_data \\
--dataset.single_task="patrol" \\
--resume=true
+
+ # Rollout with custom video encoding parameters
+ lerobot-rollout \\
+ --strategy.type=base \\
+ --policy.path=lerobot/act_koch_real \\
+ --robot.type=koch_follower \\
+ --robot.port=/dev/ttyACM0 \\
+ --task="pick up cube" --duration=60 \\
+ --display_data=true \\
+ --dataset.camera_encoder.vcodec=h264 \\
+ --dataset.camera_encoder.preset=fast \\
+ --dataset.camera_encoder.extra_options={"tune": "film", "profile:v": "high", "bf": 2}
"""
import logging
@@ -132,6 +144,7 @@ from lerobot.robots import ( # noqa: F401
Robot,
RobotConfig,
bi_openarm_follower,
+ bi_rebot_b601_follower,
bi_so_follower,
earthrover_mini_plus,
hope_jr,
@@ -139,6 +152,7 @@ from lerobot.robots import ( # noqa: F401
omx_follower,
openarm_follower,
reachy2,
+ rebot_b601_follower,
so_follower,
unitree_g1 as unitree_g1_robot,
)
@@ -147,6 +161,7 @@ from lerobot.teleoperators import ( # noqa: F401
Teleoperator,
TeleoperatorConfig,
bi_openarm_leader,
+ bi_rebot_102_leader,
bi_so_leader,
homunculus,
koch_leader,
@@ -154,6 +169,7 @@ from lerobot.teleoperators import ( # noqa: F401
openarm_leader,
openarm_mini,
reachy2_teleoperator,
+ rebot_102_leader,
so_leader,
unitree_g1,
)
diff --git a/src/lerobot/scripts/lerobot_setup_motors.py b/src/lerobot/scripts/lerobot_setup_motors.py
index 2c962a6e2..69ebcf5fa 100644
--- a/src/lerobot/scripts/lerobot_setup_motors.py
+++ b/src/lerobot/scripts/lerobot_setup_motors.py
@@ -30,20 +30,24 @@ import draccus
from lerobot.robots import ( # noqa: F401
RobotConfig,
+ bi_rebot_b601_follower,
bi_so_follower,
koch_follower,
lekiwi,
make_robot_from_config,
omx_follower,
+ rebot_b601_follower,
so_follower,
)
from lerobot.teleoperators import ( # noqa: F401
TeleoperatorConfig,
+ bi_rebot_102_leader,
bi_so_leader,
koch_leader,
make_teleoperator_from_config,
omx_leader,
openarm_mini,
+ rebot_102_leader,
so_leader,
)
diff --git a/src/lerobot/scripts/lerobot_teleoperate.py b/src/lerobot/scripts/lerobot_teleoperate.py
index 76157595e..2ff02bda0 100644
--- a/src/lerobot/scripts/lerobot_teleoperate.py
+++ b/src/lerobot/scripts/lerobot_teleoperate.py
@@ -72,6 +72,7 @@ from lerobot.robots import ( # noqa: F401
Robot,
RobotConfig,
bi_openarm_follower,
+ bi_rebot_b601_follower,
bi_so_follower,
earthrover_mini_plus,
hope_jr,
@@ -80,6 +81,7 @@ from lerobot.robots import ( # noqa: F401
omx_follower,
openarm_follower,
reachy2,
+ rebot_b601_follower,
so_follower,
unitree_g1 as unitree_g1_robot,
)
@@ -87,6 +89,7 @@ from lerobot.teleoperators import ( # noqa: F401
Teleoperator,
TeleoperatorConfig,
bi_openarm_leader,
+ bi_rebot_102_leader,
bi_so_leader,
gamepad,
homunculus,
@@ -97,6 +100,7 @@ from lerobot.teleoperators import ( # noqa: F401
openarm_leader,
openarm_mini,
reachy2_teleoperator,
+ rebot_102_leader,
so_leader,
unitree_g1,
)
diff --git a/src/lerobot/scripts/lerobot_train.py b/src/lerobot/scripts/lerobot_train.py
index 55a8cc935..463668eb2 100644
--- a/src/lerobot/scripts/lerobot_train.py
+++ b/src/lerobot/scripts/lerobot_train.py
@@ -48,6 +48,7 @@ from lerobot.envs import close_envs, make_env, make_env_pre_post_processors
from lerobot.optim.factory import make_optimizer_and_scheduler
from lerobot.policies import PreTrainedPolicy, make_policy, make_pre_post_processors
from lerobot.rewards import make_reward_pre_post_processors
+from lerobot.utils.collate import lerobot_collate_fn
from lerobot.utils.import_utils import register_third_party_plugins
from lerobot.utils.logging_utils import AverageMeter, MetricsTracker
from lerobot.utils.random_utils import set_seed
@@ -401,6 +402,10 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None):
shuffle = True
sampler = None
+ # Only swap in the language-aware collate when the dataset actually
+ # declares language columns; otherwise stay on PyTorch's default
+ # collate so non-language training runs are unaffected.
+ collate_fn = lerobot_collate_fn if dataset.meta.has_language_columns else None
dataloader = torch.utils.data.DataLoader(
dataset,
num_workers=cfg.num_workers,
@@ -409,6 +414,7 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None):
sampler=sampler,
pin_memory=device.type == "cuda",
drop_last=False,
+ collate_fn=collate_fn,
prefetch_factor=cfg.prefetch_factor if cfg.num_workers > 0 else None,
persistent_workers=cfg.persistent_workers and cfg.num_workers > 0,
)
diff --git a/src/lerobot/teleoperators/bi_rebot_102_leader/__init__.py b/src/lerobot/teleoperators/bi_rebot_102_leader/__init__.py
new file mode 100644
index 000000000..c15cf76d8
--- /dev/null
+++ b/src/lerobot/teleoperators/bi_rebot_102_leader/__init__.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .bi_rebot_102_leader import BiRebotArm102Leader
+from .config_bi_rebot_102_leader import BiRebotArm102LeaderConfig
+
+__all__ = ["BiRebotArm102Leader", "BiRebotArm102LeaderConfig"]
diff --git a/src/lerobot/teleoperators/bi_rebot_102_leader/bi_rebot_102_leader.py b/src/lerobot/teleoperators/bi_rebot_102_leader/bi_rebot_102_leader.py
new file mode 100644
index 000000000..a4e5fd8c6
--- /dev/null
+++ b/src/lerobot/teleoperators/bi_rebot_102_leader/bi_rebot_102_leader.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from functools import cached_property
+
+from lerobot.types import RobotAction
+from lerobot.utils.decorators import check_if_already_connected, check_if_not_connected
+
+from ..rebot_102_leader import RebotArm102Leader, RebotArm102LeaderTeleopConfig
+from ..teleoperator import Teleoperator
+from .config_bi_rebot_102_leader import BiRebotArm102LeaderConfig
+
+logger = logging.getLogger(__name__)
+
+
+class BiRebotArm102Leader(Teleoperator):
+ """Bimanual Seeed Studio StarArm102 / reBot Arm 102 leader.
+
+ Composes two single-arm :class:`RebotArm102Leader` instances. Action keys of
+ each arm are namespaced with a ``left_`` / ``right_`` prefix, so a bimanual
+ leader can teleoperate a bimanual reBot B601 follower.
+ """
+
+ config_class = BiRebotArm102LeaderConfig
+ name = "bi_rebot_102_leader"
+
+ def __init__(self, config: BiRebotArm102LeaderConfig):
+ super().__init__(config)
+ self.config = config
+
+ left_arm_config = RebotArm102LeaderTeleopConfig(
+ id=f"{config.id}_left" if config.id else None,
+ calibration_dir=config.calibration_dir,
+ port=config.left_arm_config.port,
+ baudrate=config.left_arm_config.baudrate,
+ joint_ids=config.left_arm_config.joint_ids,
+ joint_directions=config.left_arm_config.joint_directions,
+ joint_ranges=config.left_arm_config.joint_ranges,
+ )
+
+ right_arm_config = RebotArm102LeaderTeleopConfig(
+ id=f"{config.id}_right" if config.id else None,
+ calibration_dir=config.calibration_dir,
+ port=config.right_arm_config.port,
+ baudrate=config.right_arm_config.baudrate,
+ joint_ids=config.right_arm_config.joint_ids,
+ joint_directions=config.right_arm_config.joint_directions,
+ joint_ranges=config.right_arm_config.joint_ranges,
+ )
+
+ self.left_arm = RebotArm102Leader(left_arm_config)
+ self.right_arm = RebotArm102Leader(right_arm_config)
+
+ @cached_property
+ def action_features(self) -> dict[str, type]:
+ return {
+ **{f"left_{k}": v for k, v in self.left_arm.action_features.items()},
+ **{f"right_{k}": v for k, v in self.right_arm.action_features.items()},
+ }
+
+ @cached_property
+ def feedback_features(self) -> dict[str, type]:
+ return {}
+
+ @property
+ def is_connected(self) -> bool:
+ return self.left_arm.is_connected and self.right_arm.is_connected
+
+ @check_if_already_connected
+ def connect(self, calibrate: bool = True) -> None:
+ self.left_arm.connect(calibrate)
+ self.right_arm.connect(calibrate)
+
+ @property
+ def is_calibrated(self) -> bool:
+ return self.left_arm.is_calibrated and self.right_arm.is_calibrated
+
+ def calibrate(self) -> None:
+ self.left_arm.calibrate()
+ self.right_arm.calibrate()
+
+ def configure(self) -> None:
+ self.left_arm.configure()
+ self.right_arm.configure()
+
+ @check_if_not_connected
+ def get_action(self) -> RobotAction:
+ action_dict = {}
+ action_dict.update({f"left_{k}": v for k, v in self.left_arm.get_action().items()})
+ action_dict.update({f"right_{k}": v for k, v in self.right_arm.get_action().items()})
+ return action_dict
+
+ def send_feedback(self, feedback: dict[str, float]) -> None:
+ raise NotImplementedError("Feedback is not implemented for the reBot Arm 102 leader.")
+
+ @check_if_not_connected
+ def disconnect(self) -> None:
+ self.left_arm.disconnect()
+ self.right_arm.disconnect()
diff --git a/src/lerobot/teleoperators/bi_rebot_102_leader/config_bi_rebot_102_leader.py b/src/lerobot/teleoperators/bi_rebot_102_leader/config_bi_rebot_102_leader.py
new file mode 100644
index 000000000..265ae26c1
--- /dev/null
+++ b/src/lerobot/teleoperators/bi_rebot_102_leader/config_bi_rebot_102_leader.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+
+from ..config import TeleoperatorConfig
+from ..rebot_102_leader import RebotArm102LeaderConfig
+
+
+@TeleoperatorConfig.register_subclass("bi_rebot_102_leader")
+@dataclass
+class BiRebotArm102LeaderConfig(TeleoperatorConfig):
+ """Configuration class for the bimanual reBot Arm 102 leader teleoperator."""
+
+ left_arm_config: RebotArm102LeaderConfig
+ right_arm_config: RebotArm102LeaderConfig
diff --git a/src/lerobot/teleoperators/rebot_102_leader/__init__.py b/src/lerobot/teleoperators/rebot_102_leader/__init__.py
new file mode 100644
index 000000000..a13524707
--- /dev/null
+++ b/src/lerobot/teleoperators/rebot_102_leader/__init__.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .config_rebot_102_leader import RebotArm102LeaderConfig, RebotArm102LeaderTeleopConfig
+from .rebot_102_leader import RebotArm102Leader
+
+__all__ = ["RebotArm102Leader", "RebotArm102LeaderConfig", "RebotArm102LeaderTeleopConfig"]
diff --git a/src/lerobot/teleoperators/rebot_102_leader/config_rebot_102_leader.py b/src/lerobot/teleoperators/rebot_102_leader/config_rebot_102_leader.py
new file mode 100644
index 000000000..d1beea2ed
--- /dev/null
+++ b/src/lerobot/teleoperators/rebot_102_leader/config_rebot_102_leader.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+
+from ..config import TeleoperatorConfig
+
+
+@dataclass
+class RebotArm102LeaderConfig:
+ """Base configuration class for the Seeed Studio StarArm102 / reBot Arm 102 leader.
+
+ The reBot Arm 102 is a 7-joint (incl. gripper) leader arm driven by FashionStar
+ UART smart servos. Servo communication goes through ``motorbridge-smart-servo``.
+ """
+
+ # USB-to-UART device the leader arm is connected to (e.g. "/dev/ttyUSB0").
+ port: str
+
+ baudrate: int = 1_000_000
+
+ # Servo id of each joint on the UART bus.
+ joint_ids: dict[str, int] = field(
+ default_factory=lambda: {
+ "shoulder_pan": 0,
+ "shoulder_lift": 1,
+ "elbow_flex": 2,
+ "wrist_flex": 3,
+ "wrist_yaw": 4,
+ "wrist_roll": 5,
+ "gripper": 6,
+ }
+ )
+
+ # Per-joint sign applied to raw servo angles so the leader matches the follower
+ # convention. The gripper additionally carries a scale (e.g. -6) to widen its
+ # range to the reBot B601 follower's gripper travel.
+ joint_directions: dict[str, int] = field(
+ default_factory=lambda: {
+ "shoulder_pan": -1,
+ "shoulder_lift": -1,
+ "elbow_flex": 1,
+ "wrist_flex": 1,
+ "wrist_yaw": 1,
+ "wrist_roll": -1,
+ "gripper": -6,
+ }
+ )
+
+ # Per-joint [min, max] output range in degrees. Matches the reBot B601 follower
+ # joint limits so leader actions can drive the follower key-for-key.
+ joint_ranges: dict[str, list[int]] = field(
+ default_factory=lambda: {
+ "shoulder_pan": [-150, 150],
+ "shoulder_lift": [-170, 1],
+ "elbow_flex": [-200, 1],
+ "wrist_flex": [-80, 90],
+ "wrist_yaw": [-90, 90],
+ "wrist_roll": [-90, 90],
+ "gripper": [-270, 0],
+ }
+ )
+
+
+@TeleoperatorConfig.register_subclass("rebot_102_leader")
+@dataclass
+class RebotArm102LeaderTeleopConfig(TeleoperatorConfig, RebotArm102LeaderConfig):
+ """Registered configuration for the reBot Arm 102 leader teleoperator."""
+
+ pass
diff --git a/src/lerobot/teleoperators/rebot_102_leader/rebot_102_leader.py b/src/lerobot/teleoperators/rebot_102_leader/rebot_102_leader.py
new file mode 100644
index 000000000..f9f10ed69
--- /dev/null
+++ b/src/lerobot/teleoperators/rebot_102_leader/rebot_102_leader.py
@@ -0,0 +1,207 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import time
+from typing import TYPE_CHECKING
+
+from lerobot.motors import MotorCalibration
+from lerobot.types import RobotAction
+from lerobot.utils.decorators import check_if_already_connected, check_if_not_connected
+from lerobot.utils.import_utils import _motorbridge_smart_servo_available, require_package
+
+from ..teleoperator import Teleoperator
+from .config_rebot_102_leader import RebotArm102LeaderTeleopConfig
+
+if TYPE_CHECKING or _motorbridge_smart_servo_available:
+ from motorbridge_smart_servo import FashionStarServo, ServoMonitor
+else:
+ FashionStarServo = None
+ ServoMonitor = None
+
+logger = logging.getLogger(__name__)
+
+_SETTLE_SEC = 0.01
+
+
+class RebotArm102Leader(Teleoperator):
+ """Seeed Studio StarArm102 / reBot Arm 102 leader arm.
+
+ A 7-joint (incl. gripper) leader built on FashionStar UART smart servos. Servo
+ communication is handled by the ``motorbridge-smart-servo`` package; this class
+ only reads joint angles, so it produces actions but accepts no feedback.
+ """
+
+ config_class = RebotArm102LeaderTeleopConfig
+ name = "rebot_102_leader"
+
+ def __init__(self, config: RebotArm102LeaderTeleopConfig):
+ require_package("motorbridge-smart-servo", extra="rebot", import_name="motorbridge_smart_servo")
+ super().__init__(config)
+ self.config = config
+ self.bus: FashionStarServo | None = None
+ self.motor_names = list(config.joint_ids.keys())
+ self._last_raw_positions: dict[str, float] = {}
+
+ @property
+ def action_features(self) -> dict[str, type]:
+ return {f"{motor}.pos": float for motor in self.motor_names}
+
+ @property
+ def feedback_features(self) -> dict[str, type]:
+ return {}
+
+ @property
+ def is_connected(self) -> bool:
+ return self.bus is not None
+
+ @check_if_already_connected
+ def connect(self, calibrate: bool = True) -> None:
+ logger.info(f"Connecting {self} on {self.config.port}...")
+ bus = FashionStarServo(self.config.port, baudrate=self.config.baudrate)
+ try:
+ for motor_name, motor_id in self.config.joint_ids.items():
+ if not bus.ping(motor_id):
+ raise RuntimeError(f"Servo not found for {motor_name} (id={motor_id}).")
+ self._last_raw_positions[motor_name] = 0.0
+ self.bus = bus
+
+ if not self.is_calibrated and calibrate:
+ logger.info(
+ "Mismatch between calibration values in the motor and the calibration file or no calibration file found"
+ )
+ self.calibrate()
+
+ self.configure()
+ except Exception:
+ bus.close()
+ self.bus = None
+ raise
+
+ logger.info(f"{self} connected.")
+
+ @property
+ def is_calibrated(self) -> bool:
+ return bool(self.calibration) and set(self.calibration) == set(self.motor_names)
+
+ def calibrate(self) -> None:
+ if self.calibration:
+ user_input = input(
+ f"Press ENTER to use provided calibration file associated with the id {self.id}, "
+ "or type 'c' and press ENTER to run calibration: "
+ )
+ if user_input.strip().lower() != "c":
+ logger.info(f"Using calibration file associated with the id {self.id}")
+ return
+
+ logger.info(f"\nRunning calibration of {self}")
+ input(
+ "\nCalibration: set zero position.\n"
+ "Manually move the reBot Arm 102 to its zero pose and close the gripper.\n"
+ "Press ENTER when ready..."
+ )
+
+ self.calibration = {}
+ for motor_name, motor_id in self.config.joint_ids.items():
+ self.bus.unlock(motor_id)
+ time.sleep(_SETTLE_SEC)
+ self.bus.set_origin_point(motor_id)
+ range_min, range_max = self.config.joint_ranges[motor_name]
+ self.calibration[motor_name] = MotorCalibration(
+ id=motor_id,
+ drive_mode=0,
+ homing_offset=0,
+ range_min=int(range_min),
+ range_max=int(range_max),
+ )
+
+ self._save_calibration()
+ logger.info(f"Calibration saved to {self.calibration_fpath}")
+
+ def configure(self) -> None:
+ for motor_id in self.config.joint_ids.values():
+ self.bus.unlock(motor_id)
+ time.sleep(_SETTLE_SEC)
+ # Reset the multi-turn counter of each servo individually.
+ for motor_id in self.config.joint_ids.values():
+ self.bus.reset_multi_turn(motor_id)
+
+ def _read_raw_positions(self) -> dict[str, float]:
+ result: dict[int, ServoMonitor | None] = self.bus.sync_monitor(list(self.config.joint_ids.values()))
+ id_to_name = {v: k for k, v in self.config.joint_ids.items()}
+ raw_positions: dict[str, float] = {}
+ for motor_id, monitor in result.items():
+ motor_name = id_to_name[motor_id]
+ if monitor is None:
+ raise RuntimeError(f"Servo {motor_name} (id={motor_id}) has never responded.")
+ raw_positions[motor_name] = monitor.angle_deg
+ return raw_positions
+
+ @staticmethod
+ def _round_to_valid_range(value: float, min_value: float, max_value: float) -> tuple[float, int]:
+ """Unwrap a multi-turn angle into the ±180° window centred on (min+max)/2.
+
+ The servo may report an angle that has accumulated extra full rotations
+ (value = true_angle + N*360). Subtract the nearest whole number of turns
+ to bring it back into [center-180, center+180]. Returns the unwrapped
+ angle and the number of turns removed.
+ """
+ center = (min_value + max_value) / 2.0
+ turns = round((value - center) / 360.0)
+ return value - turns * 360.0, abs(turns)
+
+ @check_if_not_connected
+ def get_action(self) -> RobotAction:
+ start = time.perf_counter()
+ try:
+ raw_positions = self._read_raw_positions()
+ self._last_raw_positions = raw_positions
+ except Exception as e:
+ logger.error(f"Failed to read raw positions: {e}")
+ logger.warning("[EMERGENCY STOP] Hold the follower arm and cut off the main power to the arms.")
+ logger.warning(
+ "[EMERGENCY STOP] Break the teleoperation session and check the leader USB connection or power."
+ )
+ raw_positions = self._last_raw_positions
+
+ action_dict: dict[str, float] = {}
+ for motor_name in self.motor_names:
+ range_min, range_max = self.config.joint_ranges[motor_name]
+ direction = self.config.joint_directions[motor_name]
+ sign = 1.0 if direction >= 0 else -1.0
+ unwrapped, k = self._round_to_valid_range(
+ raw_positions[motor_name], range_min * sign, range_max * sign
+ )
+ position = unwrapped * direction
+ if k > 0:
+ logger.debug(
+ f"Servo {motor_name} (id={self.config.joint_ids[motor_name]}) wrapped {k} * 360°. "
+ f"Unwrapped pos: {unwrapped:.1f}° (raw: {raw_positions[motor_name]:.1f}°)"
+ )
+ action_dict[f"{motor_name}.pos"] = max(float(range_min), min(float(range_max), position))
+
+ dt_ms = (time.perf_counter() - start) * 1e3
+ logger.debug(f"{self} read action: {dt_ms:.1f}ms")
+ return action_dict
+
+ def send_feedback(self, feedback: dict[str, float]) -> None:
+ raise NotImplementedError("Feedback is not implemented for the reBot Arm 102 leader.")
+
+ @check_if_not_connected
+ def disconnect(self) -> None:
+ self.bus.close()
+ self.bus = None
+ logger.info(f"{self} disconnected.")
diff --git a/src/lerobot/teleoperators/utils.py b/src/lerobot/teleoperators/utils.py
index db685f396..5a6d4ecde 100644
--- a/src/lerobot/teleoperators/utils.py
+++ b/src/lerobot/teleoperators/utils.py
@@ -99,6 +99,14 @@ def make_teleoperator_from_config(config: TeleoperatorConfig) -> "Teleoperator":
from .openarm_mini import OpenArmMini
return OpenArmMini(config)
+ elif config.type == "rebot_102_leader":
+ from .rebot_102_leader import RebotArm102Leader
+
+ return RebotArm102Leader(config)
+ elif config.type == "bi_rebot_102_leader":
+ from .bi_rebot_102_leader import BiRebotArm102Leader
+
+ return BiRebotArm102Leader(config)
else:
try:
return cast("Teleoperator", make_device_from_device_class(config))
diff --git a/src/lerobot/templates/lerobot_modelcard_template.md b/src/lerobot/templates/lerobot_modelcard_template.md
index f0dd0da07..b93e83b6e 100644
--- a/src/lerobot/templates/lerobot_modelcard_template.md
+++ b/src/lerobot/templates/lerobot_modelcard_template.md
@@ -41,8 +41,6 @@ For more details, see the [Physical Intelligence π₀ blog post](https://www.ph
For more details, see the [Physical Intelligence π₀.₅ blog post](https://www.physicalintelligence.company/blog/pi05).
{% elif model_name == "gaussian_actor" %}
This is a Gaussian Actor policy (Gaussian policy with a tanh squash) — the policy-side component used by [Soft Actor-Critic (SAC)](https://huggingface.co/papers/1801.01290) and related maximum-entropy continuous-control algorithms.
-{% elif model_name == "reward_classifier" %}
-A reward classifier is a lightweight neural network that scores observations or trajectories for task success, providing a learned reward signal or offline evaluation when explicit rewards are unavailable.
{% else %}
_Model type not recognized — please update this template._
{% endif %}
diff --git a/src/lerobot/transport/utils.py b/src/lerobot/transport/utils.py
index 8da338044..2ef63c2cc 100644
--- a/src/lerobot/transport/utils.py
+++ b/src/lerobot/transport/utils.py
@@ -25,9 +25,10 @@ from typing import Any
import torch
-from lerobot.transport import services_pb2
from lerobot.utils.transition import Transition
+from . import services_pb2
+
# FIX for protobuf: Assign the enum to a variable and ignore the type error once
TransferState = services_pb2.TransferState # type: ignore[attr-defined]
diff --git a/src/lerobot/utils/collate.py b/src/lerobot/utils/collate.py
new file mode 100644
index 000000000..fce7e6b42
--- /dev/null
+++ b/src/lerobot/utils/collate.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from typing import Any
+
+from torch.utils.data._utils.collate import default_collate
+
+from lerobot.datasets.language import LANGUAGE_COLUMNS
+
+_PYTHON_LIST_KEYS = {"messages", "message_streams", "target_message_indices"}
+
+
+def lerobot_collate_fn(batch: list[dict[str, Any] | None]) -> dict[str, Any] | None:
+ """Collate function that preserves Python-list and language fields as lists.
+
+ Drops ``None`` samples (e.g. recipes that yielded no target message), keeps
+ rendered-message and language fields as plain Python lists, and delegates
+ every other key to PyTorch's ``default_collate``.
+ """
+ batch = [sample for sample in batch if sample is not None]
+ if not batch:
+ return None
+
+ # All-or-nothing per key: a partial-presence batch (e.g. half the samples
+ # carry `messages` and half don't) is a real bug in the upstream
+ # rendering step — silently filtering would hand downstream consumers a
+ # preserved list shorter than the tensor batch. Raise instead so the
+ # mismatch surfaces at the boundary.
+ preserved: dict[str, list[Any]] = {}
+ for key in _PYTHON_LIST_KEYS:
+ presence = [key in sample for sample in batch]
+ if not any(presence):
+ continue
+ if not all(presence):
+ raise ValueError(
+ f"Inconsistent batch: {sum(presence)}/{len(batch)} samples carry {key!r}; "
+ f"every sample in a batch must agree."
+ )
+ preserved[key] = [sample[key] for sample in batch]
+ tensorizable = [
+ {
+ key: value
+ for key, value in sample.items()
+ if key not in _PYTHON_LIST_KEYS and key not in LANGUAGE_COLUMNS
+ }
+ for sample in batch
+ ]
+ collated = default_collate(tensorizable)
+ collated.update(preserved)
+ return collated
diff --git a/src/lerobot/utils/import_utils.py b/src/lerobot/utils/import_utils.py
index 6ba912bf5..5dbce2c5b 100644
--- a/src/lerobot/utils/import_utils.py
+++ b/src/lerobot/utils/import_utils.py
@@ -69,7 +69,7 @@ def is_package_available(
return package_exists
-def get_safe_default_codec():
+def get_safe_default_video_backend():
logger = logging.getLogger(__name__)
if importlib.util.find_spec("torchcodec"):
return "torchcodec"
@@ -114,6 +114,10 @@ _dynamixel_sdk_available = is_package_available("dynamixel-sdk", import_name="dy
_feetech_sdk_available = is_package_available("feetech-servo-sdk", import_name="scservo_sdk")
_reachy2_sdk_available = is_package_available("reachy2_sdk")
_can_available = is_package_available("python-can", "can")
+_motorbridge_available = is_package_available("motorbridge")
+_motorbridge_smart_servo_available = is_package_available(
+ "motorbridge-smart-servo", import_name="motorbridge_smart_servo"
+)
_unitree_sdk_available = is_package_available("unitree-sdk2py", "unitree_sdk2py")
_pyrealsense2_available = is_package_available("pyrealsense2") or is_package_available(
"pyrealsense2-macosx", import_name="pyrealsense2"
@@ -128,6 +132,9 @@ _hidapi_available = is_package_available("hidapi", import_name="hid")
_pandas_available = is_package_available("pandas")
_faker_available = is_package_available("faker")
+# Video encoding / decoding
+_av_available = is_package_available("av")
+
# Misc
_pynput_available = is_package_available("pynput")
_pygame_available = is_package_available("pygame")
diff --git a/src/lerobot/utils/utils.py b/src/lerobot/utils/utils.py
index 2574f1fa3..6aad0c503 100644
--- a/src/lerobot/utils/utils.py
+++ b/src/lerobot/utils/utils.py
@@ -160,6 +160,25 @@ def has_method(cls: object, method_name: str) -> bool:
return hasattr(cls, method_name) and callable(getattr(cls, method_name))
+def unwrap_scalar(value: Any) -> Any:
+ """Unwrap a tensor / numpy scalar / single-element list into a Python scalar.
+
+ Tensors and numpy scalars expose ``.item()``; single-element lists are
+ unwrapped recursively. Anything else is returned unchanged. Centralized
+ here so the language renderer and processor steps share one definition.
+
+ Raises:
+ ValueError: If ``value`` is a list with zero or multiple elements.
+ """
+ if hasattr(value, "item"):
+ return value.item()
+ if isinstance(value, list):
+ if len(value) != 1:
+ raise ValueError(f"Expected a scalar, got list of length {len(value)}: {value!r}")
+ return unwrap_scalar(value[0])
+ return value
+
+
def is_valid_numpy_dtype_string(dtype_str: str) -> bool:
"""
Return True if a given string can be converted to a numpy dtype.
diff --git a/tests/artifacts/encoded_videos/clip_32x48.mp4 b/tests/artifacts/encoded_videos/clip_32x48.mp4
new file mode 100644
index 000000000..086c399d3
--- /dev/null
+++ b/tests/artifacts/encoded_videos/clip_32x48.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2191cd86e9e32ecbe18e33ad68d49060e479723ab5a3212bbb26df3025ccb568
+size 5815
diff --git a/tests/artifacts/encoded_videos/clip_4frames.mp4 b/tests/artifacts/encoded_videos/clip_4frames.mp4
new file mode 100644
index 000000000..487c3c8ad
--- /dev/null
+++ b/tests/artifacts/encoded_videos/clip_4frames.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e0ebf563ba3ed9c24b691a0f0b29e0294a1fa9b51422e1ece296155f1465768
+size 16236
diff --git a/tests/artifacts/encoded_videos/clip_5frames.mp4 b/tests/artifacts/encoded_videos/clip_5frames.mp4
new file mode 100644
index 000000000..cbbe81c39
--- /dev/null
+++ b/tests/artifacts/encoded_videos/clip_5frames.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8475bfd5e6c4c780df46200e2b027e262b38436c57d01078bd943a5b87c65b8f
+size 20726
diff --git a/tests/artifacts/encoded_videos/clip_6frames.mp4 b/tests/artifacts/encoded_videos/clip_6frames.mp4
new file mode 100644
index 000000000..50d9badca
--- /dev/null
+++ b/tests/artifacts/encoded_videos/clip_6frames.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6434322d1c671a7d132367619f841a775317cb9ff973f3f4505831e3ed74076d
+size 23808
diff --git a/tests/artifacts/encoded_videos/clip_h264.mp4 b/tests/artifacts/encoded_videos/clip_h264.mp4
new file mode 100644
index 000000000..90698dcf5
--- /dev/null
+++ b/tests/artifacts/encoded_videos/clip_h264.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8efc84375e92a3499cef93100e04d8fb354670f3d9e0db2097b52575927284fc
+size 12237
diff --git a/tests/configs/test_recipe.py b/tests/configs/test_recipe.py
new file mode 100644
index 000000000..b4954efbf
--- /dev/null
+++ b/tests/configs/test_recipe.py
@@ -0,0 +1,168 @@
+#!/usr/bin/env python
+
+from pathlib import Path
+from textwrap import dedent
+
+import pytest
+
+from lerobot.configs.recipe import MessageTurn, TrainingRecipe, load_recipe
+
+
+def _minimal_message_turn(content: str = "${task}") -> MessageTurn:
+ return MessageTurn(role="user", content=content, stream="high_level")
+
+
+def _minimal_target_turn() -> MessageTurn:
+ return MessageTurn(role="assistant", content="ok", stream="high_level", target=True)
+
+
+# ── Message-recipe validation ────────────────────────────────────────
+
+
+def test_message_recipe_validates_unknown_binding():
+ with pytest.raises(ValueError, match="unknown binding"):
+ TrainingRecipe(
+ messages=[
+ MessageTurn(role="user", content="${missing}", stream="high_level"),
+ _minimal_target_turn(),
+ ]
+ )
+
+
+def test_message_turn_requires_a_stream():
+ """Every turn must declare a stream — None is rejected at construction.
+
+ Previously this only failed at render time (``_validate_rendered``);
+ catching it here means a malformed recipe YAML errors at load instead
+ of at the first training sample.
+ """
+ with pytest.raises(ValueError, match="missing a stream"):
+ MessageTurn(role="user", content="${task}")
+
+
+def test_message_recipe_requires_at_least_one_target():
+ with pytest.raises(ValueError, match="target"):
+ TrainingRecipe(
+ messages=[
+ _minimal_message_turn(),
+ MessageTurn(role="assistant", content="no target", stream="high_level"),
+ ]
+ )
+
+
+def test_recipe_rejects_both_messages_and_blend():
+ with pytest.raises(ValueError, match="only one"):
+ TrainingRecipe(
+ messages=[_minimal_message_turn(), _minimal_target_turn()],
+ blend={"a": TrainingRecipe(weight=1.0, messages=[_minimal_target_turn()])},
+ )
+
+
+def test_recipe_rejects_neither_messages_nor_blend():
+ with pytest.raises(ValueError, match="must set one"):
+ TrainingRecipe()
+
+
+# ── Blend validation ─────────────────────────────────────────────────
+
+
+def test_blend_must_be_non_empty():
+ with pytest.raises(ValueError, match="at least one component"):
+ TrainingRecipe(blend={})
+
+
+def test_blend_component_must_define_weight():
+ with pytest.raises(ValueError, match="weight"):
+ TrainingRecipe(blend={"a": TrainingRecipe(messages=[_minimal_target_turn()])})
+
+
+def test_blend_component_weight_must_be_positive():
+ with pytest.raises(ValueError, match="positive weight"):
+ TrainingRecipe(blend={"a": TrainingRecipe(weight=0.0, messages=[_minimal_target_turn()])})
+
+
+def test_blend_component_must_define_messages():
+ # A bare TrainingRecipe(weight=1.0) would itself raise; build it without
+ # going through __post_init__ to exercise the blend-level validator.
+ bad = TrainingRecipe.__new__(TrainingRecipe)
+ bad.messages = None
+ bad.bindings = None
+ bad.blend = None
+ bad.weight = 1.0
+ with pytest.raises(ValueError, match="must define messages"):
+ TrainingRecipe(blend={"a": bad})
+
+
+def test_blend_components_cannot_themselves_define_a_blend():
+ inner = TrainingRecipe(blend={"x": TrainingRecipe(weight=1.0, messages=[_minimal_target_turn()])})
+ # Force-bypass the inner component's normal validation so the test
+ # exercises the outer blend's "no nested blends" rule directly.
+ nested = TrainingRecipe.__new__(TrainingRecipe)
+ nested.messages = None
+ nested.bindings = None
+ nested.blend = inner.blend
+ nested.weight = 1.0
+ with pytest.raises(ValueError, match="cannot itself define a blend"):
+ TrainingRecipe(blend={"outer": nested})
+
+
+# ── from_dict / from_yaml round-trips ────────────────────────────────
+
+
+def test_from_dict_with_nested_blend():
+ recipe = TrainingRecipe.from_dict(
+ {
+ "blend": {
+ "a": {
+ "weight": 1.0,
+ "messages": [
+ {"role": "user", "content": "${task}", "stream": "high_level"},
+ {"role": "assistant", "content": "a", "stream": "high_level", "target": True},
+ ],
+ },
+ "b": {
+ "weight": 2.0,
+ "messages": [
+ {"role": "user", "content": "${task}", "stream": "high_level"},
+ {"role": "assistant", "content": "b", "stream": "high_level", "target": True},
+ ],
+ },
+ }
+ }
+ )
+ assert recipe.blend is not None
+ assert set(recipe.blend) == {"a", "b"}
+ assert recipe.blend["b"].weight == 2.0
+ # Inner messages were promoted to MessageTurn instances.
+ assert isinstance(recipe.blend["a"].messages[0], MessageTurn)
+
+
+def test_from_yaml_round_trips_through_load_recipe(tmp_path: Path):
+ yaml_text = dedent(
+ """
+ bindings:
+ custom: "active_at(t, style=subtask)"
+ messages:
+ - {role: user, content: "${task}: ${custom}", stream: high_level}
+ - {role: assistant, content: "ok", stream: high_level, target: true}
+ """
+ ).strip()
+ path = tmp_path / "recipe.yaml"
+ path.write_text(yaml_text)
+
+ via_classmethod = TrainingRecipe.from_yaml(path)
+ via_helper = load_recipe(path)
+
+ assert via_classmethod.bindings == {"custom": "active_at(t, style=subtask)"}
+ assert via_classmethod.messages[1].target is True
+ # ``load_recipe`` is just a wrapper, but assert the two paths agree
+ # on the structural result so a future divergence is caught here.
+ assert via_helper.bindings == via_classmethod.bindings
+ assert len(via_helper.messages) == len(via_classmethod.messages)
+
+
+def test_from_yaml_rejects_non_mapping(tmp_path: Path):
+ path = tmp_path / "bad.yaml"
+ path.write_text("- just\n- a\n- list\n")
+ with pytest.raises(ValueError, match="mapping at the top level"):
+ TrainingRecipe.from_yaml(path)
diff --git a/tests/datasets/test_aggregate.py b/tests/datasets/test_aggregate.py
index 6d646d4f7..80a95aa1f 100644
--- a/tests/datasets/test_aggregate.py
+++ b/tests/datasets/test_aggregate.py
@@ -14,6 +14,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+import json
+import logging
from unittest.mock import patch
import pytest
@@ -23,7 +25,9 @@ pytest.importorskip("datasets", reason="datasets is required (install lerobot[da
import datasets # noqa: E402
import torch
+from lerobot.configs import VIDEO_ENCODER_INFO_KEYS
from lerobot.datasets.aggregate import aggregate_datasets
+from lerobot.datasets.feature_utils import features_equal_for_merge
from lerobot.datasets.lerobot_dataset import LeRobotDataset
from tests.fixtures.constants import DUMMY_REPO_ID
@@ -117,8 +121,9 @@ def assert_metadata_consistency(aggr_ds, ds_0, ds_1):
"Robot type should be the same"
)
- # Test features are the same
- assert aggr_ds.features == ds_0.features == ds_1.features, "Features should be the same"
+ # Schema matches; merged video ``info`` is reconciled separately from per-source ``info``.
+ assert features_equal_for_merge(aggr_ds.features, ds_0.features)
+ assert features_equal_for_merge(aggr_ds.features, ds_1.features)
# Test tasks aggregation
expected_tasks = set(ds_0.meta.tasks.index) | set(ds_1.meta.tasks.index)
@@ -284,6 +289,73 @@ def test_aggregate_datasets(tmp_path, lerobot_dataset_factory):
assert_dataset_iteration_works(aggr_ds)
+@pytest.mark.parametrize("mutation", ["mismatched_value", "missing_key"])
+def test_aggregate_incomplete_video_encoder_info_warns_and_nuls_encoders(
+ tmp_path, lerobot_dataset_factory, caplog, mutation
+):
+ """Mismatched or missing encoder ``info`` is merged per-key with fallbacks and a warning."""
+ suffix = "enc_mismatch" if mutation == "mismatched_value" else "enc_missing"
+ ds_0 = lerobot_dataset_factory(
+ root=tmp_path / f"{suffix}_a",
+ repo_id=f"{DUMMY_REPO_ID}_{suffix}_a",
+ total_episodes=2,
+ total_frames=20,
+ )
+ ds_1 = lerobot_dataset_factory(
+ root=tmp_path / f"{suffix}_b",
+ repo_id=f"{DUMMY_REPO_ID}_{suffix}_b",
+ total_episodes=2,
+ total_frames=20,
+ )
+
+ info_path = ds_1.root / "meta" / "info.json"
+ data = json.loads(info_path.read_text())
+ for ft in data["features"].values():
+ if ft.get("dtype") != "video":
+ continue
+ inf = ft.setdefault("info", {})
+ if mutation == "mismatched_value":
+ inf["video.crf"] = 99
+ inf["video.extra_options"] = {"tune": "film"}
+ else:
+ inf.pop("video.crf", None)
+ inf.pop("video.extra_options", None)
+ info_path.write_text(json.dumps(data))
+
+ aggr_id = f"{DUMMY_REPO_ID}_{suffix}_aggr"
+ aggr_root = tmp_path / f"{suffix}_aggr"
+ with caplog.at_level(logging.WARNING):
+ aggregate_datasets(
+ repo_ids=[ds_0.repo_id, ds_1.repo_id],
+ roots=[ds_0.root, ds_1.root],
+ aggr_repo_id=aggr_id,
+ aggr_root=aggr_root,
+ )
+
+ assert "heterogeneous" in caplog.text.lower() or "incomplete" in caplog.text.lower()
+
+ with (
+ patch("lerobot.datasets.dataset_metadata.get_safe_version") as mock_get_safe_version,
+ patch("lerobot.datasets.dataset_metadata.snapshot_download") as mock_snapshot_download,
+ ):
+ mock_get_safe_version.return_value = "v3.0"
+ mock_snapshot_download.return_value = str(aggr_root)
+ aggr_ds = LeRobotDataset(aggr_id, root=aggr_root)
+
+ for key, ft in aggr_ds.meta.info.features.items():
+ if ft.get("dtype") != "video":
+ continue
+ info = ft["info"]
+ reference = ds_0.meta.info.features[key]["info"]
+ for info_key in VIDEO_ENCODER_INFO_KEYS:
+ if info_key == "video.crf":
+ assert info[info_key] is None
+ elif info_key == "video.extra_options":
+ assert info[info_key] == {}
+ else:
+ assert info[info_key] == reference[info_key]
+
+
def test_aggregate_with_low_threshold(tmp_path, lerobot_dataset_factory):
"""Test aggregation with small file size limits to force file rotation/sharding."""
ds_0_num_episodes = ds_1_num_episodes = 10
diff --git a/tests/datasets/test_dataset_metadata.py b/tests/datasets/test_dataset_metadata.py
index 6c784c90b..171d8af8b 100644
--- a/tests/datasets/test_dataset_metadata.py
+++ b/tests/datasets/test_dataset_metadata.py
@@ -385,3 +385,140 @@ def test_finalize_flushes_buffered_metadata(tmp_path):
assert episodes_dir.exists()
parquet_files = list(episodes_dir.rglob("*.parquet"))
assert len(parquet_files) > 0
+
+
+# ── Tools accessor ───────────────────────────────────────────────────
+
+
+def test_tools_falls_back_to_default_when_info_has_no_tools_field(tmp_path):
+ """meta.tools returns DEFAULT_TOOLS when info.json doesn't declare any."""
+ from lerobot.datasets.language import DEFAULT_TOOLS
+
+ root = tmp_path / "no_tools"
+ meta = LeRobotDatasetMetadata.create(
+ repo_id="test/no_tools",
+ fps=DEFAULT_FPS,
+ features=SIMPLE_FEATURES,
+ root=root,
+ use_videos=False,
+ )
+
+ assert meta.tools == DEFAULT_TOOLS
+ # info.json on disk should NOT include a `tools` key for clean datasets
+ with open(root / INFO_PATH) as f:
+ info_on_disk = json.load(f)
+ assert "tools" not in info_on_disk
+
+
+def test_tools_reads_declared_tools_from_info_json(tmp_path):
+ """A `tools` list written into info.json survives load → meta.tools.
+
+ Regression test for the bug where ``DatasetInfo.from_dict`` silently
+ dropped the ``tools`` key (no matching dataclass field), so
+ ``meta.tools`` always returned ``DEFAULT_TOOLS`` regardless of
+ what was on disk.
+ """
+ from lerobot.datasets.io_utils import load_info
+
+ root = tmp_path / "with_tools"
+ meta = LeRobotDatasetMetadata.create(
+ repo_id="test/with_tools",
+ fps=DEFAULT_FPS,
+ features=SIMPLE_FEATURES,
+ root=root,
+ use_videos=False,
+ )
+
+ custom_tool = {
+ "type": "function",
+ "function": {
+ "name": "record_observation",
+ "description": "Capture a still image.",
+ "parameters": {
+ "type": "object",
+ "properties": {"label": {"type": "string"}},
+ "required": ["label"],
+ },
+ },
+ }
+ info_path = root / INFO_PATH
+ with open(info_path) as f:
+ raw = json.load(f)
+ raw["tools"] = [custom_tool]
+ with open(info_path, "w") as f:
+ json.dump(raw, f)
+
+ # Reload info from disk and rebind it on the metadata object
+ meta.info = load_info(root)
+ assert meta.tools == [custom_tool]
+
+
+def test_tools_round_trip_through_dataset_info(tmp_path):
+ """A `tools` list survives DatasetInfo.from_dict / to_dict."""
+ from lerobot.datasets.utils import DatasetInfo
+
+ raw = {
+ "codebase_version": "v3.1",
+ "fps": 30,
+ "features": SIMPLE_FEATURES,
+ "tools": [{"type": "function", "function": {"name": "say"}}],
+ }
+ info = DatasetInfo.from_dict(raw)
+ assert info.tools == raw["tools"]
+ assert info.to_dict()["tools"] == raw["tools"]
+
+
+def test_tools_setter_persists_to_info_json_and_reloads(tmp_path):
+ """Assigning meta.tools writes info.json and reloads meta.info."""
+ from lerobot.datasets.io_utils import load_info
+
+ root = tmp_path / "set_tools"
+ meta = LeRobotDatasetMetadata.create(
+ repo_id="test/set_tools",
+ fps=DEFAULT_FPS,
+ features=SIMPLE_FEATURES,
+ root=root,
+ use_videos=False,
+ )
+
+ custom_tool = {
+ "type": "function",
+ "function": {
+ "name": "record_observation",
+ "description": "Capture a still image.",
+ "parameters": {
+ "type": "object",
+ "properties": {"label": {"type": "string"}},
+ "required": ["label"],
+ },
+ },
+ }
+ meta.tools = [custom_tool]
+
+ # In-memory metadata reflects the new catalog ...
+ assert meta.tools == [custom_tool]
+ assert meta.info.tools == [custom_tool]
+ # ... and a fresh read from disk agrees.
+ assert load_info(root).tools == [custom_tool]
+
+
+def test_tools_setter_clears_key_when_set_to_none(tmp_path):
+ """Setting meta.tools back to None drops the key and restores the default."""
+ from lerobot.datasets.language import DEFAULT_TOOLS
+
+ root = tmp_path / "clear_tools"
+ meta = LeRobotDatasetMetadata.create(
+ repo_id="test/clear_tools",
+ fps=DEFAULT_FPS,
+ features=SIMPLE_FEATURES,
+ root=root,
+ use_videos=False,
+ )
+
+ meta.tools = [{"type": "function", "function": {"name": "say"}}]
+ meta.tools = None
+
+ assert meta.tools == DEFAULT_TOOLS
+ with open(root / INFO_PATH) as f:
+ info_on_disk = json.load(f)
+ assert "tools" not in info_on_disk
diff --git a/tests/datasets/test_dataset_reader.py b/tests/datasets/test_dataset_reader.py
index bbe858b5d..085563bb8 100644
--- a/tests/datasets/test_dataset_reader.py
+++ b/tests/datasets/test_dataset_reader.py
@@ -20,7 +20,7 @@ import pytest
pytest.importorskip("datasets", reason="datasets is required (install lerobot[dataset])")
from lerobot.datasets.dataset_reader import DatasetReader
-from lerobot.utils.import_utils import get_safe_default_codec
+from lerobot.utils.import_utils import get_safe_default_video_backend
# ── Loading ──────────────────────────────────────────────────────────
@@ -35,7 +35,7 @@ def test_try_load_returns_true_when_data_exists(tmp_path, lerobot_dataset_factor
root=dataset.root,
episodes=None,
tolerance_s=1e-4,
- video_backend=get_safe_default_codec(),
+ video_backend=get_safe_default_video_backend(),
delta_timestamps=None,
image_transforms=None,
)
@@ -58,7 +58,7 @@ def test_try_load_returns_false_when_no_data(tmp_path):
root=meta.root,
episodes=None,
tolerance_s=1e-4,
- video_backend=get_safe_default_codec(),
+ video_backend=get_safe_default_video_backend(),
delta_timestamps=None,
image_transforms=None,
)
diff --git a/tests/datasets/test_dataset_tools.py b/tests/datasets/test_dataset_tools.py
index 0b0862f00..d36312920 100644
--- a/tests/datasets/test_dataset_tools.py
+++ b/tests/datasets/test_dataset_tools.py
@@ -23,16 +23,21 @@ import torch
pytest.importorskip("datasets", reason="datasets is required (install lerobot[dataset])")
+
+from lerobot.configs import VideoEncoderConfig
from lerobot.datasets.dataset_tools import (
add_features,
+ convert_image_to_video_dataset,
delete_episodes,
merge_datasets,
modify_features,
modify_tasks,
+ reencode_dataset,
remove_feature,
split_dataset,
)
-from lerobot.scripts.lerobot_edit_dataset import convert_image_to_video_dataset
+from lerobot.datasets.io_utils import load_info
+from tests.datasets.test_video_encoding import _add_frames, require_h264, require_libsvtav1
@pytest.fixture
@@ -1246,10 +1251,12 @@ def test_convert_image_to_video_dataset(tmp_path):
dataset=source_dataset,
output_dir=output_dir,
repo_id="lerobot/pusht_video",
- vcodec="libsvtav1",
- pix_fmt="yuv420p",
- g=2,
- crf=30,
+ camera_encoder=VideoEncoderConfig(
+ vcodec="libsvtav1",
+ pix_fmt="yuv420p",
+ g=2,
+ crf=30,
+ ),
episode_indices=[0, 1],
num_workers=2,
)
@@ -1323,3 +1330,41 @@ def test_convert_image_to_video_dataset_subset_episodes(tmp_path):
if output_dir.exists():
shutil.rmtree(output_dir)
+
+
+# ─── reencode_dataset ─────────────────────────────────────────────────
+
+
+@require_libsvtav1
+@require_h264
+def test_reencode_dataset_multi_key_multiprocessing(
+ tmp_path, empty_lerobot_dataset_factory, features_factory
+):
+ """Re-encode a two-camera dataset with num_workers=2 and verify metadata refresh."""
+ features = features_factory(use_videos=True)
+ initial_cfg = VideoEncoderConfig(vcodec="libsvtav1", g=2, crf=30, preset=12)
+ dataset = empty_lerobot_dataset_factory(
+ root=tmp_path / "ds",
+ features=features,
+ use_videos=True,
+ camera_encoder=initial_cfg,
+ )
+
+ _add_frames(dataset, num_frames=4)
+ dataset.save_episode()
+ _add_frames(dataset, num_frames=4)
+ dataset.save_episode()
+ dataset.finalize()
+
+ assert len(dataset.meta.video_keys) == 2
+
+ target_cfg = VideoEncoderConfig(vcodec="h264", g=6, crf=23, pix_fmt="yuv420p")
+
+ result = reencode_dataset(dataset, camera_encoder=target_cfg, num_workers=2)
+
+ assert result is dataset
+
+ persisted_info = load_info(dataset.root)
+ for vk in dataset.meta.video_keys:
+ persisted_encoder = VideoEncoderConfig.from_video_info(persisted_info.features[vk].get("info", {}))
+ assert persisted_encoder == target_cfg
diff --git a/tests/datasets/test_dataset_writer.py b/tests/datasets/test_dataset_writer.py
index 8d2bc0373..8670aeebc 100644
--- a/tests/datasets/test_dataset_writer.py
+++ b/tests/datasets/test_dataset_writer.py
@@ -25,6 +25,7 @@ from PIL import Image
pytest.importorskip("datasets", reason="datasets is required (install lerobot[dataset])")
+from lerobot.configs import VideoEncoderConfig
from lerobot.datasets.dataset_writer import _encode_video_worker
from lerobot.datasets.lerobot_dataset import LeRobotDataset
from lerobot.datasets.utils import DEFAULT_IMAGE_PATH
@@ -52,8 +53,8 @@ def _make_frame(features: dict, task: str = "Dummy task") -> dict:
# ── Existing encode_video_worker tests ───────────────────────────────
-def test_encode_video_worker_forwards_vcodec(tmp_path):
- """_encode_video_worker correctly forwards the vcodec parameter."""
+def test_encode_video_worker_forwards_camera_encoder(tmp_path):
+ """_encode_video_worker forwards camera_encoder to encode_video_frames."""
video_key = "observation.images.laptop"
fpath = DEFAULT_IMAGE_PATH.format(image_key=video_key, episode_index=0, frame_index=0)
img_dir = tmp_path / Path(fpath).parent
@@ -68,13 +69,21 @@ def test_encode_video_worker_forwards_vcodec(tmp_path):
Path(video_path).touch()
with patch("lerobot.datasets.dataset_writer.encode_video_frames", side_effect=mock_encode):
- _encode_video_worker(video_key, 0, tmp_path, fps=30, vcodec="h264")
+ _encode_video_worker(
+ video_key,
+ 0,
+ tmp_path,
+ fps=30,
+ camera_encoder=VideoEncoderConfig(vcodec="h264", preset=None),
+ encoder_threads=4,
+ )
- assert captured_kwargs["vcodec"] == "h264"
+ assert captured_kwargs["camera_encoder"].vcodec == "h264"
+ assert captured_kwargs["encoder_threads"] == 4
-def test_encode_video_worker_default_vcodec(tmp_path):
- """_encode_video_worker uses libsvtav1 as the default codec."""
+def test_encode_video_worker_default_camera_encoder(tmp_path):
+ """_encode_video_worker passes None camera_encoder which encode_video_frames defaults."""
video_key = "observation.images.laptop"
fpath = DEFAULT_IMAGE_PATH.format(image_key=video_key, episode_index=0, frame_index=0)
img_dir = tmp_path / Path(fpath).parent
@@ -91,7 +100,8 @@ def test_encode_video_worker_default_vcodec(tmp_path):
with patch("lerobot.datasets.dataset_writer.encode_video_frames", side_effect=mock_encode):
_encode_video_worker(video_key, 0, tmp_path, fps=30)
- assert captured_kwargs["vcodec"] == "libsvtav1"
+ assert captured_kwargs["camera_encoder"] is None
+ assert captured_kwargs["encoder_threads"] is None
# ── add_frame contracts ──────────────────────────────────────────────
diff --git a/tests/datasets/test_datasets.py b/tests/datasets/test_datasets.py
index 654f8cdf1..19c314fd6 100644
--- a/tests/datasets/test_datasets.py
+++ b/tests/datasets/test_datasets.py
@@ -24,11 +24,13 @@ import torch
pytest.importorskip("datasets", reason="datasets is required (install lerobot[dataset])")
+import datasets
from huggingface_hub import HfApi
from PIL import Image
from safetensors.torch import load_file
from torchvision.transforms import v2
+from lerobot.configs import VALID_VIDEO_CODECS, VideoEncoderConfig
from lerobot.configs.default import DatasetConfig
from lerobot.configs.train import TrainPipelineConfig
from lerobot.datasets import make_dataset
@@ -43,7 +45,6 @@ from lerobot.datasets.utils import (
DEFAULT_VIDEO_FILE_SIZE_IN_MB,
create_branch,
)
-from lerobot.datasets.video_utils import VALID_VIDEO_CODECS
from lerobot.envs.factory import make_env_config
from lerobot.policies.factory import make_policy_config
from lerobot.robots import make_robot_from_config
@@ -360,6 +361,41 @@ def test_add_frame_image_pil(image_dataset):
assert dataset[0]["image"].shape == torch.Size(DUMMY_CHW)
+@pytest.mark.parametrize(
+ "dtype,np_dtype,values,assert_fn",
+ [
+ ("float32", np.float32, [1.0, 2.0], np.testing.assert_allclose),
+ ("int64", np.int64, [1, 2], np.testing.assert_array_equal),
+ ("bool", np.bool_, [True, False], np.testing.assert_array_equal),
+ ],
+ ids=["float32", "int64", "bool"],
+)
+def test_save_episode_shape_1_scalar_is_scalarized_before_hf_encoding(
+ tmp_path, empty_lerobot_dataset_factory, monkeypatch, dtype, np_dtype, values, assert_fn
+):
+ features = {"state": {"dtype": dtype, "shape": (1,), "names": None}}
+ dataset = empty_lerobot_dataset_factory(root=tmp_path / "test", features=features)
+ dataset.add_frame({"state": np.array([values[0]], dtype=np_dtype), "task": "Dummy task"})
+ dataset.add_frame({"state": np.array([values[1]], dtype=np_dtype), "task": "Dummy task"})
+
+ captured = {}
+ original_from_dict = datasets.Dataset.from_dict
+
+ def _from_dict_spy(cls, mapping, *args, **kwargs):
+ captured["state"] = mapping["state"]
+ return original_from_dict(mapping, *args, **kwargs)
+
+ monkeypatch.setattr(datasets.Dataset, "from_dict", classmethod(_from_dict_spy))
+
+ dataset.save_episode()
+ dataset.finalize()
+
+ assert "state" in captured
+ assert isinstance(captured["state"], np.ndarray)
+ assert captured["state"].shape == (2,)
+ assert_fn(captured["state"], np.array(values, dtype=np_dtype))
+
+
def test_set_image_transforms_applies_transparently(image_dataset):
dataset = image_dataset
dataset.add_frame({"image": np.random.rand(*DUMMY_CHW), "task": "Dummy task"})
@@ -1470,17 +1506,9 @@ def test_frames_in_current_file_calculation(tmp_path, empty_lerobot_dataset_fact
def test_lerobot_dataset_vcodec_validation():
- """Test that LeRobotDataset validates the vcodec parameter."""
- # Test that invalid vcodec raises ValueError
+ """Invalid vcodec in encoder config is rejected at construction time."""
with pytest.raises(ValueError, match="Invalid vcodec"):
- LeRobotDataset.__new__(LeRobotDataset) # bypass __init__ to test validation directly
- # Actually test via create since it's easier
- LeRobotDataset.create(
- repo_id="test/invalid_codec",
- fps=30,
- features={"observation.state": {"dtype": "float32", "shape": (2,), "names": ["x", "y"]}},
- vcodec="invalid_codec",
- )
+ VideoEncoderConfig(vcodec="invalid_codec")
def test_valid_video_codecs_constant():
diff --git a/tests/datasets/test_language.py b/tests/datasets/test_language.py
new file mode 100644
index 000000000..52c7b3708
--- /dev/null
+++ b/tests/datasets/test_language.py
@@ -0,0 +1,173 @@
+#!/usr/bin/env python
+
+import pytest
+
+pytest.importorskip("datasets", reason="datasets is required (install lerobot[dataset])")
+pytest.importorskip("pandas", reason="pandas is required (install lerobot[dataset])")
+
+import numpy as np # noqa: E402
+import pandas as pd # noqa: E402
+import pyarrow as pa # noqa: E402
+
+from lerobot.datasets import LeRobotDataset # noqa: E402
+from lerobot.datasets.io_utils import write_info # noqa: E402
+from lerobot.datasets.language import ( # noqa: E402
+ EVENT_ONLY_STYLES,
+ LANGUAGE_EVENTS,
+ LANGUAGE_PERSISTENT,
+ PERSISTENT_STYLES,
+ STYLE_REGISTRY,
+ VIEW_DEPENDENT_STYLES,
+ column_for_style,
+ is_view_dependent_style,
+ language_events_arrow_type,
+ language_feature_info,
+ language_persistent_arrow_type,
+ validate_camera_field,
+)
+from lerobot.datasets.utils import DEFAULT_DATA_PATH # noqa: E402
+
+
+def test_language_arrow_schema_has_expected_fields():
+ persistent_row_type = language_persistent_arrow_type().value_type
+ event_row_type = language_events_arrow_type().value_type
+
+ assert isinstance(persistent_row_type, pa.StructType)
+ assert persistent_row_type.names == [
+ "role",
+ "content",
+ "style",
+ "timestamp",
+ "camera",
+ "tool_calls",
+ ]
+
+ assert isinstance(event_row_type, pa.StructType)
+ assert event_row_type.names == ["role", "content", "style", "camera", "tool_calls"]
+
+ # Persistent-row timestamps use float32, matching LeRobotDataset frame timestamps.
+ assert persistent_row_type.field("timestamp").type == pa.float32()
+
+
+def test_validate_feature_language_warns_only_on_non_empty_value(caplog):
+ from lerobot.datasets.feature_utils import validate_feature_language
+
+ # None (the expected record-time value) is silent and non-fatal.
+ with caplog.at_level("WARNING"):
+ assert validate_feature_language("language_persistent", None) == ""
+ assert caplog.records == []
+
+ # A stray non-empty value is dropped later, so we warn rather than fail.
+ with caplog.at_level("WARNING"):
+ assert validate_feature_language("language_persistent", [{"role": "user"}]) == ""
+ assert any("language_persistent" in r.message for r in caplog.records)
+
+
+def test_style_registry_routes_columns():
+ assert {"subtask", "plan", "memory", "motion", "task_aug"} == PERSISTENT_STYLES
+ assert {"interjection", "vqa", "trace"} == EVENT_ONLY_STYLES
+ assert PERSISTENT_STYLES | EVENT_ONLY_STYLES <= STYLE_REGISTRY
+
+ assert column_for_style("subtask") == LANGUAGE_PERSISTENT
+ assert column_for_style("plan") == LANGUAGE_PERSISTENT
+ assert column_for_style("memory") == LANGUAGE_PERSISTENT
+ assert column_for_style("motion") == LANGUAGE_PERSISTENT
+ assert column_for_style("task_aug") == LANGUAGE_PERSISTENT
+ assert column_for_style("interjection") == LANGUAGE_EVENTS
+ assert column_for_style("vqa") == LANGUAGE_EVENTS
+ assert column_for_style("trace") == LANGUAGE_EVENTS
+ assert column_for_style(None) == LANGUAGE_EVENTS
+
+
+def test_view_dependent_styles():
+ # motion lives in PERSISTENT_STYLES and is described in robot-frame
+ # (joint / Cartesian) terms, so it is NOT view-dependent. Only vqa
+ # (event) and trace (event, pixel-trajectory) carry a camera tag.
+ assert {"vqa", "trace"} == VIEW_DEPENDENT_STYLES
+ assert is_view_dependent_style("vqa")
+ assert is_view_dependent_style("trace")
+ assert not is_view_dependent_style("motion")
+ assert not is_view_dependent_style("subtask")
+ assert not is_view_dependent_style("plan")
+ assert not is_view_dependent_style("interjection")
+ assert not is_view_dependent_style(None)
+
+
+def test_validate_camera_field_requires_camera_for_view_dependent_styles():
+ validate_camera_field("vqa", "observation.images.top")
+ validate_camera_field("trace", "observation.images.front")
+ with pytest.raises(ValueError, match="view-dependent"):
+ validate_camera_field("vqa", None)
+ with pytest.raises(ValueError, match="view-dependent"):
+ validate_camera_field("trace", "")
+
+
+def test_validate_camera_field_rejects_camera_on_non_view_dependent_styles():
+ validate_camera_field("subtask", None)
+ validate_camera_field("plan", None)
+ validate_camera_field("memory", None)
+ validate_camera_field("motion", None)
+ validate_camera_field("interjection", None)
+ validate_camera_field(None, None)
+ with pytest.raises(ValueError, match="must have camera=None"):
+ validate_camera_field("subtask", "observation.images.top")
+ with pytest.raises(ValueError, match="must have camera=None"):
+ validate_camera_field("motion", "observation.images.top")
+ with pytest.raises(ValueError, match="must have camera=None"):
+ validate_camera_field("interjection", "observation.images.top")
+ with pytest.raises(ValueError, match="must have camera=None"):
+ validate_camera_field(None, "observation.images.top")
+
+
+def test_unknown_style_rejected():
+ with pytest.raises(ValueError, match="Unknown language style"):
+ column_for_style("surprise")
+
+
+def test_lerobot_dataset_passes_language_columns_through(tmp_path, empty_lerobot_dataset_factory):
+ root = tmp_path / "language_dataset"
+ dataset = empty_lerobot_dataset_factory(
+ root=root,
+ features={"state": {"dtype": "float32", "shape": (2,), "names": None}},
+ use_videos=False,
+ )
+ dataset.add_frame({"state": np.array([0.0, 1.0], dtype=np.float32), "task": "tidy"})
+ dataset.add_frame({"state": np.array([1.0, 2.0], dtype=np.float32), "task": "tidy"})
+ dataset.save_episode()
+ dataset.finalize()
+
+ persistent = [
+ {
+ "role": "assistant",
+ "content": "reach for the cup",
+ "style": "subtask",
+ "timestamp": 0.0,
+ "camera": None,
+ "tool_calls": None,
+ }
+ ]
+ event = {
+ "role": "user",
+ "content": "what is visible?",
+ "style": "vqa",
+ "camera": "observation.images.top",
+ "tool_calls": None,
+ }
+ data_path = root / DEFAULT_DATA_PATH.format(chunk_index=0, file_index=0)
+ df = pd.read_parquet(data_path)
+ df[LANGUAGE_PERSISTENT] = [persistent, persistent]
+ df[LANGUAGE_EVENTS] = [[event], []]
+ df.to_parquet(data_path)
+
+ info = dataset.meta.info
+ info["features"].update(language_feature_info())
+ write_info(info, root)
+
+ reloaded = LeRobotDataset(repo_id=dataset.repo_id, root=root)
+
+ first = reloaded[0]
+ second = reloaded[1]
+ assert first[LANGUAGE_PERSISTENT] == persistent
+ assert first[LANGUAGE_EVENTS] == [event]
+ assert second[LANGUAGE_PERSISTENT] == persistent
+ assert second[LANGUAGE_EVENTS] == []
diff --git a/tests/datasets/test_language_render.py b/tests/datasets/test_language_render.py
new file mode 100644
index 000000000..fcef41fd8
--- /dev/null
+++ b/tests/datasets/test_language_render.py
@@ -0,0 +1,417 @@
+#!/usr/bin/env python
+
+import pytest
+
+pytest.importorskip("datasets", reason="datasets is required (install lerobot[dataset])")
+
+from lerobot.configs.recipe import MessageTurn, TrainingRecipe # noqa: E402
+from lerobot.datasets.language_render import ( # noqa: E402
+ EMITTED_AT_TOLERANCE_S,
+ active_at,
+ emitted_at,
+ nth_next,
+ nth_prev,
+ render_sample,
+)
+
+
+def persistent_row(role, content, style, timestamp, tool_calls=None, camera=None):
+ return {
+ "role": role,
+ "content": content,
+ "style": style,
+ "timestamp": timestamp,
+ "camera": camera,
+ "tool_calls": tool_calls,
+ }
+
+
+def event_row(role, content, style, tool_calls=None, camera=None):
+ return {
+ "role": role,
+ "content": content,
+ "style": style,
+ "camera": camera,
+ "tool_calls": tool_calls,
+ }
+
+
+PERSISTENT = [
+ persistent_row("assistant", "plan 0", "plan", 0.0),
+ persistent_row("assistant", "memory 0", "memory", 0.0),
+ persistent_row("assistant", "subtask 0", "subtask", 0.0),
+ persistent_row("assistant", "memory 1", "memory", 1.0),
+ persistent_row("assistant", "subtask 1", "subtask", 1.0),
+]
+EVENTS_AT_1 = [
+ event_row("user", "what is visible?", "vqa", camera="observation.images.top"),
+ event_row("assistant", '{"count": 2}', "vqa", camera="observation.images.top"),
+]
+EVENTS_AT_2 = [
+ event_row("user", "skip wiping", "interjection"),
+ event_row(
+ "assistant",
+ None,
+ None,
+ [{"type": "function", "function": {"name": "say", "arguments": {"text": "Skipping wiping."}}}],
+ ),
+]
+# Same emission tick, two cameras: triggers per-camera disambiguation in
+# resolvers, mirroring how Module 3 of the annotation pipeline writes one
+# (vqa, user) + (vqa, assistant) pair per camera.
+EVENTS_AT_3_TWO_CAMERAS = [
+ event_row("user", "how many cups (top)?", "vqa", camera="observation.images.top"),
+ event_row("assistant", '{"count": 3}', "vqa", camera="observation.images.top"),
+ event_row("user", "how many cups (wrist)?", "vqa", camera="observation.images.wrist"),
+ event_row("assistant", '{"count": 1}', "vqa", camera="observation.images.wrist"),
+]
+
+
+def test_resolver_temporal_semantics():
+ assert active_at(0.5, persistent=PERSISTENT, style="subtask")["content"] == "subtask 0"
+ assert active_at(1.0, persistent=PERSISTENT, style="subtask")["content"] == "subtask 1"
+ assert emitted_at(0.5, persistent=PERSISTENT, events=[], style="vqa", role="assistant") is None
+ assert (
+ emitted_at(1.0, persistent=PERSISTENT, events=EVENTS_AT_1, style="vqa", role="assistant")["content"]
+ == '{"count": 2}'
+ )
+
+
+def test_persistent_relative_resolvers_reject_event_styles():
+ with pytest.raises(ValueError, match="event-only"):
+ active_at(1.0, persistent=PERSISTENT, style="vqa")
+ with pytest.raises(ValueError, match="event-only"):
+ nth_prev(1.0, persistent=PERSISTENT, style="interjection")
+
+
+def test_nth_prev_and_next():
+ assert nth_prev(1.0, persistent=PERSISTENT, style="subtask", offset=1)["content"] == "subtask 0"
+ assert nth_next(0.0, persistent=PERSISTENT, style="subtask", offset=1)["content"] == "subtask 1"
+
+
+def test_substitution_if_present_multimodal_and_tool_calls():
+ recipe = TrainingRecipe(
+ messages=[
+ MessageTurn(
+ role="user",
+ content=[
+ {"type": "image", "feature": "observation.images.top"},
+ {"type": "text", "text": "${task}: ${interjection}"},
+ ],
+ stream="high_level",
+ if_present="interjection",
+ ),
+ MessageTurn(
+ role="assistant",
+ content="${plan}",
+ stream="high_level",
+ target=True,
+ tool_calls_from="speech",
+ ),
+ ],
+ bindings={"plan": "active_at(t, style=plan)"},
+ )
+
+ rendered = render_sample(
+ recipe=recipe,
+ persistent=PERSISTENT,
+ events=EVENTS_AT_2,
+ t=2.0,
+ sample_idx=0,
+ task="clean kitchen",
+ )
+
+ assert rendered["messages"][0]["content"][1]["text"] == "clean kitchen: skip wiping"
+ assert rendered["messages"][1]["content"] == "plan 0"
+ assert rendered["messages"][1]["tool_calls"][0]["function"]["name"] == "say"
+ assert rendered["message_streams"] == ["high_level", "high_level"]
+ assert rendered["target_message_indices"] == [1]
+
+
+def test_exact_event_miss_returns_none_when_target_skips():
+ recipe = TrainingRecipe(
+ messages=[
+ MessageTurn(role="user", content="${vqa_query}", stream="high_level", if_present="vqa_query"),
+ MessageTurn(
+ role="assistant",
+ content="${vqa}",
+ stream="high_level",
+ target=True,
+ if_present="vqa",
+ ),
+ ]
+ )
+
+ assert (
+ render_sample(recipe=recipe, persistent=PERSISTENT, events=EVENTS_AT_2, t=0.0, sample_idx=0) is None
+ )
+
+
+def test_deterministic_blend_sampling():
+ recipe = TrainingRecipe(
+ blend={
+ "a": TrainingRecipe(
+ weight=1.0,
+ messages=[
+ MessageTurn(role="user", content="${task}", stream="high_level"),
+ MessageTurn(role="assistant", content="a", stream="high_level", target=True),
+ ],
+ ),
+ "b": TrainingRecipe(
+ weight=1.0,
+ messages=[
+ MessageTurn(role="user", content="${task}", stream="high_level"),
+ MessageTurn(role="assistant", content="b", stream="high_level", target=True),
+ ],
+ ),
+ }
+ )
+
+ first = render_sample(
+ recipe=recipe, persistent=PERSISTENT, events=EVENTS_AT_2, t=0.0, sample_idx=123, task="x"
+ )
+ second = render_sample(
+ recipe=recipe, persistent=PERSISTENT, events=EVENTS_AT_2, t=0.0, sample_idx=123, task="x"
+ )
+ assert first == second
+
+
+def test_emitted_at_filters_vqa_by_camera():
+ top = emitted_at(
+ 3.0,
+ persistent=PERSISTENT,
+ events=EVENTS_AT_3_TWO_CAMERAS,
+ style="vqa",
+ role="assistant",
+ camera="observation.images.top",
+ )
+ wrist = emitted_at(
+ 3.0,
+ persistent=PERSISTENT,
+ events=EVENTS_AT_3_TWO_CAMERAS,
+ style="vqa",
+ role="assistant",
+ camera="observation.images.wrist",
+ )
+ assert top["content"] == '{"count": 3}'
+ assert wrist["content"] == '{"count": 1}'
+
+
+def test_emitted_at_raises_on_ambiguous_per_camera_vqa():
+ with pytest.raises(ValueError, match="Ambiguous resolver"):
+ emitted_at(
+ 3.0,
+ persistent=PERSISTENT,
+ events=EVENTS_AT_3_TWO_CAMERAS,
+ style="vqa",
+ role="assistant",
+ )
+
+
+def _vqa_subrecipe(camera: str) -> TrainingRecipe:
+ return TrainingRecipe(
+ weight=1.0,
+ bindings={
+ "vqa_query": f"emitted_at(t, style=vqa, role=user, camera={camera})",
+ "vqa": f"emitted_at(t, style=vqa, role=assistant, camera={camera})",
+ },
+ messages=[
+ MessageTurn(
+ role="user",
+ content=[{"type": "image", "feature": camera}, {"type": "text", "text": "${vqa_query}"}],
+ stream="high_level",
+ if_present="vqa_query",
+ ),
+ MessageTurn(
+ role="assistant",
+ content="${vqa}",
+ stream="high_level",
+ target=True,
+ if_present="vqa",
+ ),
+ ],
+ )
+
+
+@pytest.mark.parametrize(
+ ("camera", "expected_query", "expected_answer"),
+ [
+ ("observation.images.top", "how many cups (top)?", '{"count": 3}'),
+ ("observation.images.wrist", "how many cups (wrist)?", '{"count": 1}'),
+ ],
+)
+def test_per_camera_blend_renders_both_views(camera, expected_query, expected_answer):
+ rendered = render_sample(
+ recipe=_vqa_subrecipe(camera),
+ persistent=PERSISTENT,
+ events=EVENTS_AT_3_TWO_CAMERAS,
+ t=3.0,
+ sample_idx=0,
+ )
+
+ assert rendered["messages"][0]["content"][0]["feature"] == camera
+ assert rendered["messages"][0]["content"][1]["text"] == expected_query
+ assert rendered["messages"][1]["content"] == expected_answer
+
+
+def test_resolve_task_picks_rephrasing_deterministically_per_sample():
+ rephrasings = [
+ persistent_row("user", "tidy the kitchen", "task_aug", 0.0),
+ persistent_row("user", "please clean up the kitchen", "task_aug", 0.0),
+ persistent_row("user", "kitchen needs tidying", "task_aug", 0.0),
+ persistent_row("user", "make the kitchen clean", "task_aug", 0.0),
+ ]
+ recipe = TrainingRecipe(
+ messages=[
+ MessageTurn(role="user", content="${task}", stream="high_level"),
+ MessageTurn(role="assistant", content="ok", stream="high_level", target=True),
+ ]
+ )
+
+ # No explicit task override → resolver consults persistent rows.
+ seen: set[str] = set()
+ for sample_idx in range(64):
+ rendered = render_sample(
+ recipe=recipe,
+ persistent=rephrasings,
+ events=[],
+ t=0.0,
+ sample_idx=sample_idx,
+ dataset_ctx={"task": "canonical kitchen task"},
+ )
+ seen.add(rendered["messages"][0]["content"])
+ # Every rephrasing should be reachable across enough samples.
+ assert seen == {r["content"] for r in rephrasings}
+ # Same sample_idx → same pick (determinism).
+ a = render_sample(
+ recipe=recipe,
+ persistent=rephrasings,
+ events=[],
+ t=0.0,
+ sample_idx=42,
+ dataset_ctx={"task": "canonical"},
+ )
+ b = render_sample(
+ recipe=recipe,
+ persistent=rephrasings,
+ events=[],
+ t=0.0,
+ sample_idx=42,
+ dataset_ctx={"task": "canonical"},
+ )
+ assert a["messages"][0]["content"] == b["messages"][0]["content"]
+
+
+def test_resolve_task_falls_back_to_canonical_without_rephrasings():
+ recipe = TrainingRecipe(
+ messages=[
+ MessageTurn(role="user", content="${task}", stream="high_level"),
+ MessageTurn(role="assistant", content="ok", stream="high_level", target=True),
+ ]
+ )
+ rendered = render_sample(
+ recipe=recipe,
+ persistent=PERSISTENT, # no task_aug rows
+ events=[],
+ t=0.0,
+ sample_idx=0,
+ dataset_ctx={"task": "clean the kitchen"},
+ )
+ assert rendered["messages"][0]["content"] == "clean the kitchen"
+
+
+def test_resolve_task_explicit_override_beats_rephrasings():
+ rephrasings = [
+ persistent_row("user", "rephrased one", "task_aug", 0.0),
+ persistent_row("user", "rephrased two", "task_aug", 0.0),
+ ]
+ recipe = TrainingRecipe(
+ messages=[
+ MessageTurn(role="user", content="${task}", stream="high_level"),
+ MessageTurn(role="assistant", content="ok", stream="high_level", target=True),
+ ]
+ )
+ rendered = render_sample(
+ recipe=recipe,
+ persistent=rephrasings,
+ events=[],
+ t=0.0,
+ sample_idx=0,
+ task="explicit override wins",
+ dataset_ctx={"task": "canonical"},
+ )
+ assert rendered["messages"][0]["content"] == "explicit override wins"
+
+
+def test_emitted_at_persistent_tolerates_small_timestamp_drift():
+ """Persistent ``emitted_at`` should match within EMITTED_AT_TOLERANCE_S
+ so callers that derive ``t`` arithmetically (``frame_idx / fps``) still
+ line up with the parquet-stored timestamp.
+ """
+ rows = [persistent_row("assistant", "memo", "memory", 1.0)]
+ # Half a tolerance window — bit-different float, comfortably inside
+ inside = emitted_at(1.0 + EMITTED_AT_TOLERANCE_S / 2, persistent=rows, events=[], style="memory")
+ assert inside is not None and inside["content"] == "memo"
+
+ # Just past the window — no match
+ outside = emitted_at(1.0 + EMITTED_AT_TOLERANCE_S * 2, persistent=rows, events=[], style="memory")
+ assert outside is None
+
+
+def test_render_sample_rejects_non_dict_language_rows():
+ """``_normalize_rows`` must surface malformed inputs as TypeError.
+
+ A pipeline that hands the renderer a non-dict (e.g. a stray string)
+ is a real upstream bug — silent skipping would let it propagate.
+ """
+ recipe = TrainingRecipe(
+ messages=[
+ MessageTurn(role="user", content="${task}", stream="high_level"),
+ MessageTurn(role="assistant", content="ok", stream="high_level", target=True),
+ ]
+ )
+ with pytest.raises(TypeError, match="must be dictionaries"):
+ render_sample(
+ recipe=recipe,
+ persistent=["not a dict"],
+ events=[],
+ t=0.0,
+ sample_idx=0,
+ task="x",
+ )
+
+
+def test_low_level_branch_renders_active_subtask():
+ low_level = TrainingRecipe(
+ blend={
+ "low": TrainingRecipe(
+ weight=1.0,
+ messages=[
+ MessageTurn(
+ role="user",
+ content="${task}\nPlan: ${plan}\nMemory: ${memory}",
+ stream="high_level",
+ ),
+ MessageTurn(
+ role="assistant",
+ content="${subtask}",
+ stream="low_level",
+ target=True,
+ ),
+ ],
+ )
+ }
+ )
+
+ rendered = render_sample(
+ recipe=low_level,
+ persistent=PERSISTENT,
+ events=[],
+ t=0.5,
+ sample_idx=0,
+ task="clean kitchen",
+ )
+
+ assert rendered["messages"][-1] == {"role": "assistant", "content": "subtask 0"}
+ assert rendered["message_streams"][-1] == "low_level"
+ assert rendered["target_message_indices"] == [1]
diff --git a/tests/datasets/test_streaming_video_encoder.py b/tests/datasets/test_streaming_video_encoder.py
index 8b7a1540f..b69f24254 100644
--- a/tests/datasets/test_streaming_video_encoder.py
+++ b/tests/datasets/test_streaming_video_encoder.py
@@ -14,11 +14,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-"""Tests for streaming video encoding and hardware-accelerated encoding."""
+"""Tests for streaming video encoding."""
import queue
import threading
-from unittest.mock import patch
import numpy as np
import pytest
@@ -27,112 +26,20 @@ pytest.importorskip("av", reason="av is required (install lerobot[dataset])")
import av # noqa: E402
+from lerobot.configs import VideoEncoderConfig
+from lerobot.datasets.pyav_utils import get_codec
from lerobot.datasets.video_utils import (
- VALID_VIDEO_CODECS,
StreamingVideoEncoder,
_CameraEncoderThread,
- _get_codec_options,
- detect_available_hw_encoders,
- resolve_vcodec,
)
from lerobot.utils.constants import OBS_IMAGES
-# ─── _get_codec_options tests ───
-
-
-class TestGetCodecOptions:
- def test_libsvtav1_defaults(self):
- opts = _get_codec_options("libsvtav1")
- assert opts["g"] == "2"
- assert opts["crf"] == "30"
- assert opts["preset"] == "12"
-
- def test_libsvtav1_custom_preset(self):
- opts = _get_codec_options("libsvtav1", preset=8)
- assert opts["preset"] == "8"
-
- def test_h264_options(self):
- opts = _get_codec_options("h264", g=10, crf=23)
- assert opts["g"] == "10"
- assert opts["crf"] == "23"
- assert "preset" not in opts
-
- def test_videotoolbox_options(self):
- opts = _get_codec_options("h264_videotoolbox", g=2, crf=30)
- assert opts["g"] == "2"
- # CRF 30 maps to quality = max(1, min(100, 100 - 30*2)) = 40
- assert opts["q:v"] == "40"
- assert "crf" not in opts
-
- def test_nvenc_options(self):
- opts = _get_codec_options("h264_nvenc", g=2, crf=25)
- assert opts["rc"] == "constqp"
- assert opts["qp"] == "25"
- assert "crf" not in opts
- # NVENC doesn't support g
- assert "g" not in opts
-
- def test_vaapi_options(self):
- opts = _get_codec_options("h264_vaapi", crf=28)
- assert opts["qp"] == "28"
-
- def test_qsv_options(self):
- opts = _get_codec_options("h264_qsv", crf=25)
- assert opts["global_quality"] == "25"
-
- def test_no_g_no_crf(self):
- opts = _get_codec_options("h264", g=None, crf=None)
- assert "g" not in opts
- assert "crf" not in opts
-
-
-# ─── HW encoder detection tests ───
-
-
-class TestHWEncoderDetection:
- def test_detect_available_hw_encoders_returns_list(self):
- result = detect_available_hw_encoders()
- assert isinstance(result, list)
-
- def test_detect_available_hw_encoders_only_valid(self):
- from lerobot.datasets.video_utils import HW_ENCODERS
-
- result = detect_available_hw_encoders()
- for encoder in result:
- assert encoder in HW_ENCODERS
-
- def test_resolve_vcodec_passthrough(self):
- assert resolve_vcodec("libsvtav1") == "libsvtav1"
- assert resolve_vcodec("h264") == "h264"
-
- def test_resolve_vcodec_auto_fallback(self):
- """When no HW encoders are available, auto should fall back to libsvtav1."""
- with patch("lerobot.datasets.video_utils.detect_available_hw_encoders", return_value=[]):
- assert resolve_vcodec("auto") == "libsvtav1"
-
- def test_resolve_vcodec_auto_picks_hw(self):
- """When a HW encoder is available, auto should pick it."""
- with patch(
- "lerobot.datasets.video_utils.detect_available_hw_encoders",
- return_value=["h264_videotoolbox"],
- ):
- assert resolve_vcodec("auto") == "h264_videotoolbox"
-
- def test_resolve_vcodec_auto_returns_valid(self):
- """Test that resolve_vcodec('auto') returns a known valid codec."""
- result = resolve_vcodec("auto")
- assert result in VALID_VIDEO_CODECS
-
- def test_hw_encoder_names_accepted_in_validation(self):
- """Test that HW encoder names pass validation in VALID_VIDEO_CODECS."""
- assert "auto" in VALID_VIDEO_CODECS
- assert "h264_videotoolbox" in VALID_VIDEO_CODECS
- assert "h264_nvenc" in VALID_VIDEO_CODECS
-
- def test_resolve_vcodec_invalid_raises(self):
- """Test that resolve_vcodec raises ValueError for invalid codecs."""
- with pytest.raises(ValueError, match="Invalid vcodec"):
- resolve_vcodec("not_a_real_codec")
+# Cross-codec validation tests only fire when the target codec is present
+# in the local FFmpeg build; on other platforms validate() is a no-op.
+_has_videotoolbox = get_codec("h264_videotoolbox") is not None
+_videotoolbox_only = pytest.mark.skipif(
+ not _has_videotoolbox, reason="h264_videotoolbox not in local FFmpeg build"
+)
# ─── _CameraEncoderThread tests ───
@@ -150,14 +57,13 @@ class TestCameraEncoderThread:
result_queue: queue.Queue = queue.Queue(maxsize=1)
stop_event = threading.Event()
+ enc_cfg = VideoEncoderConfig(vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13)
encoder_thread = _CameraEncoderThread(
video_path=video_path,
fps=fps,
- vcodec="libsvtav1",
- pix_fmt="yuv420p",
- g=2,
- crf=30,
- preset=13,
+ vcodec=enc_cfg.vcodec,
+ pix_fmt=enc_cfg.pix_fmt,
+ codec_options=enc_cfg.get_codec_options(as_strings=True),
frame_queue=frame_queue,
result_queue=result_queue,
stop_event=stop_event,
@@ -202,14 +108,13 @@ class TestCameraEncoderThread:
result_queue: queue.Queue = queue.Queue(maxsize=1)
stop_event = threading.Event()
+ enc_cfg = VideoEncoderConfig(vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13)
encoder_thread = _CameraEncoderThread(
video_path=video_path,
fps=fps,
- vcodec="libsvtav1",
- pix_fmt="yuv420p",
- g=2,
- crf=30,
- preset=13,
+ vcodec=enc_cfg.vcodec,
+ pix_fmt=enc_cfg.pix_fmt,
+ codec_options=enc_cfg.get_codec_options(as_strings=True),
frame_queue=frame_queue,
result_queue=result_queue,
stop_event=stop_event,
@@ -237,14 +142,13 @@ class TestCameraEncoderThread:
result_queue: queue.Queue = queue.Queue(maxsize=1)
stop_event = threading.Event()
+ enc_cfg = VideoEncoderConfig(vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13)
encoder_thread = _CameraEncoderThread(
video_path=video_path,
fps=fps,
- vcodec="libsvtav1",
- pix_fmt="yuv420p",
- g=2,
- crf=30,
- preset=13,
+ vcodec=enc_cfg.vcodec,
+ pix_fmt=enc_cfg.pix_fmt,
+ codec_options=enc_cfg.get_codec_options(as_strings=True),
frame_queue=frame_queue,
result_queue=result_queue,
stop_event=stop_event,
@@ -266,11 +170,20 @@ class TestCameraEncoderThread:
class TestStreamingVideoEncoder:
+ def _make_encoder_config(self, **kwargs):
+ """Helper to build a VideoEncoderConfig."""
+ return VideoEncoderConfig(**kwargs)
+
def test_single_camera_episode(self, tmp_path):
"""Test encoding a single camera episode."""
- encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13)
-
video_keys = [f"{OBS_IMAGES}.laptop"]
+ encoder = StreamingVideoEncoder(
+ fps=30,
+ camera_encoder=self._make_encoder_config(
+ vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13
+ ),
+ )
+
encoder.start_episode(video_keys, tmp_path)
num_frames = 20
@@ -295,9 +208,11 @@ class TestStreamingVideoEncoder:
def test_multi_camera_episode(self, tmp_path):
"""Test encoding multiple cameras simultaneously."""
- encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30)
-
video_keys = [f"{OBS_IMAGES}.laptop", f"{OBS_IMAGES}.phone"]
+ encoder = StreamingVideoEncoder(
+ fps=30,
+ camera_encoder=self._make_encoder_config(vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30),
+ )
encoder.start_episode(video_keys, tmp_path)
num_frames = 15
@@ -319,8 +234,11 @@ class TestStreamingVideoEncoder:
def test_sequential_episodes(self, tmp_path):
"""Test that multiple sequential episodes work correctly."""
- encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30)
video_keys = [f"{OBS_IMAGES}.cam"]
+ encoder = StreamingVideoEncoder(
+ fps=30,
+ camera_encoder=self._make_encoder_config(vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30),
+ )
for ep in range(3):
encoder.start_episode(video_keys, tmp_path)
@@ -342,8 +260,11 @@ class TestStreamingVideoEncoder:
def test_cancel_episode(self, tmp_path):
"""Test that canceling an episode cleans up properly."""
- encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30)
video_keys = [f"{OBS_IMAGES}.cam"]
+ encoder = StreamingVideoEncoder(
+ fps=30,
+ camera_encoder=self._make_encoder_config(vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30),
+ )
encoder.start_episode(video_keys, tmp_path)
@@ -365,28 +286,33 @@ class TestStreamingVideoEncoder:
def test_feed_without_start_raises(self, tmp_path):
"""Test that feeding frames without starting an episode raises."""
- encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p")
+ encoder = StreamingVideoEncoder(fps=30)
with pytest.raises(RuntimeError, match="No active episode"):
encoder.feed_frame("cam", np.zeros((64, 96, 3), dtype=np.uint8))
encoder.close()
def test_finish_without_start_raises(self, tmp_path):
"""Test that finishing without starting raises."""
- encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p")
+ encoder = StreamingVideoEncoder(fps=30)
with pytest.raises(RuntimeError, match="No active episode"):
encoder.finish_episode()
encoder.close()
def test_close_is_idempotent(self, tmp_path):
"""Test that close() can be called multiple times safely."""
- encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p")
+ encoder = StreamingVideoEncoder(fps=30)
encoder.close()
encoder.close() # Should not raise
def test_video_duration_matches_frame_count(self, tmp_path):
"""Test that encoded video duration matches num_frames / fps."""
- encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13)
video_keys = [f"{OBS_IMAGES}.cam"]
+ encoder = StreamingVideoEncoder(
+ fps=30,
+ camera_encoder=self._make_encoder_config(
+ vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13
+ ),
+ )
encoder.start_episode(video_keys, tmp_path)
num_frames = 90 # 3 seconds at 30fps
@@ -417,9 +343,11 @@ class TestStreamingVideoEncoder:
def test_multi_camera_start_episode_called_once(self, tmp_path):
"""Test that with multiple cameras, no frames are lost due to double start_episode."""
- encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30)
-
video_keys = [f"{OBS_IMAGES}.cam1", f"{OBS_IMAGES}.cam2"]
+ encoder = StreamingVideoEncoder(
+ fps=30,
+ camera_encoder=self._make_encoder_config(vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30),
+ )
encoder.start_episode(video_keys, tmp_path)
num_frames = 30
@@ -446,17 +374,24 @@ class TestStreamingVideoEncoder:
def test_encoder_threads_passed_to_thread(self, tmp_path):
"""Test that encoder_threads is stored and passed through to encoder threads."""
- encoder = StreamingVideoEncoder(
- fps=30, vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, encoder_threads=2
- )
- assert encoder.encoder_threads == 2
-
video_keys = [f"{OBS_IMAGES}.cam"]
+ cfg = VideoEncoderConfig(
+ vcodec="libsvtav1",
+ pix_fmt="yuv420p",
+ g=2,
+ crf=30,
+ )
+ encoder = StreamingVideoEncoder(
+ fps=30,
+ camera_encoder=cfg,
+ encoder_threads=2,
+ )
+ assert encoder._encoder_threads == 2
encoder.start_episode(video_keys, tmp_path)
- # Verify the thread received the encoder_threads value
+ # Verify codec options include thread tuning for libsvtav1 (lp=…)
thread = encoder._threads[f"{OBS_IMAGES}.cam"]
- assert thread.encoder_threads == 2
+ assert "svtav1-params" in thread.codec_options or "threads" in thread.codec_options
# Feed some frames and finish to ensure it works end-to-end
num_frames = 10
@@ -478,16 +413,20 @@ class TestStreamingVideoEncoder:
def test_encoder_threads_none_by_default(self, tmp_path):
"""Test that encoder_threads defaults to None (codec auto-detect)."""
- encoder = StreamingVideoEncoder(fps=30, vcodec="libsvtav1", pix_fmt="yuv420p")
- assert encoder.encoder_threads is None
+ encoder = StreamingVideoEncoder(fps=30)
+ assert encoder._encoder_threads is None
encoder.close()
def test_graceful_frame_dropping(self, tmp_path):
"""Test that full queue drops frames instead of crashing."""
- encoder = StreamingVideoEncoder(
- fps=30, vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13, queue_maxsize=1
- )
video_keys = [f"{OBS_IMAGES}.cam"]
+ encoder = StreamingVideoEncoder(
+ fps=30,
+ camera_encoder=self._make_encoder_config(
+ vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30, preset=13
+ ),
+ queue_maxsize=1,
+ )
encoder.start_episode(video_keys, tmp_path)
# Feed many frames quickly - with queue_maxsize=1, some will be dropped
diff --git a/tests/datasets/test_subtask_dataset.py b/tests/datasets/test_subtask_dataset.py
deleted file mode 100644
index bb77b77d1..000000000
--- a/tests/datasets/test_subtask_dataset.py
+++ /dev/null
@@ -1,193 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Tests for subtask functionality in LeRobotDataset.
-
-These tests verify that:
-- Subtask information is correctly loaded from datasets that have subtask data
-- The __getitem__ method correctly adds subtask strings to returned items
-- Subtask handling gracefully handles missing data
-"""
-
-import pytest
-
-pytest.importorskip("pandas", reason="pandas is required (install lerobot[dataset])")
-
-import pandas as pd # noqa: E402
-import torch
-
-from lerobot.datasets.lerobot_dataset import LeRobotDataset
-
-
-class TestSubtaskDataset:
- """Tests for subtask handling in LeRobotDataset."""
-
- @pytest.fixture
- def subtask_dataset(self):
- """Load the test subtask dataset from the hub."""
- # Use lerobot/pusht-subtask dataset with episode 1
- return LeRobotDataset(
- repo_id="lerobot/pusht-subtask",
- episodes=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
- )
-
- def test_subtask_dataset_loads(self, subtask_dataset):
- """Test that the subtask dataset loads successfully."""
- assert subtask_dataset is not None
- assert len(subtask_dataset) > 0
-
- def test_subtask_metadata_loaded(self, subtask_dataset):
- """Test that subtask metadata is loaded when present in dataset."""
- # The dataset should have subtasks metadata loaded
- assert subtask_dataset.meta.subtasks is not None
- assert isinstance(subtask_dataset.meta.subtasks, pd.DataFrame)
-
- def test_subtask_index_in_features(self, subtask_dataset):
- """Test that subtask_index is a feature when dataset has subtasks."""
- assert "subtask_index" in subtask_dataset.features
-
- def test_getitem_returns_subtask_string(self, subtask_dataset):
- """Test that __getitem__ correctly adds subtask string to returned item."""
- item = subtask_dataset[0]
-
- # Subtask should be present in the returned item
- assert "subtask" in item
- assert isinstance(item["subtask"], str)
- assert len(item["subtask"]) > 0 # Should not be empty
-
- def test_getitem_has_subtask_index(self, subtask_dataset):
- """Test that __getitem__ includes subtask_index."""
- item = subtask_dataset[0]
-
- assert "subtask_index" in item
- assert isinstance(item["subtask_index"], torch.Tensor)
-
- def test_subtask_index_maps_to_valid_subtask(self, subtask_dataset):
- """Test that subtask_index correctly maps to a subtask in metadata."""
- item = subtask_dataset[0]
-
- subtask_idx = item["subtask_index"].item()
- subtask_from_metadata = subtask_dataset.meta.subtasks.iloc[subtask_idx].name
-
- assert item["subtask"] == subtask_from_metadata
-
- def test_all_items_have_subtask(self, subtask_dataset):
- """Test that all items in the dataset have subtask information."""
- for i in range(min(len(subtask_dataset), 5)): # Check first 5 items
- item = subtask_dataset[i]
- assert "subtask" in item
- assert isinstance(item["subtask"], str)
-
- def test_task_and_subtask_coexist(self, subtask_dataset):
- """Test that both task and subtask are present in returned items."""
- item = subtask_dataset[0]
-
- # Both task and subtask should be present
- assert "task" in item
- assert "subtask" in item
- assert isinstance(item["task"], str)
- assert isinstance(item["subtask"], str)
-
-
-class TestSubtaskDatasetMissing:
- """Tests for graceful handling when subtask data is missing."""
-
- @pytest.fixture
- def dataset_without_subtasks(self, tmp_path, empty_lerobot_dataset_factory):
- """Create a dataset without subtask information."""
- features = {"state": {"dtype": "float32", "shape": (2,), "names": None}}
- dataset = empty_lerobot_dataset_factory(root=tmp_path / "no_subtask", features=features)
-
- # Add some frames and save
- for _ in range(5):
- dataset.add_frame({"state": torch.randn(2), "task": "Test task"})
- dataset.save_episode()
- dataset.finalize()
-
- # Reload the dataset
- return LeRobotDataset(dataset.repo_id, root=dataset.root)
-
- def test_no_subtask_in_features(self, dataset_without_subtasks):
- """Test that subtask_index is not in features when not provided."""
- assert "subtask_index" not in dataset_without_subtasks.features
-
- def test_getitem_without_subtask(self, dataset_without_subtasks):
- """Test that __getitem__ works when subtask is not present."""
- item = dataset_without_subtasks[0]
-
- # Item should still be retrievable
- assert item is not None
- assert "state" in item
- assert "task" in item
-
- # Subtask should NOT be present
- assert "subtask" not in item
-
- def test_subtasks_metadata_is_none(self, dataset_without_subtasks):
- """Test that subtasks metadata is None when not present."""
- assert dataset_without_subtasks.meta.subtasks is None
-
-
-class TestSubtaskEdgeCases:
- """Edge case tests for subtask handling."""
-
- def test_subtask_with_multiple_episodes(self):
- """Test subtask handling with multiple episodes if available."""
- try:
- dataset = LeRobotDataset(
- repo_id="lerobot/pusht-subtask",
- episodes=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
- )
- except Exception:
- pytest.skip("Could not load test-subtask dataset")
-
- # Check first and last items have valid subtasks
- first_item = dataset[0]
- last_item = dataset[len(dataset) - 1]
-
- assert "subtask" in first_item
- assert "subtask" in last_item
- assert isinstance(first_item["subtask"], str)
- assert isinstance(last_item["subtask"], str)
-
- def test_subtask_index_consistency(self):
- """Test that same subtask_index returns same subtask string."""
- try:
- dataset = LeRobotDataset(
- repo_id="lerobot/pusht-subtask",
- episodes=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
- )
- except Exception:
- pytest.skip("Could not load test-subtask dataset")
-
- if len(dataset) < 2:
- pytest.skip("Dataset too small for this test")
-
- # Collect subtask_index to subtask mappings
- subtask_map = {}
- for i in range(min(len(dataset), 10)):
- item = dataset[i]
- idx = item["subtask_index"].item()
- subtask = item["subtask"]
-
- if idx in subtask_map:
- # Same index should always return same subtask
- assert subtask_map[idx] == subtask, (
- f"Inconsistent subtask for index {idx}: '{subtask_map[idx]}' vs '{subtask}'"
- )
- else:
- subtask_map[idx] = subtask
diff --git a/tests/datasets/test_video_decoder_cache.py b/tests/datasets/test_video_decoder_cache.py
new file mode 100644
index 000000000..6e69f8403
--- /dev/null
+++ b/tests/datasets/test_video_decoder_cache.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Unit tests for ``lerobot.datasets.video_utils.VideoDecoderCache``.
+
+These cover the LRU bounding + file-handle release behaviour added to prevent
+unbounded growth when iterating over datasets with many distinct video files
+(observed: ~35 GB anon-rss per DataLoader worker on an 8 k-file dataset).
+"""
+
+import shutil
+from pathlib import Path
+
+import pytest
+
+pytest.importorskip("torchcodec", reason="torchcodec is required (install lerobot[dataset])")
+
+from lerobot.datasets.video_utils import VideoDecoderCache # noqa: E402
+
+TEST_ARTIFACTS_DIR = Path(__file__).resolve().parent.parent / "artifacts" / "encoded_videos"
+SRC_CLIP = TEST_ARTIFACTS_DIR / "clip_4frames.mp4"
+
+
+def _make_distinct_clips(tmp_path: Path, n: int) -> list[Path]:
+ """Copy the small reference mp4 to ``n`` distinct paths.
+
+ The cache keys on absolute path, so distinct paths force distinct cache entries
+ even though the file contents are identical.
+ """
+ assert SRC_CLIP.exists(), f"missing test artifact {SRC_CLIP}"
+ paths = []
+ for i in range(n):
+ dst = tmp_path / f"clip_{i:04d}.mp4"
+ shutil.copyfile(SRC_CLIP, dst)
+ paths.append(dst)
+ return paths
+
+
+class TestVideoDecoderCacheBounded:
+ def test_default_cache_is_bounded(self):
+ """The default cache must have a finite ``max_size`` to bound RSS growth."""
+ cache = VideoDecoderCache()
+ assert cache.max_size is not None, "default cache must be bounded"
+ assert cache.max_size > 0
+
+ def test_size_capped_at_max_size(self, tmp_path):
+ """``get_decoder`` for >``max_size`` distinct paths must NOT grow without bound."""
+ paths = _make_distinct_clips(tmp_path, n=5)
+ cache = VideoDecoderCache(max_size=2)
+ for p in paths:
+ cache.get_decoder(p)
+ assert cache.size() == 2
+
+ def test_evicts_least_recently_used(self, tmp_path):
+ """Re-accessing an entry must promote it; the LRU entry is the one evicted."""
+ paths = _make_distinct_clips(tmp_path, n=3)
+ cache = VideoDecoderCache(max_size=2)
+
+ cache.get_decoder(paths[0])
+ cache.get_decoder(paths[1])
+ cache.get_decoder(paths[0]) # promote paths[0] to MRU; paths[1] is now LRU
+ cache.get_decoder(paths[2]) # should evict paths[1]
+
+ assert str(paths[0]) in cache # MRU stays
+ assert str(paths[1]) not in cache # LRU evicted
+ assert str(paths[2]) in cache # newest stays
+
+ def test_eviction_closes_file_handle(self, tmp_path):
+ """Evicting an entry must close its fsspec file handle (otherwise we leak FDs)."""
+ paths = _make_distinct_clips(tmp_path, n=2)
+ cache = VideoDecoderCache(max_size=1)
+
+ cache.get_decoder(paths[0])
+ # Reach into the cache to capture the handle before it is evicted. This is
+ # the only assertion in the suite that touches a private attribute, and it
+ # is the most direct way to prove the file descriptor is actually released.
+ evicted_handle = cache._cache[str(paths[0])][1]
+ assert evicted_handle.closed is False
+
+ cache.get_decoder(paths[1]) # forces eviction of paths[0]
+
+ assert evicted_handle.closed is True
+
+ def test_clear_closes_all_file_handles(self, tmp_path):
+ """``clear()`` must close every cached file handle."""
+ paths = _make_distinct_clips(tmp_path, n=3)
+ cache = VideoDecoderCache(max_size=10)
+
+ for p in paths:
+ cache.get_decoder(p)
+ handles = [entry[1] for entry in cache._cache.values()]
+ assert all(not h.closed for h in handles)
+
+ cache.clear()
+
+ assert cache.size() == 0
+ assert all(h.closed for h in handles)
+
+ def test_hit_does_not_reopen_or_evict(self, tmp_path):
+ """A cache hit must return the same decoder instance without touching the cap."""
+ paths = _make_distinct_clips(tmp_path, n=1)
+ cache = VideoDecoderCache(max_size=2)
+
+ first = cache.get_decoder(paths[0])
+ second = cache.get_decoder(paths[0])
+
+ assert first is second
+ assert cache.size() == 1
+
+ def test_unbounded_when_max_size_none(self, tmp_path):
+ """``max_size=None`` preserves the legacy unbounded behaviour."""
+ paths = _make_distinct_clips(tmp_path, n=4)
+ cache = VideoDecoderCache(max_size=None)
+ for p in paths:
+ cache.get_decoder(p)
+ assert cache.size() == 4
+
+ def test_env_var_overrides_default(self, tmp_path, monkeypatch):
+ """``LEROBOT_VIDEO_DECODER_CACHE_SIZE`` env var sets the default ``max_size``."""
+ monkeypatch.setenv("LEROBOT_VIDEO_DECODER_CACHE_SIZE", "3")
+ cache = VideoDecoderCache()
+ assert cache.max_size == 3
+
+ paths = _make_distinct_clips(tmp_path, n=5)
+ for p in paths:
+ cache.get_decoder(p)
+ assert cache.size() == 3
diff --git a/tests/datasets/test_video_encoding.py b/tests/datasets/test_video_encoding.py
new file mode 100644
index 000000000..1af61e9f9
--- /dev/null
+++ b/tests/datasets/test_video_encoding.py
@@ -0,0 +1,626 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Unit tests for ``lerobot.datasets.video_utils`` encoding functions and ``lerobot.configs.video.VideoEncoderConfig`` config class."""
+
+import json
+from pathlib import Path
+
+import numpy as np
+import pytest
+
+pytest.importorskip("av", reason="av is required (install lerobot[dataset])")
+
+import av # noqa: E402
+
+from lerobot.configs import VALID_VIDEO_CODECS, VideoEncoderConfig
+from lerobot.datasets.image_writer import write_image
+from lerobot.datasets.lerobot_dataset import LeRobotDataset
+from lerobot.datasets.pyav_utils import get_codec
+from lerobot.datasets.utils import INFO_PATH
+from lerobot.datasets.video_utils import (
+ concatenate_video_files,
+ encode_video_frames,
+ get_video_info,
+ reencode_video,
+)
+from tests.fixtures.constants import DUMMY_VIDEO_INFO
+
+
+# Per-codec skip markers — validation tests only fire when the codec is available
+def _require_encoder(vcodec: str) -> pytest.MarkDecorator:
+ """Skip the test if ``vcodec`` is not available in the local FFmpeg build."""
+ return pytest.mark.skipif(get_codec(vcodec) is None, reason=f"{vcodec!r} not in local FFmpeg build")
+
+
+require_libsvtav1 = _require_encoder("libsvtav1")
+require_h264 = _require_encoder("h264")
+require_videotoolbox = _require_encoder("h264_videotoolbox")
+require_nvenc = _require_encoder("h264_nvenc")
+require_vaapi = _require_encoder("h264_vaapi")
+require_qsv = _require_encoder("h264_qsv")
+
+
+# ─── VideoEncoderConfig / codec options ──────────────────────────────
+
+
+class TestCodecOptions:
+ @require_libsvtav1
+ def test_libsvtav1_defaults(self):
+ cfg = VideoEncoderConfig()
+ opts = cfg.get_codec_options()
+ assert opts["g"] == 2
+ assert opts["crf"] == 30
+ assert opts["preset"] == 12
+
+ @require_libsvtav1
+ def test_libsvtav1_custom_preset(self):
+ cfg = VideoEncoderConfig(preset=8)
+ assert cfg.get_codec_options()["preset"] == 8
+
+ @require_h264
+ def test_h264_options(self):
+ cfg = VideoEncoderConfig(vcodec="h264", g=10, crf=23, preset=None)
+ opts = cfg.get_codec_options()
+ assert opts["g"] == 10
+ assert opts["crf"] == 23
+ assert "preset" not in opts
+
+ @require_videotoolbox
+ def test_videotoolbox_options(self):
+ cfg = VideoEncoderConfig(vcodec="h264_videotoolbox", g=2, crf=30, preset=None)
+ opts = cfg.get_codec_options()
+ assert opts["g"] == 2
+ assert opts["q:v"] == 40
+ assert "crf" not in opts
+
+ @_require_encoder("h264_nvenc")
+ def test_nvenc_options(self):
+ cfg = VideoEncoderConfig(vcodec="h264_nvenc", g=2, crf=25, preset=None)
+ opts = cfg.get_codec_options()
+ assert opts["rc"] == 0
+ assert opts["qp"] == 25
+ assert "crf" not in opts
+ assert opts["g"] == 2
+
+ @_require_encoder("h264_vaapi")
+ def test_vaapi_options(self):
+ cfg = VideoEncoderConfig(vcodec="h264_vaapi", crf=28, preset=None)
+ assert cfg.get_codec_options()["qp"] == 28
+
+ @_require_encoder("h264_qsv")
+ def test_qsv_options(self):
+ cfg = VideoEncoderConfig(vcodec="h264_qsv", crf=25, preset=None)
+ assert cfg.get_codec_options()["global_quality"] == 25
+
+ @require_h264
+ def test_no_g_no_crf(self):
+ cfg = VideoEncoderConfig(vcodec="h264", g=None, crf=None, preset=None)
+ opts = cfg.get_codec_options()
+ assert "g" not in opts
+ assert "crf" not in opts
+
+ @require_libsvtav1
+ def test_encoder_threads_libsvtav1(self):
+ cfg = VideoEncoderConfig(fast_decode=0)
+ opts = cfg.get_codec_options(encoder_threads=4)
+ assert "lp=4" in opts.get("svtav1-params", "")
+
+ @require_h264
+ def test_encoder_threads_h264(self):
+ cfg = VideoEncoderConfig(vcodec="h264", preset=None)
+ assert cfg.get_codec_options(encoder_threads=2)["threads"] == 2
+
+ @require_libsvtav1
+ def test_fast_decode_libsvtav1(self):
+ cfg = VideoEncoderConfig(fast_decode=1)
+ opts = cfg.get_codec_options()
+ assert "fast-decode=1" in opts.get("svtav1-params", "")
+
+ @require_libsvtav1
+ def test_libsvtav1_fast_decode_clamped_to_svt_range(self):
+ """Out-of-range fast_decode is clamped to [0, 2] in svtav1-params (SVT-AV1 FastDecode)."""
+ cfg = VideoEncoderConfig(fast_decode=100)
+ assert "fast-decode=2" in cfg.get_codec_options().get("svtav1-params", "")
+ cfg_neg = VideoEncoderConfig(fast_decode=-5)
+ assert "fast-decode=0" in cfg_neg.get_codec_options().get("svtav1-params", "")
+
+ @require_h264
+ def test_fast_decode_h264(self):
+ cfg = VideoEncoderConfig(vcodec="h264", fast_decode=1, preset=None)
+ assert cfg.get_codec_options()["tune"] == "fastdecode"
+
+ @require_libsvtav1
+ def test_pix_fmt_unsupported_raises(self):
+ """Passing an unsupported pix_fmt is a hard error."""
+ with pytest.raises(ValueError, match="pix_fmt"):
+ VideoEncoderConfig(pix_fmt="yuv444p") # libsvtav1 only supports yuv420p variants
+
+ @require_libsvtav1
+ @require_h264
+ def test_preset_default_behaviour(self):
+ """Empty constructor picks preset=12 (libsvtav1 path); other codecs stay None."""
+ assert VideoEncoderConfig().preset == 12
+ assert VideoEncoderConfig(vcodec="libsvtav1").preset == 12
+ assert VideoEncoderConfig(vcodec="h264").preset is None
+ assert VideoEncoderConfig(vcodec="h264", preset=None).preset is None
+
+ @require_h264
+ def test_preset_string_on_h264(self):
+ """h264 accepts string presets and forwards them to FFmpeg."""
+ cfg = VideoEncoderConfig(vcodec="h264", preset="slow")
+ assert cfg.get_codec_options()["preset"] == "slow"
+
+ @require_videotoolbox
+ def test_preset_on_videotoolbox_not_set(self):
+ """videotoolbox has no preset option at all."""
+ cfg = VideoEncoderConfig(vcodec="h264_videotoolbox", preset="slow")
+ assert "preset" not in cfg.get_codec_options()
+
+ @require_libsvtav1
+ def test_libsvtav1_preset_out_of_range_raises(self):
+ """libsvtav1 preset must sit in [-2, 13] as exposed by PyAV."""
+ with pytest.raises(ValueError, match="out of range"):
+ VideoEncoderConfig(vcodec="libsvtav1", preset=100)
+ with pytest.raises(ValueError, match="out of range"):
+ VideoEncoderConfig(vcodec="libsvtav1", preset=-3)
+
+ @require_libsvtav1
+ def test_libsvtav1_crf_out_of_range_raises(self):
+ """libsvtav1 crf must sit in [0, 63]."""
+ with pytest.raises(ValueError, match="crf.*out of range"):
+ VideoEncoderConfig(vcodec="libsvtav1", crf=64)
+
+ @require_libsvtav1
+ def test_libsvtav1_crf_rejects_python_float(self):
+ """libsvtav1 exposes ``crf`` as an INT AVOption; Python float must not pass validation."""
+ with pytest.raises(ValueError, match="float values are not allowed"):
+ VideoEncoderConfig(vcodec="libsvtav1", crf=2.5)
+
+ @require_libsvtav1
+ def test_libsvtav1_extra_crf_rejects_fractional_string(self):
+ """INT options reject fractional values even when supplied only via ``extra_options``."""
+ with pytest.raises(ValueError, match="float values are not allowed"):
+ VideoEncoderConfig(
+ vcodec="libsvtav1",
+ crf=None,
+ extra_options={"crf": "2.5"},
+ )
+
+ @require_libsvtav1
+ def test_libsvtav1_extra_crf_rejects_float(self):
+ with pytest.raises(ValueError, match="float values are not allowed"):
+ VideoEncoderConfig(
+ vcodec="libsvtav1",
+ crf=None,
+ extra_options={"crf": 2.5},
+ )
+
+ @require_h264
+ def test_h264_crf_accepts_float_and_int(self):
+ """x264 exposes crf as a FLOAT option, so both int and float are accepted."""
+ assert VideoEncoderConfig(vcodec="h264", crf=23).get_codec_options()["crf"] == 23
+ assert VideoEncoderConfig(vcodec="h264", crf=23.5).get_codec_options()["crf"] == 23.5
+
+ @require_libsvtav1
+ def test_validate_is_rerunnable(self):
+ """After mutating a field, validate() re-checks and surfaces new issues."""
+ cfg = VideoEncoderConfig(vcodec="libsvtav1")
+ cfg.preset = 100 # now out of range
+ with pytest.raises(ValueError, match="out of range"):
+ cfg.validate()
+
+
+class TestExtraOptions:
+ @require_libsvtav1
+ def test_default_is_empty_dict(self):
+ cfg = VideoEncoderConfig()
+ assert cfg.extra_options == {}
+
+ @require_libsvtav1
+ def test_unknown_key_passes_through(self):
+ """Keys not published as AVOptions are forwarded to FFmpeg."""
+ cfg = VideoEncoderConfig(extra_options={"totally_made_up_option": "value"})
+ assert cfg.extra_options == {"totally_made_up_option": "value"}
+
+ @require_libsvtav1
+ def test_numeric_value_in_range_ok(self):
+ """libsvtav1 exposes ``qp`` as INT in [0, 63]."""
+ cfg = VideoEncoderConfig(extra_options={"qp": 30})
+ assert cfg.extra_options == {"qp": 30}
+
+ @require_libsvtav1
+ def test_numeric_out_of_range_raises(self):
+ with pytest.raises(ValueError, match=r"qp=.*out of range"):
+ VideoEncoderConfig(extra_options={"qp": 999})
+
+ @require_libsvtav1
+ def test_numeric_string_accepted_in_range(self):
+ """Numeric strings are accepted for numeric options (mirrors FFmpeg)."""
+ cfg = VideoEncoderConfig(extra_options={"qp": "18"})
+ assert cfg.extra_options == {"qp": "18"}
+
+ @require_libsvtav1
+ def test_numeric_string_out_of_range_raises(self):
+ with pytest.raises(ValueError, match=r"qp=.*out of range"):
+ VideoEncoderConfig(extra_options={"qp": "999"})
+
+ @require_libsvtav1
+ def test_non_numeric_string_on_numeric_option_raises(self):
+ with pytest.raises(ValueError, match=r"qp=.*not numeric"):
+ VideoEncoderConfig(extra_options={"qp": "medium"})
+
+ @require_libsvtav1
+ def test_bool_on_numeric_option_raises(self):
+ """``bool`` is explicitly rejected for numeric options."""
+ with pytest.raises(ValueError, match=r"qp=.*not numeric"):
+ VideoEncoderConfig(extra_options={"qp": True})
+
+ @require_h264
+ def test_string_option_passes_through_unchecked(self):
+ """String-typed AVOptions are NOT enum-checked (too many accept freeform)."""
+ cfg = VideoEncoderConfig(vcodec="h264", preset=None, extra_options={"tune": "some-future-tune"})
+ assert cfg.extra_options == {"tune": "some-future-tune"}
+
+ @require_libsvtav1
+ def test_merged_into_codec_options_and_stringified(self):
+ """Typed merge by default; ``as_strings=True`` matches FFmpeg option dict."""
+ cfg = VideoEncoderConfig(extra_options={"qp": 20})
+ opts = cfg.get_codec_options()
+ assert opts["qp"] == 20
+ assert isinstance(opts["qp"], int)
+ assert cfg.get_codec_options(as_strings=True)["qp"] == "20"
+
+ @require_libsvtav1
+ def test_structured_fields_win_on_collision(self):
+ """A colliding extra_options key is discarded; the structured field wins."""
+ cfg = VideoEncoderConfig(crf=30, extra_options={"crf": 18})
+ assert cfg.get_codec_options()["crf"] == 30
+
+
+class TestEncoderDetection:
+ @require_h264
+ def test_explicit_codec_kept_when_available(self):
+ cfg = VideoEncoderConfig(vcodec="h264")
+ assert cfg.vcodec == "h264"
+
+ @require_videotoolbox
+ def test_auto_picks_videotoolbox_when_available(self):
+ """``h264_videotoolbox`` sits at the top of ``HW_VIDEO_CODECS`` so it wins when present."""
+ cfg = VideoEncoderConfig(vcodec="auto")
+ assert cfg.vcodec == "h264_videotoolbox"
+
+ def test_invalid_codec_raises(self):
+ with pytest.raises(ValueError, match="Invalid vcodec"):
+ VideoEncoderConfig(vcodec="not_a_real_codec")
+
+ def test_hw_encoder_names_listed_as_valid(self):
+ assert "auto" in VALID_VIDEO_CODECS
+ assert "h264_videotoolbox" in VALID_VIDEO_CODECS
+ assert "h264_nvenc" in VALID_VIDEO_CODECS
+
+
+TEST_ARTIFACTS_DIR = Path(__file__).parent.parent / "artifacts" / "encoded_videos"
+
+# Default video feature set used by persistence tests.
+VIDEO_FEATURES = {
+ "observation.images.cam": {
+ "dtype": "video",
+ "shape": (64, 96, 3),
+ "names": ["height", "width", "channels"],
+ },
+ "action": {"dtype": "float32", "shape": (2,), "names": ["a", "b"]},
+}
+VIDEO_KEY = "observation.images.cam"
+
+
+def _write_frames(imgs_dir: Path, num_frames: int = 4, height: int = 64, width: int = 96) -> None:
+ imgs_dir.mkdir(parents=True, exist_ok=True)
+ for i in range(num_frames):
+ arr = np.random.randint(0, 256, (height, width, 3), dtype=np.uint8)
+ write_image(arr, imgs_dir / f"frame-{i:06d}.png")
+
+
+def _encode_video(
+ path: Path, num_frames: int = 4, fps: int = 30, cfg: VideoEncoderConfig | None = None
+) -> Path:
+ imgs_dir = path.parent / f"imgs_{path.stem}"
+ _write_frames(imgs_dir, num_frames=num_frames)
+ encode_video_frames(imgs_dir, path, fps=fps, camera_encoder=cfg, overwrite=True)
+ return path
+
+
+def _read_feature_info(dataset: LeRobotDataset) -> dict:
+ info = json.loads((dataset.root / INFO_PATH).read_text())
+ return info["features"][VIDEO_KEY]["info"]
+
+
+def _add_frames(dataset: LeRobotDataset, num_frames: int, video_keys: list[str] | None = None) -> None:
+ from lerobot.utils.constants import DEFAULT_FEATURES
+
+ if video_keys is None:
+ video_keys = dataset.meta.video_keys
+ for _ in range(num_frames):
+ frame: dict = {"task": "test"}
+ for key, ft in dataset.meta.features.items():
+ if key in DEFAULT_FEATURES:
+ continue
+ shape = ft["shape"]
+ if key in video_keys:
+ frame[key] = np.random.randint(0, 256, shape, dtype=np.uint8)
+ else:
+ frame[key] = np.zeros(shape, dtype=np.float32)
+ dataset.add_frame(frame)
+
+
+class TestGetVideoInfo:
+ def test_returns_all_stream_fields(self):
+ info = get_video_info(TEST_ARTIFACTS_DIR / "clip_4frames.mp4")
+
+ assert info["video.height"] == 64
+ assert info["video.width"] == 96
+ assert info["video.pix_fmt"] == "yuv420p"
+ assert info["video.fps"] == 30
+ assert info["video.channels"] == 3
+ assert info["video.is_depth_map"] is False
+ assert info["has_audio"] is False
+ assert "video.g" not in info
+ assert "video.crf" not in info
+ assert "video.preset" not in info
+
+ @require_libsvtav1
+ def test_merges_encoder_config_as_video_prefixed_entries(self):
+ cfg = VideoEncoderConfig(vcodec="libsvtav1", g=2, crf=30, preset=12)
+
+ info = get_video_info(TEST_ARTIFACTS_DIR / "clip_4frames.mp4", camera_encoder=cfg)
+
+ assert info["video.g"] == 2
+ assert info["video.crf"] == 30
+ assert info["video.preset"] == 12
+ assert info["video.fast_decode"] == 0
+ assert info["video.video_backend"] == "pyav"
+ assert info["video.extra_options"] == {}
+
+ @require_libsvtav1
+ def test_stream_derived_keys_take_precedence_over_config(self):
+ cfg = VideoEncoderConfig(vcodec="libsvtav1", pix_fmt="yuv420p")
+
+ info = get_video_info(TEST_ARTIFACTS_DIR / "clip_4frames.mp4", camera_encoder=cfg)
+
+ assert info["video.codec"] # populated from stream, not from config's vcodec
+ assert info["video.pix_fmt"] == "yuv420p"
+
+
+class TestEncodeVideoFrames:
+ @require_libsvtav1
+ def test_produces_readable_mp4(self, tmp_path):
+ video_path = _encode_video(tmp_path / "out.mp4")
+
+ assert video_path.exists()
+ info = get_video_info(video_path)
+ assert info["video.height"] == 64
+ assert info["video.width"] == 96
+
+ @require_libsvtav1
+ def test_frame_count_and_duration_match_input(self, tmp_path):
+ num_frames = 10
+ fps = 30
+ video_path = _encode_video(tmp_path / "out.mp4", num_frames=num_frames, fps=fps)
+
+ with av.open(str(video_path)) as container:
+ stream = container.streams.video[0]
+ actual_frames = sum(1 for _ in container.decode(stream))
+ duration = (
+ float(stream.duration * stream.time_base)
+ if stream.duration is not None
+ else float(container.duration / av.time_base)
+ )
+
+ assert actual_frames == num_frames
+ assert abs(duration - num_frames / fps) < 0.1
+
+ def test_overwrite_false_skips_existing_file(self, tmp_path):
+ imgs_dir = tmp_path / "imgs"
+ _write_frames(imgs_dir)
+ video_path = tmp_path / "out.mp4"
+ sentinel = b"pre-existing content"
+ video_path.write_bytes(sentinel)
+
+ encode_video_frames(imgs_dir, video_path, fps=30, overwrite=False)
+
+ assert video_path.read_bytes() == sentinel
+
+ @require_libsvtav1
+ def test_overwrite_true_replaces_existing_file(self, tmp_path):
+ imgs_dir = tmp_path / "imgs"
+ _write_frames(imgs_dir)
+ video_path = tmp_path / "out.mp4"
+ video_path.write_bytes(b"stale content")
+
+ encode_video_frames(imgs_dir, video_path, fps=30, overwrite=True)
+
+ info = get_video_info(video_path)
+ assert info["video.height"] == 64
+
+ @require_libsvtav1
+ def test_custom_encoder_config_fields_stored_in_info(self, tmp_path):
+ """All stream-derived and encoder config fields are present after encoding."""
+ cfg = VideoEncoderConfig(vcodec="libsvtav1", g=4, crf=25, preset=10)
+ video_path = _encode_video(tmp_path / "out.mp4", num_frames=4, fps=30, cfg=cfg)
+
+ info = get_video_info(video_path, camera_encoder=cfg)
+
+ # Stream-derived
+ assert info["video.height"] == 64
+ assert info["video.width"] == 96
+ assert info["video.channels"] == 3
+ assert info["video.codec"] == "av1"
+ assert info["video.pix_fmt"] == "yuv420p"
+ assert info["video.fps"] == 30
+ assert info["video.is_depth_map"] is False
+ assert info["has_audio"] is False
+ # Encoder config
+ assert info["video.g"] == 4
+ assert info["video.crf"] == 25
+ assert info["video.preset"] == 10
+ assert info["video.fast_decode"] == 0
+ assert info["video.video_backend"] == "pyav"
+ assert info["video.extra_options"] == {}
+
+
+class TestReencodeVideo:
+ @require_libsvtav1
+ @require_h264
+ def test_reencode_video(self, tmp_path):
+ src = TEST_ARTIFACTS_DIR / "clip_4frames.mp4"
+ out = tmp_path / "reencoded.mp4"
+ cfg = VideoEncoderConfig(vcodec="h264", g=6, crf=23, pix_fmt="yuv444p")
+ reencode_video(src, out, camera_encoder=cfg, overwrite=True)
+
+ assert out.exists()
+ with av.open(str(out)) as container:
+ n_frames = sum(1 for _ in container.decode(video=0))
+ assert n_frames == 4
+
+ info = get_video_info(out, camera_encoder=cfg)
+ assert info["video.codec"] == "h264"
+ assert info["video.pix_fmt"] == "yuv444p"
+ assert info["video.height"] == 64
+ assert info["video.width"] == 96
+ assert info["video.fps"] == 30
+ assert info["video.g"] == 6
+ assert info["video.crf"] == 23
+
+
+class TestConcatenateVideoFiles:
+ def test_two_clips_frame_count(self, tmp_path):
+ """Output frame count equals the sum of the two input frame counts."""
+ out = tmp_path / "out.mp4"
+ concatenate_video_files(
+ [TEST_ARTIFACTS_DIR / "clip_6frames.mp4", TEST_ARTIFACTS_DIR / "clip_4frames.mp4"], out
+ )
+
+ with av.open(str(out)) as container:
+ total = sum(1 for _ in container.decode(video=0))
+ assert total == 10
+
+ def test_three_clips_frame_count(self, tmp_path):
+ out = tmp_path / "out.mp4"
+ clip = TEST_ARTIFACTS_DIR / "clip_5frames.mp4"
+ concatenate_video_files([clip, clip, clip], out)
+
+ with av.open(str(out)) as container:
+ total = sum(1 for _ in container.decode(video=0))
+ assert total == 15
+
+ @require_libsvtav1
+ def test_geometry_preserved(self, tmp_path):
+ """Output resolution, fps, codec and pixel format must match the inputs."""
+ out = tmp_path / "out.mp4"
+ concatenate_video_files(
+ [TEST_ARTIFACTS_DIR / "clip_4frames.mp4", TEST_ARTIFACTS_DIR / "clip_4frames.mp4"], out
+ )
+
+ info = get_video_info(out)
+ assert info["video.height"] == 64
+ assert info["video.width"] == 96
+ assert info["video.fps"] == 30
+ assert info["video.codec"] == "av1"
+ assert info["video.pix_fmt"] == "yuv420p"
+
+ def test_compatibility_check_raises_on_different_codec(self, tmp_path):
+ with pytest.raises(ValueError):
+ concatenate_video_files(
+ [TEST_ARTIFACTS_DIR / "clip_4frames.mp4", TEST_ARTIFACTS_DIR / "clip_h264.mp4"],
+ tmp_path / "out.mp4",
+ compatibility_check=True,
+ )
+
+ def test_compatibility_check_raises_on_different_resolution(self, tmp_path):
+ with pytest.raises(ValueError):
+ concatenate_video_files(
+ [TEST_ARTIFACTS_DIR / "clip_4frames.mp4", TEST_ARTIFACTS_DIR / "clip_32x48.mp4"],
+ tmp_path / "out.mp4",
+ compatibility_check=True,
+ )
+
+
+class TestEncoderConfigPersistence:
+ """Encoder config must be stored as ``video.`` entries in
+ ``info["features"][key]["info"]`` when the first episode is saved.
+ """
+
+ @require_libsvtav1
+ def test_first_episode_save_persists_encoder_config(self, tmp_path, empty_lerobot_dataset_factory):
+ cfg = VideoEncoderConfig(vcodec="libsvtav1", g=2, crf=30, preset=12)
+ dataset = empty_lerobot_dataset_factory(
+ root=tmp_path / "ds", features=VIDEO_FEATURES, use_videos=True, camera_encoder=cfg
+ )
+
+ _add_frames(dataset, num_frames=4)
+ dataset.save_episode()
+ dataset.finalize()
+
+ info = _read_feature_info(dataset)
+
+ assert info["video.height"] == 64
+ assert info["video.width"] == 96
+ assert info["video.fps"] == 30
+ assert info["video.g"] == 2
+ assert info["video.crf"] == 30
+ assert info["video.preset"] == 12
+ assert info["video.fast_decode"] == 0
+ assert info["video.video_backend"] == "pyav"
+ assert info["video.extra_options"] == {}
+
+ @require_libsvtav1
+ def test_second_episode_does_not_overwrite_encoder_fields(self, tmp_path, empty_lerobot_dataset_factory):
+ cfg = VideoEncoderConfig(vcodec="libsvtav1", g=2, crf=30, preset=12)
+ dataset = empty_lerobot_dataset_factory(
+ root=tmp_path / "ds", features=VIDEO_FEATURES, use_videos=True, camera_encoder=cfg
+ )
+
+ _add_frames(dataset, num_frames=4)
+ dataset.save_episode()
+ first_info = dict(_read_feature_info(dataset))
+
+ _add_frames(dataset, num_frames=4)
+ dataset.save_episode()
+ dataset.finalize()
+
+ assert _read_feature_info(dataset) == first_info
+
+
+class TestFromVideoInfo:
+ """``VideoEncoderConfig.from_video_info`` reconstructs an encoder config
+ from the ``video.*`` keys persisted in a dataset's ``info.json``.
+ """
+
+ @require_libsvtav1
+ def test_reconstructs_from_dummy_video_info(self):
+ cfg = VideoEncoderConfig.from_video_info(DUMMY_VIDEO_INFO)
+
+ # Canonical stream codec ``"av1"`` is aliased to the encoder name.
+ assert cfg.vcodec == "libsvtav1"
+ assert cfg.pix_fmt == DUMMY_VIDEO_INFO["video.pix_fmt"]
+ assert cfg.g == DUMMY_VIDEO_INFO["video.g"]
+ assert cfg.crf == DUMMY_VIDEO_INFO["video.crf"]
+ assert cfg.preset == DUMMY_VIDEO_INFO["video.preset"]
+ assert cfg.fast_decode == DUMMY_VIDEO_INFO["video.fast_decode"]
+ assert cfg.video_backend == DUMMY_VIDEO_INFO["video.video_backend"]
+ # ``{}`` placeholder (typical after a merge with disagreeing sources)
+ # must not leak into the reconstructed config.
+ assert cfg.extra_options == VideoEncoderConfig().extra_options
diff --git a/tests/fixtures/constants.py b/tests/fixtures/constants.py
index 35d8776ce..4d578b503 100644
--- a/tests/fixtures/constants.py
+++ b/tests/fixtures/constants.py
@@ -28,17 +28,23 @@ DUMMY_MOTOR_FEATURES = {
"names": ["shoulder_pan", "shoulder_lift", "elbow_flex", "wrist_flex", "wrist_roll", "gripper"],
},
}
-DUMMY_CAMERA_FEATURES = {
- "laptop": {"shape": (64, 96, 3), "names": ["height", "width", "channels"], "info": None},
- "phone": {"shape": (64, 96, 3), "names": ["height", "width", "channels"], "info": None},
-}
DEFAULT_FPS = 30
DUMMY_VIDEO_INFO = {
"video.fps": DEFAULT_FPS,
"video.codec": "av1",
"video.pix_fmt": "yuv420p",
+ "video.video_backend": "pyav",
+ "video.extra_options": {},
+ "video.g": 2,
+ "video.crf": 30,
+ "video.preset": 12,
+ "video.fast_decode": 0,
"video.is_depth_map": False,
"has_audio": False,
}
+DUMMY_CAMERA_FEATURES = {
+ "laptop": {"shape": (64, 96, 3), "names": ["height", "width", "channels"], "info": DUMMY_VIDEO_INFO},
+ "phone": {"shape": (64, 96, 3), "names": ["height", "width", "channels"], "info": DUMMY_VIDEO_INFO},
+}
DUMMY_CHW = (3, 96, 128)
DUMMY_HWC = (96, 128, 3)
diff --git a/tests/fixtures/dataset_factories.py b/tests/fixtures/dataset_factories.py
index 48128a8d0..a6e349778 100644
--- a/tests/fixtures/dataset_factories.py
+++ b/tests/fixtures/dataset_factories.py
@@ -46,7 +46,6 @@ from tests.fixtures.constants import (
DUMMY_MOTOR_FEATURES,
DUMMY_REPO_ID,
DUMMY_ROBOT_TYPE,
- DUMMY_VIDEO_INFO,
)
@@ -134,9 +133,7 @@ def features_factory():
use_videos: bool = True,
) -> dict:
if use_videos:
- camera_ft = {
- key: {"dtype": "video", **ft, **DUMMY_VIDEO_INFO} for key, ft in camera_features.items()
- }
+ camera_ft = {key: {"dtype": "video", **ft} for key, ft in camera_features.items()}
else:
camera_ft = {key: {"dtype": "image", **ft} for key, ft in camera_features.items()}
return {
diff --git a/tests/processor/test_render_messages_processor.py b/tests/processor/test_render_messages_processor.py
new file mode 100644
index 000000000..f96e3c0ab
--- /dev/null
+++ b/tests/processor/test_render_messages_processor.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python
+
+import pytest
+
+pytest.importorskip("datasets", reason="datasets is required (install lerobot[dataset])")
+
+import torch # noqa: E402
+
+from lerobot.configs.recipe import MessageTurn, TrainingRecipe # noqa: E402
+from lerobot.processor.converters import create_transition # noqa: E402
+from lerobot.processor.render_messages_processor import RenderMessagesStep # noqa: E402
+from lerobot.types import TransitionKey # noqa: E402
+
+
+def test_render_messages_step_noops_without_language_columns():
+ recipe = TrainingRecipe(
+ messages=[
+ MessageTurn(role="user", content="${task}", stream="high_level"),
+ MessageTurn(role="assistant", content="${subtask}", stream="low_level", target=True),
+ ]
+ )
+ transition = create_transition(complementary_data={"task": "do it"})
+
+ assert RenderMessagesStep(recipe)(transition) == transition
+
+
+def test_render_messages_step_renders_and_drops_raw_language():
+ recipe = TrainingRecipe(
+ messages=[
+ MessageTurn(role="user", content="${task}", stream="high_level"),
+ MessageTurn(role="assistant", content="${subtask}", stream="low_level", target=True),
+ ]
+ )
+ transition = create_transition(
+ complementary_data={
+ "task": "do it",
+ "timestamp": torch.tensor(0.0),
+ "index": torch.tensor(7),
+ "language_persistent": [
+ {
+ "role": "assistant",
+ "content": "reach carefully",
+ "style": "subtask",
+ "timestamp": 0.0,
+ "camera": None,
+ "tool_calls": None,
+ }
+ ],
+ "language_events": [],
+ }
+ )
+
+ out = RenderMessagesStep(recipe)(transition)
+ data = out[TransitionKey.COMPLEMENTARY_DATA]
+
+ assert "language_persistent" not in data
+ assert "language_events" not in data
+ assert data["messages"][-1]["content"] == "reach carefully"
+ assert data["message_streams"] == ["high_level", "low_level"]
+ assert data["target_message_indices"] == [1]
diff --git a/tests/robots/test_rebot_b601_follower.py b/tests/robots/test_rebot_b601_follower.py
new file mode 100644
index 000000000..553675be0
--- /dev/null
+++ b/tests/robots/test_rebot_b601_follower.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from lerobot.robots.bi_rebot_b601_follower import BiRebotB601Follower, BiRebotB601FollowerConfig
+from lerobot.robots.rebot_b601_follower import (
+ RebotB601Follower,
+ RebotB601FollowerConfig,
+ RebotB601FollowerRobotConfig,
+)
+
+_MODULE = "lerobot.robots.rebot_b601_follower.rebot_b601_follower"
+
+
+def _make_motor_mock(position_rad: float = 0.0) -> MagicMock:
+ motor = MagicMock(name="MotorMock")
+ state = MagicMock()
+ state.pos = position_rad
+ motor.get_state.return_value = state
+ return motor
+
+
+def _make_bus_mock() -> MagicMock:
+ bus = MagicMock(name="MotorBridgeControllerMock")
+ # add_damiao_motor returns a fresh motor mock; position encodes the call order.
+ bus._motor_count = 0
+
+ def _add_motor(_send_id, _recv_id, _model):
+ bus._motor_count += 1
+ return _make_motor_mock(position_rad=math.radians(bus._motor_count))
+
+ bus.add_damiao_motor.side_effect = _add_motor
+ return bus
+
+
+@pytest.fixture
+def follower():
+ bus_mock = _make_bus_mock()
+ with (
+ patch(f"{_MODULE}.require_package", lambda *a, **kw: None),
+ patch(f"{_MODULE}.MotorBridgeController") as controller_cls,
+ patch(f"{_MODULE}.MotorBridgeMode", MagicMock()),
+ ):
+ controller_cls.from_dm_serial.return_value = bus_mock
+ cfg = RebotB601FollowerRobotConfig(port="/dev/null")
+ robot = RebotB601Follower(cfg)
+ robot.connect(calibrate=False)
+ yield robot
+ if robot.is_connected:
+ robot.disconnect()
+
+
+def test_features_match_joints():
+ with patch(f"{_MODULE}.require_package", lambda *a, **kw: None):
+ robot = RebotB601Follower(RebotB601FollowerRobotConfig(port="/dev/null"))
+ expected = {f"{m}.pos" for m in robot.motor_names}
+ assert set(robot.action_features) == expected
+ assert set(robot.observation_features) == expected
+ assert "gripper.pos" in expected
+
+
+def test_connect_disconnect(follower):
+ assert follower.is_connected
+ follower.disconnect()
+ assert not follower.is_connected
+
+
+def test_get_observation_converts_to_degrees(follower):
+ obs = follower.get_observation()
+ assert set(obs) == {f"{m}.pos" for m in follower.motor_names}
+ # The bus mock seeds each motor's position with its 1-indexed creation order (radians).
+ for idx, motor in enumerate(follower.motor_names, 1):
+ assert obs[f"{motor}.pos"] == pytest.approx(math.degrees(math.radians(idx)))
+
+
+def test_send_action_clips_to_joint_limits(follower):
+ # shoulder_pan limit is (-145, 145); request beyond the upper bound.
+ returned = follower.send_action({"shoulder_pan.pos": 999.0})
+ assert returned["shoulder_pan.pos"] == 145.0
+ follower.motors["shoulder_pan"].send_pos_vel.assert_called_once()
+
+
+def test_send_action_routes_gripper_to_force_pos(follower):
+ follower.send_action({"gripper.pos": -10.0})
+ follower.motors["gripper"].send_force_pos.assert_called_once()
+ follower.motors["gripper"].send_pos_vel.assert_not_called()
+
+
+def test_bimanual_prefixes_features():
+ with patch(f"{_MODULE}.require_package", lambda *a, **kw: None):
+ cfg = BiRebotB601FollowerConfig(
+ left_arm_config=RebotB601FollowerConfig(port="/dev/null0"),
+ right_arm_config=RebotB601FollowerConfig(port="/dev/null1"),
+ )
+ robot = BiRebotB601Follower(cfg)
+ assert any(k.startswith("left_") for k in robot.action_features)
+ assert any(k.startswith("right_") for k in robot.action_features)
+ assert "left_gripper.pos" in robot.action_features
+ assert "right_gripper.pos" in robot.action_features
diff --git a/tests/teleoperators/test_rebot_102_leader.py b/tests/teleoperators/test_rebot_102_leader.py
new file mode 100644
index 000000000..bea10e131
--- /dev/null
+++ b/tests/teleoperators/test_rebot_102_leader.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from lerobot.teleoperators.bi_rebot_102_leader import BiRebotArm102Leader, BiRebotArm102LeaderConfig
+from lerobot.teleoperators.rebot_102_leader import (
+ RebotArm102Leader,
+ RebotArm102LeaderConfig,
+ RebotArm102LeaderTeleopConfig,
+)
+
+_MODULE = "lerobot.teleoperators.rebot_102_leader.rebot_102_leader"
+
+
+def _make_bus_mock(joint_ids: dict[str, int]) -> MagicMock:
+ bus = MagicMock(name="FashionStarServoMock")
+ bus.ping.return_value = True
+
+ def _sync_monitor(ids):
+ # Report each servo at 5 degrees raw.
+ monitors = {}
+ for servo_id in ids:
+ monitor = MagicMock()
+ monitor.angle_deg = 5.0
+ monitors[servo_id] = monitor
+ return monitors
+
+ bus.sync_monitor.side_effect = _sync_monitor
+ return bus
+
+
+@pytest.fixture
+def leader():
+ cfg = RebotArm102LeaderTeleopConfig(port="/dev/null")
+ bus_mock = _make_bus_mock(cfg.joint_ids)
+ with (
+ patch(f"{_MODULE}.require_package", lambda *a, **kw: None),
+ patch(f"{_MODULE}.FashionStarServo", return_value=bus_mock),
+ ):
+ teleop = RebotArm102Leader(cfg)
+ teleop.connect(calibrate=False)
+ yield teleop
+ if teleop.is_connected:
+ teleop.disconnect()
+
+
+def test_action_features_match_joints():
+ with patch(f"{_MODULE}.require_package", lambda *a, **kw: None):
+ teleop = RebotArm102Leader(RebotArm102LeaderTeleopConfig(port="/dev/null"))
+ assert set(teleop.action_features) == {f"{m}.pos" for m in teleop.motor_names}
+ assert teleop.feedback_features == {}
+
+
+def test_connect_disconnect(leader):
+ assert leader.is_connected
+ leader.disconnect()
+ assert not leader.is_connected
+
+
+def test_get_action_applies_direction_and_clamp(leader):
+ action = leader.get_action()
+ assert set(action) == {f"{m}.pos" for m in leader.motor_names}
+ # shoulder_pan has direction -1, so a +5deg raw reading flips to -5deg.
+ assert action["shoulder_pan.pos"] == pytest.approx(-5.0)
+ # Every joint stays within its configured range.
+ for motor, value in action.items():
+ lo, hi = leader.config.joint_ranges[motor.removesuffix(".pos")]
+ assert lo <= value <= hi
+
+
+def test_send_feedback_not_implemented(leader):
+ with pytest.raises(NotImplementedError):
+ leader.send_feedback({})
+
+
+def test_bimanual_prefixes_features():
+ with patch(f"{_MODULE}.require_package", lambda *a, **kw: None):
+ cfg = BiRebotArm102LeaderConfig(
+ left_arm_config=RebotArm102LeaderConfig(port="/dev/null0"),
+ right_arm_config=RebotArm102LeaderConfig(port="/dev/null1"),
+ )
+ teleop = BiRebotArm102Leader(cfg)
+ assert any(k.startswith("left_") for k in teleop.action_features)
+ assert any(k.startswith("right_") for k in teleop.action_features)
+ assert "left_gripper.pos" in teleop.action_features
+ assert "right_gripper.pos" in teleop.action_features
diff --git a/tests/utils/test_collate.py b/tests/utils/test_collate.py
new file mode 100644
index 000000000..2b23b3180
--- /dev/null
+++ b/tests/utils/test_collate.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python
+
+import pytest
+
+pytest.importorskip("datasets", reason="datasets is required (install lerobot[dataset])")
+
+import torch # noqa: E402
+
+from lerobot.utils.collate import lerobot_collate_fn # noqa: E402
+
+
+def test_lerobot_collate_preserves_messages_and_drops_raw_language():
+ batch = [
+ {
+ "index": torch.tensor(0),
+ "messages": [{"role": "assistant", "content": "a"}],
+ "message_streams": ["low_level"],
+ "target_message_indices": [0],
+ "language_persistent": [{"content": "raw"}],
+ "language_events": [],
+ },
+ {
+ "index": torch.tensor(1),
+ "messages": [{"role": "assistant", "content": "b"}],
+ "message_streams": ["low_level"],
+ "target_message_indices": [0],
+ "language_persistent": [{"content": "raw"}],
+ "language_events": [],
+ },
+ ]
+
+ out = lerobot_collate_fn(batch)
+
+ assert out["index"].tolist() == [0, 1]
+ assert out["messages"][0][0]["content"] == "a"
+ assert out["messages"][1][0]["content"] == "b"
+ assert out["message_streams"] == [["low_level"], ["low_level"]]
+ assert out["target_message_indices"] == [[0], [0]]
+ assert "language_persistent" not in out
+ assert "language_events" not in out
+
+
+def test_lerobot_collate_passes_through_standard_batch():
+ """On a non-language batch, the collate must match ``default_collate``.
+
+ Guards against silent regressions: ``lerobot_train.py`` only opts into
+ ``lerobot_collate_fn`` when the dataset declares language columns, but
+ if a future change ever wires it in unconditionally we want the
+ behavior to remain a transparent pass-through for ordinary tensor
+ batches.
+ """
+ from torch.utils.data._utils.collate import default_collate
+
+ batch = [
+ {
+ "observation.image": torch.zeros(3, 4, 4),
+ "action": torch.tensor([0.0, 1.0]),
+ "index": torch.tensor(0),
+ },
+ {
+ "observation.image": torch.ones(3, 4, 4),
+ "action": torch.tensor([2.0, 3.0]),
+ "index": torch.tensor(1),
+ },
+ ]
+
+ custom = lerobot_collate_fn(batch)
+ expected = default_collate(batch)
+
+ assert custom.keys() == expected.keys()
+ for key in expected:
+ assert torch.equal(custom[key], expected[key]), f"key={key} diverged"
+
+
+def test_lerobot_collate_drops_none_samples():
+ """Recipes that yielded no target message return ``None`` — those samples
+ must be filtered out, and an entirely-``None`` batch must collapse to ``None``.
+ """
+ batch = [None, {"index": torch.tensor(0)}, None]
+ out = lerobot_collate_fn(batch)
+ assert out is not None
+ assert out["index"].tolist() == [0]
+
+ assert lerobot_collate_fn([None, None]) is None
diff --git a/uv.lock b/uv.lock
index 408a9a351..7092f780a 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1,5 +1,5 @@
version = 1
-revision = 2
+revision = 3
requires-python = ">=3.12"
resolution-markers = [
"(python_full_version >= '3.15' and platform_machine == 'AMD64' and sys_platform == 'linux') or (python_full_version >= '3.15' and platform_machine == 'x86_64' and sys_platform == 'linux')",
@@ -1142,7 +1142,7 @@ name = "decord"
version = "0.6.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
- { name = "numpy", marker = "(platform_machine != 'arm64' and sys_platform == 'darwin') or (platform_machine == 'AMD64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
+ { name = "numpy", marker = "(platform_machine != 'arm64' and platform_machine != 's390x' and sys_platform == 'darwin') or (platform_machine == 'AMD64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 's390x' and sys_platform != 'darwin' and sys_platform != 'linux')" },
]
wheels = [
{ url = "https://files.pythonhosted.org/packages/11/79/936af42edf90a7bd4e41a6cac89c913d4b47fa48a26b042d5129a9242ee3/decord-0.6.0-py3-none-manylinux2010_x86_64.whl", hash = "sha256:51997f20be8958e23b7c4061ba45d0efcd86bffd5fe81c695d0befee0d442976", size = 13602299, upload-time = "2021-06-14T21:30:55.486Z" },
@@ -2710,6 +2710,8 @@ all = [
{ name = "matplotlib" },
{ name = "metaworld" },
{ name = "mock-serial", marker = "sys_platform != 'win32'" },
+ { name = "motorbridge" },
+ { name = "motorbridge-smart-servo" },
{ name = "mypy" },
{ name = "num2words" },
{ name = "pandas" },
@@ -2913,6 +2915,12 @@ metaworld = [
{ name = "scipy" },
{ name = "torchcodec", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'AMD64' and sys_platform == 'linux') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'arm64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or sys_platform == 'win32'" },
]
+motorbridge-dep = [
+ { name = "motorbridge" },
+]
+motorbridge-smart-servo-dep = [
+ { name = "motorbridge-smart-servo" },
+]
multi-task-dit = [
{ name = "diffusers" },
{ name = "transformers" },
@@ -2972,6 +2980,10 @@ qwen-vl-utils-dep = [
reachy2 = [
{ name = "reachy2-sdk" },
]
+rebot = [
+ { name = "motorbridge" },
+ { name = "motorbridge-smart-servo" },
+]
robstride = [
{ name = "python-can" },
]
@@ -3045,7 +3057,7 @@ requires-dist = [
{ name = "av", marker = "extra == 'av-dep'", specifier = ">=15.0.0,<16.0.0" },
{ name = "cmake", specifier = ">=3.29.0.1,<4.2.0" },
{ name = "contourpy", marker = "extra == 'matplotlib-dep'", specifier = ">=1.3.0,<2.0.0" },
- { name = "datasets", marker = "extra == 'dataset'", specifier = ">=4.0.0,<5.0.0" },
+ { name = "datasets", marker = "extra == 'dataset'", specifier = ">=4.7.0,<5.0.0" },
{ name = "debugpy", marker = "extra == 'dev'", specifier = ">=1.8.1,<1.9.0" },
{ name = "decord", marker = "(platform_machine == 'AMD64' and extra == 'groot') or (platform_machine == 'x86_64' and extra == 'groot')", specifier = ">=0.6.0,<1.0.0" },
{ name = "deepdiff", marker = "extra == 'deepdiff-dep'", specifier = ">=7.0.1,<9.0.0" },
@@ -3116,6 +3128,8 @@ requires-dist = [
{ name = "lerobot", extras = ["matplotlib-dep"], marker = "extra == 'sarm'" },
{ name = "lerobot", extras = ["matplotlib-dep"], marker = "extra == 'unitree-g1'" },
{ name = "lerobot", extras = ["metaworld"], marker = "extra == 'all'" },
+ { name = "lerobot", extras = ["motorbridge-dep"], marker = "extra == 'rebot'" },
+ { name = "lerobot", extras = ["motorbridge-smart-servo-dep"], marker = "extra == 'rebot'" },
{ name = "lerobot", extras = ["multi-task-dit"], marker = "extra == 'all'" },
{ name = "lerobot", extras = ["notebook"], marker = "extra == 'dev'" },
{ name = "lerobot", extras = ["openarms"], marker = "extra == 'all'" },
@@ -3142,6 +3156,7 @@ requires-dist = [
{ name = "lerobot", extras = ["qwen-vl-utils-dep"], marker = "extra == 'sarm'" },
{ name = "lerobot", extras = ["qwen-vl-utils-dep"], marker = "extra == 'wallx'" },
{ name = "lerobot", extras = ["reachy2"], marker = "extra == 'all'" },
+ { name = "lerobot", extras = ["rebot"], marker = "extra == 'all'" },
{ name = "lerobot", extras = ["robstride"], marker = "extra == 'all'" },
{ name = "lerobot", extras = ["sarm"], marker = "extra == 'all'" },
{ name = "lerobot", extras = ["scipy-dep"], marker = "extra == 'aloha'" },
@@ -3174,6 +3189,8 @@ requires-dist = [
{ name = "meshcat", marker = "extra == 'unitree-g1'", specifier = ">=0.3.0,<0.4.0" },
{ name = "metaworld", marker = "extra == 'metaworld'", specifier = "==3.0.0" },
{ name = "mock-serial", marker = "sys_platform != 'win32' and extra == 'test'", specifier = ">=0.0.1,<0.1.0" },
+ { name = "motorbridge", marker = "extra == 'motorbridge-dep'", specifier = ">=0.3.2,<0.4.0" },
+ { name = "motorbridge-smart-servo", marker = "extra == 'motorbridge-smart-servo-dep'", specifier = ">=0.0.4,<0.1.0" },
{ name = "mypy", marker = "extra == 'dev'", specifier = ">=1.19.1" },
{ name = "ninja", marker = "extra == 'groot'", specifier = ">=1.11.1,<2.0.0" },
{ name = "num2words", marker = "extra == 'smolvla'", specifier = ">=0.5.14,<0.6.0" },
@@ -3227,7 +3244,7 @@ requires-dist = [
{ name = "transformers", marker = "extra == 'transformers-dep'", specifier = ">=5.4.0,<5.6.0" },
{ name = "wandb", marker = "extra == 'training'", specifier = ">=0.24.0,<0.25.0" },
]
-provides-extras = ["dataset", "training", "hardware", "viz", "core-scripts", "evaluation", "dataset-viz", "av-dep", "pygame-dep", "placo-dep", "transformers-dep", "grpcio-dep", "can-dep", "peft-dep", "scipy-dep", "diffusers-dep", "qwen-vl-utils-dep", "matplotlib-dep", "pyserial-dep", "deepdiff-dep", "pynput-dep", "pyzmq-dep", "feetech", "dynamixel", "damiao", "robstride", "openarms", "gamepad", "hopejr", "lekiwi", "unitree-g1", "reachy2", "kinematics", "intelrealsense", "phone", "diffusion", "wallx", "pi", "smolvla", "multi-task-dit", "groot", "sarm", "xvla", "eo1", "hilserl", "async", "peft", "dev", "notebook", "test", "video-benchmark", "aloha", "pusht", "libero", "metaworld", "all"]
+provides-extras = ["dataset", "training", "hardware", "viz", "core-scripts", "evaluation", "dataset-viz", "av-dep", "pygame-dep", "placo-dep", "transformers-dep", "grpcio-dep", "can-dep", "peft-dep", "scipy-dep", "diffusers-dep", "qwen-vl-utils-dep", "matplotlib-dep", "pyserial-dep", "deepdiff-dep", "pynput-dep", "pyzmq-dep", "motorbridge-dep", "motorbridge-smart-servo-dep", "feetech", "dynamixel", "damiao", "robstride", "openarms", "gamepad", "hopejr", "lekiwi", "unitree-g1", "reachy2", "rebot", "kinematics", "intelrealsense", "phone", "diffusion", "wallx", "pi", "smolvla", "multi-task-dit", "groot", "sarm", "xvla", "eo1", "hilserl", "async", "peft", "dev", "notebook", "test", "video-benchmark", "aloha", "pusht", "libero", "metaworld", "all"]
[[package]]
name = "librt"
@@ -3653,6 +3670,35 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/98/c2/8c1e6bf77cf62a10203a107179e34e0965fc5369386e0b7034a247ed054d/mock_serial-0.0.1-py3-none-any.whl", hash = "sha256:b6b8cc10c302354bf3ca270a3d4d6bf199c4bbe41478c65046db8f30ea967675", size = 6080, upload-time = "2021-11-23T09:34:51.108Z" },
]
+[[package]]
+name = "motorbridge"
+version = "0.3.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/58/f2/b824ac4d611c71020dccdb72fc50606e543c77c68455ea824b26d9a6de03/motorbridge-0.3.2.tar.gz", hash = "sha256:5cf85dd22c46c7f3c5e6981e90b1034af2deb1bc4e7d74c13074d1d4a7b75ceb", size = 30158, upload-time = "2026-05-18T07:13:17.239Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/2c/1a/7d367039a8325c0e2796c14a1503dfc563e7b244c815b26e079114244b4b/motorbridge-0.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8ad158928e93fafd2a7814eaffe8e6ecbec4686f64c2df85f80d7979dfc82532", size = 1108065, upload-time = "2026-05-18T07:13:04.669Z" },
+ { url = "https://files.pythonhosted.org/packages/fe/d6/fafa2b8a3635a6fe7f6e8129e140a68d30f4d6438350a86e51b8198b7834/motorbridge-0.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2adde5f26ea4e37d05da6b41b03b637efa6c80db4676bc6dbdb91ac6e811e54a", size = 1184657, upload-time = "2026-05-18T07:13:06.081Z" },
+ { url = "https://files.pythonhosted.org/packages/d8/30/aca01e81ec523d37b98a1ce6e41688d31827625eb15ecf0cf0485d91d62c/motorbridge-0.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a03b6dc0be80db7b47d3f190f8c6f4fc43b0b4089235283f53763153a6d4e58c", size = 1201394, upload-time = "2026-05-18T07:13:07.476Z" },
+ { url = "https://files.pythonhosted.org/packages/70/eb/97b2f93682a1ce67bad50e9b598af889be4a3156ebcec129ebb41fa44e5b/motorbridge-0.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:b0657d47aa94f8535d0663538be4a86c46e314303fba513122d17612b584c6e6", size = 839087, upload-time = "2026-05-18T07:13:08.664Z" },
+ { url = "https://files.pythonhosted.org/packages/6e/b0/03246c25ae67c2b33bd19b5d11bae668bb8baa7d9cbd75b035a8bef61d62/motorbridge-0.3.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f305a69c7c3c91dca19c43084beb4cd30a93fd85ff35c712cc3fb0ae33a5c7d3", size = 1108065, upload-time = "2026-05-18T07:13:10.032Z" },
+ { url = "https://files.pythonhosted.org/packages/a9/40/b82d86fbfcc6b18946567f15a7d76d1c673d43bc0c8d268b668506811981/motorbridge-0.3.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:778fdde2b12df20184fb8c8f4c7665919d969bd582589a267c7956d4c57336ad", size = 1184657, upload-time = "2026-05-18T07:13:11.812Z" },
+ { url = "https://files.pythonhosted.org/packages/f2/3e/90e41d798814db89605d9a021e0c182608aec3d40eef2be211427e2bb863/motorbridge-0.3.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eac3a2d27ca387e8d537ec148bea0c28b9517ff4fb9ea0b12f6e78c1e9a7faa4", size = 1201393, upload-time = "2026-05-18T07:13:13.396Z" },
+ { url = "https://files.pythonhosted.org/packages/34/75/3c9ba7514fd0ec330c1fe0b4d76dedfd221abc1b750fe063b6e3f9a88075/motorbridge-0.3.2-cp313-cp313-win_amd64.whl", hash = "sha256:d7d1eb76ae29e8673a320fd1a86b944fb0869129fd4114f0983e43cd48f67372", size = 839087, upload-time = "2026-05-18T07:13:14.555Z" },
+ { url = "https://files.pythonhosted.org/packages/87/33/6787dd22914291a640c2821f175abc7cbb9a1e0fe6c1143f92d7ac362903/motorbridge-0.3.2-cp314-cp314-win_amd64.whl", hash = "sha256:c5f05e36c6607d2145f38fb6f1f11090bb01dbd1012e8251b0d2ae4d60fa4f50", size = 870167, upload-time = "2026-05-18T07:13:15.898Z" },
+]
+
+[[package]]
+name = "motorbridge-smart-servo"
+version = "0.0.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e6/56/45af87189dc49abbe46157b792b7c71f502a5f819f04e7485de0cfa52d9b/motorbridge_smart_servo-0.0.4.tar.gz", hash = "sha256:fb65f3f6e765e6b1915071c255caaf112fad3796fa1761aeee0132d15b8a0989", size = 20415, upload-time = "2026-05-08T09:24:57.563Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/e9/ee/bec4b3acf55cd18e7db83a6d951caccf699533dbd038c1f0b5f2d16d5208/motorbridge_smart_servo-0.0.4-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:8bc1f034fa9f96e23229a834db6e7cfe1368dba7b9a2a6f6dbd316448c4390dc", size = 304384, upload-time = "2026-05-08T09:24:52.619Z" },
+ { url = "https://files.pythonhosted.org/packages/3f/d2/71c87063b826433553ce8869b99df3e4f191b107710dd5c905e637512b10/motorbridge_smart_servo-0.0.4-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:348cef6a647e5c7f9cc8e8ce1f3c806af4522e1087172bac2f8a1a0daa3592b6", size = 345668, upload-time = "2026-05-08T09:24:53.735Z" },
+ { url = "https://files.pythonhosted.org/packages/9b/6b/e65e7227a510236c6334cf054c501d3de2cbd463f4c594e42c6e965d5143/motorbridge_smart_servo-0.0.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8c1982643c496c9f425fa9238f9a92ba601d77f4f2279df68c6868e7b997cbe1", size = 348123, upload-time = "2026-05-08T09:24:55.191Z" },
+ { url = "https://files.pythonhosted.org/packages/2d/fa/539ea123a5660c22c5e5cdad62d7bc5e931c816a0ffd402ae6e4623ab45b/motorbridge_smart_servo-0.0.4-cp39-abi3-win_amd64.whl", hash = "sha256:ea3baa9ba25bcec5541f3d86d73a3406ba2fcffe5dbf900c22e058638fc31ab0", size = 194130, upload-time = "2026-05-08T09:24:56.369Z" },
+]
+
[[package]]
name = "mpmath"
version = "1.3.0"