Compare commits

..

32 Commits

Author SHA1 Message Date
Jade Choghari e29e89e4ed improve script, time saving subtask array
Signed-off-by: Jade Choghari <chogharijade@gmail.com>
2026-03-06 17:07:44 +03:00
root 3d55c5e484 add qwen 3.5 and fix video extraction 2026-03-04 12:22:41 +00:00
Jade Choghari 51b3b31927 more annotation changes 2026-02-12 12:46:45 +00:00
Jade Choghari 4503019d18 clean subtask 2026-02-09 10:55:22 +01:00
Jade Choghari 6aa0cc267f merge branch main 2026-02-09 08:55:11 +01:00
Jade Choghari 6629b454b2 Merge branch 'feat/add-pi05' of github.com:huggingface/lerobot into feat/add-pi05 2026-02-09 08:34:01 +01:00
Jade Choghari 0059ca7924 add cached subtask inference 2026-02-09 07:33:12 +00:00
Jade Choghari 6c94fcd1b1 add KI optional 2026-02-02 15:58:47 +00:00
Jade Choghari 092f4617ca more changes 2026-02-02 09:04:55 +00:00
Jade Choghari 6380c0d0dd example change 2026-01-29 11:21:03 +00:00
Jade Choghari 0947111edd Merge branch 'feat/add-pi05' of github.com:huggingface/lerobot into feat/add-pi05 2026-01-28 21:39:40 +01:00
Jade Choghari 477204d485 add eos to subtask token 2026-01-28 12:32:13 +00:00
Jade Choghari 4eb912da30 Merge remote-tracking branch 'origin/main' into feat/add-pi05 2026-01-27 17:48:22 +01:00
Jade Choghari 99dbbd56c2 add generation inference for subtask 2026-01-27 16:21:44 +00:00
Jade Choghari 6a6912ec37 revert .clone 2026-01-27 16:00:40 +00:00
Jade Choghari 2bf6359d24 more changes 2026-01-27 11:14:22 +00:00
Jade Choghari 4c694e20c7 comments 2026-01-26 09:19:14 +00:00
Jade Choghari 5e609426fd add knowledge insulation 2026-01-26 09:14:39 +00:00
Jade Choghari d0b6a66f34 update subtask annotate 2026-01-21 13:59:16 +00:00
Jade Choghari dc85e9b742 remove brkp 2026-01-20 23:05:44 +00:00
Jade Choghari 90d9698c7e Merge remote-tracking branch 'origin/main' into feat/add-pi05 2026-01-20 11:05:38 +00:00
Jade Choghari bbef8bb077 more 2026-01-20 10:02:59 +00:00
Jade Choghari 80417111d3 handle failed annotations 2026-01-19 16:11:32 +00:00
Jade Choghari d44f3a3bd9 update 2026-01-19 15:48:14 +00:00
Jade Choghari b864c13dfb add docs 2026-01-19 10:36:25 +00:00
Jade Choghari fd917e4fa0 add high/low/normal level annotation 2026-01-15 17:21:52 +00:00
Jade Choghari 966fedfeef add more 2026-01-15 16:35:58 +00:00
Jade Choghari 6e88d6f387 make it work- runnning example 2026-01-15 13:21:17 +00:00
Jade Choghari 83276eeb2f loss naming 2026-01-14 14:53:18 +00:00
Jade Choghari 72b0af4ed7 add three losses: flow_mse, subtask_ce, action_ce 2026-01-14 14:52:32 +00:00
Jade Choghari b57504b89e run inference, attention mask 2026-01-14 11:52:31 +00:00
Jade Choghari 72f7aaedb5 add annotation pipeline 2026-01-13 11:05:26 +00:00
219 changed files with 12462 additions and 9285 deletions
+1 -8
View File
@@ -44,7 +44,7 @@ permissions:
# Sets up the environment variables
env:
UV_VERSION: "0.8.0"
PYTHON_VERSION: "3.12"
PYTHON_VERSION: "3.10"
# Ensures that only the latest commit for a PR or branch is built, canceling older runs.
concurrency:
@@ -61,7 +61,6 @@ jobs:
MUJOCO_GL: egl
HF_HOME: /mnt/cache/.cache/huggingface
HF_LEROBOT_HOME: /mnt/cache/.cache/huggingface/lerobot
HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
steps:
- uses: actions/checkout@v6
with:
@@ -90,11 +89,5 @@ jobs:
- name: Install lerobot with test extras
run: uv sync --extra "test"
- name: Login to Hugging Face
if: env.HF_USER_TOKEN != ''
run: |
uv run hf auth login --token "$HF_USER_TOKEN" --add-to-git-credential
uv run hf auth whoami
- name: Run pytest
run: uv run pytest tests -vv --maxfail=10
+4 -21
View File
@@ -37,7 +37,7 @@ permissions:
# Sets up the environment variables
env:
UV_VERSION: "0.8.0"
PYTHON_VERSION: "3.12"
PYTHON_VERSION: "3.10"
DOCKER_IMAGE_NAME: huggingface/lerobot-gpu
# Ensures that only the latest action is built, canceling older runs.
@@ -60,7 +60,6 @@ jobs:
MUJOCO_GL: egl
HF_HOME: /mnt/cache/.cache/huggingface
HF_LEROBOT_HOME: /mnt/cache/.cache/huggingface/lerobot
HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
steps:
- uses: actions/checkout@v6
with:
@@ -88,12 +87,6 @@ jobs:
- name: Install lerobot with all extras
run: uv sync --extra all # TODO(Steven): Make flash-attn optional
- name: Login to Hugging Face
if: env.HF_USER_TOKEN != ''
run: |
uv run hf auth login --token "$HF_USER_TOKEN" --add-to-git-credential
uv run hf auth whoami
- name: Run pytest (all extras)
run: uv run pytest tests -vv --maxfail=10
@@ -108,11 +101,9 @@ jobs:
runs-on:
group: aws-general-8-plus
if: |
github.repository == 'huggingface/lerobot' && (
(github.event_name == 'pull_request_review' && github.event.review.state == 'approved' && github.event.pull_request.head.repo.fork == false) ||
github.event_name == 'push' ||
github.event_name == 'workflow_dispatch'
)
(github.event_name == 'pull_request_review' && github.event.review.state == 'approved' && github.event.pull_request.head.repo.fork == false) ||
github.event_name == 'push' ||
github.event_name == 'workflow_dispatch'
outputs:
image_tag: ${{ steps.set_tag.outputs.image_tag }}
env:
@@ -169,7 +160,6 @@ jobs:
HF_LEROBOT_HOME: /home/user_lerobot/.cache/huggingface/lerobot
TORCH_HOME: /home/user_lerobot/.cache/torch
TRITON_CACHE_DIR: /home/user_lerobot/.cache/triton
HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
container:
image: ${{ needs.build-and-push-docker.outputs.image_tag }} # zizmor: ignore[unpinned-images]
options: --gpus all --shm-size "16gb"
@@ -181,13 +171,6 @@ jobs:
shell: bash
working-directory: /lerobot
steps:
- name: Login to Hugging Face
if: env.HF_USER_TOKEN != ''
run: |
hf auth login --token "$HF_USER_TOKEN" --add-to-git-credential
hf auth whoami
- name: Fix ptxas permissions
run: chmod +x /lerobot/.venv/lib/python3.12/site-packages/triton/backends/nvidia/bin/ptxas
- name: Run pytest on GPU
run: pytest tests -vv --maxfail=10
- name: Run end-to-end tests
+4 -20
View File
@@ -28,7 +28,7 @@ on:
# Sets up the environment variables
env:
UV_VERSION: "0.8.0"
PYTHON_VERSION: "3.12"
PYTHON_VERSION: "3.10"
DOCKER_IMAGE_NAME_CPU: huggingface/lerobot-cpu:latest
DOCKER_IMAGE_NAME_GPU: huggingface/lerobot-gpu:latest
@@ -119,7 +119,6 @@ jobs:
HF_LEROBOT_HOME: /home/user_lerobot/.cache/huggingface/lerobot
TORCH_HOME: /home/user_lerobot/.cache/torch
TRITON_CACHE_DIR: /home/user_lerobot/.cache/triton
HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
container:
image: ${{ needs.build-docker-cpu-nightly.outputs.image_tag }} # zizmor: ignore[unpinned-images]
options: --shm-size "16gb"
@@ -131,11 +130,6 @@ jobs:
shell: bash
working-directory: /lerobot
steps:
- name: Login to Hugging Face
if: env.HF_USER_TOKEN != ''
run: |
hf auth login --token "$HF_USER_TOKEN" --add-to-git-credential
hf auth whoami
- name: Run pytest on CPU
run: pytest tests -vv --maxfail=10
- name: Run end-to-end tests
@@ -152,7 +146,6 @@ jobs:
HF_LEROBOT_HOME: /home/user_lerobot/.cache/huggingface/lerobot
TORCH_HOME: /home/user_lerobot/.cache/torch
TRITON_CACHE_DIR: /home/user_lerobot/.cache/triton
HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
container:
image: ${{ needs.build-docker-gpu-nightly.outputs.image_tag }} # zizmor: ignore[unpinned-images]
options: --gpus all --shm-size "16gb"
@@ -164,11 +157,6 @@ jobs:
shell: bash
working-directory: /lerobot
steps:
- name: Login to Hugging Face
if: env.HF_USER_TOKEN != ''
run: |
hf auth login --token "$HF_USER_TOKEN" --add-to-git-credential
hf auth whoami
- name: Run pytest on GPU
run: pytest tests -vv --maxfail=10
- name: Run end-to-end tests
@@ -186,7 +174,6 @@ jobs:
TORCH_HOME: /home/user_lerobot/.cache/torch
TRITON_CACHE_DIR: /home/user_lerobot/.cache/triton
CUDA_VISIBLE_DEVICES: "0,1,2,3"
HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
container:
image: ${{ needs.build-docker-gpu-nightly.outputs.image_tag }} # zizmor: ignore[unpinned-images]
options: --gpus all --shm-size "16gb"
@@ -198,15 +185,12 @@ jobs:
shell: bash
working-directory: /lerobot
steps:
- name: Login to Hugging Face
if: env.HF_USER_TOKEN != ''
run: |
hf auth login --token "$HF_USER_TOKEN" --add-to-git-credential
hf auth whoami
- name: Verify GPU availability
run: |
nvidia-smi
python -c "import torch; print(f'PyTorch CUDA available: {torch.cuda.is_available()}'); print(f'Number of GPUs: {torch.cuda.device_count()}')"
- name: Run multi-GPU training tests
run: pytest -vv tests/training/
# TODO(Steven): Investigate why motors tests are failing in multi-GPU setup
run: pytest tests -vv --maxfail=10 --ignore=tests/motors/
timeout-minutes: 10
+1 -1
View File
@@ -50,7 +50,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v6
with:
python-version: '3.12'
python-version: '3.10'
- name: Run pre-commit hooks
uses: pre-commit/action@v3.0.1 # zizmor: ignore[unpinned-uses]
+10 -2
View File
@@ -22,7 +22,7 @@ on:
# Sets up the environment variables
env:
UV_VERSION: "0.8.0"
PYTHON_VERSION: "3.12"
PYTHON_VERSION: "3.10"
jobs:
# This job builds the Python package and publishes it to PyPI
@@ -45,7 +45,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v6
with:
python-version: '3.12'
python-version: '3.10'
- name: Extract Version
id: extract_info
@@ -83,6 +83,14 @@ jobs:
exit 1
fi
- name: Remove Tags with Git dependencies
# TODO(Steven): Temporary patch to remove pi from PyPi 0.4.0 release due to its reliance on git dependencies.
run: |
echo "::info:: Checking for Git dependencies to remove from pyproject.toml..."
grep -E '@ git\+https|lerobot\[pi\]' pyproject.toml | sed 's/^/::warning:: Removing line: /' || true
sed -E -i '/@ git\+https|lerobot\[pi\]/d' pyproject.toml
echo "::info:: Git dependencies removed. Proceeding with build."
- name: Install build dependencies
run: python -m pip install build
+2 -14
View File
@@ -29,7 +29,7 @@ permissions:
# Sets up the environment variables
env:
UV_VERSION: "0.8.0"
PYTHON_VERSION: "3.12"
PYTHON_VERSION: "3.10"
DOCKER_IMAGE_NAME: huggingface/lerobot-gpu:unbound
# Ensures that only the latest action is built, canceling older runs.
@@ -48,7 +48,6 @@ jobs:
MUJOCO_GL: egl
HF_HOME: /mnt/cache/.cache/huggingface
HF_LEROBOT_HOME: /mnt/cache/.cache/huggingface/lerobot
HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
steps:
- uses: actions/checkout@v6
with:
@@ -80,11 +79,7 @@ jobs:
- name: Install lerobot with all extras
run: uv sync --extra all # TODO(Steven): Make flash-attn optional
- name: Login to Hugging Face
if: env.HF_USER_TOKEN != ''
run: |
uv run hf auth login --token "$HF_USER_TOKEN" --add-to-git-credential
uv run hf auth whoami
- name: Run pytest (all extras)
run: uv run pytest tests -vv
@@ -96,7 +91,6 @@ jobs:
name: Build and Push Docker
runs-on:
group: aws-general-8-plus
if: github.repository == 'huggingface/lerobot'
outputs:
image_tag: ${{ env.DOCKER_IMAGE_NAME }}
env:
@@ -142,7 +136,6 @@ jobs:
HF_LEROBOT_HOME: /home/user_lerobot/.cache/huggingface/lerobot
TORCH_HOME: /home/user_lerobot/.cache/torch
TRITON_CACHE_DIR: /home/user_lerobot/.cache/triton
HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
container:
image: ${{ needs.build-and-push-docker.outputs.image_tag }} # zizmor: ignore[unpinned-images]
options: --gpus all --shm-size "16gb"
@@ -154,11 +147,6 @@ jobs:
shell: bash
working-directory: /lerobot
steps:
- name: Login to Hugging Face
if: env.HF_USER_TOKEN != ''
run: |
hf auth login --token "$HF_USER_TOKEN" --add-to-git-credential
hf auth whoami
- name: Run pytest on GPU
run: pytest tests -vv
- name: Run end-to-end tests
+2 -2
View File
@@ -13,7 +13,7 @@
# limitations under the License.
default_language_version:
python: python3.12
python: python3.10
exclude: "tests/artifacts/.*\\.safetensors$"
@@ -55,7 +55,7 @@ repos:
rev: v3.21.0
hooks:
- id: pyupgrade
args: [--py312-plus]
args: [--py310-plus]
##### Markdown Quality #####
- repo: https://github.com/rbubley/mirrors-prettier
-25
View File
@@ -1,25 +0,0 @@
# AI Usage Policy
The LeRobot project welcomes contributions from everyone, and we have a few guidelines regarding AI usage to ensure high code quality, clear communication, and a healthy open-source ecosystem:
- **Please disclose significant AI assistance.** If you used AI tools (e.g., Copilot, Claude, Cursor, ChatGPT) to generate a substantial portion of your code or text, let us know in your PR description. Transparency helps us review your changes more effectively.
- **Own your code (The Human-in-the-Loop).** You must fully understand all the changes you are proposing. If you cannot explain what your AI-assisted code does or how it interacts with LeRobot's broader architecture, please take the time to learn and test it before submitting.
- **Keep issues and discussions focused.** You are welcome to use AI to help draft issues or PR descriptions, but please review and edit them carefully before posting. AI can often be overly verbose; trimming the noise and getting straight to the point helps our maintainers address your needs faster.
Our core maintainers also use AI tools to aid their workflows, but they do so while bringing deep contextual knowledge of the LeRobot codebase to validate the output. We ask all contributors to apply that same level of rigor.
## Remember the Human Maintainers
Please remember that LeRobot is maintained by a dedicated team of humans.
Every discussion, issue, and pull request is read and reviewed by real people. While AI tools can generate thousands of lines of code in seconds, reviewing that code still takes human time and energy. Submitting unverified or low-effort AI output puts an unfair burden on our maintainers.
Today, the quality of the AI output still heavily depends on the developer driving the tool. We ask that you respect our maintainers' time by thoroughly vetting, testing, and refining your submissions.
## AI is Welcome Here
LeRobot operates at the cutting edge of AI and robotics, and many of our maintainers actively embrace AI coding assistants as valuable productivity tools. We are a pro-AI project!
Our reason for having an AI policy is not an anti-AI stance. Rather, it exists to ensure that AI is used to enhance human contributions, not replace them with unverified noise. It's about how the tools are used, not the tools themselves.
We value the unique human insight you bring to the LeRobot community. Let AI empower your workflow, but always let your own judgment take the wheel.
+1 -1
View File
@@ -2,7 +2,7 @@
Everyone is welcome to contribute, and we value everybody's contribution. Code is not the only way to help the community. Answering questions, helping others, reaching out, and improving the documentation are immensely valuable.
Whichever way you choose to contribute, please be mindful to respect our [code of conduct](./CODE_OF_CONDUCT.md) and our [AI policy](./AI_POLICY.md).
Whichever way you choose to contribute, please be mindful to respect our [code of conduct](./CODE_OF_CONDUCT.md).
## Ways to Contribute
-1
View File
@@ -1,3 +1,2 @@
include src/lerobot/templates/lerobot_modelcard_template.md
include src/lerobot/datasets/card_template.md
include src/lerobot/envs/metaworld_config.json
+1 -18
View File
@@ -135,7 +135,7 @@ Learn how to implement your own simulation environment or benchmark and distribu
## Citation
If you use LeRobot in your project, please cite the GitHub repository to acknowledge the ongoing development and contributors:
If you use LeRobot in your research, please cite:
```bibtex
@misc{cadene2024lerobot,
@@ -146,23 +146,6 @@ If you use LeRobot in your project, please cite the GitHub repository to acknowl
}
```
If you are referencing our research or the academic paper, please also cite our ICLR publication:
<details>
<summary><b>ICLR 2026 Paper</b></summary>
```bibtex
@inproceedings{cadenelerobot,
title={LeRobot: An Open-Source Library for End-to-End Robot Learning},
author={Cadene, Remi and Alibert, Simon and Capuano, Francesco and Aractingi, Michel and Zouitine, Adil and Kooijmans, Pepijn and Choghari, Jade and Russi, Martino and Pascal, Caroline and Palma, Steven and Shukor, Mustafa and Moss, Jess and Soare, Alexander and Aubakirova, Dana and Lhoest, Quentin and Gallou\'edec, Quentin and Wolf, Thomas},
booktitle={The Fourteenth International Conference on Learning Representations},
year={2026},
url={https://arxiv.org/abs/2602.22818}
}
```
</details>
## Contribute
We welcome contributions from everyone in the community! To get started, please read our [CONTRIBUTING.md](./CONTRIBUTING.md) guide. Whether you're adding a new feature, improving documentation, or fixing a bug, your help and feedback are invaluable. We're incredibly excited about the future of open-source robotics and can't wait to work with you on what's next—thank you for your support!
+42 -42
View File
@@ -28,9 +28,9 @@ We don't expect the same optimal settings for a dataset of images from a simulat
For these reasons, we run this benchmark on four representative datasets:
- `lerobot/pusht_image`: (96 x 96 pixels) simulation with simple geometric shapes, fixed camera.
- `lerobot/aloha_mobile_shrimp_image`: (480 x 640 pixels) real-world indoor, moving camera.
- `lerobot/paris_street`: (720 x 1280 pixels) real-world outdoor, moving camera.
- `lerobot/kitchen`: (1080 x 1920 pixels) real-world indoor, fixed camera.
- `aliberts/aloha_mobile_shrimp_image`: (480 x 640 pixels) real-world indoor, moving camera.
- `aliberts/paris_street`: (720 x 1280 pixels) real-world outdoor, moving camera.
- `aliberts/kitchen`: (1080 x 1920 pixels) real-world indoor, fixed camera.
Note: The datasets used for this benchmark need to be image datasets, not video datasets.
@@ -179,7 +179,7 @@ python benchmark/video/run_video_benchmark.py \
--output-dir outputs/video_benchmark \
--repo-ids \
lerobot/pusht_image \
lerobot/aloha_mobile_shrimp_image \
aliberts/aloha_mobile_shrimp_image \
--vcodec libx264 libx265 \
--pix-fmt yuv444p yuv420p \
--g 2 20 None \
@@ -203,9 +203,9 @@ python benchmark/video/run_video_benchmark.py \
--output-dir outputs/video_benchmark \
--repo-ids \
lerobot/pusht_image \
lerobot/aloha_mobile_shrimp_image \
lerobot/paris_street \
lerobot/kitchen \
aliberts/aloha_mobile_shrimp_image \
aliberts/paris_street \
aliberts/kitchen \
--vcodec libx264 libx265 \
--pix-fmt yuv444p yuv420p \
--g 1 2 3 4 5 6 10 15 20 40 None \
@@ -221,9 +221,9 @@ python benchmark/video/run_video_benchmark.py \
--output-dir outputs/video_benchmark \
--repo-ids \
lerobot/pusht_image \
lerobot/aloha_mobile_shrimp_image \
lerobot/paris_street \
lerobot/kitchen \
aliberts/aloha_mobile_shrimp_image \
aliberts/paris_street \
aliberts/kitchen \
--vcodec libsvtav1 \
--pix-fmt yuv420p \
--g 1 2 3 4 5 6 10 15 20 40 None \
@@ -252,37 +252,37 @@ Since we're using av1 encoding, we're choosing the `pyav` decoder as `video_read
These tables show the results for `g=2` and `crf=30`, using `timestamps-modes=6_frames` and `backend=pyav`
| video_images_size_ratio | vcodec | pix_fmt | | | |
| --------------------------------- | ---------- | ------- | --------- | --------- | --------- |
| | libx264 | | libx265 | | libsvtav1 |
| repo_id | yuv420p | yuv444p | yuv420p | yuv444p | yuv420p |
| lerobot/pusht_image | **16.97%** | 17.58% | 18.57% | 18.86% | 22.06% |
| lerobot/aloha_mobile_shrimp_image | 2.14% | 2.11% | 1.38% | **1.37%** | 5.59% |
| lerobot/paris_street | 2.12% | 2.13% | **1.54%** | **1.54%** | 4.43% |
| lerobot/kitchen | 1.40% | 1.39% | **1.00%** | **1.00%** | 2.52% |
| video_images_size_ratio | vcodec | pix_fmt | | | |
| ---------------------------------- | ---------- | ------- | --------- | --------- | --------- |
| | libx264 | | libx265 | | libsvtav1 |
| repo_id | yuv420p | yuv444p | yuv420p | yuv444p | yuv420p |
| lerobot/pusht_image | **16.97%** | 17.58% | 18.57% | 18.86% | 22.06% |
| aliberts/aloha_mobile_shrimp_image | 2.14% | 2.11% | 1.38% | **1.37%** | 5.59% |
| aliberts/paris_street | 2.12% | 2.13% | **1.54%** | **1.54%** | 4.43% |
| aliberts/kitchen | 1.40% | 1.39% | **1.00%** | **1.00%** | 2.52% |
| video_images_load_time_ratio | vcodec | pix_fmt | | | |
| --------------------------------- | ------- | ------- | -------- | ------- | --------- |
| | libx264 | | libx265 | | libsvtav1 |
| repo_id | yuv420p | yuv444p | yuv420p | yuv444p | yuv420p |
| lerobot/pusht_image | 6.45 | 5.19 | **1.90** | 2.12 | 2.47 |
| lerobot/aloha_mobile_shrimp_image | 11.80 | 7.92 | 0.71 | 0.85 | **0.48** |
| lerobot/paris_street | 2.21 | 2.05 | 0.36 | 0.49 | **0.30** |
| lerobot/kitchen | 1.46 | 1.46 | 0.28 | 0.51 | **0.26** |
| video_images_load_time_ratio | vcodec | pix_fmt | | | |
| ---------------------------------- | ------- | ------- | -------- | ------- | --------- |
| | libx264 | | libx265 | | libsvtav1 |
| repo_id | yuv420p | yuv444p | yuv420p | yuv444p | yuv420p |
| lerobot/pusht_image | 6.45 | 5.19 | **1.90** | 2.12 | 2.47 |
| aliberts/aloha_mobile_shrimp_image | 11.80 | 7.92 | 0.71 | 0.85 | **0.48** |
| aliberts/paris_street | 2.21 | 2.05 | 0.36 | 0.49 | **0.30** |
| aliberts/kitchen | 1.46 | 1.46 | 0.28 | 0.51 | **0.26** |
| | | vcodec | pix_fmt | | | |
| --------------------------------- | -------- | -------- | ------------ | -------- | --------- | ------------ |
| | | libx264 | | libx265 | | libsvtav1 |
| repo_id | metric | yuv420p | yuv444p | yuv420p | yuv444p | yuv420p |
| lerobot/pusht_image | avg_mse | 2.90E-04 | **2.03E-04** | 3.13E-04 | 2.29E-04 | 2.19E-04 |
| | avg_psnr | 35.44 | 37.07 | 35.49 | **37.30** | 37.20 |
| | avg_ssim | 98.28% | **98.85%** | 98.31% | 98.84% | 98.72% |
| lerobot/aloha_mobile_shrimp_image | avg_mse | 2.76E-04 | 2.59E-04 | 3.17E-04 | 3.06E-04 | **1.30E-04** |
| | avg_psnr | 35.91 | 36.21 | 35.88 | 36.09 | **40.17** |
| | avg_ssim | 95.19% | 95.18% | 95.00% | 95.05% | **97.73%** |
| lerobot/paris_street | avg_mse | 6.89E-04 | 6.70E-04 | 4.03E-03 | 4.02E-03 | **3.09E-04** |
| | avg_psnr | 33.48 | 33.68 | 32.05 | 32.15 | **35.40** |
| | avg_ssim | 93.76% | 93.75% | 89.46% | 89.46% | **95.46%** |
| lerobot/kitchen | avg_mse | 2.50E-04 | 2.24E-04 | 4.28E-04 | 4.18E-04 | **1.53E-04** |
| | avg_psnr | 36.73 | 37.33 | 36.56 | 36.75 | **39.12** |
| | avg_ssim | 95.47% | 95.58% | 95.52% | 95.53% | **96.82%** |
| | | vcodec | pix_fmt | | | |
| ---------------------------------- | -------- | -------- | ------------ | -------- | --------- | ------------ |
| | | libx264 | | libx265 | | libsvtav1 |
| repo_id | metric | yuv420p | yuv444p | yuv420p | yuv444p | yuv420p |
| lerobot/pusht_image | avg_mse | 2.90E-04 | **2.03E-04** | 3.13E-04 | 2.29E-04 | 2.19E-04 |
| | avg_psnr | 35.44 | 37.07 | 35.49 | **37.30** | 37.20 |
| | avg_ssim | 98.28% | **98.85%** | 98.31% | 98.84% | 98.72% |
| aliberts/aloha_mobile_shrimp_image | avg_mse | 2.76E-04 | 2.59E-04 | 3.17E-04 | 3.06E-04 | **1.30E-04** |
| | avg_psnr | 35.91 | 36.21 | 35.88 | 36.09 | **40.17** |
| | avg_ssim | 95.19% | 95.18% | 95.00% | 95.05% | **97.73%** |
| aliberts/paris_street | avg_mse | 6.89E-04 | 6.70E-04 | 4.03E-03 | 4.02E-03 | **3.09E-04** |
| | avg_psnr | 33.48 | 33.68 | 32.05 | 32.15 | **35.40** |
| | avg_ssim | 93.76% | 93.75% | 89.46% | 89.46% | **95.46%** |
| aliberts/kitchen | avg_mse | 2.50E-04 | 2.24E-04 | 4.28E-04 | 4.18E-04 | **1.53E-04** |
| | avg_psnr | 36.73 | 37.33 | 36.56 | 36.75 | **39.12** |
| | avg_ssim | 95.47% | 95.58% | 95.52% | 95.53% | **96.82%** |
+1 -3
View File
@@ -24,7 +24,7 @@ ARG OS_VERSION=22.04
FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu${OS_VERSION}
# Define Python version argument
ARG PYTHON_VERSION=3.12
ARG PYTHON_VERSION=3.10
# Configure environment variables
ENV DEBIAN_FRONTEND=noninteractive \
@@ -85,8 +85,6 @@ RUN if [ "$UNBOUND_DEPS" = "true" ]; then \
RUN uv pip install --no-cache ".[all]"
RUN chmod +x /lerobot/.venv/lib/python${PYTHON_VERSION}/site-packages/triton/backends/nvidia/bin/ptxas
# Copy the rest of the application source code
# Make sure to have the git-LFS files for testing
COPY --chown=user_lerobot:user_lerobot . .
+1 -1
View File
@@ -19,7 +19,7 @@
# docker run -it --rm lerobot-user
# Configure the base image
ARG PYTHON_VERSION=3.12
ARG PYTHON_VERSION=3.10
FROM python:${PYTHON_VERSION}-slim
# Configure environment variables
+2 -4
View File
@@ -27,12 +27,10 @@
title: Porting Large Datasets
- local: using_dataset_tools
title: Using the Dataset Tools
- local: annotation_tools
title: Using the Annotation Tools
- local: dataset_subtask
title: Using Subtasks in the Dataset
- local: streaming_video_encoding
title: Streaming Video Encoding
- local: multi_dataset_training
title: Multi-Dataset Training
title: "Datasets"
- sections:
- local: act
-3
View File
@@ -88,8 +88,5 @@ lerobot-record \
--dataset.repo_id=${HF_USER}/eval_act_your_dataset \
--dataset.num_episodes=10 \
--dataset.single_task="Your task description" \
--dataset.streaming_encoding=true \
--dataset.encoder_threads=2 \
# --dataset.vcodec=auto \
--policy.path=${HF_USER}/act_policy
```
+425
View File
@@ -0,0 +1,425 @@
# Dataset Annotation Tools
This guide explains how to use the automatic annotation tools to add skill labels and synthetic dialogue to your LeRobot datasets.
## Overview
The annotation pipeline consists of two main components:
1. **Subtask Annotation** (`subtask_annotate.py`): Automatically segments robot demonstrations into atomic skills using Vision-Language Models (VLMs)
2. **High-Level Annotation** (`high_level_annotate.py`): Generates synthetic user prompts and robot utterances for hierarchical policy training
These tools enable you to transform raw robot demonstration data into richly annotated datasets suitable for training hierarchical policies.
## Installation Requirements
Before using the annotation tools, ensure you have the required dependencies:
```bash
pip install transformers qwen-vl-utils opencv-python rich pandas pyarrow
```
You'll also need FFmpeg for video processing:
```bash
# Ubuntu/Debian
sudo apt-get install ffmpeg
# macOS
brew install ffmpeg
```
## Part 1: Subtask Annotation
### What It Does
The subtask annotator segments each episode into short atomic manipulation skills (1-3 seconds each). For example, a "pick and place" episode might be segmented into:
- "reach towards object" (0.0s - 1.2s)
- "grasp object" (1.2s - 2.1s)
- "lift object" (2.1s - 3.5s)
- "move to target" (3.5s - 5.0s)
- "release object" (5.0s - 6.2s)
### Usage
#### Basic Example
```bash
python src/lerobot/policies/pi05_full/annotate/subtask_annotate.py \
--repo-id your-username/your-dataset \
--video-key observation.images.base \
--output-dir /path/to/output
```
#### With Local Dataset
```bash
python src/lerobot/policies/pi05_full/annotate/subtask_annotate.py \
--data-dir /path/to/local/dataset \
--video-key observation.images.base \
--output-dir /path/to/output
```
#### Advanced Options
```bash
python src/lerobot/policies/pi05_full/annotate/subtask_annotate.py \
--repo-id your-username/your-dataset \
--video-key observation.images.base \
--model Qwen/Qwen2-VL-7B-Instruct \
--batch-size 16 \
--output-dir /path/to/output \
--push-to-hub
```
### Parameters
| Parameter | Description | Default |
|-----------|-------------|---------|
| `--repo-id` | HuggingFace Hub dataset ID | Required (or use --data-dir) |
| `--data-dir` | Path to local dataset | Required (or use --repo-id) |
| `--video-key` | Video observation key | Required |
| `--model` | VLM model to use | `Qwen/Qwen2-VL-7B-Instruct` |
| `--device` | Device to run model on | `cuda` |
| `--dtype` | Model dtype | `bfloat16` |
| `--batch-size` | Episodes per batch | `8` |
| `--episodes` | Specific episodes to annotate | All episodes |
| `--output-dir` | Output directory | Auto-generated |
| `--push-to-hub` | Push to HuggingFace Hub | `False` |
### Supported Models
- **Qwen2-VL**: `Qwen/Qwen2-VL-2B-Instruct`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`
- **Qwen3-VL**: `Qwen/Qwen3-VL-30B-A3B-Instruct`
### Output Files
The subtask annotation creates the following files in your dataset:
1. **`meta/subtasks.parquet`**: DataFrame with unique subtask names
```python
# Structure:
# Index: subtask name (string)
# Column: subtask_index (int64)
```
2. **`meta/skills.json`**: Raw skill annotations with timestamps
```json
{
"coarse_description": "Pick and place the object",
"skill_to_subtask_index": {
"reach towards object": 0,
"grasp object": 1,
...
},
"episodes": {
"0": {
"episode_index": 0,
"description": "Pick and place the object",
"skills": [
{"name": "reach towards object", "start": 0.0, "end": 1.2},
{"name": "grasp object", "start": 1.2, "end": 2.1},
...
]
}
}
}
```
3. **`subtask_index` feature**: Added to each frame in the dataset
- Type: `int64`
- Shape: `(1,)`
- Maps each frame to its corresponding subtask
### Accessing Subtask Annotations
```python
from lerobot.datasets.lerobot_dataset import LeRobotDataset
# Load annotated dataset
dataset = LeRobotDataset(repo_id="your/dataset_with_subtasks")
# Get a frame
frame = dataset[100]
# Get the subtask for this frame
subtask_idx = frame["subtask_index"].item()
subtask_name = dataset.meta.subtasks.iloc[subtask_idx].name
print(f"Frame 100 is performing: {subtask_name}")
# Load all subtasks
subtasks_df = dataset.meta.subtasks
print(subtasks_df)
```
## Part 2: High-Level Annotation
### What It Does
The high-level annotator generates synthetic dialogue for hierarchical policy training. For each skill, it creates:
- **User Prompt** (`_t`): A natural language request from the user
- **Robot Utterance** (`u_t`): A natural language response from the robot
This enables training policies that can understand and respond to human instructions in natural dialogue.
### Prerequisites
**Important**: You must run subtask annotation first! High-level annotation requires the `skills.json` file generated by subtask annotation.
### Usage
#### Image Mode (Default)
Samples frames at regular intervals and passes images to the VLM:
```bash
python src/lerobot/policies/pi05_full/annotate/high_level_annotate.py \
--repo-id your/dataset_with_subtasks \
--model Qwen/Qwen2-VL-7B-Instruct \
--image-key observation.images.base \
--output-dir /path/to/output
```
#### Video Mode
Passes entire episode videos to the VLM for better temporal understanding:
```bash
python src/lerobot/policies/pi05_full/annotate/high_level_annotate.py \
--repo-id your/dataset_with_subtasks \
--model Qwen/Qwen2-VL-7B-Instruct \
--video-mode \
--video-key observation.images.base \
--video-batch-size 4 \
--output-dir /path/to/output
```
### Parameters
| Parameter | Description | Default |
|-----------|-------------|---------|
| `--repo-id` | HuggingFace Hub dataset ID | Required (or use --data-dir) |
| `--data-dir` | Path to local dataset | Required (or use --repo-id) |
| `--model` | VLM model to use | `Qwen/Qwen2-VL-7B-Instruct` |
| `--image-key` | Image observation key (image mode) | First camera key |
| `--video-mode` | Use video instead of images | `False` |
| `--video-key` | Video observation key (video mode) | Auto-detected |
| `--video-batch-size` | Episodes per batch (video mode) | `1` |
| `--sample-interval` | Sampling interval in seconds | `1.0` |
| `--temperature` | Sampling temperature | `0.7` |
| `--output-dir` | Output directory | Auto-generated |
| `--push-to-hub` | Push to HuggingFace Hub | `False` |
### Output Files
The high-level annotation creates:
1. **`meta/tasks_high_level.parquet`**: DataFrame with high-level tasks
```python
# Structure:
# Index: task string (concatenated user_prompt | robot_utterance)
# Columns:
# - task_index: int64
# - user_prompt: string
# - robot_utterance: string
# - skill: string (associated subtask)
# - scenario_type: string
# - response_type: string
```
2. **`meta/syn_annotations.jsonl`**: Debug annotations (JSONL format)
```json
{"episode_id": 0, "timestamp": 1.5, "skill_current": "grasp object", "user_prompt": "Can you pick that up?", "robot_utterance": "Sure, I'll grasp it now", ...}
```
3. **`task_index_high_level` feature**: Added to each frame
- Type: `int64`
- Shape: `(1,)`
- Maps each frame to its high-level task
### Dialogue Types Generated
The system generates diverse interaction types:
**Scenario Types:**
- `specific_object`: "Pick up the red block"
- `negative_task`: "Don't touch the blue one"
- `situated_correction`: "Actually, move to the other box instead"
- `implicit_request`: "I need something red for the tower"
- `constraint_based`: "Make sure to handle it gently"
**Response Types:**
- `confirmation`: "OK, I'll pick it up"
- `clarification`: "Just to confirm, you want me to pick up the red block?"
- `acknowledgment`: "Got it, picking up the red block"
- `constraint_acknowledgment`: "Sure, I'll pick it up gently"
### Accessing High-Level Annotations
```python
from lerobot.datasets.lerobot_dataset import LeRobotDataset
import pandas as pd
# Load annotated dataset
dataset = LeRobotDataset(repo_id="your/dataset_with_high_level_tasks")
# Get a frame
frame = dataset[100]
# Get the high-level task
task_idx = frame["task_index_high_level"].item()
# Load tasks metadata
tasks_df = pd.read_parquet(dataset.root / "meta" / "tasks_high_level.parquet")
task_row = tasks_df[tasks_df["task_index"] == task_idx].iloc[0]
print(f"User: {task_row['user_prompt']}")
print(f"Robot: {task_row['robot_utterance']}")
print(f"Skill: {task_row['skill']}")
# Use in a DataLoader
import torch
from torch.utils.data import DataLoader
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
batch = next(iter(dataloader))
print(f"Task indices: {batch['task_index_high_level']}")
print(f"User prompts: {batch['user_prompt'][0]}")
print(f"Robot utterances: {batch['robot_utterance'][0]}")
```
## Complete Pipeline Example
Here's how to run both annotation stages:
```bash
#!/bin/bash
REPO_ID="your-username/your-dataset"
MODEL="Qwen/Qwen2-VL-7B-Instruct"
OUTPUT_DIR="/path/to/output"
# Step 1: Subtask Annotation
python src/lerobot/policies/pi05_full/annotate/subtask_annotate.py \
--repo-id "$REPO_ID" \
--video-key observation.images.base \
--model "$MODEL" \
--batch-size 8 \
--output-dir "${OUTPUT_DIR}/subtasks"
# Step 2: High-Level Annotation (Image Mode)
python src/lerobot/policies/pi05_full/annotate/high_level_annotate.py \
--data-dir "${OUTPUT_DIR}/subtasks" \
--model "$MODEL" \
--image-key observation.images.base \
--sample-interval 1.0 \
--output-dir "${OUTPUT_DIR}/final"
# Or Step 2: High-Level Annotation (Video Mode - Recommended)
python src/lerobot/policies/pi05_full/annotate/high_level_annotate.py \
--data-dir "${OUTPUT_DIR}/subtasks" \
--model "$MODEL" \
--video-mode \
--video-key observation.images.base \
--video-batch-size 4 \
--output-dir "${OUTPUT_DIR}/final"
```
## Performance Tips
### For Faster Processing
1. **Increase batch size**: Use `--batch-size 16` or higher (subtask annotation)
2. **Increase video batch size**: Use `--video-batch-size 8` (high-level annotation in video mode)
3. **Larger sampling interval**: Use `--sample-interval 5.0` for testing (samples every 5 seconds instead of 1)
4. **Use smaller models**: `Qwen/Qwen2-VL-2B-Instruct` is faster than `Qwen2-VL-7B-Instruct`
5. **Process specific episodes**: Use `--episodes 0 1 2 3` to annotate only a subset
### For Better Quality
1. **Use larger models**: `Qwen/Qwen3-VL-30B-A3B-Instruct` or `Qwen/Qwen2-VL-72B-Instruct`
2. **Use video mode**: Provides better temporal context
3. **Smaller sampling intervals**: `--sample-interval 0.5` for dense annotations
4. **Adjust temperature**: Use `--temperature 0.9` for more diverse dialogue
## Memory Requirements
| Model | GPU Memory | Recommended Batch Size |
|-------|------------|------------------------|
| Qwen2-VL-2B | ~8 GB | 16-32 |
| Qwen2-VL-7B | ~16 GB | 8-16 |
| Qwen2-VL-72B | ~80 GB | 1-2 |
| Qwen3-VL-30B | ~40 GB | 4-8 |
## Troubleshooting
### "FFmpeg not found"
```bash
# Install FFmpeg
sudo apt-get install ffmpeg # Ubuntu/Debian
brew install ffmpeg # macOS
```
### "CUDA out of memory"
- Reduce batch size: `--batch-size 1` or `--video-batch-size 1`
- Use smaller model: `Qwen/Qwen2-VL-2B-Instruct`
- Use CPU: `--device cpu` (much slower)
### "No skills.json found"
Run subtask annotation first before high-level annotation.
### "Video key not found"
List available keys:
```python
from lerobot.datasets.lerobot_dataset import LeRobotDataset
dataset = LeRobotDataset(repo_id="your/dataset")
print("Video keys:", dataset.meta.video_keys)
print("Camera keys:", dataset.meta.camera_keys)
```
## Dataset Structure After Annotation
```
your_dataset_with_high_level_tasks/
├── meta/
│ ├── info.json # Original metadata
│ ├── tasks.parquet # Original tasks (preserved)
│ ├── subtasks.parquet # NEW: Subtask names and indices
│ ├── skills.json # NEW: Raw skill annotations with timestamps
│ ├── tasks_high_level.parquet # NEW: High-level tasks with dialogue
│ └── syn_annotations.jsonl # NEW: Debug annotations
├── data/
│ └── chunk-000/
│ ├── observation.images.base.mp4
│ ├── action.safetensors
│ ├── subtask_index.safetensors # NEW: Subtask per frame
│ └── task_index_high_level.safetensors # NEW: High-level task per frame
└── videos/
└── ...
```
## Citation
If you use these annotation tools in your research, please cite:
```bibtex
@article{lerobot2024,
title={LeRobot: State-of-the-art Machine Learning for Real-World Robotics},
author={LeRobot Contributors},
year={2024},
url={https://github.com/huggingface/lerobot}
}
```
## Next Steps
After annotation, you can:
1. Train hierarchical policies using the subtask and high-level annotations
2. Use the synthetic dialogue for instruction-following policy training
3. Analyze skill distributions and dialogue patterns
4. Share your annotated dataset on HuggingFace Hub with `--push-to-hub`
For training examples, see the [training documentation](../training/).
+1 -1
View File
@@ -48,7 +48,7 @@ python -m lerobot.async_inference.robot_client \
--task="dummy" \ # POLICY: The task to run the policy on (`Fold my t-shirt`). Not necessarily defined for all policies, such as `act`
--policy_type=your_policy_type \ # POLICY: the type of policy to run (smolvla, act, etc)
--pretrained_name_or_path=user/model \ # POLICY: the model name/path on server to the checkpoint to run (e.g., lerobot/smolvla_base)
--policy_device=mps \ # POLICY: the device to run the policy on, on the server (cuda, mps, xpu, cpu)
--policy_device=mps \ # POLICY: the device to run the policy on, on the server
--actions_per_chunk=50 \ # POLICY: the number of actions to output at once
--chunk_size_threshold=0.5 \ # CLIENT: the threshold for the chunk size before sending a new observation to the server
--aggregate_fn_name=weighted_average \ # CLIENT: the function to aggregate actions on overlapping portions
+4 -4
View File
@@ -32,7 +32,7 @@ version = "0.1.0"
dependencies = [
# your policy-specific dependencies
]
requires-python = ">= 3.12"
requires-python = ">= 3.11"
[build-system]
build-backend = # your-build-backend
@@ -82,7 +82,7 @@ Create your policy implementation by inheriting from LeRobot's base `PreTrainedP
# modeling_my_custom_policy.py
import torch
import torch.nn as nn
from typing import Any
from typing import Dict, Any
from lerobot.policies.pretrained import PreTrainedPolicy
from .configuration_my_custom_policy import MyCustomPolicyConfig
@@ -91,7 +91,7 @@ class MyCustomPolicy(PreTrainedPolicy):
config_class = MyCustomPolicyConfig
name = "my_custom_policy"
def __init__(self, config: MyCustomPolicyConfig, dataset_stats: dict[str, Any] = None):
def __init__(self, config: MyCustomPolicyConfig, dataset_stats: Dict[str, Any] = None):
super().__init__(config, dataset_stats)
...
```
@@ -102,7 +102,7 @@ Create processor functions:
```python
# processor_my_custom_policy.py
from typing import Any
from typing import Dict, Any
import torch
+4 -7
View File
@@ -13,7 +13,7 @@ The EarthRover Mini Plus is a fully open source mobile robot that connects throu
### Hardware
- EarthRover Mini robot
- Computer with Python 3.12 or newer
- Computer with Python 3.10 or newer
- Internet connection
### Setting Up the Frodobots SDK
@@ -170,13 +170,13 @@ Once you can drive the robot well, you can start recording data to train AI mode
We use Hugging Face to store your data online. First, log in with your token from [Hugging Face settings](https://huggingface.co/settings/tokens):
```bash
hf auth login --token ${HUGGINGFACE_TOKEN} --add-to-git-credential
huggingface-cli login --token ${HUGGINGFACE_TOKEN} --add-to-git-credential
```
Store your Hugging Face username:
```bash
HF_USER=$(hf auth whoami | awk -F': *' 'NR==1 {print $2}')
HF_USER=$(huggingface-cli whoami | head -n 1)
echo $HF_USER
```
@@ -185,16 +185,13 @@ echo $HF_USER
Use the standard recording command:
```bash
lerobot-record \
python src/lerobot/scripts/lerobot_record.py \
--robot.type=earthrover_mini_plus \
--teleop.type=keyboard_rover \
--dataset.repo_id=your_username/dataset_name \
--dataset.num_episodes=2 \
--dataset.fps=10 \
--dataset.single_task="Navigate around obstacles" \
--dataset.streaming_encoding=true \
--dataset.encoder_threads=2 \
# --dataset.vcodec=auto \
--display_data=true
```
+2 -2
View File
@@ -155,10 +155,10 @@ Upload your repository to Hugging Face:
pip install huggingface_hub
# Login to Hugging Face
hf auth login
huggingface-cli login
# Create a new repository
hf repo create my-org/my-custom-env
huggingface-cli repo create my-custom-env --type space --org my-org
# Initialize git and push
git init
+3 -6
View File
@@ -120,12 +120,9 @@ lerobot-record \
--display_data=true \
--dataset.repo_id=<user>/eval_groot-bimanual \
--dataset.num_episodes=10 \
--dataset.single_task="Grab and handover the red cube to the other arm" \
--dataset.streaming_encoding=true \
--dataset.encoder_threads=2 \
# --dataset.vcodec=auto \
--policy.path=<user>/groot-bimanual \ # your trained model
--dataset.episode_time_s=30 \
--dataset.single_task="Grab and handover the red cube to the other arm"
--policy.path=<user>/groot-bimanual # your trained model
--dataset.episode_time_s=30
--dataset.reset_time_s=10
```
+5 -11
View File
@@ -224,15 +224,12 @@ lerobot-record \
--teleop.port=/dev/tty.usbmodem1201 \
--teleop.id=right \
--teleop.side=right \
--dataset.repo_id=<USER>/hand_record_test_with_video_data \
--dataset.repo_id=nepyope/hand_record_test_with_video_data \
--dataset.single_task="Hand recording test with video data" \
--dataset.num_episodes=1 \
--dataset.episode_time_s=5 \
--dataset.push_to_hub=true \
--dataset.private=true \
--dataset.streaming_encoding=true \
--dataset.encoder_threads=2 \
# --dataset.vcodec=auto \
--display_data=true
```
@@ -244,7 +241,7 @@ lerobot-replay \
--robot.port=/dev/tty.usbmodem58760432281 \
--robot.id=right \
--robot.side=right \
--dataset.repo_id=<USER>/hand_record_test_with_camera \
--dataset.repo_id=nepyope/hand_record_test_with_camera \
--dataset.episode=0
```
@@ -252,13 +249,13 @@ lerobot-replay \
```bash
lerobot-train \
--dataset.repo_id=<USER>/hand_record_test_with_video_data \
--dataset.repo_id=nepyope/hand_record_test_with_video_data \
--policy.type=act \
--output_dir=outputs/train/hopejr_hand \
--job_name=hopejr \
--policy.device=mps \
--wandb.enable=true \
--policy.repo_id=<USER>/hand_test_policy
--policy.repo_id=nepyope/hand_test_policy
```
### Evaluate
@@ -273,11 +270,8 @@ lerobot-record \
--robot.side=right \
--robot.cameras='{"main": {"type": "opencv", "index_or_path": 0, "width": 640, "height": 480, "fps": 30}}' \
--display_data=false \
--dataset.repo_id=<USER>/eval_hopejr \
--dataset.repo_id=nepyope/eval_hopejr \
--dataset.single_task="Evaluate hopejr hand policy" \
--dataset.num_episodes=10 \
--dataset.streaming_encoding=true \
--dataset.encoder_threads=2 \
# --dataset.vcodec=auto \
--policy.path=outputs/train/hopejr_hand/checkpoints/last/pretrained_model
```
+6 -12
View File
@@ -159,13 +159,13 @@ We use the Hugging Face hub features for uploading your dataset. If you haven't
Add your token to the CLI by running this command:
```bash
hf auth login --token ${HUGGINGFACE_TOKEN} --add-to-git-credential
huggingface-cli login --token ${HUGGINGFACE_TOKEN} --add-to-git-credential
```
Then store your Hugging Face repository name in a variable:
```bash
HF_USER=$(hf auth whoami | awk -F': *' 'NR==1 {print $2}')
HF_USER=$(hf auth whoami | head -n 1)
echo $HF_USER
```
@@ -185,10 +185,7 @@ lerobot-record \
--display_data=true \
--dataset.repo_id=${HF_USER}/record-test \
--dataset.num_episodes=5 \
--dataset.single_task="Grab the black cube" \
--dataset.streaming_encoding=true \
# --dataset.vcodec=auto \
--dataset.encoder_threads=2
--dataset.single_task="Grab the black cube"
```
</hfoption>
<hfoption id="API example">
@@ -327,7 +324,7 @@ You can look for other LeRobot datasets on the hub by searching for `LeRobot` [t
You can also push your local dataset to the Hub manually, running:
```bash
hf upload ${HF_USER}/record-test ~/.cache/huggingface/lerobot/{repo-id} --repo-type dataset
huggingface-cli upload ${HF_USER}/record-test ~/.cache/huggingface/lerobot/{repo-id} --repo-type dataset
```
#### Record function
@@ -491,7 +488,7 @@ If your local computer doesn't have a powerful GPU you could utilize Google Cola
Once training is done, upload the latest checkpoint with:
```bash
hf upload ${HF_USER}/act_so101_test \
huggingface-cli upload ${HF_USER}/act_so101_test \
outputs/train/act_so101_test/checkpoints/last/pretrained_model
```
@@ -499,7 +496,7 @@ You can also upload intermediate checkpoints with:
```bash
CKPT=010000
hf upload ${HF_USER}/act_so101_test${CKPT} \
huggingface-cli upload ${HF_USER}/act_so101_test${CKPT} \
outputs/train/act_so101_test/checkpoints/${CKPT}/pretrained_model
```
@@ -518,9 +515,6 @@ lerobot-record \
--display_data=false \
--dataset.repo_id=${HF_USER}/eval_so100 \
--dataset.single_task="Put lego brick into the transparent box" \
--dataset.streaming_encoding=true \
--dataset.encoder_threads=2 \
# --dataset.vcodec=auto \
# <- Teleop optional if you want to teleoperate in between episodes \
# --teleop.type=so100_leader \
# --teleop.port=/dev/ttyACM0 \
+14 -70
View File
@@ -1,57 +1,30 @@
# Installation
This guide uses `conda` (via miniforge) to manage environments (recommended). If you prefer another environment manager (e.g. `uv`, `venv`), ensure you have Python >=3.12 and `ffmpeg` installed with the `libsvtav1` encoder, then skip ahead to [Environment Setup](#step-2-environment-setup).
## Step 1 (`conda` only): Install [`miniforge`](https://conda-forge.org/download/)
## Install [`miniforge`](https://conda-forge.org/download/)
```bash
wget "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
bash Miniforge3-$(uname)-$(uname -m).sh
```
## Step 2: Environment Setup
## Environment Setup
Create a virtual environment with Python 3.12:
Create a virtual environment with Python 3.10, using conda:
<!-- prettier-ignore-start -->
<hfoptions id="create_venv">
<hfoption id="conda">
```bash
conda create -y -n lerobot python=3.12
conda create -y -n lerobot python=3.10
```
</hfoption>
<hfoption id="uv">
Then activate your conda environment, you have to do this each time you open a shell to use lerobot:
```bash
uv python install 3.12
uv venv --python 3.12
```
</hfoption>
</hfoptions>
<!-- prettier-ignore-end -->
Then activate your virtual environment, you have to do this each time you open a shell to use lerobot:
<!-- prettier-ignore-start -->
<hfoptions id="activate_venv">
<hfoption id="conda">```bash
conda activate lerobot
```</hfoption>
<hfoption id="uv">
```bash
# Linux/macOSsource
source .venv/bin/activate
# Windows PowerShell
source .venv\Scripts\Activate.ps1
```
</hfoption>
</hfoptions>
<!-- prettier-ignore-end -->
When using `conda`, install `ffmpeg` in your environment:
```bash
conda install ffmpeg -c conda-forge
ffmpeg -version # ffmpeg 8.X is not yet supported !
```
> [!TIP]
@@ -65,17 +38,7 @@ ffmpeg -version # ffmpeg 8.X is not yet supported !
>
> - _[On Linux only]_ If you want to bring your own ffmpeg: Install [ffmpeg build dependencies](https://trac.ffmpeg.org/wiki/CompilationGuide/Ubuntu#GettheDependencies) and [compile ffmpeg from source with libsvtav1](https://trac.ffmpeg.org/wiki/CompilationGuide/Ubuntu#libsvtav1), and make sure you use the corresponding ffmpeg binary to your install with `which ffmpeg`.
> [!NOTE]
> When installing LeRobot inside WSL (Windows Subsystem for Linux), make sure to install `evdev` with the following command:
>
> ```bash
> conda install evdev -c conda-forge
> ```
> [!IMPORTANT]
> If you are using `uv` you will have to install `ffmpeg` system-wide (outside of the virtual environment). You rely on `uv` and `torchcodec` ability to dynamically link to the system `ffmpeg`.
## Step 3: Install LeRobot 🤗
## Install LeRobot 🤗
### From Source
@@ -88,45 +51,23 @@ cd lerobot
Then, install the library in editable mode. This is useful if you plan to contribute to the code.
<!-- prettier-ignore-start -->
<hfoptions id="install_lerobot_src">
<hfoption id="conda">
```bash
pip install -e .
```
</hfoption>
<hfoption id="uv">
```bash
uv pip install -e .
```
</hfoption>
</hfoptions>
<!-- prettier-ignore-end -->
### Installation from PyPI
**Core Library:**
Install the base package with:
<!-- prettier-ignore-start -->
<hfoptions id="install_lerobot_pypi">
<hfoption id="conda">
```bash
pip install lerobot
```
</hfoption>
<hfoption id="uv">
```bash
uv pip install lerobot
```
</hfoption>
</hfoptions>
<!-- prettier-ignore-end -->
_This installs only the default dependencies._
**Extra Features:**
To install additional functionality, use one of the following (If you are using `uv`, replace `pip install` with `uv pip install` in the commands below.):
To install additional functionality, use one of the following:
```bash
pip install 'lerobot[all]' # All available features
@@ -140,10 +81,13 @@ _Replace `[...]` with your desired features._
For a full list of optional dependencies, see:
https://pypi.org/project/lerobot/
> [!NOTE]
> For lerobot 0.4.0, if you want to install pi, you will have to do: `pip install "lerobot[pi]@git+https://github.com/huggingface/lerobot.git"`
### Troubleshooting
If you encounter build errors, you may need to install additional dependencies: `cmake`, `build-essential`, and `ffmpeg libs`.
To install these for Linux run:
To install these for linux run:
```bash
sudo apt-get install cmake build-essential python3-dev pkg-config libavformat-dev libavcodec-dev libavdevice-dev libavutil-dev libswscale-dev libswresample-dev libavfilter-dev
@@ -153,7 +97,7 @@ For other systems, see: [Compiling PyAV](https://pyav.org/docs/develop/overview/
## Optional dependencies
LeRobot provides optional extras for specific functionalities. Multiple extras can be combined (e.g., `.[aloha,feetech]`). For all available extras, refer to `pyproject.toml`. If you are using `uv`, replace `pip install` with `uv pip install` in the commands below.
LeRobot provides optional extras for specific functionalities. Multiple extras can be combined (e.g., `.[aloha,feetech]`). For all available extras, refer to `pyproject.toml`.
### Simulations
+2 -2
View File
@@ -279,13 +279,13 @@ We use the Hugging Face hub features for uploading your dataset. If you haven't
Add your token to the CLI by running this command:
```bash
hf auth login --token ${HUGGINGFACE_TOKEN} --add-to-git-credential
huggingface-cli login --token ${HUGGINGFACE_TOKEN} --add-to-git-credential
```
Then store your Hugging Face repository name in a variable:
```bash
HF_USER=$(hf auth whoami | awk -F': *' 'NR==1 {print $2}')
HF_USER=$(huggingface-cli whoami | head -n 1)
echo $HF_USER
```
+1 -4
View File
@@ -41,10 +41,7 @@ lerobot-record \
--display_data=true \
--dataset.repo_id=${HF_USER}/record-test \
--dataset.num_episodes=5 \
--dataset.single_task="Grab the black cube" \
--dataset.streaming_encoding=true \
# --dataset.vcodec=auto \
--dataset.encoder_threads=2
--dataset.single_task="Grab the black cube"
```
See the [recording guide](./il_robots#record-a-dataset) for more details.
-232
View File
@@ -1,232 +0,0 @@
# Multi-Dataset Training
This guide covers how to train a single policy on multiple heterogeneous datasets using `MultiLeRobotDataset`.
## Overview
Real-world robot learning datasets come from different environments, robots, and camera setups. A RoboCasa dataset might have three cameras named `robot0_agentview_left`, `robot0_agentview_right`, and `robot0_eye_in_hand`, while a LIBERO dataset uses `observation.images.front` and `observation.images.wrist`, and a RoboMME dataset uses bare `image` and `wrist_image` keys. State and action dimensions also differ.
`MultiLeRobotDataset` lets you train on all of them jointly by:
- **Mapping** each dataset's feature keys into a shared namespace
- **Padding** features that a dataset doesn't have with zeros
- **Weighting** how often each dataset is sampled
- **Transforming** samples per-dataset (e.g. padding actions to a common dimension)
- **Aggregating** statistics across all sub-datasets for normalization
## Configuration
Multi-dataset training is configured via `MultiDatasetConfig` in a YAML config file. Instead of a single `dataset.repo_id`, you provide a `datasets` list where each entry is a `SubDatasetConfig`.
### SubDatasetConfig fields
| Field | Type | Default | Description |
|-------|------|---------|-------------|
| `repo_id` | `str` | required | HuggingFace repo ID or local dataset name |
| `root` | `str \| None` | `None` | Local root directory for the dataset |
| `episodes` | `list[int] \| None` | `None` | Subset of episode indices to use |
| `revision` | `str \| None` | `None` | Dataset version / revision |
| `video_backend` | `str` | auto | Video decoding backend (`pyav`, `torchcodec`, etc.) |
| `weight` | `float` | `1.0` | Relative sampling weight for this dataset |
| `feature_map` | `dict[str, str]` | `{}` | Maps dataset keys to unified policy keys |
| `transforms` | `list` | `None` | Per-dataset transform steps (applied per sample) |
### Example: Three-dataset config
```yaml
dataset:
type: multi
use_imagenet_stats: true
datasets:
# RoboCasa: 3 cameras, state(16), action(12)
- repo_id: pepijn223/robocasa_PrepareCoffee
root: /data/robocasa_PrepareCoffee
weight: 1.0
feature_map:
observation.images.robot0_agentview_left: observation.images.front_left
observation.images.robot0_agentview_right: observation.images.front_right
observation.images.robot0_eye_in_hand: observation.images.wrist
# LIBERO-plus: 2 cameras, state(8), action(7)
- repo_id: pepijn223/libero_plus_lerobot
root: /data/libero_plus_lerobot
weight: 0.5
feature_map:
observation.images.front: observation.images.front_left
observation.images.wrist: observation.images.wrist
transforms:
- type: pad_action
kwargs: {target_dim: 12}
- type: pad_state
kwargs: {target_dim: 16}
# RoboMME: 2 cameras (non-standard keys), state(8), action(8)
- repo_id: pepijn223/robomme_data_lerobot
root: /data/robomme_data_lerobot
weight: 0.3
feature_map:
image: observation.images.front_left
wrist_image: observation.images.wrist
state: observation.state
actions: action
transforms:
- type: pad_action
kwargs: {target_dim: 12}
- type: pad_state
kwargs: {target_dim: 16}
```
## Feature Mapping
The `feature_map` dictionary renames dataset-local keys into a shared namespace. Keys not listed pass through unchanged. In the example above, all three datasets end up with the same camera key names (`observation.images.front_left`, `observation.images.wrist`) even though they use different conventions internally.
After mapping, the **union** of all features across datasets defines the unified schema. If a feature exists in some datasets but not others, it is automatically zero-padded for datasets that lack it, and a boolean `{key}_is_pad` flag is added to the sample so the policy can optionally mask padded features.
## Automatic Padding
When a sub-dataset doesn't have a feature that exists in the unified schema:
- **Images/videos**: padded with a black frame (zeros) matching the expected resolution
- **Float tensors** (state, action): padded with zeros
- **Integer/bool tensors**: padded with zeros / False
A companion `{key}_is_pad = True` tensor is added so the model can distinguish real data from padding.
## Per-Dataset Transforms
Each sub-dataset can have its own `transforms` pipeline that runs after feature renaming but before cross-dataset padding. This is useful for making shapes compatible before PyTorch's collate function stacks the batch.
### Built-in transforms
| Name | Description | Parameters |
|------|-------------|------------|
| `pad_action` | Zero-pad `action` to a target dimension | `target_dim: int` |
| `pad_state` | Zero-pad `observation.state` to a target dimension | `target_dim: int` |
| `resize_images` | Resize all `observation.images.*` tensors | `height: int`, `width: int` |
### Custom transforms
You can register your own transforms in `lerobot/datasets/transforms.py`:
```python
from lerobot.datasets.transforms import DatasetTransformStep, register_dataset_transform
@register_dataset_transform("my_transform")
class MyTransform(DatasetTransformStep):
def __init__(self, some_param: int):
self.some_param = some_param
def __call__(self, sample: dict) -> dict:
# Modify sample in-place or return a new dict
sample["action"] = sample["action"] * self.some_param
return sample
```
Then reference it in the config:
```yaml
transforms:
- type: my_transform
kwargs: {some_param: 2}
```
## Weighted Sampling
The `weight` field on each sub-dataset controls how often it is sampled during training. Weights are relative and automatically normalized to probabilities. For example, with weights `[1.0, 0.5, 0.3]`, the first dataset is sampled roughly 56% of the time, the second 28%, and the third 16%.
This uses `WeightedEpisodeAwareSampler`, which respects episode boundaries (so `drop_n_last_frames` and similar policy settings work correctly) while sampling across datasets proportionally.
## Stats Aggregation
Normalization statistics (mean, std, min, max, quantiles) are automatically aggregated across all sub-datasets using the mapped feature keys. The aggregation uses a weighted parallel variance algorithm so that datasets with more frames contribute proportionally to the global statistics.
The aggregated stats are used by the standard LeRobot preprocessor for normalization during training.
## Training
Launch training the same way as single-dataset training. The factory and training script automatically detect `MultiDatasetConfig` and set up the weighted sampler:
```bash
python -m lerobot.scripts.lerobot_train \
--config_path path/to/multi_dataset_config.yaml
```
## Architecture
The data flow during training with `MultiLeRobotDataset`:
```
┌─────────────────────────────────────────────────────────┐
│ MultiLeRobotDataset.__getitem__(global_idx) │
│ │
│ 1. Map global_idx → (dataset_idx, local_idx) │
│ 2. Fetch sample from sub-dataset │
│ 3. Rename keys via feature_map │
│ 4. Apply per-dataset transforms (pad_action, etc.) │
│ 5. Zero-pad missing features + add _is_pad flags │
│ 6. Add dataset_index tag │
└─────────────────────┬───────────────────────────────────┘
┌────────────▼────────────┐
│ PyTorch DataLoader │
│ (collates into batch) │
└────────────┬────────────┘
┌────────────▼────────────┐
│ LeRobot Preprocessor │
│ (normalize, tokenize) │
└────────────┬────────────┘
┌────────────▼────────────┐
│ Policy forward + loss │
└─────────────────────────┘
```
## API Reference
### `NewMultiLeRobotDataset`
```python
from lerobot.datasets.multi_dataset import NewMultiLeRobotDataset
dataset = NewMultiLeRobotDataset(
configs=[...], # list[SubDatasetConfig]
image_transforms=None, # optional image augmentation
delta_timestamps=None, # optional temporal neighbors
tolerance_s=1e-4, # timestamp tolerance
)
dataset.num_frames # total frames across all sub-datasets
dataset.num_episodes # total episodes
dataset.meta # MultiDatasetMeta (stats, features, episodes)
dataset.dataset_weights # list of per-dataset weights
dataset.features # unified feature dict (union of all mapped features)
dataset.camera_keys # unified camera key list
```
### `WeightedEpisodeAwareSampler`
```python
from lerobot.datasets.sampler import WeightedEpisodeAwareSampler
sampler = WeightedEpisodeAwareSampler(
dataset_from_indices=dataset.meta.episodes["dataset_from_index"],
dataset_to_indices=dataset.meta.episodes["dataset_to_index"],
dataset_membership=dataset.meta.episodes["dataset_source"],
dataset_weights=dataset.dataset_weights,
shuffle=True,
)
```
### `DatasetTransformPipeline`
```python
from lerobot.datasets.transforms import DatasetTransformPipeline, DatasetTransformStepConfig
pipeline = DatasetTransformPipeline([
DatasetTransformStepConfig(type="pad_action", kwargs={"target_dim": 12}),
DatasetTransformStepConfig(type="pad_state", kwargs={"target_dim": 16}),
])
sample = pipeline(sample) # modifies the sample dict
```
+5 -9
View File
@@ -66,13 +66,12 @@ Run on of the examples scripts to teleoperate, record a dataset, replay a datase
All scripts assume you configured your robot (e.g., SO-100 follower) and set the correct serial port.
Additionally you need to **copy the URDF of the robot into the examples folder**. For the examples in this tutorial (using SO100/SO101), copy the `SO101` folder from the [SO-ARM100 repo](https://github.com/TheRobotStudio/SO-ARM100/blob/main/Simulation/SO101) into the `examples/phone_to_so100/` directory, so that the URDF file path becomes `examples/phone_to_so100/SO101/so101_new_calib.urdf`.
Additionally you need to **copy the urdf of the robot to the examples folder**. For the examples in this tutorial (Using SO100/SO101) it is highly recommended to use the urdf in the [SO-ARM100 repo](https://github.com/TheRobotStudio/SO-ARM100/blob/main/Simulation/SO101/so101_new_calib.urdf)
- Run this example to teleoperate:
```bash
cd examples/phone_to_so100
python teleoperate.py
python examples/phone_to_so100/teleoperate.py
```
After running the example:
@@ -85,22 +84,19 @@ Additionally you can customize mapping or safety limits by editing the processor
- Run this example to record a dataset, which saves absolute end effector observations and actions:
```bash
cd examples/phone_to_so100
python record.py
python examples/phone_to_so100/record.py
```
- Run this example to replay recorded episodes:
```bash
cd examples/phone_to_so100
python replay.py
python examples/phone_to_so100/replay.py
```
- Run this example to evaluate a pretrained policy:
```bash
cd examples/phone_to_so100
python evaluate.py
python examples/phone_to_so100/evaluate.py
```
### Important pipeline steps and options
+6 -1
View File
@@ -34,6 +34,11 @@ As described by Physical Intelligence, while AI has achieved remarkable success
pip install -e ".[pi]"
```
> [!NOTE]
> For lerobot 0.4.0, if you want to install pi tag, you will have to do: `pip install "lerobot[pi]@git+https://github.com/huggingface/lerobot.git"`.
>
> This will be solved in the next patch release
## Training Data and Capabilities
π₀ is trained on the largest robot interaction dataset to date, combining three key data sources:
@@ -55,7 +60,7 @@ policy.type=pi0
For training π₀, you can use the standard LeRobot training script with the appropriate configuration:
```bash
lerobot-train \
python src/lerobot/scripts/lerobot_train.py \
--dataset.repo_id=your_dataset \
--policy.type=pi0 \
--output_dir=./outputs/pi0_training \
+6 -1
View File
@@ -36,6 +36,11 @@ This diverse training mixture creates a "curriculum" that enables generalization
pip install -e ".[pi]"
```
> [!NOTE]
> For lerobot 0.4.0, if you want to install pi tag, you will have to do: `pip install "lerobot[pi]@git+https://github.com/huggingface/lerobot.git"`.
>
> This will be solved in the next patch release
## Usage
To use π₀.₅ in your LeRobot configuration, specify the policy type as:
@@ -51,7 +56,7 @@ policy.type=pi05
Here's a complete training command for finetuning the base π₀.₅ model on your own dataset:
```bash
lerobot-train \
python src/lerobot/scripts/lerobot_train.py\
--dataset.repo_id=your_dataset \
--policy.type=pi05 \
--output_dir=./outputs/pi05_training \
+15 -10
View File
@@ -43,11 +43,16 @@ This approach can transform **any existing VLM** into a VLA by training it to pr
pip install -e ".[pi]"
```
> [!NOTE]
> For lerobot 0.4.0, if you want to install the pi tag, you will have to do: `pip install "lerobot[pi]@git+https://github.com/huggingface/lerobot.git"`.
>
> This will be solved in the next patch release
## Training a Custom FAST Tokenizer
You have two options for the FAST tokenizer:
1. **Use the pre-trained tokenizer**: The `lerobot/fast-action-tokenizer` tokenizer was trained on 1M+ real robot action sequences and works as a general-purpose tokenizer.
1. **Use the pre-trained tokenizer**: The `physical-intelligence/fast` tokenizer was trained on 1M+ real robot action sequences and works as a general-purpose tokenizer.
2. **Train your own tokenizer**: For maximum performance on your specific dataset, you can finetune the tokenizer on your own data.
@@ -109,15 +114,15 @@ lerobot-train \
### Key Training Parameters
| Parameter | Description | Default |
| -------------------------------------- | -------------------------------------------------- | ------------------------------- |
| `--policy.gradient_checkpointing=true` | Reduces memory usage significantly during training | `false` |
| `--policy.dtype=bfloat16` | Use mixed precision training for efficiency | `float32` |
| `--policy.chunk_size` | Number of action steps to predict (action horizon) | `50` |
| `--policy.n_action_steps` | Number of action steps to execute | `50` |
| `--policy.max_action_tokens` | Maximum number of FAST tokens per action chunk | `256` |
| `--policy.action_tokenizer_name` | FAST tokenizer to use | `lerobot/fast-action-tokenizer` |
| `--policy.compile_model=true` | Enable torch.compile for faster training | `false` |
| Parameter | Description | Default |
| -------------------------------------- | -------------------------------------------------- | ---------------------------- |
| `--policy.gradient_checkpointing=true` | Reduces memory usage significantly during training | `false` |
| `--policy.dtype=bfloat16` | Use mixed precision training for efficiency | `float32` |
| `--policy.chunk_size` | Number of action steps to predict (action horizon) | `50` |
| `--policy.n_action_steps` | Number of action steps to execute | `50` |
| `--policy.max_action_tokens` | Maximum number of FAST tokens per action chunk | `256` |
| `--policy.action_tokenizer_name` | FAST tokenizer to use | `physical-intelligence/fast` |
| `--policy.compile_model=true` | Enable torch.compile for faster training | `false` |
## Inference
-6
View File
@@ -159,9 +159,6 @@ lerobot-record \
--dataset.fps=15 \
--dataset.push_to_hub=true \
--dataset.private=true \
--dataset.streaming_encoding=true \
--dataset.encoder_threads=2 \
# --dataset.vcodec=auto \
--display_data=true
```
@@ -201,9 +198,6 @@ lerobot-record \
--dataset.fps=15 \
--dataset.push_to_hub=true \
--dataset.private=true \
--dataset.streaming_encoding=true \
--dataset.encoder_threads=2 \
# --dataset.vcodec=auto \
--display_data=true
```
+4 -4
View File
@@ -269,7 +269,7 @@ This generates visualizations showing video frames with subtask boundaries overl
Train with **no annotations** - uses linear progress from 0 to 1:
```bash
lerobot-train \
python src/lerobot/scripts/lerobot_train.py \
--dataset.repo_id=your-username/your-dataset \
--policy.type=sarm \
--policy.annotation_mode=single_stage \
@@ -288,7 +288,7 @@ lerobot-train \
Train with **dense annotations only** (sparse auto-generated):
```bash
lerobot-train \
python src/lerobot/scripts/lerobot_train.py \
--dataset.repo_id=your-username/your-dataset \
--policy.type=sarm \
--policy.annotation_mode=dense_only \
@@ -307,7 +307,7 @@ lerobot-train \
Train with **both sparse and dense annotations**:
```bash
lerobot-train \
python src/lerobot/scripts/lerobot_train.py \
--dataset.repo_id=your-username/your-dataset \
--policy.type=sarm \
--policy.annotation_mode=dual \
@@ -468,7 +468,7 @@ This script:
Once you have the progress file, train your policy with RA-BC weighting. The progress file is auto-detected from the dataset path (`sarm_progress.parquet`). Currently PI0, PI0.5 and SmolVLA are supported with RA-BC:
```bash
lerobot-train \
python src/lerobot/scripts/lerobot_train.py \
--dataset.repo_id=your-username/your-dataset \
--policy.type=pi0 \
--use_rabc=true \
-3
View File
@@ -106,9 +106,6 @@ lerobot-record \
--dataset.repo_id=${HF_USER}/eval_DATASET_NAME_test \ # <- This will be the dataset name on HF Hub
--dataset.episode_time_s=50 \
--dataset.num_episodes=10 \
--dataset.streaming_encoding=true \
--dataset.encoder_threads=2 \
# --dataset.vcodec=auto \
# <- Teleop optional if you want to teleoperate in between episodes \
# --teleop.type=so100_leader \
# --teleop.port=/dev/ttyACM0 \
-155
View File
@@ -1,155 +0,0 @@
# Streaming Video Encoding Guide
## 1. Overview
Streaming video encoding eliminates the traditional PNG round-trip during video dataset recording. Instead of:
1. Capture frame -> write PNG to disk -> (at episode end) read PNG's -> encode to MP4 -> delete PNG's
Frames can be encoded in real-time during capture:
1. Capture frame -> queue to encoder thread -> encode to MP4 directly
This makes `save_episode()` near-instant (the video is already encoded by the time the episode ends) and removes the blocking wait that previously occurred between episodes, especially with multiple cameras in long episodes.
## 2. Tuning Parameters
| Parameter | CLI Flag | Type | Default | Description |
| ----------------------- | --------------------------------- | ------------- | ------------- | ----------------------------------------------------------------- |
| `streaming_encoding` | `--dataset.streaming_encoding` | `bool` | `True` | Enable real-time encoding during capture |
| `vcodec` | `--dataset.vcodec` | `str` | `"libsvtav1"` | Video codec. `"auto"` detects best HW encoder |
| `encoder_threads` | `--dataset.encoder_threads` | `int \| None` | `None` (auto) | Threads per encoder instance. `None` will leave the vcoded decide |
| `encoder_queue_maxsize` | `--dataset.encoder_queue_maxsize` | `int` | `60` | Max buffered frames per camera (~2s at 30fps). Consumes RAM |
## 3. Performance Considerations
Streaming encoding means the CPU is encoding video **during** the capture loop, not after. This creates a CPU budget that must be shared between:
- **Control loop** (reading cameras, control the robot, writing non-video data)
- **Encoder threads** (one pool per camera)
- **Rerun visualization** (if enabled)
- **OS and other processes**
### Resolution & Number of Cameras Impact
| Setup | Throughput (px/sec) | CPU Encoding Load | Notes |
| ------------------------- | ------------------- | ----------------- | ------------------------------ |
| 2camsx 640x480x3 @30fps | 55M | Low | Works on most systems |
| 2camsx 1280x720x3 @30fps | 165M | Moderate | Comfortable on modern systems |
| 2camsx 1920x1080x3 @30fps | 373M | High | Requires powerful high-end CPU |
### `encoder_threads` Tuning
This parameter controls how many threads each encoder instance uses internally:
- **Higher values** (e.g., 4-5): Faster encoding, but uses more CPU cores per camera. Good for high-end systems with many cores.
- **Lower values** (e.g., 1-2): Less CPU per camera, freeing cores for capture and visualization. Good for low-res images and capable CPUs.
- **`None` (default)**: Lets the codec decide. Information available in the codec logs.
### Backpressure and Frame Dropping
Each camera has a bounded queue (`encoder_queue_maxsize`, default 60 frames). When the encoder can't keep up:
1. The queue fills up (consuming RAM)
2. New frames are **dropped** (not blocked) — the capture loop continues uninterrupted
3. A warning is logged: `"Encoder queue full for {camera}, dropped N frame(s)"`
4. At episode end, total dropped frames per camera are reported
### Symptoms of Encoder Falling Behind
- **System feels laggy and freezes**: all CPUs are at 100%
- **Dropped frame warnings** in the log or lower frames/FPS than expected in the recorded dataset
- **Choppy robot movement**: If CPU is severely overloaded, even the capture loop may be affected
- **Accumulated rerun lag**: Visualization falls behind real-time
## 4. Hardware-Accelerated Encoding
### When to Use
Use HW encoding when:
- CPU is the bottleneck (dropped frames, choppy robot, rerun lag)
- You have compatible hardware (GPU or dedicated encoder)
- You're recording at high throughput (high resolution or with many cameras)
### Choosing a Codec
| Codec | CPU Usage | File Size | Quality | Notes |
| --------------------- | --------- | -------------- | ------- | ---------------------------------------------------------------- |
| `libsvtav1` (default) | High | Smallest | Best | Default. Best compression but most CPU-intensive |
| `h264` | Medium | ~30-50% larger | Good | Software H.264. Lower CPU |
| HW encoders | Very Low | Largest | Good | Offloads to dedicated hardware. Best for CPU-constrained systems |
### Available HW Encoders
| Encoder | Platform | Hardware | CLI Value |
| ------------------- | ------------- | ------------------------------------------------------------------------------------------------ | ------------------------------------ |
| `h264_videotoolbox` | macOS | Apple Silicon / Intel | `--dataset.vcodec=h264_videotoolbox` |
| `hevc_videotoolbox` | macOS | Apple Silicon / Intel | `--dataset.vcodec=hevc_videotoolbox` |
| `h264_nvenc` | Linux/Windows | NVIDIA GPU | `--dataset.vcodec=h264_nvenc` |
| `hevc_nvenc` | Linux/Windows | NVIDIA GPU | `--dataset.vcodec=hevc_nvenc` |
| `h264_vaapi` | Linux | Intel/AMD GPU | `--dataset.vcodec=h264_vaapi` |
| `h264_qsv` | Linux/Windows | Intel Quick Sync | `--dataset.vcodec=h264_qsv` |
| `auto` | Any | Probes the system for available HW encoders. Falls back to `libsvtav1` if no HW encoder is found | `--dataset.vcodec=auto` |
> [!NOTE]
> In order to use the HW accelerated encoders you might need to upgrade your GPU drivers.
> [!NOTE]
> `libsvtav1` is the default because it provides the best training performance; other vcodecs can reduce CPU usage and be faster, but they typically produce larger files and may affect training time.
## 5. Troubleshooting
| Symptom | Likely Cause | Fix |
| ------------------------------------------------------------------ | -------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| System freezes or choppy robot movement or Rerun visualization lag | CPU starved (100% load usage) | Close other apps, reduce encoding throughput, lower `encoder_threads`, use `h264`, use `display_data=False`. If the CPU continues to be at 100% then it might be insufficient for your setup, consider `--dataset.streaming_encoding=false` or HW encoding (`--dataset.vcodec=auto`) |
| "Encoder queue full" warnings or dropped frames in dataset | Encoder can't keep up (Queue overflow) | If CPU is not at 100%: Increase `encoder_threads`, increase `encoder_queue_maxsize` or use HW encoding (`--dataset.vcodec=auto`). |
| High RAM usage | Queue filling faster than encoding | `encoder_threads` too low or CPU insufficient. Reduce `encoder_queue_maxsize` or use HW encoding |
| Large video files | Using HW encoder or H.264 | Expected trade-off. Switch to `libsvtav1` if CPU allows |
| `save_episode()` still slow | `streaming_encoding` is `False` | Set `--dataset.streaming_encoding=true` |
| Encoder thread crash | Codec not available or invalid settings | Check `vcodec` is installed, try `--dataset.vcodec=auto` |
| Recorded dataset is missing frames | CPU/GPU starvation or occasional load spikes | If ~5% of frames are missing, your system is likely overloaded — follow the recommendations above. If fewer frames are missing (~2%), they are probably due to occasional transient load spikes (often at startup) and can be considered expected. |
## 6. Recommended Configurations
These estimates are conservative; we recommend testing them on your setup—start with a low load and increase it gradually.
### High-End Systems: modern 12+ cores (24+ threads)
A throughput between ~250-500M px/sec should be comfortable in CPU. For even better results try HW encoding if available.
```bash
# 3camsx 1280x720x3 @30fps: Defaults work well. Optionally increase encoder parallelism.
# 2camsx 1920x1080x3 @30fps: Defaults work well. Optionally increase encoder parallelism.
lerobot-record --dataset.encoder_threads=5 ...
# 3camsx 1920x1080x3 @30fps: Might require some tuning.
```
### Mid-Range Systems: modern 8+ cores (16+ threads) or Apple Silicon
A throughput between ~80-300M px/sec should be possible in CPU.
```bash
# 3camsx 640x480x3 @30fps: Defaults work well. Optionally decrease encoder parallelism.
# 2camsx 1280x720x3 @30fps: Defaults work well. Optionally decrease encoder parallelism.
lerobot-record --dataset.encoder_threads=2 ...
# 2camsx 1920x1080x3 @30fps: Might require some tuning.
```
### Low-Resource Systems: modern 4+ cores (8+ threads) or Raspberry Pi 5
On very constrained systems, streaming encoding may compete too heavily with the capture loop. Disabling it falls back to the PNG-based approach where encoding happens between episodes (blocking, but doesn't interfere with capture). Alternatively, record at a lower throughput to reduce both capture and encoding load. Consider also changing codec to `h264` and using batch encoding.
```bash
# 2camsx 640x480x3 @30fps: Requires some tuning.
# Use H.264, disable streaming, consider batching encoding
lerobot-record --dataset.vcodec=h264 --dataset.streaming_encoding=false ...
```
## 7. Closing note
Performance ultimately depends on your exact setup — frames-per-second, resolution, CPU cores and load, available memory, episode length, and the encoder you choose. Always test with your target workload, be mindful about your CPU & system capabilities and tune `encoder_threads`, `encoder_queue_maxsize`, and
`vcodec` reasonably. That said, a common practical configuration (for many applications) is three cameras at 640×480x3 @30fps; this usually runs fine with the default streaming video encoding settings in modern systems. Always verify your recorded dataset is healthy by comparing the video duration to the CLI episode duration and confirming the row count equals FPS × CLI duration.
+181 -141
View File
@@ -1,49 +1,23 @@
# Unitree G1
<img
src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lerobot/unitree_thumbnail.jpg"
alt="Unitree G1 locomanipulation demo"
style={{ width: "100%" }}
/>
This guide covers the complete setup process for the Unitree G1 humanoid, from initial connection to running gr00t_wbc locomotion.
The Unitree G1 humanoid is now supported in LeRobot! You can teleoperate, train locomanipulation policies, test in sim, and more. Both 29 and 23 DoF variants are supported.
## About
We support both 29 and 23 DOF G1 EDU version. We introduce:
- **`unitree g1` robot class, handling low level read/write from/to the humanoid**
- **ZMQ socket bridge** for remote communication and camera streaming, allowing for remote policy deployment over wlan, eth or directly on the robot
- **Locomotion policies** from NVIDIA gr00t and Amazon FAR Holosoma
- **Simulation mode** for testing policies without the physical robot in mujoco
---
## Part 1: Getting Started
## Connection guide
### Install LeRobot on Your Machine
### Step 1: Configure Ethernet Interface
```bash
conda create -y -n lerobot python=3.12
conda activate lerobot
git clone https://github.com/unitreerobotics/unitree_sdk2_python.git
cd unitree_sdk2_python && pip install -e .
git clone https://github.com/huggingface/lerobot.git
cd lerobot
pip install -e '.[unitree_g1]'
```
### Test the Installation (Simulation)
```bash
lerobot-teleoperate \
--robot.type=unitree_g1 \
--robot.is_simulation=true \
--teleop.type=unitree_g1 \
--teleop.id=wbc_unitree \
--robot.cameras='{"global_view": {"type": "zmq", "server_address": "localhost", "port": 5555, "camera_name": "head_camera", "width": 640, "height": 480, "fps": 30}}' \
--display_data=true
```
This will launch a [MuJoCo sim instance](https://huggingface.co/lerobot/unitree-g1-mujoco/tree/main) for the G1.
- Press `9` to release the robot
- Press `7` / `8` to increase / decrease waist height
### Connect to the Robot
The G1's Ethernet IP is fixed at `192.168.123.164`. Your machine must have a static IP on the same subnet: `192.168.123.x` where `x ≠ 164`.
Set a static IP on the same subnet as the robot:
```bash
# Replace 'enp131s0' with your ethernet interface name (check with `ip a`)
@@ -52,200 +26,266 @@ sudo ip addr add 192.168.123.200/24 dev enp131s0
sudo ip link set enp131s0 up
```
### SSH into the Robot
**Note**: The G1's Ethernet IP is fixed at `192.168.123.164`. Your computer must use `192.168.123.x` with x ≠ 164.
### Step 2: SSH into the Robot
```bash
ssh unitree@192.168.123.164
# Password: 123
```
### Install LeRobot on the G1
From the robot:
```bash
conda create -y -n lerobot python=3.12
conda activate lerobot
git clone https://github.com/unitreerobotics/unitree_sdk2_python.git
cd unitree_sdk2_python && pip install -e .
git clone https://github.com/huggingface/lerobot.git
cd lerobot
pip install -e '.[unitree_g1]'
```
> **Note:** The Unitree SDK requires CycloneDDS v0.10.2. See the [Unitree SDK docs](https://github.com/unitreerobotics/unitree_sdk2_python) for details.
You should now be connected to the G1's Orin.
---
## Part 2: Enable WiFi on the Robot
Wi-Fi connectivity is blocked by default on the G1. To activate:
Wlan0 is disabled by default on the G1. To enable it:
### Step 1: Enable WiFi Hardware
```bash
sudo rfkill unblock wifi
sudo rfkill unblock all
# Bring up wlan0
sudo ip link set wlan0 up
# Enable NetworkManager control of wlan0
sudo nmcli radio wifi on
sudo nmcli device set wlan0 managed yes
sudo systemctl restart NetworkManager
```
**On your laptop** (share internet via Ethernet):
### Step 2: Enable Internet Forwarding
**On your laptop:**
```bash
# Enable IP forwarding
sudo sysctl -w net.ipv4.ip_forward=1
# Replace wlp132s0f0 with your WiFi interface name
# Set up NAT (replace wlp132s0f0 with your WiFi interface)
sudo iptables -t nat -A POSTROUTING -o wlp132s0f0 -s 192.168.123.0/24 -j MASQUERADE
sudo iptables -A FORWARD -i wlp132s0f0 -o enp131s0 -m state --state RELATED,ESTABLISHED -j ACCEPT
sudo iptables -A FORWARD -i enp131s0 -o wlp132s0f0 -j ACCEPT
```
**On the G1** (set default route through your laptop):
**On the G1:**
```bash
# Add laptop as default gateway
sudo ip route del default 2>/dev/null || true
sudo ip route add default via 192.168.123.200 dev eth0
echo "nameserver 8.8.8.8" | sudo tee /etc/resolv.conf
# Verify
# Test connection
ping -c 3 8.8.8.8
```
**Connect to a WiFi network:**
### Step 3: Connect to WiFi Network
```bash
# List available networks
nmcli device wifi list
# Connect to your WiFi (example)
sudo nmcli connection add type wifi ifname wlan0 con-name "YourNetwork" ssid "YourNetwork"
sudo nmcli connection modify "YourNetwork" wifi-sec.key-mgmt wpa-psk
sudo nmcli connection modify "YourNetwork" wifi-sec.psk "YourPassword"
sudo nmcli connection modify "YourNetwork" connection.autoconnect yes
sudo nmcli connection up "YourNetwork"
# Check WiFi IP address
ip a show wlan0
```
You can now SSH over WiFi:
### Step 4: SSH Over WiFi
Once connected to WiFi, note the robot's IP address and disconnect the Ethernet cable. You can now SSH over WiFi:
```bash
ssh unitree@<ROBOT_WIFI_IP>
ssh unitree@<YOUR_ROBOT_IP>
# Password: 123
```
Replace `<YOUR_ROBOT_IP>` with your robot's actual WiFi IP address.
---
## Part 3: Teleoperation & Locomotion
## Part 3: Robot Server Setup
### Run the Robot Server
### Step 1: Install LeRobot on the Orin
SSH into the robot and install LeRobot:
```bash
ssh unitree@<YOUR_ROBOT_IP>
conda create -y -n lerobot python=3.10
conda activate lerobot
git clone https://github.com/huggingface/lerobot.git
cd lerobot
pip install -e '.[unitree_g1]'
git clone https://github.com/unitreerobotics/unitree_sdk2_python.git
cd unitree_sdk2_python && pip install -e .
```
**Note**: The Unitree SDK requires CycloneDDS v0.10.2 to be installed. See the [Unitree SDK documentation](https://github.com/unitreerobotics/unitree_sdk2_python) for details.
### Step 2: Run the Robot Server
On the robot:
```bash
python src/lerobot/robots/unitree_g1/run_g1_server.py --camera
python src/lerobot/robots/unitree_g1/run_g1_server.py
```
### Run the Locomotion Policy
```bash
lerobot-teleoperate \
--robot.type=unitree_g1 \
--robot.is_simulation=false \
--robot.robot_ip=<ROBOT_IP> \
--teleop.type=unitree_g1 \
--teleop.id=wbc_unitree \
--robot.cameras='{"global_view": {"type": "zmq", "server_address": "<ROBOT_IP>", "port": 5555, "camera_name": "head_camera", "width": 640, "height": 480, "fps": 30}}' \
--display_data=true \
--robot.controller=HolosomaLocomotionController
```
We support both [HolosomaLocomotionController](https://github.com/amazon-far/holosoma) and [GrootLocomotionController](https://github.com/NVlabs/GR00T-WholeBodyControl).
**Important**: Keep this terminal running. The server must be active for remote control.
---
## Part 4: Loco-Manipulation with the Homunculus Exoskeleton
## Part 4: Controlling the robot
We provide a loco-manipulation solution via the Homunculus Exoskeleton — an open-source 7 DoF exoskeleton for whole-body control. Assembly instructions [here](https://github.com/nepyope/hmc_exo).
With the robot server running, you can now control the robot remotely. Let's launch a locomotion policy
### Calibrate
### Step 1: Install LeRobot on your machine
```bash
conda create -y -n lerobot python=3.10
conda activate lerobot
git clone https://github.com/huggingface/lerobot.git
cd lerobot
pip install -e '.[unitree_g1]'
git clone https://github.com/unitreerobotics/unitree_sdk2_python.git
cd unitree_sdk2_python && pip install -e .
```
### Step 2: Update Robot IP in Config
Edit the config file to match your robot's WiFi IP:
```python
# In src/lerobot/robots/unitree_g1/config_unitree_g1.py
robot_ip: str = "<YOUR_ROBOT_IP>" # Replace with your robot's WiFi IP.
```
### Step 3: Run the Locomotion Policy
```bash
# Run GR00T locomotion controller
python examples/unitree_g1/gr00t_locomotion.py --repo-id "nepyope/GR00T-WholeBodyControl_g1"
# Run Holosoma locomotion controller
python examples/unitree_g1/holosoma_locomotion.py
```
Press `Ctrl+C` to stop the policy.
---
## Running in Simulation Mode (MuJoCo)
You can test policies before deploying on the physical robot using MuJoCo simulation. Set `is_simulation=True` in config or pass `--robot.is_simulation=true` via CLI.
### Calibrate Exoskeleton Teleoperator
```bash
lerobot-calibrate \
--teleop.type=unitree_g1 \
--teleop.left_arm_config.port=/dev/ttyACM1 \
--teleop.right_arm_config.port=/dev/ttyACM0 \
--teleop.id=exo
--teleop.type=unitree_g1 \
--teleop.left_arm_config.port=/dev/ttyACM1 \
--teleop.right_arm_config.port=/dev/ttyACM0 \
--teleop.id=exo
```
During calibration move each joint through its entire range. After fitting, move the joint in a neutral position and press `n` to advance.
### Record a Dataset
### Teleoperate in Simulation
```bash
lerobot-record \
--robot.type=unitree_g1 \
--robot.is_simulation=true \
--robot.cameras='{"global_view": {"type": "zmq", "server_address": "localhost", "port": 5555, "camera_name": "head_camera", "width": 640, "height": 480, "fps": 30}}' \
--teleop.type=unitree_g1 \
--teleop.left_arm_config.port=/dev/ttyACM1 \
--teleop.right_arm_config.port=/dev/ttyACM0 \
--teleop.id=exo \
--dataset.repo_id=your-username/dataset-name \
--dataset.single_task="Test" \
--dataset.num_episodes=2 \
--dataset.episode_time_s=5 \
--dataset.reset_time_s=5 \
--dataset.push_to_hub=true \
--dataset.streaming_encoding=true \
--dataset.encoder_threads=2
lerobot-teleoperate \
--robot.type=unitree_g1 \
--robot.is_simulation=true \
--teleop.type=unitree_g1 \
--teleop.left_arm_config.port=/dev/ttyACM1 \
--teleop.right_arm_config.port=/dev/ttyACM0 \
--teleop.id=exo \
--fps=100
```
> **Note:** Omit `--teleop.left_arm_config.port` and `--teleop.right_arm_config.port` if you're only using the joystick.
### Record Dataset in Simulation
Example dataset: [nepyope/unitree_box_move_blue_full](https://huggingface.co/datasets/nepyope/unitree_box_move_blue_full)
```bash
python -m lerobot.scripts.lerobot_record \
--robot.type=unitree_g1 \
--robot.is_simulation=true \
--robot.cameras='{"global_view": {"type": "zmq", "server_address": "localhost", "port": 5555, "camera_name": "head_camera", "width": 640, "height": 480, "fps": 30}}' \
--teleop.type=unitree_g1 \
--teleop.left_arm_config.port=/dev/ttyACM1 \
--teleop.right_arm_config.port=/dev/ttyACM0 \
--teleop.id=exo \
--dataset.repo_id=your-username/dataset-name \
--dataset.single_task="Test" \
--dataset.num_episodes=2 \
--dataset.episode_time_s=5 \
--dataset.reset_time_s=5 \
--dataset.push_to_hub=true
```
Example simulation dataset: [nepyope/teleop_test_sim](https://huggingface.co/datasets/nepyope/teleop_test_sim)
---
## Part 5: Training & Inference
## Running on Real Robot
### Train
Once the robot server is running on the G1 (see Part 3), you can teleoperate and record on the real robot.
### Start the Camera Server
On the robot, start the ZMQ image server:
```bash
python src/lerobot/scripts/lerobot_train.py \
--dataset.repo_id=your-username/dataset-name \
--policy.type=pi05 \
--output_dir=./outputs/pi05_training \
--job_name=pi05_training \
--policy.repo_id=your-username/your-repo-id \
--policy.pretrained_path=lerobot/pi05_base \
--policy.compile_model=true \
--policy.gradient_checkpointing=true \
--wandb.enable=true \
--policy.dtype=bfloat16 \
--policy.freeze_vision_encoder=false \
--policy.train_expert_only=false \
--steps=3000 \
--policy.device=cuda \
--batch_size=32
python src/lerobot/cameras/zmq/image_server.py
```
### Inference with RTC
Keep this running in a separate terminal for camera streaming during recording.
Once trained, we recommend deploying policies using inference-time RTC:
### Teleoperate Real Robot
```bash
python examples/rtc/eval_with_real_robot.py \
--policy.path=your-username/your-repo-id \
--policy.device=cuda \
--robot.type=unitree_g1 \
--robot.is_simulation=false \
--robot.controller=HolosomaLocomotionController \
--robot.cameras='{"global_view": {"type": "zmq", "server_address": "<ROBOT_IP>", "port": 5555, "camera_name": "head_camera", "width": 640, "height": 480, "fps": 30}}' \
--task="task_description" \
--duration=1000 \
--fps=30 \
--rtc.enabled=true
lerobot-teleoperate \
--robot.type=unitree_g1 \
--robot.is_simulation=false \
--teleop.type=unitree_g1 \
--teleop.left_arm_config.port=/dev/ttyACM1 \
--teleop.right_arm_config.port=/dev/ttyACM0 \
--teleop.id=exo \
--fps=100
```
### Record Dataset on Real Robot
```bash
python -m lerobot.scripts.lerobot_record \
--robot.type=unitree_g1 \
--robot.is_simulation=false \
--robot.cameras='{"global_view": {"type": "zmq", "server_address": "172.18.129.215", "port": 5555, "camera_name": "head_camera", "width": 640, "height": 480, "fps": 30}}' \
--teleop.type=unitree_g1 \
--teleop.left_arm_config.port=/dev/ttyACM1 \
--teleop.right_arm_config.port=/dev/ttyACM0 \
--teleop.id=exo \
--dataset.repo_id=your-username/dataset-name \
--dataset.single_task="Test" \
--dataset.num_episodes=2 \
--dataset.episode_time_s=5 \
--dataset.reset_time_s=5 \
--dataset.push_to_hub=true
```
**Note**: Update `server_address` to match your robot's camera server IP.
Example real robot dataset: [nepyope/teleop_test_real](https://huggingface.co/datasets/nepyope/teleop_test_real)
---
## Additional Resources
@@ -254,8 +294,8 @@ python examples/rtc/eval_with_real_robot.py \
- [GR00T-WholeBodyControl](https://github.com/NVlabs/GR00T-WholeBodyControl)
- [Holosoma](https://github.com/amazon-far/holosoma)
- [LeRobot Documentation](https://github.com/huggingface/lerobot)
- [Unitree IL LeRobot](https://github.com/unitreerobotics/unitree_IL_lerobot)
- [Unitree_IL_Lerobot](https://github.com/unitreerobotics/unitree_IL_lerobot)
---
_Last updated: March 2026_
_Last updated: December 2025_
-25
View File
@@ -12,7 +12,6 @@ LeRobot provides several utilities for manipulating datasets:
4. **Add Features** - Add new features to a dataset
5. **Remove Features** - Remove features from a dataset
6. **Convert to Video** - Convert image-based datasets to video format for efficient storage
7. **Show the Info of Datasets** - Show the summary of datasets information such as number of episode etc.
The core implementation is in `lerobot.datasets.dataset_tools`.
An example script detailing how to use the tools API is available in `examples/dataset/use_dataset_tools.py`.
@@ -157,30 +156,6 @@ lerobot-edit-dataset \
**Note:** The resulting dataset will be a proper LeRobotDataset with all cameras encoded as videos in the `videos/` directory, with parquet files containing only metadata (no raw image data). All episodes, stats, and tasks are preserved.
### Show the information of datasets
Show the information of datasets such as number of episode, number of frame, File size and so on.
No change will be made to the dataset
```bash
# Show dataset information without feature details
lerobot-edit-dataset \
--repo_id lerobot/pusht_image \
--operation.type info \
# Show dataset information with feature details
lerobot-edit-dataset \
--repo_id lerobot/pusht_image \
--operation.type info \
--operation.show_features true
```
**Parameters:**
- `parameters`: The flag to control show or no show dataset information with feature details.(default=false)
### Push to Hub
Add the `--push_to_hub true` flag to any command to automatically upload the resulting dataset to the Hugging Face Hub:
+1 -1
View File
@@ -45,7 +45,7 @@ policy.type=wall_x
For training WallX, you can use the standard LeRobot training script with the appropriate configuration:
```bash
lerobot-train \
python src/lerobot/scripts/lerobot_train.py \
--dataset.repo_id=your_dataset \
--policy.type=wall_x \
--output_dir=./outputs/wallx_training \
+1 -1
View File
@@ -154,7 +154,7 @@ lerobot-train \
```bash
lerobot-train \
--dataset.repo_id=<USER>/bimanual-so100-handover-cube \
--dataset.repo_id=pepijn223/bimanual-so100-handover-cube \
--output_dir=./outputs/xvla_bimanual \
--job_name=xvla_so101_training \
--policy.path="lerobot/xvla-base" \
+2 -2
View File
@@ -22,7 +22,7 @@ lerobot-replay \
--robot.type=so100_follower \
--robot.port=/dev/tty.usbmodem58760431541 \
--robot.id=black \
--dataset.repo_id=<USER>/record-test \
--dataset.repo_id=aliberts/record-test \
--dataset.episode=2
```
"""
@@ -57,7 +57,7 @@ class DatasetReplayConfig:
repo_id: str
# Episode to replay.
episode: int
# Root directory where the dataset will be stored (e.g. 'dataset/path'). If None, defaults to $HF_LEROBOT_HOME/repo_id.
# Root directory where the dataset will be stored (e.g. 'dataset/path').
root: str | Path | None = None
# Limit the frames per second. By default, uses the policy fps.
fps: int = 30
-490
View File
@@ -1,490 +0,0 @@
#!/usr/bin/env python
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
SLURM-distributed SARM RA-BC annotation pipeline.
Computes SARM progress values for all frames in a dataset, distributed across
SLURM workers, then merges the shards into a single sarm_progress.parquet.
Two subcommands, each a separate SLURM submission:
compute N workers, each computes progress for a subset of episodes
aggregate 1 worker, merges N shards into sarm_progress.parquet, pushes to hub
Usage:
python slurm_compute_rabc.py compute \\
--repo-id user/dataset --reward-model-path user/sarm_model \\
--stride 10 --device cpu --workers 50 --partition cpu
python slurm_compute_rabc.py aggregate \\
--repo-id user/dataset --reward-model-path user/sarm_model \\
--partition cpu --push-to-hub
"""
import argparse
from pathlib import Path
from datatrove.executor import LocalPipelineExecutor
from datatrove.executor.slurm import SlurmPipelineExecutor
from datatrove.pipeline.base import PipelineStep
class ComputeProgressShards(PipelineStep):
"""Each worker computes SARM progress for its assigned episodes."""
def __init__(
self, repo_id, reward_model_path, stride=1, head_mode="sparse", device="cpu", shard_dir="rabc_shards"
):
super().__init__()
if stride < 1:
raise ValueError(f"stride must be >= 1, got {stride}")
self.repo_id = repo_id
self.reward_model_path = reward_model_path
self.stride = stride
self.head_mode = head_mode
self.device = device
self.shard_dir = shard_dir
def run(self, data=None, rank: int = 0, world_size: int = 1):
import logging
from pathlib import Path
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import torch
from tqdm import tqdm
from lerobot.policies.sarm.compute_rabc_weights import (
generate_all_frame_indices,
interpolate_progress,
load_sarm_resources,
)
from lerobot.utils.utils import init_logging
init_logging()
dataset, reward_model, preprocess = load_sarm_resources(
self.repo_id,
self.reward_model_path,
self.device,
)
if hasattr(preprocess, "eval"):
preprocess.eval()
for step in preprocess.steps:
if hasattr(step, "eval"):
step.eval()
image_key = reward_model.config.image_key
state_key = reward_model.config.state_key
frame_gap = reward_model.config.frame_gap
center_idx = reward_model.config.n_obs_steps // 2
dual_mode = reward_model.config.uses_dual_heads
compute_sparse = self.head_mode in ("sparse", "both") or not dual_mode
compute_dense = self.head_mode in ("dense", "both") and dual_mode
my_episodes = list(range(dataset.num_episodes))[rank::world_size]
if not my_episodes:
logging.info(f"Rank {rank}: no episodes assigned")
return
logging.info(f"Rank {rank}: {len(my_episodes)} / {dataset.num_episodes} episodes")
all_rows = []
for ep_idx in tqdm(my_episodes, desc=f"Rank {rank}"):
ep = dataset.meta.episodes[ep_idx]
ep_start, ep_end = ep["dataset_from_index"], ep["dataset_to_index"]
task = dataset[ep_start].get("task", "perform the task")
all_ep_indices = generate_all_frame_indices(ep_start, ep_end, frame_gap)
if self.stride > 1:
compute_indices = [i for i in all_ep_indices if (i - ep_start) % self.stride == 0]
if (ep_end - 1) not in compute_indices:
compute_indices.append(ep_end - 1)
compute_indices = sorted(set(compute_indices))
else:
compute_indices = all_ep_indices
frame_results = {}
for qi in tqdm(compute_indices, desc=f" Ep {ep_idx}", leave=False):
try:
sample = dataset[qi]
batch = {
image_key: sample[image_key],
"task": task,
"index": qi,
"episode_index": ep_idx,
}
if state_key in sample:
batch[state_key] = sample[state_key]
with torch.no_grad():
processed = preprocess(batch)
vf = processed["video_features"].to(self.device)
tf = processed["text_features"].to(self.device)
sf = processed.get("state_features")
if sf is not None:
sf = sf.to(self.device)
lengths = processed.get("lengths")
sparse_val = dense_val = np.nan
if compute_sparse:
r = reward_model.calculate_rewards(
text_embeddings=tf,
video_embeddings=vf,
state_features=sf,
lengths=lengths,
return_all_frames=True,
head_mode="sparse",
)
sparse_val = float(r[0, center_idx] if r.ndim == 2 else r[center_idx])
if compute_dense:
r = reward_model.calculate_rewards(
text_embeddings=tf,
video_embeddings=vf,
state_features=sf,
lengths=lengths,
return_all_frames=True,
head_mode="dense",
)
dense_val = float(r[0, center_idx] if r.ndim == 2 else r[center_idx])
frame_results[qi] = (sparse_val, dense_val)
except Exception as e:
logging.warning(f"Failed frame {qi}: {e}")
if not frame_results:
logging.warning(f"Episode {ep_idx}: all frames failed, skipping")
continue
# Interpolate to all frames in this episode
computed_idx = np.array(sorted(frame_results.keys()))
all_frame_arr = np.arange(ep_start, ep_end)
sparse_vals = np.array([frame_results[i][0] for i in computed_idx]) if compute_sparse else None
dense_vals = np.array([frame_results[i][1] for i in computed_idx]) if compute_dense else None
if self.stride > 1 and len(computed_idx) > 1:
if compute_sparse:
sparse_vals = interpolate_progress(computed_idx, sparse_vals, all_frame_arr)
if compute_dense:
dense_vals = interpolate_progress(computed_idx, dense_vals, all_frame_arr)
output_frames = all_frame_arr
else:
# Use only successfully computed frames to avoid indexing mismatch on failures
output_frames = computed_idx
for i, fi in enumerate(output_frames):
row = {"index": int(fi), "episode_index": ep_idx, "frame_index": int(fi - ep_start)}
if compute_sparse:
row["progress_sparse"] = float(sparse_vals[i])
if compute_dense:
row["progress_dense"] = float(dense_vals[i])
all_rows.append(row)
if all_rows:
import pandas as pd
df = pd.DataFrame(all_rows).sort_values("index").reset_index(drop=True)
table = pa.Table.from_pandas(df, preserve_index=False)
table = table.replace_schema_metadata({b"reward_model_path": self.reward_model_path.encode()})
shard_dir = Path(self.shard_dir)
shard_dir.mkdir(parents=True, exist_ok=True)
out = shard_dir / f"shard_{rank:05d}.parquet"
pq.write_table(table, out)
logging.info(f"Rank {rank}: saved {len(df)} rows to {out}")
class AggregateProgress(PipelineStep):
"""Merge all shard parquets into final sarm_progress.parquet."""
def __init__(self, repo_id, reward_model_path, shard_dir="rabc_shards", push_to_hub=False):
super().__init__()
self.repo_id = repo_id
self.reward_model_path = reward_model_path
self.shard_dir = shard_dir
self.push_to_hub = push_to_hub
def run(self, data=None, rank: int = 0, world_size: int = 1):
import datetime
import logging
import os
from pathlib import Path
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from lerobot.datasets.lerobot_dataset import LeRobotDataset
from lerobot.utils.utils import init_logging
init_logging()
if rank != 0:
return
shard_dir = Path(self.shard_dir)
shards = sorted(shard_dir.glob("shard_*.parquet"))
if not shards:
raise FileNotFoundError(f"No shards found in {shard_dir}")
# Log shard modification time range to help detect stale files
mtimes = [os.path.getmtime(s) for s in shards]
oldest = datetime.datetime.fromtimestamp(min(mtimes)).isoformat(timespec="seconds")
newest = datetime.datetime.fromtimestamp(max(mtimes)).isoformat(timespec="seconds")
logging.info(f"Aggregating {len(shards)} shards (oldest: {oldest}, newest: {newest})")
df = pd.concat([pd.read_parquet(s) for s in shards], ignore_index=True)
df = df.sort_values("index").reset_index(drop=True)
table = pa.Table.from_pandas(df, preserve_index=False)
table = table.replace_schema_metadata({b"reward_model_path": self.reward_model_path.encode()})
temp_ds = LeRobotDataset(self.repo_id, download_videos=False)
out_path = Path(temp_ds.root) / "sarm_progress.parquet"
out_path.parent.mkdir(parents=True, exist_ok=True)
pq.write_table(table, out_path)
logging.info(f"Saved {len(df)} rows to {out_path}")
for col in ["progress_sparse", "progress_dense"]:
if col in df.columns:
v = df[col].dropna()
logging.info(
f"{col}: mean={v.mean():.4f} std={v.std():.4f} min={v.min():.4f} max={v.max():.4f}"
)
if self.push_to_hub:
from huggingface_hub import HfApi
api = HfApi()
hub_path = "sarm_progress.parquet"
logging.info(f"Uploading to {self.repo_id}/{hub_path}")
api.upload_file(
path_or_fileobj=str(out_path),
path_in_repo=hub_path,
repo_id=self.repo_id,
repo_type="dataset",
)
logging.info(f"Uploaded: https://huggingface.co/datasets/{self.repo_id}/blob/main/{hub_path}")
def make_compute_executor(
repo_id,
reward_model_path,
stride,
head_mode,
device,
shard_dir,
logs_dir,
job_name,
slurm,
workers,
partition,
cpus_per_task,
mem_per_cpu,
):
kwargs = {
"pipeline": [
ComputeProgressShards(repo_id, reward_model_path, stride, head_mode, device, str(shard_dir)),
],
"logging_dir": str(logs_dir / job_name),
}
if slurm:
kwargs.update(
{
"job_name": job_name,
"tasks": workers,
"workers": workers,
"time": "24:00:00",
"partition": partition,
"cpus_per_task": cpus_per_task,
"sbatch_args": {"mem-per-cpu": mem_per_cpu},
}
)
return SlurmPipelineExecutor(**kwargs)
kwargs.update({"tasks": workers, "workers": 1})
return LocalPipelineExecutor(**kwargs)
def make_aggregate_executor(
repo_id,
reward_model_path,
shard_dir,
logs_dir,
job_name,
slurm,
partition,
cpus_per_task,
mem_per_cpu,
push_to_hub,
):
kwargs = {
"pipeline": [
AggregateProgress(repo_id, reward_model_path, str(shard_dir), push_to_hub),
],
"logging_dir": str(logs_dir / job_name),
}
if slurm:
kwargs.update(
{
"job_name": job_name,
"tasks": 1,
"workers": 1,
"time": "02:00:00",
"partition": partition,
"cpus_per_task": cpus_per_task,
"sbatch_args": {"mem-per-cpu": mem_per_cpu},
}
)
return SlurmPipelineExecutor(**kwargs)
kwargs.update({"tasks": 1, "workers": 1})
return LocalPipelineExecutor(**kwargs)
def _add_shared_args(p):
p.add_argument(
"--repo-id",
type=str,
required=True,
help="Hugging Face repository identifier, e.g. 'user/dataset'.",
)
p.add_argument(
"--shard-dir",
type=Path,
default=Path("rabc_shards"),
help="Directory to read/write per-rank parquet shards.",
)
p.add_argument(
"--logs-dir",
type=Path,
default=Path("logs"),
help="Directory for datatrove logs.",
)
p.add_argument(
"--job-name",
type=str,
default=None,
help="SLURM job name (defaults to rabc_<subcommand>).",
)
p.add_argument(
"--slurm",
type=int,
default=1,
help="1 = submit via SLURM; 0 = run locally (useful for debugging).",
)
p.add_argument(
"--partition",
type=str,
default=None,
help="SLURM partition to submit to.",
)
p.add_argument(
"--cpus-per-task",
type=int,
default=4,
help="Number of CPUs per SLURM task.",
)
p.add_argument(
"--mem-per-cpu",
type=str,
default="4G",
help="Memory per CPU, e.g. '4G' or '1950M'.",
)
def main():
parser = argparse.ArgumentParser(
description="SLURM-distributed SARM RA-BC annotation pipeline",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
sub = parser.add_subparsers(dest="command", required=True)
# compute subcommand
cp = sub.add_parser(
"compute",
help="Distribute progress computation across SLURM workers.",
)
_add_shared_args(cp)
cp.add_argument(
"--reward-model-path",
type=str,
required=True,
help="Path or HF repo id of the SARM reward model.",
)
cp.add_argument(
"--stride",
type=int,
default=1,
help="Compute every Nth frame; intermediate frames are interpolated (must be >= 1).",
)
cp.add_argument(
"--head-mode",
type=str,
default="sparse",
choices=["sparse", "dense", "both"],
help="Which reward head(s) to compute.",
)
cp.add_argument(
"--device",
type=str,
default="cpu",
help="Device for reward model inference, e.g. 'cpu' or 'cuda'.",
)
cp.add_argument(
"--workers",
type=int,
default=50,
help="Number of parallel SLURM tasks (one shard per worker).",
)
# aggregate subcommand
ap = sub.add_parser(
"aggregate",
help="Merge per-rank shards into a single sarm_progress.parquet.",
)
_add_shared_args(ap)
ap.add_argument(
"--reward-model-path",
type=str,
required=True,
help="Path or HF repo id of the SARM reward model (stored in parquet metadata).",
)
ap.add_argument(
"--push-to-hub",
action="store_true",
help="Upload sarm_progress.parquet to the Hugging Face Hub after aggregation.",
)
args = parser.parse_args()
job_name = args.job_name or f"rabc_{args.command}"
kwargs = vars(args)
kwargs["slurm"] = kwargs.pop("slurm") == 1
kwargs["job_name"] = job_name
command = kwargs.pop("command")
executor = make_compute_executor(**kwargs) if command == "compute" else make_aggregate_executor(**kwargs)
executor.run()
if __name__ == "__main__":
main()
+10 -10
View File
@@ -27,8 +27,8 @@ measuring consistency and ground truth alignment.
Usage:
# Basic usage with smolvla policy
uv run python examples/rtc/eval_dataset.py \
--policy.path=<USER>/smolvla_check_rtc_last3 \
--dataset.repo_id=<USER>/check_rtc \
--policy.path=helper2424/smolvla_check_rtc_last3 \
--dataset.repo_id=helper2424/check_rtc \
--rtc.execution_horizon=8 \
--device=mps \
--rtc.max_guidance_weight=10.0 \
@@ -58,16 +58,16 @@ Usage:
--device=cuda
uv run python examples/rtc/eval_dataset.py \
--policy.path=<USER>/reuben_pi0 \
--dataset.repo_id=<USER>/so101_cube_in_cup \
--policy.path=lipsop/reuben_pi0 \
--dataset.repo_id=ReubenLim/so101_cube_in_cup \
--rtc.execution_horizon=8 \
--device=cuda
# With torch.compile for faster inference (PyTorch 2.0+)
# Note: CUDA graphs disabled by default due to in-place ops in denoising loop
uv run python examples/rtc/eval_dataset.py \
--policy.path=<USER>/smolvla_check_rtc_last3 \
--dataset.repo_id=<USER>/check_rtc \
--policy.path=helper2424/smolvla_check_rtc_last3 \
--dataset.repo_id=helper2424/check_rtc \
--rtc.execution_horizon=8 \
--device=mps \
--use_torch_compile=true \
@@ -75,8 +75,8 @@ Usage:
# With torch.compile on CUDA (CUDA graphs disabled by default)
uv run python examples/rtc/eval_dataset.py \
--policy.path=<USER>/smolvla_check_rtc_last3 \
--dataset.repo_id=<USER>/check_rtc \
--policy.path=helper2424/smolvla_check_rtc_last3 \
--dataset.repo_id=helper2424/check_rtc \
--rtc.execution_horizon=8 \
--device=cuda \
--use_torch_compile=true \
@@ -84,8 +84,8 @@ Usage:
# Enable CUDA graphs (advanced - may cause tensor aliasing errors)
uv run python examples/rtc/eval_dataset.py \
--policy.path=<USER>/smolvla_check_rtc_last3 \
--dataset.repo_id=<USER>/check_rtc \
--policy.path=helper2424/smolvla_check_rtc_last3 \
--dataset.repo_id=helper2424/check_rtc \
--use_torch_compile=true \
--torch_compile_backend=inductor \
--torch_compile_mode=max-autotune \
+3 -5
View File
@@ -28,7 +28,7 @@ For simulation environments, see eval_with_simulation.py
Usage:
# Run RTC with Real robot with RTC
uv run examples/rtc/eval_with_real_robot.py \
--policy.path=<USER>/smolvla_check_rtc_last3 \
--policy.path=helper2424/smolvla_check_rtc_last3 \
--policy.device=mps \
--rtc.enabled=true \
--rtc.execution_horizon=20 \
@@ -41,7 +41,7 @@ Usage:
# Run RTC with Real robot without RTC
uv run examples/rtc/eval_with_real_robot.py \
--policy.path=<USER>/smolvla_check_rtc_last3 \
--policy.path=helper2424/smolvla_check_rtc_last3 \
--policy.device=mps \
--rtc.enabled=false \
--robot.type=so100_follower \
@@ -53,7 +53,7 @@ Usage:
# Run RTC with Real robot with pi0.5 policy
uv run examples/rtc/eval_with_real_robot.py \
--policy.path=<USER>/pi05_check_rtc \
--policy.path=helper2424/pi05_check_rtc \
--policy.device=mps \
--rtc.enabled=true \
--rtc.execution_horizon=20 \
@@ -78,7 +78,6 @@ from torch import Tensor
from lerobot.cameras.opencv.configuration_opencv import OpenCVCameraConfig # noqa: F401
from lerobot.cameras.realsense.configuration_realsense import RealSenseCameraConfig # noqa: F401
from lerobot.cameras.zmq.configuration_zmq import ZMQCameraConfig # noqa: F401
from lerobot.configs import parser
from lerobot.configs.policies import PreTrainedConfig
from lerobot.configs.types import RTCAttentionSchedule
@@ -98,7 +97,6 @@ from lerobot.robots import ( # noqa: F401
bi_so_follower,
koch_follower,
so_follower,
unitree_g1,
)
from lerobot.robots.utils import make_robot_from_config
from lerobot.utils.constants import OBS_IMAGES
@@ -14,20 +14,20 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import logging
import time
from collections import deque
import numpy as np
import onnxruntime as ort
from huggingface_hub import hf_hub_download
from lerobot.robots.unitree_g1.g1_utils import (
REMOTE_AXES,
REMOTE_BUTTONS,
G1_29_JointIndex,
get_gravity_orientation,
)
from lerobot.robots.unitree_g1.config_unitree_g1 import UnitreeG1Config
from lerobot.robots.unitree_g1.g1_utils import G1_29_JointIndex
from lerobot.robots.unitree_g1.unitree_g1 import UnitreeG1
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
@@ -36,13 +36,18 @@ GROOT_DEFAULT_ANGLES[[0, 6]] = -0.1 # Hip pitch
GROOT_DEFAULT_ANGLES[[3, 9]] = 0.3 # Knee
GROOT_DEFAULT_ANGLES[[4, 10]] = -0.2 # Ankle pitch
MISSING_JOINTS = []
G1_MODEL = "g1_23" # Or "g1_29"
if G1_MODEL == "g1_23":
MISSING_JOINTS = [12, 14, 20, 21, 27, 28] # Waist yaw/pitch, wrist pitch/yaw
# Control parameters
ACTION_SCALE = 0.25
CONTROL_DT = 0.02 # 50Hz
ANG_VEL_SCALE: float = 0.25
DOF_POS_SCALE: float = 1.0
DOF_VEL_SCALE: float = 0.05
CMD_SCALE: list[float] = [2.0, 2.0, 0.25]
CMD_SCALE: list = [2.0, 2.0, 0.25]
DEFAULT_GROOT_REPO_ID = "nepyope/GR00T-WholeBodyControl_g1"
@@ -80,11 +85,11 @@ def load_groot_policies(
class GrootLocomotionController:
"""GR00T lower-body locomotion controller for the Unitree G1."""
control_dt = CONTROL_DT # Expose for unitree_g1.py
def __init__(self):
# Load policies
self.policy_balance, self.policy_walk = load_groot_policies()
def __init__(self, policy_balance, policy_walk, robot, config):
self.policy_balance = policy_balance
self.policy_walk = policy_walk
self.robot = robot
self.config = config
self.cmd = np.array([0.0, 0.0, 0.0], dtype=np.float32) # vx, vy, theta_dot
@@ -104,60 +109,45 @@ class GrootLocomotionController:
logger.info("GrootLocomotionController initialized")
def reset(self) -> None:
"""Reset internal state for a new episode."""
self.cmd[:] = 0.0
self.groot_qj_all[:] = 0.0
self.groot_dqj_all[:] = 0.0
self.groot_action[:] = 0.0
self.groot_obs_single[:] = 0.0
self.groot_obs_stacked[:] = 0.0
self.groot_height_cmd = 0.74
self.groot_orientation_cmd[:] = 0.0
self.groot_obs_history.clear()
for _ in range(6):
self.groot_obs_history.append(np.zeros(86, dtype=np.float32))
def run_step(self):
# Get current observation
obs = self.robot.get_observation()
def run_step(self, action: dict, lowstate) -> dict:
"""Run one step of the locomotion controller.
if not obs:
return
Args:
action: Action dict containing remote.lx/ly/rx/ry and buttons
lowstate: Robot lowstate containing motor positions/velocities and IMU
Returns:
Action dict for lower body joints (0-14)
"""
if lowstate is None:
return {}
buttons = [int(action.get(k, 0)) for k in REMOTE_BUTTONS]
if buttons[0]: # R1 - raise waist
# Get command from remote controller
if obs["remote.buttons"][0]: # R1 - raise waist
self.groot_height_cmd += 0.001
self.groot_height_cmd = np.clip(self.groot_height_cmd, 0.50, 1.00)
if buttons[4]: # R2 - lower waist
if obs["remote.buttons"][4]: # R2 - lower waist
self.groot_height_cmd -= 0.001
self.groot_height_cmd = np.clip(self.groot_height_cmd, 0.50, 1.00)
lx, ly, rx, _ry = (action.get(k, 0.0) for k in REMOTE_AXES)
self.cmd[0] = ly # Forward/backward
self.cmd[1] = -lx # Left/right (negated)
self.cmd[2] = -rx # Rotation rate (negated)
self.cmd[0] = obs["remote.ly"] # Forward/backward
self.cmd[1] = obs["remote.lx"] * -1 # Left/right
self.cmd[2] = obs["remote.rx"] * -1 # Rotation rate
# Get joint positions and velocities from lowstate
# Get joint positions and velocities from flat dict
for motor in G1_29_JointIndex:
name = motor.name
idx = motor.value
self.groot_qj_all[idx] = lowstate.motor_state[idx].q
self.groot_dqj_all[idx] = lowstate.motor_state[idx].dq
self.groot_qj_all[idx] = obs[f"{name}.q"]
self.groot_dqj_all[idx] = obs[f"{name}.dq"]
# Adapt observation for g1_23dof
for idx in MISSING_JOINTS:
self.groot_qj_all[idx] = 0.0
self.groot_dqj_all[idx] = 0.0
# Scale joint positions and velocities
qj_obs = self.groot_qj_all.copy()
dqj_obs = self.groot_dqj_all.copy()
# Express IMU data in gravity frame of reference
quat = lowstate.imu_state.quaternion
ang_vel = np.array(lowstate.imu_state.gyroscope, dtype=np.float32)
gravity_orientation = get_gravity_orientation(quat)
quat = [obs["imu.quat.w"], obs["imu.quat.x"], obs["imu.quat.y"], obs["imu.quat.z"]]
ang_vel = np.array([obs["imu.gyro.x"], obs["imu.gyro.y"], obs["imu.gyro.z"]], dtype=np.float32)
gravity_orientation = self.robot.get_gravity_orientation(quat)
# Scale joint positions and velocities before policy inference
qj_obs = (qj_obs - GROOT_DEFAULT_ANGLES) * DOF_POS_SCALE
@@ -196,10 +186,73 @@ class GrootLocomotionController:
# Transform action back to target joint positions
target_dof_pos_15 = GROOT_DEFAULT_ANGLES[:15] + self.groot_action * ACTION_SCALE
# Build action dict
# Build action dict (only first 15 joints for GR00T)
action_dict = {}
for i in range(15):
motor_name = G1_29_JointIndex(i).name
action_dict[f"{motor_name}.q"] = float(target_dof_pos_15[i])
return action_dict
# Zero out missing joints for g1_23dof
for joint_idx in MISSING_JOINTS:
motor_name = G1_29_JointIndex(joint_idx).name
action_dict[f"{motor_name}.q"] = 0.0
# Send action to robot
self.robot.send_action(action_dict)
def run(repo_id: str = DEFAULT_GROOT_REPO_ID) -> None:
"""Main function to run the GR00T locomotion controller.
Args:
repo_id: Hugging Face Hub repository ID for GR00T policies.
"""
# Load policies
policy_balance, policy_walk = load_groot_policies(repo_id=repo_id)
# Initialize robot
config = UnitreeG1Config()
robot = UnitreeG1(config)
robot.connect()
# Initialize gr00T locomotion controller
groot_controller = GrootLocomotionController(
policy_balance=policy_balance,
policy_walk=policy_walk,
robot=robot,
config=config,
)
try:
robot.reset(CONTROL_DT, GROOT_DEFAULT_ANGLES)
logger.info("Use joystick: LY=fwd/back, LX=left/right, RX=rotate, R1=raise waist, R2=lower waist")
logger.info("Press Ctrl+C to stop")
# Run step
while not robot._shutdown_event.is_set():
start_time = time.time()
groot_controller.run_step()
elapsed = time.time() - start_time
sleep_time = max(0, CONTROL_DT - elapsed)
time.sleep(sleep_time)
except KeyboardInterrupt:
logger.info("Stopping locomotion...")
finally:
if robot.is_connected:
robot.disconnect()
logger.info("Done!")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="GR00T Locomotion Controller for Unitree G1")
parser.add_argument(
"--repo-id",
type=str,
default=DEFAULT_GROOT_REPO_ID,
help=f"Hugging Face Hub repo ID for GR00T policies (default: {DEFAULT_GROOT_REPO_ID})",
)
args = parser.parse_args()
run(repo_id=args.repo_id)
@@ -14,21 +14,21 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import json
import logging
import time
import numpy as np
import onnx
import onnxruntime as ort
from huggingface_hub import hf_hub_download
from lerobot.robots.unitree_g1.g1_utils import (
REMOTE_AXES,
G1_29_JointArmIndex,
G1_29_JointIndex,
get_gravity_orientation,
)
from lerobot.robots.unitree_g1.config_unitree_g1 import UnitreeG1Config
from lerobot.robots.unitree_g1.g1_utils import G1_29_JointIndex
from lerobot.robots.unitree_g1.unitree_g1 import UnitreeG1
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
DEFAULT_ANGLES = np.zeros(29, dtype=np.float32)
@@ -40,13 +40,18 @@ DEFAULT_ANGLES[16] = 0.2 # Left shoulder roll
DEFAULT_ANGLES[23] = -0.2 # Right shoulder roll
DEFAULT_ANGLES[[18, 25]] = 0.6 # Elbow
MISSING_JOINTS = []
G1_MODEL = "g1_23" # Or "g1_29"
if G1_MODEL == "g1_23":
MISSING_JOINTS = [12, 14, 20, 21, 27, 28] # Waist yaw/pitch, wrist pitch/yaw
# Control parameters
ACTION_SCALE = 0.25
CONTROL_DT = 0.005 # 200Hz
CONTROL_DT = 0.02 # 50Hz
ANG_VEL_SCALE = 0.25
DOF_POS_SCALE = 1.0
DOF_VEL_SCALE = 0.05
GAIT_PERIOD = 0.5
GAIT_PERIOD = 1.0
DEFAULT_HOLOSOMA_REPO_ID = "nepyope/holosoma_locomotion"
@@ -82,7 +87,7 @@ def load_policy(
logger.info(f"Policy loaded: {policy.get_inputs()[0].shape}{policy.get_outputs()[0].shape}")
# Extract KP/KD from ONNX metadata
model = onnx.load(policy_path, load_external_data=False)
model = onnx.load(policy_path)
metadata = {prop.key: prop.value for prop in model.metadata_props}
if "kp" not in metadata or "kd" not in metadata:
@@ -96,13 +101,15 @@ def load_policy(
class HolosomaLocomotionController:
"""Holosoma lower-body locomotion controller for Unitree G1."""
"""Holosoma whole-body locomotion controller for Unitree G1."""
control_dt = CONTROL_DT # Expose for unitree_g1.py
def __init__(self, policy, robot, kp: np.ndarray, kd: np.ndarray):
self.policy = policy
self.robot = robot
def __init__(self):
# Load policy and gains
self.policy, self.kp, self.kd = load_policy()
# Override robot's PD gains with policy gains
self.robot.kp = kp
self.robot.kd = kd
self.cmd = np.zeros(3, dtype=np.float32)
@@ -117,55 +124,35 @@ class HolosomaLocomotionController:
self.phase_dt = 2 * np.pi / ((1.0 / CONTROL_DT) * GAIT_PERIOD)
self.is_standing = True
logger.info("HolosomaLocomotionController initialized")
def run_step(self):
# Get current observation
obs = self.robot.get_observation()
def reset(self) -> None:
"""Reset internal state for a new episode."""
self.cmd[:] = 0.0
self.qj[:] = 0.0
self.dqj[:] = 0.0
self.obs[:] = 0.0
self.last_action[:] = 0.0
self.phase = np.array([[0.0, np.pi]], dtype=np.float32)
self.is_standing = True
if not obs:
return
def run_step(self, action: dict, lowstate) -> dict:
"""Run one step of the locomotion controller.
Args:
action: Action dict containing remote.lx/ly/rx/ry
lowstate: Robot lowstate containing motor positions/velocities and IMU
Returns:
Action dict for lower body joints (0-14)
"""
if lowstate is None:
return {}
lx, ly, rx, _ry = (action.get(k, 0.0) for k in REMOTE_AXES)
ly = ly if abs(ly) > 0.1 else 0.0
lx = lx if abs(lx) > 0.1 else 0.0
rx = rx if abs(rx) > 0.1 else 0.0
ly = np.clip(ly, -0.3, 0.3)
lx = np.clip(lx, -0.3, 0.3)
# Get command from remote controller
ly = obs["remote.ly"] if abs(obs["remote.ly"]) > 0.1 else 0.0
lx = obs["remote.lx"] if abs(obs["remote.lx"]) > 0.1 else 0.0
rx = obs["remote.rx"] if abs(obs["remote.rx"]) > 0.1 else 0.0
self.cmd[:] = [ly, -lx, -rx]
# Get joint positions and velocities from lowstate
# Get joint positions and velocities
for motor in G1_29_JointIndex:
name = motor.name
idx = motor.value
self.qj[idx] = lowstate.motor_state[idx].q
self.dqj[idx] = lowstate.motor_state[idx].dq
self.qj[idx] = obs[f"{name}.q"]
self.dqj[idx] = obs[f"{name}.dq"]
# Hide arm positions from policy (show DEFAULT_ANGLES instead)
# This prevents policy from reacting to teleop arm movements
for arm_joint in G1_29_JointArmIndex:
self.qj[arm_joint.value] = DEFAULT_ANGLES[arm_joint.value]
self.dqj[arm_joint.value] = 0.0
# Adapt observation for g1_23dof
for idx in MISSING_JOINTS:
self.qj[idx] = 0.0
self.dqj[idx] = 0.0
# Express IMU data in gravity frame of reference
quat = lowstate.imu_state.quaternion
ang_vel = np.array(lowstate.imu_state.gyroscope, dtype=np.float32)
gravity = get_gravity_orientation(quat)
quat = [obs["imu.quat.w"], obs["imu.quat.x"], obs["imu.quat.y"], obs["imu.quat.z"]]
ang_vel = np.array([obs["imu.gyro.x"], obs["imu.gyro.y"], obs["imu.gyro.z"]], dtype=np.float32)
gravity = self.robot.get_gravity_orientation(quat)
# Scale joint positions and velocities before policy inference
qj_obs = (self.qj - DEFAULT_ANGLES) * DOF_POS_SCALE
@@ -199,16 +186,79 @@ class HolosomaLocomotionController:
# Run policy inference
ort_in = {self.policy.get_inputs()[0].name: self.obs.reshape(1, -1).astype(np.float32)}
raw_action = self.policy.run(None, ort_in)[0].squeeze()
policy_action = np.clip(raw_action, -100.0, 100.0)
self.last_action = policy_action.copy()
action = np.clip(raw_action, -100.0, 100.0)
self.last_action = action.copy()
# Transform action back to target joint positions
target = DEFAULT_ANGLES + policy_action * ACTION_SCALE
target = DEFAULT_ANGLES + action * ACTION_SCALE
# Build action dict (first 15 joints only)
# Build action dict
action_dict = {}
for i in range(15):
motor_name = G1_29_JointIndex(i).name
action_dict[f"{motor_name}.q"] = float(target[i])
for motor in G1_29_JointIndex:
action_dict[f"{motor.name}.q"] = float(target[motor.value])
return action_dict
# Zero out missing joints for g1_23dof
for joint_idx in MISSING_JOINTS:
motor_name = G1_29_JointIndex(joint_idx).name
action_dict[f"{motor_name}.q"] = 0.0
# Send action to robot
self.robot.send_action(action_dict)
def run(repo_id: str = DEFAULT_HOLOSOMA_REPO_ID, policy_type: str = "fastsac") -> None:
"""Main function to run the Holosoma locomotion controller.
Args:
repo_id: Hugging Face Hub repository ID for Holosoma policies.
policy_type: Policy type to use ('fastsac' or 'ppo').
"""
# Load policy and gains
policy, kp, kd = load_policy(repo_id=repo_id, policy_type=policy_type)
# Initialize robot
config = UnitreeG1Config()
robot = UnitreeG1(config)
robot.connect()
holosoma_controller = HolosomaLocomotionController(policy, robot, kp, kd)
try:
robot.reset(CONTROL_DT, DEFAULT_ANGLES)
logger.info("Use joystick: LY=fwd/back, LX=left/right, RX=rotate")
logger.info("Press Ctrl+C to stop")
# Run step
while not robot._shutdown_event.is_set():
start_time = time.time()
holosoma_controller.run_step()
elapsed = time.time() - start_time
sleep_time = max(0, CONTROL_DT - elapsed)
time.sleep(sleep_time)
except KeyboardInterrupt:
logger.info("Stopping locomotion...")
finally:
if robot.is_connected:
robot.disconnect()
logger.info("Done!")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Holosoma Locomotion Controller for Unitree G1")
parser.add_argument(
"--repo-id",
type=str,
default=DEFAULT_HOLOSOMA_REPO_ID,
help=f"Hugging Face Hub repo ID for Holosoma policies (default: {DEFAULT_HOLOSOMA_REPO_ID})",
)
parser.add_argument(
"--policy",
type=str,
choices=["fastsac", "ppo"],
default="fastsac",
help="Policy type to use: 'fastsac' (default) or 'ppo'",
)
args = parser.parse_args()
run(repo_id=args.repo_id, policy_type=args.policy)
+122 -69
View File
@@ -25,11 +25,11 @@ discord = "https://discord.gg/s3KuuzsPFb"
[project]
name = "lerobot"
version = "0.5.1"
version = "0.4.4"
description = "🤗 LeRobot: State-of-the-art Machine Learning for Real-World Robotics in Pytorch"
dynamic = ["readme"]
license = { text = "Apache-2.0" }
requires-python = ">=3.12"
requires-python = ">=3.10"
authors = [
{ name = "Rémi Cadène", email = "re.cadene@gmail.com" },
{ name = "Simon Alibert", email = "alibert.sim@gmail.com" },
@@ -50,8 +50,7 @@ classifiers = [
"Intended Audience :: Education",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: Apache Software License",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Programming Language :: Python :: 3.10",
"Topic :: Software Development :: Build Tools",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
]
@@ -60,30 +59,28 @@ keywords = ["lerobot", "huggingface", "robotics", "machine learning", "artifici
dependencies = [
# Hugging Face dependencies
"datasets>=4.0.0,<5.0.0",
"datasets>=4.0.0,<4.2.0",
"diffusers>=0.27.2,<0.36.0",
"huggingface-hub>=1.0.0,<2.0.0",
"huggingface-hub[hf-transfer,cli]>=0.34.2,<0.36.0",
"accelerate>=1.10.0,<2.0.0",
# Core dependencies
"numpy>=2.0.0,<2.3.0", # NOTE: Explicitly listing numpy helps the resolver converge faster. Upper bound imposed by opencv-python-headless.
"setuptools>=71.0.0,<81.0.0",
"cmake>=3.29.0.1,<4.2.0",
"packaging>=24.2,<26.0",
"torch>=2.2.1,<2.11.0",
"torchcodec>=0.2.1,<0.11.0; sys_platform != 'win32' and (sys_platform != 'linux' or (platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 'armv7l')) and (sys_platform != 'darwin' or platform_machine != 'x86_64')",
"torchvision>=0.21.0,<0.26.0",
"einops>=0.8.0,<0.9.0",
"opencv-python-headless>=4.9.0,<4.13.0",
"av>=15.0.0,<16.0.0",
"jsonlines>=4.0.0,<5.0.0",
"pynput>=1.7.8,<1.9.0",
"packaging>=24.2,<26.0",
"pynput>=1.7.7,<1.9.0",
"pyserial>=3.5,<4.0",
"wandb>=0.24.0,<0.25.0",
"draccus==0.10.0", # TODO: Relax version constraint
"torch>=2.2.1,<2.8.0", # TODO: Bumb dependency
"torchcodec>=0.2.1,<0.6.0; sys_platform != 'win32' and (sys_platform != 'linux' or (platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 'armv7l')) and (sys_platform != 'darwin' or platform_machine != 'x86_64')", # TODO: Bumb dependency
"torchvision>=0.21.0,<0.23.0", # TODO: Bumb dependency
"draccus==0.10.0", # TODO: Remove ==
"gymnasium>=1.1.1,<2.0.0",
"rerun-sdk>=0.24.0,<0.27.0",
@@ -98,20 +95,14 @@ dependencies = [
# Common
pygame-dep = ["pygame>=2.5.1,<2.7.0"]
placo-dep = ["placo>=0.9.6,<0.9.17"]
transformers-dep = ["transformers>=5.3.0,<6.0.0"]
placo-dep = ["placo>=0.9.6,<0.10.0"]
transformers-dep = ["transformers>=4.57.1,<5.0.0"]
grpcio-dep = ["grpcio==1.73.1", "protobuf>=6.31.1,<6.32.0"]
can-dep = ["python-can>=4.2.0,<5.0.0"]
peft-dep = ["peft>=0.18.0,<1.0.0"]
scipy-dep = ["scipy>=1.14.0,<2.0.0"]
qwen-vl-utils-dep = ["qwen-vl-utils>=0.0.11,<0.1.0"]
matplotlib-dep = ["matplotlib>=3.10.3,<4.0.0", "contourpy>=1.3.0,<2.0.0"] # NOTE: Explicitly listing contourpy helps the resolver converge faster.
# Motors
feetech = ["feetech-servo-sdk>=1.0.0,<2.0.0"]
dynamixel = ["dynamixel-sdk>=3.7.31,<3.9.0"]
damiao = ["lerobot[can-dep]"]
robstride = ["lerobot[can-dep]"]
damiao = ["python-can>=4.2.0,<5.0.0"]
# Robots
openarms = ["lerobot[damiao]"]
@@ -119,36 +110,34 @@ gamepad = ["lerobot[pygame-dep]", "hidapi>=0.14.0,<0.15.0"]
hopejr = ["lerobot[feetech]", "lerobot[pygame-dep]"]
lekiwi = ["lerobot[feetech]", "pyzmq>=26.2.1,<28.0.0"]
unitree_g1 = [
"unitree-sdk2==1.0.1",
"pyzmq>=26.2.1,<28.0.0",
"onnxruntime>=1.16.0,<2.0.0",
"pin>=3.0.0,<4.0.0",
"meshcat>=0.3.0,<0.4.0",
"lerobot[matplotlib-dep]",
"lerobot[pygame-dep]",
"matplotlib>=3.9.0,<4.0.0",
"casadi>=3.6.0,<4.0.0",
]
reachy2 = ["reachy2_sdk>=1.0.15,<1.1.0"]
kinematics = ["lerobot[placo-dep]"]
intelrealsense = [
"pyrealsense2>=2.55.1.6486,<2.57.0 ; sys_platform != 'darwin'",
"pyrealsense2-macosx>=2.54,<2.57.0 ; sys_platform == 'darwin'",
"pyrealsense2-macosx>=2.54,<2.55.0 ; sys_platform == 'darwin'",
]
phone = ["hebi-py>=2.8.0,<2.12.0", "teleop>=0.1.0,<0.2.0", "fastapi<1.0", "lerobot[scipy-dep]"]
phone = ["hebi-py>=2.8.0,<2.12.0", "teleop>=0.1.0,<0.2.0", "fastapi<1.0"]
# Policies
wallx = [
"lerobot[transformers-dep]",
"lerobot[peft]",
"lerobot[scipy-dep]",
"torchdiffeq>=0.2.4,<0.3.0",
"lerobot[qwen-vl-utils-dep]",
"transformers==4.49.0",
"peft==0.17.1",
"scipy==1.15.3",
"torchdiffeq==0.2.5",
"qwen_vl_utils==0.0.11"
]
pi = ["lerobot[transformers-dep]", "lerobot[scipy-dep]"]
pi = ["transformers @ git+https://github.com/huggingface/transformers.git@fix/lerobot_openpi", "scipy>=1.10.1,<1.15"]
smolvla = ["lerobot[transformers-dep]", "num2words>=0.5.14,<0.6.0", "accelerate>=1.7.0,<2.0.0", "safetensors>=0.4.3,<1.0.0"]
groot = [
"lerobot[transformers-dep]",
"lerobot[peft]",
"peft>=0.13.0,<1.0.0",
"dm-tree>=0.1.8,<1.0.0",
"timm>=1.0.0,<1.1.0",
"safetensors>=0.4.3,<1.0.0",
@@ -157,13 +146,13 @@ groot = [
"ninja>=1.11.1,<2.0.0",
"flash-attn>=2.5.9,<3.0.0 ; sys_platform != 'darwin'"
]
sarm = ["lerobot[transformers-dep]", "faker>=33.0.0,<35.0.0", "lerobot[matplotlib-dep]", "lerobot[qwen-vl-utils-dep]"]
sarm = ["lerobot[transformers-dep]", "faker>=33.0.0,<35.0.0", "matplotlib>=3.10.3,<4.0.0", "qwen-vl-utils>=0.0.14,<0.1.0"]
xvla = ["lerobot[transformers-dep]"]
hilserl = ["lerobot[transformers-dep]", "gym-hil>=0.1.13,<0.2.0", "lerobot[grpcio-dep]", "lerobot[placo-dep]"]
# Features
async = ["lerobot[grpcio-dep]", "lerobot[matplotlib-dep]"]
peft = ["lerobot[transformers-dep]", "lerobot[peft-dep]"]
async = ["lerobot[grpcio-dep]", "matplotlib>=3.10.3,<4.0.0"]
peft = ["lerobot[transformers-dep]", "peft>=0.18.0,<1.0.0"]
# Development
dev = ["pre-commit>=3.7.0,<5.0.0", "debugpy>=1.8.1,<1.9.0", "lerobot[grpcio-dep]", "grpcio-tools==1.73.1", "mypy>=1.19.1"]
@@ -171,27 +160,13 @@ test = ["pytest>=8.1.0,<9.0.0", "pytest-timeout>=2.4.0,<3.0.0", "pytest-cov>=5.0
video_benchmark = ["scikit-image>=0.23.2,<0.26.0", "pandas>=2.2.2,<2.4.0"]
# Simulation
# NOTE: Explicitly listing scipy helps flatten the dependecy tree.
aloha = ["gym-aloha>=0.1.2,<0.2.0", "lerobot[scipy-dep]"]
aloha = ["gym-aloha>=0.1.2,<0.2.0"]
pusht = ["gym-pusht>=0.1.5,<0.2.0", "pymunk>=6.6.0,<7.0.0"] # TODO: Fix pymunk version in gym-pusht instead
libero = ["lerobot[transformers-dep]", "hf-libero>=0.1.3,<0.2.0; sys_platform == 'linux'", "lerobot[scipy-dep]"]
libero_plus = [
"lerobot[transformers-dep]",
"libero @ git+https://github.com/sylvestf/LIBERO-plus.git@main ; sys_platform == 'linux'",
"lerobot[scipy-dep]",
]
robomme = [
"robomme @ git+https://github.com/RoboMME/robomme_benchmark.git@main ; sys_platform == 'linux'",
]
metaworld = ["metaworld==3.0.0", "lerobot[scipy-dep]"]
libero = ["lerobot[transformers-dep]", "hf-libero>=0.1.3,<0.2.0"]
metaworld = ["metaworld==3.0.0"]
# All
all = [
# NOTE(resolver hint): scipy is pulled in transitively via lerobot[scipy-dep] through
# multiple extras (aloha, metaworld, pi, wallx, phone). Listing it explicitly
# helps pip's resolver converge by constraining scipy early, before it encounters
# the loose scipy requirements from transitive deps like dm-control and metaworld.
"scipy>=1.14.0,<2.0.0",
"lerobot[dynamixel]",
"lerobot[gamepad]",
"lerobot[hopejr]",
@@ -199,8 +174,8 @@ all = [
"lerobot[reachy2]",
"lerobot[kinematics]",
"lerobot[intelrealsense]",
"lerobot[wallx]",
"lerobot[pi]",
# "lerobot[wallx]",
# "lerobot[pi]", TODO(Pepijn): Update pi to transformers v5
"lerobot[smolvla]",
# "lerobot[groot]", TODO(Steven): Gr00t requires specific installation instructions for flash-attn
"lerobot[xvla]",
@@ -212,11 +187,10 @@ all = [
"lerobot[aloha]",
"lerobot[pusht]",
"lerobot[phone]",
"lerobot[libero]; sys_platform == 'linux'",
"lerobot[libero]",
"lerobot[metaworld]",
"lerobot[sarm]",
"lerobot[peft]",
# "lerobot[unitree_g1]", TODO: Unitree requires specific installation instructions for unitree_sdk2
]
[project.scripts]
@@ -238,14 +212,11 @@ lerobot-edit-dataset="lerobot.scripts.lerobot_edit_dataset:main"
lerobot-setup-can="lerobot.scripts.lerobot_setup_can:main"
# ---------------- Tool Configurations ----------------
[tool.setuptools.package-data]
lerobot = ["envs/*.json"]
[tool.setuptools.packages.find]
where = ["src"]
[tool.ruff]
target-version = "py312"
target-version = "py310"
line-length = 110
exclude = ["tests/artifacts/**/*.safetensors", "*_pb2.py", "*_pb2_grpc.py"]
@@ -337,7 +308,7 @@ default.extend-ignore-identifiers-re = [
# Uncomment [tool.mypy] first, then uncomment individual module overrides as they get proper type annotations
[tool.mypy]
python_version = "3.12"
python_version = "3.10"
ignore_missing_imports = true
follow_imports = "skip"
# warn_return_any = true
@@ -389,9 +360,9 @@ ignore_errors = false
module = "lerobot.cameras.*"
ignore_errors = false
[[tool.mypy.overrides]]
module = "lerobot.motors.*"
ignore_errors = false
# [[tool.mypy.overrides]]
# module = "lerobot.motors.*"
# ignore_errors = false
# [[tool.mypy.overrides]]
# module = "lerobot.robots.*"
@@ -421,3 +392,85 @@ ignore_errors = false
# [[tool.mypy.overrides]]
# module = "lerobot.scripts.*"
# ignore_errors = false
[tool.uv]
# wallx requires transformers==4.49.0 which conflicts with other extras that need >=4.53.0
conflicts = [
[
{ extra = "wallx" },
{ extra = "transformers-dep" },
],
[
{ extra = "wallx" },
{ extra = "pi" },
],
[
{ extra = "wallx" },
{ extra = "smolvla" },
],
[
{ extra = "wallx" },
{ extra = "groot" },
],
[
{ extra = "wallx" },
{ extra = "xvla" },
],
[
{ extra = "wallx" },
{ extra = "sarm" },
],
[
{ extra = "wallx" },
{ extra = "hilserl" },
],
[
{ extra = "wallx" },
{ extra = "libero" },
],
[
{ extra = "wallx" },
{ extra = "peft" },
],
[
{ extra = "wallx" },
{ extra = "all" },
],
# pi uses custom branch which conflicts with transformers-dep
[
{ extra = "pi" },
{ extra = "transformers-dep" },
],
[
{ extra = "pi" },
{ extra = "smolvla" },
],
[
{ extra = "pi" },
{ extra = "groot" },
],
[
{ extra = "pi" },
{ extra = "xvla" },
],
[
{ extra = "pi" },
{ extra = "sarm" },
],
[
{ extra = "pi" },
{ extra = "hilserl" },
],
[
{ extra = "pi" },
{ extra = "libero" },
],
[
{ extra = "pi" },
{ extra = "peft" },
],
[
{ extra = "pi" },
{ extra = "all" },
],
]
+276 -175
View File
@@ -1,73 +1,76 @@
#
# This file is autogenerated by pip-compile with Python 3.12
# This file is autogenerated by pip-compile with Python 3.10
# by the following command:
#
# pip-compile --output-file=requirements-macos.txt requirements.in
#
-e .[all]
# via -[all]
absl-py==2.4.0
absl-py==2.3.1
# via
# dm-control
# dm-env
# dm-tree
# labmaze
# mujoco
accelerate==1.13.0
# tensorboard
accelerate==1.11.0
# via
# lerobot
# peft
aiohappyeyeballs==2.6.1
# via aiohttp
aiohttp==3.13.3
aiohttp==3.13.1
# via fsspec
aiosignal==1.4.0
# via aiohttp
annotated-doc==0.0.4
# via
# fastapi
# typer
annotated-types==0.7.0
# via pydantic
anyio==4.12.1
antlr4-python3-runtime==4.9.3
# via
# hydra-core
# omegaconf
anyio==4.11.0
# via
# httpx
# starlette
# watchfiles
asttokens==3.0.1
asttokens==3.0.0
# via stack-data
async-timeout==5.0.1
# via aiohttp
attrs==25.4.0
# via
# aiohttp
# dm-tree
# jsonlines
# jsonschema
# referencing
# rerun-sdk
av==15.1.0
# via lerobot
bddl==1.0.1
# via libero
certifi==2025.10.5
# via
# lerobot
# qwen-vl-utils
certifi==2026.2.25
# via
# httpcore
# httpx
# requests
# sentry-sdk
cffi==2.0.0
# via pymunk
cfgv==3.5.0
cfgv==3.4.0
# via pre-commit
charset-normalizer==3.4.5
charset-normalizer==3.4.4
# via requests
click==8.3.1
click==8.3.0
# via
# typer
# uvicorn
# wandb
cloudpickle==3.1.2
# via gymnasium
cmake==4.1.3
cloudpickle==3.1.1
# via
# gymnasium
# libero
cmake==4.1.0
# via lerobot
cmeel==0.59.0
cmeel==0.57.3
# via
# cmeel-assimp
# cmeel-boost
@@ -105,17 +108,15 @@ cmeel-zlib==1.3.1
# via cmeel-assimp
coal-library==3.0.1
# via pin
contourpy==1.3.3
# via
# lerobot
# matplotlib
coverage[toml]==7.13.4
contourpy==1.3.2
# via matplotlib
coverage[toml]==7.11.0
# via pytest-cov
cycler==0.12.1
# via matplotlib
datasets==4.6.1
datasets==4.1.1
# via lerobot
debugpy==1.8.20
debugpy==1.8.17
# via lerobot
decorator==5.2.1
# via ipython
@@ -129,7 +130,7 @@ dill==0.4.0
# multiprocess
distlib==0.4.0
# via virtualenv
dm-control==1.0.37
dm-control==1.0.34
# via gym-aloha
dm-env==1.6
# via dm-control
@@ -137,55 +138,69 @@ dm-tree==0.1.9
# via
# dm-control
# dm-env
# lerobot
docopt==0.6.2
# via num2words
draccus==0.10.0
# via lerobot
dynamixel-sdk==3.8.4
# via lerobot
easydict==1.13
# via libero
egl-probe @ git+https://github.com/huggingface/egl_probe.git
# via
# libero
# robomimic
eigenpy==3.10.3
# via coal-library
einops==0.8.2
# via lerobot
eiquadprog==1.2.9
# via placo
etils[epath,epy]==1.14.0
# via mujoco
executing==2.2.1
# via stack-data
faker==34.0.2
# via lerobot
farama-notifications==0.0.4
# via gymnasium
fastapi==0.135.1
einops==0.8.1
# via
# lerobot
# teleop
# libero
eiquadprog==1.2.9
# via placo
etils[epath,epy]==1.13.0
# via mujoco
exceptiongroup==1.3.0
# via
# anyio
# ipython
# pytest
executing==2.2.1
# via stack-data
farama-notifications==0.0.4
# via gymnasium
fastapi==0.119.1
# via teleop
fastjsonschema==2.21.2
# via nbformat
feetech-servo-sdk==1.0.0
# via lerobot
filelock==3.25.0
filelock==3.20.0
# via
# datasets
# diffusers
# huggingface-hub
# python-discovery
# torch
# transformers
# virtualenv
fonttools==4.61.1
fonttools==4.60.1
# via matplotlib
frozenlist==1.8.0
# via
# aiohttp
# aiosignal
fsspec[http]==2026.2.0
fsspec[http]==2025.9.0
# via
# datasets
# etils
# huggingface-hub
# torch
future==1.0.0
# via libero
gitdb==4.0.12
# via gitpython
gitpython==3.1.46
gitpython==3.1.45
# via wandb
glfw==2.10.0
# via
@@ -197,6 +212,7 @@ grpcio==1.73.1
# lerobot
# reachy2-sdk
# reachy2-sdk-api
# tensorboard
grpcio-tools==1.73.1
# via
# lerobot
@@ -207,67 +223,71 @@ gym-hil==0.1.13
# via lerobot
gym-pusht==0.1.6
# via lerobot
gymnasium==1.2.3
gymnasium==1.2.1
# via
# gym-aloha
# gym-hil
# gym-pusht
# lerobot
# libero
# metaworld
h11==0.16.0
# via
# httpcore
# uvicorn
# via uvicorn
h5py==3.15.1
# via robomimic
hebi-py==2.11.0
# via lerobot
hf-xet==1.3.2
hf-transfer==0.1.9
# via huggingface-hub
hf-xet==1.1.10
# via huggingface-hub
hidapi==0.14.0.post4
# via
# gym-hil
# lerobot
httpcore==1.0.9
# via httpx
httptools==0.7.1
# via uvicorn
httpx==0.28.1
# via
# datasets
# huggingface-hub
huggingface-hub==1.6.0
huggingface-hub[cli,hf-transfer]==0.35.3
# via
# accelerate
# datasets
# diffusers
# lerobot
# peft
# timm
# tokenizers
# transformers
identify==2.6.17
hydra-core==1.3.2
# via libero
identify==2.6.15
# via pre-commit
idna==3.11
# via
# anyio
# httpx
# requests
# yarl
imageio[ffmpeg]==2.37.2
imageio[ffmpeg]==2.37.0
# via
# gym-aloha
# gym-hil
# lerobot
# metaworld
# robomimic
# scikit-image
imageio-ffmpeg==0.6.0
# via imageio
importlib-metadata==8.7.1
# via
# imageio
# robomimic
importlib-metadata==8.7.0
# via diffusers
importlib-resources==6.5.2
# via etils
iniconfig==2.3.0
# via pytest
ipython==9.11.0
inquirerpy==0.3.4
# via huggingface-hub
ipython==8.37.0
# via meshcat
ipython-pygments-lexers==1.1.1
# via ipython
ischedule==1.2.7
# via placo
jedi==0.19.2
@@ -276,24 +296,44 @@ jinja2==3.1.6
# via torch
jsonlines==4.0.0
# via lerobot
jsonschema==4.25.1
# via nbformat
jsonschema-specifications==2025.9.1
# via jsonschema
jupyter-core==5.9.1
# via nbformat
jupytext==1.18.1
# via bddl
kiwisolver==1.4.9
# via matplotlib
labmaze==1.0.6
# via dm-control
lazy-loader==0.5
lazy-loader==0.4
# via scikit-image
librt==0.8.1
# via mypy
libero @ git+https://github.com/huggingface/lerobot-libero.git@main
# via lerobot
llvmlite==0.45.1
# via numba
lxml==6.0.2
# via dm-control
markdown==3.9
# via tensorboard
markdown-it-py==4.0.0
# via rich
# via
# jupytext
# mdit-py-plugins
markupsafe==3.0.3
# via jinja2
matplotlib==3.10.8
# via lerobot
# via
# jinja2
# werkzeug
matplotlib==3.10.7
# via
# lerobot
# libero
matplotlib-inline==0.2.1
# via ipython
mdit-py-plugins==0.5.0
# via jupytext
mdurl==0.1.2
# via markdown-it-py
mergedeep==1.3.4
@@ -306,35 +346,41 @@ mock-serial==0.0.1
# via lerobot
mpmath==1.3.0
# via sympy
mujoco==3.5.0
mujoco==3.3.7
# via
# dm-control
# gym-aloha
# gym-hil
# libero
# metaworld
multidict==6.7.1
# robosuite
multidict==6.7.0
# via
# aiohttp
# yarl
multiprocess==0.70.18
multiprocess==0.70.16
# via datasets
mypy==1.19.1
# via lerobot
mypy-extensions==1.1.0
# via typing-inspect
nbformat==5.10.4
# via jupytext
networkx==3.4.2
# via
# mypy
# typing-inspect
networkx==3.6.1
# via
# bddl
# scikit-image
# torch
nodeenv==1.10.0
ninja==1.13.0
# via lerobot
nodeenv==1.9.1
# via pre-commit
num2words==0.5.14
# via lerobot
numba==0.62.1
# via robosuite
numpy==2.2.6
# via
# accelerate
# bddl
# cmeel-boost
# contourpy
# datasets
@@ -343,14 +389,16 @@ numpy==2.2.6
# dm-env
# dm-tree
# gymnasium
# h5py
# hebi-py
# imageio
# labmaze
# lerobot
# libero
# matplotlib
# meshcat
# metaworld
# mujoco
# numba
# opencv-python
# opencv-python-headless
# pandas
@@ -358,18 +406,26 @@ numpy==2.2.6
# pyquaternion
# reachy2-sdk
# rerun-sdk
# robomimic
# robosuite
# scikit-image
# scipy
# shapely
# teleop
# tensorboard
# tensorboardx
# tifffile
# torchvision
# transformers
# transforms3d
opencv-python==4.13.0.92
omegaconf==2.3.0
# via hydra-core
opencv-python==4.12.0.88
# via
# gym-pusht
# libero
# reachy2-sdk
# robosuite
opencv-python-headless==4.12.0.88
# via lerobot
orderly-set==5.5.0
@@ -379,87 +435,97 @@ packaging==25.0
# accelerate
# datasets
# huggingface-hub
# hydra-core
# jupytext
# lazy-loader
# lerobot
# matplotlib
# peft
# pytest
# qwen-vl-utils
# reachy2-sdk
# scikit-image
# tensorboard
# tensorboardx
# transformers
# wandb
pandas==2.3.3
# via
# datasets
# lerobot
parso==0.8.6
parso==0.8.5
# via jedi
pathspec==1.0.4
# via mypy
peft==0.18.1
peft==0.17.1
# via lerobot
pexpect==4.9.0
# via ipython
pillow==12.1.1
pfzy==0.3.4
# via inquirerpy
pillow==12.0.0
# via
# diffusers
# imageio
# lerobot
# matplotlib
# meshcat
# qwen-vl-utils
# rerun-sdk
# robosuite
# scikit-image
# tensorboard
# torchvision
pin==3.4.0
# via placo
placo==0.9.16
placo==0.9.14
# via lerobot
platformdirs==4.9.4
platformdirs==4.5.0
# via
# python-discovery
# jupyter-core
# virtualenv
# wandb
pluggy==1.6.0
# via
# pytest
# pytest-cov
pre-commit==4.5.1
pre-commit==4.3.0
# via lerobot
prompt-toolkit==3.0.52
# via ipython
# via
# inquirerpy
# ipython
propcache==0.4.1
# via
# aiohttp
# yarl
protobuf==6.31.1
protobuf==6.31.0
# via
# dm-control
# grpcio-tools
# lerobot
# reachy2-sdk
# reachy2-sdk-api
# tensorboard
# tensorboardx
# wandb
psutil==7.2.2
psutil==7.1.1
# via
# accelerate
# imageio
# peft
# robomimic
ptyprocess==0.7.0
# via pexpect
pure-eval==0.2.3
# via stack-data
pyarrow==23.0.1
pyarrow==21.0.0
# via
# datasets
# rerun-sdk
pycparser==3.0
pycparser==2.23
# via cffi
pydantic==2.12.5
pydantic==2.12.3
# via
# fastapi
# wandb
pydantic-core==2.41.5
pydantic-core==2.41.4
# via pydantic
pygame==2.6.1
# via
@@ -469,35 +535,33 @@ pygame==2.6.1
pygments==2.19.2
# via
# ipython
# ipython-pygments-lexers
# pytest
# rich
pymunk==6.11.1
# via
# gym-pusht
# lerobot
pyngrok==7.5.1
pyngrok==7.4.1
# via meshcat
pynput==1.8.1
# via
# gym-hil
# lerobot
pyobjc-core==12.1
pyobjc-core==12.0
# via
# pyobjc-framework-applicationservices
# pyobjc-framework-cocoa
# pyobjc-framework-coretext
# pyobjc-framework-quartz
pyobjc-framework-applicationservices==12.1
pyobjc-framework-applicationservices==12.0
# via pynput
pyobjc-framework-cocoa==12.1
pyobjc-framework-cocoa==12.0
# via
# pyobjc-framework-applicationservices
# pyobjc-framework-coretext
# pyobjc-framework-quartz
pyobjc-framework-coretext==12.1
pyobjc-framework-coretext==12.0
# via pyobjc-framework-applicationservices
pyobjc-framework-quartz==12.1
pyobjc-framework-quartz==12.0
# via
# pynput
# pyobjc-framework-applicationservices
@@ -506,13 +570,13 @@ pyopengl==3.1.10
# via
# dm-control
# mujoco
pyparsing==3.3.2
pyparsing==3.2.5
# via
# dm-control
# matplotlib
pyquaternion==0.9.9
# via reachy2-sdk
pyrealsense2-macosx==2.56.5
pyrealsense2-macosx==2.54.2
# via lerobot
pyserial==3.5
# via
@@ -521,6 +585,7 @@ pyserial==3.5
# lerobot
pytest==8.4.2
# via
# bddl
# lerobot
# pytest-cov
# pytest-timeout
@@ -531,14 +596,11 @@ pytest-timeout==2.4.0
# via lerobot
python-dateutil==2.9.0.post0
# via
# faker
# matplotlib
# pandas
python-discovery==1.1.1
# via virtualenv
python-dotenv==1.2.2
python-dotenv==1.1.1
# via uvicorn
pytz==2026.1.post1
pytz==2025.2
# via pandas
pyyaml==6.0.3
# via
@@ -547,10 +609,13 @@ pyyaml==6.0.3
# draccus
# hebi-py
# huggingface-hub
# jupytext
# omegaconf
# peft
# pre-commit
# pyngrok
# pyyaml-include
# timm
# transformers
# uvicorn
# wandb
@@ -560,13 +625,15 @@ pyzmq==27.1.0
# via
# lerobot
# meshcat
qwen-vl-utils==0.0.14
# via lerobot
reachy2-sdk==1.0.15
reachy2-sdk==1.0.14
# via lerobot
reachy2-sdk-api==1.0.21
# via reachy2-sdk
regex==2026.2.28
referencing==0.37.0
# via
# jsonschema
# jsonschema-specifications
regex==2025.10.23
# via
# diffusers
# transformers
@@ -575,150 +642,184 @@ requests==2.32.5
# datasets
# diffusers
# dm-control
# qwen-vl-utils
# huggingface-hub
# teleop
# transformers
# wandb
rerun-sdk==0.26.2
rerun-sdk==0.26.1
# via lerobot
rhoban-cmeel-jsoncpp==1.9.4.9
# via placo
rich==14.3.3
# via typer
safetensors==0.7.0
robomimic==0.2.0
# via libero
robosuite==1.4.0
# via libero
rpds-py==0.28.0
# via
# jsonschema
# referencing
safetensors==0.6.2
# via
# accelerate
# diffusers
# lerobot
# peft
# timm
# transformers
scikit-image==0.25.2
# via
# gym-pusht
# lerobot
scipy==1.17.1
scipy==1.15.3
# via
# dm-control
# lerobot
# metaworld
# robosuite
# scikit-image
# torchdiffeq
sentry-sdk==2.54.0
sentry-sdk==2.42.1
# via wandb
shapely==2.1.2
# via gym-pusht
shellingham==1.5.4
# via typer
six==1.17.0
# via
# pynput
# python-dateutil
smmap==5.0.3
smmap==5.0.2
# via gitdb
sniffio==1.3.1
# via anyio
stack-data==0.6.3
# via ipython
starlette==0.52.1
starlette==0.48.0
# via fastapi
sympy==1.14.0
# via torch
teleop==0.1.4
teleop==0.1.2
# via lerobot
termcolor==3.3.0
# via lerobot
tifffile==2026.3.3
tensorboard==2.20.0
# via robomimic
tensorboard-data-server==0.7.2
# via tensorboard
tensorboardx==2.6.4
# via robomimic
termcolor==3.1.0
# via
# lerobot
# robomimic
thop==0.1.1.post2209072238
# via libero
tifffile==2025.5.10
# via scikit-image
tokenizers==0.22.2
timm==1.0.20
# via lerobot
tokenizers==0.22.1
# via transformers
toml==0.10.2
# via draccus
torch==2.10.0
tomli==2.3.0
# via
# cmeel
# coverage
# jupytext
# pytest
torch==2.7.1
# via
# accelerate
# lerobot
# peft
# torchdiffeq
# robomimic
# thop
# timm
# torchvision
torchcodec==0.10.0
torchcodec==0.5
# via lerobot
torchdiffeq==0.2.5
# via lerobot
torchvision==0.25.0
# via lerobot
tornado==6.5.4
torchvision==0.22.1
# via
# lerobot
# robomimic
# timm
tornado==6.5.2
# via meshcat
tqdm==4.67.3
tqdm==4.67.1
# via
# datasets
# dm-control
# huggingface-hub
# peft
# robomimic
# transformers
traitlets==5.14.3
# via
# ipython
# jupyter-core
# matplotlib-inline
transformers==5.3.0
# nbformat
transformers==4.57.1
# via
# lerobot
# libero
# peft
transforms3d==0.4.2
# via teleop
typer==0.24.1
# via
# huggingface-hub
# transformers
typing-extensions==4.15.0
# via
# aiosignal
# anyio
# etils
# faker
# exceptiongroup
# fastapi
# gymnasium
# huggingface-hub
# mypy
# ipython
# multidict
# pydantic
# pydantic-core
# referencing
# rerun-sdk
# starlette
# torch
# typing-inspect
# typing-inspection
# uvicorn
# virtualenv
# wandb
typing-inspect==0.9.0
# via draccus
typing-inspection==0.4.2
# via
# fastapi
# pydantic
tzdata==2025.3
# via pydantic
tzdata==2025.2
# via pandas
u-msgpack-python==2.8.0
# via meshcat
urllib3==2.6.3
urllib3==2.5.0
# via
# requests
# sentry-sdk
uvicorn[standard]==0.41.0
uvicorn[standard]==0.38.0
# via teleop
uvloop==0.22.1
# via uvicorn
virtualenv==21.1.0
virtualenv==20.35.3
# via pre-commit
wandb==0.24.2
# via lerobot
wandb==0.21.4
# via
# lerobot
# libero
watchfiles==1.1.1
# via uvicorn
wcwidth==0.6.0
wcwidth==0.2.14
# via prompt-toolkit
websocket-client==1.9.0
# via teleop
websockets==16.0
websockets==15.0.1
# via uvicorn
wrapt==2.1.2
werkzeug==3.1.3
# via tensorboard
wrapt==2.0.0
# via dm-tree
xxhash==3.6.0
# via datasets
yarl==1.23.0
yarl==1.22.0
# via aiohttp
zipp==3.23.0
# via
+187 -208
View File
@@ -1,12 +1,12 @@
#
# This file is autogenerated by pip-compile with Python 3.12
# This file is autogenerated by pip-compile with Python 3.10
# by the following command:
#
# pip-compile --output-file=requirements-ubuntu.txt requirements.in
#
-e .[all]
# via -[all]
absl-py==2.4.0
absl-py==2.3.1
# via
# dm-control
# dm-env
@@ -14,33 +14,30 @@ absl-py==2.4.0
# labmaze
# mujoco
# tensorboard
accelerate==1.13.0
accelerate==1.11.0
# via
# lerobot
# peft
aiohappyeyeballs==2.6.1
# via aiohttp
aiohttp==3.13.3
aiohttp==3.13.1
# via fsspec
aiosignal==1.4.0
# via aiohttp
annotated-doc==0.0.4
# via
# fastapi
# typer
annotated-types==0.7.0
# via pydantic
antlr4-python3-runtime==4.9.3
# via
# hydra-core
# omegaconf
anyio==4.12.1
anyio==4.11.0
# via
# httpx
# starlette
# watchfiles
asttokens==3.0.1
asttokens==3.0.0
# via stack-data
async-timeout==5.0.1
# via aiohttp
attrs==25.4.0
# via
# aiohttp
@@ -50,35 +47,30 @@ attrs==25.4.0
# referencing
# rerun-sdk
av==15.1.0
# via
# lerobot
# qwen-vl-utils
# via lerobot
bddl==1.0.1
# via hf-libero
certifi==2026.2.25
# via libero
certifi==2025.10.5
# via
# httpcore
# httpx
# requests
# sentry-sdk
cffi==2.0.0
# via pymunk
cfgv==3.5.0
cfgv==3.4.0
# via pre-commit
charset-normalizer==3.4.5
charset-normalizer==3.4.4
# via requests
click==8.3.1
click==8.3.0
# via
# typer
# uvicorn
# wandb
cloudpickle==3.1.2
cloudpickle==3.1.1
# via
# gymnasium
# hf-libero
cmake==4.1.3
# libero
cmake==4.1.0
# via lerobot
cmeel==0.59.0
cmeel==0.57.3
# via
# cmeel-assimp
# cmeel-boost
@@ -116,24 +108,20 @@ cmeel-zlib==1.3.1
# via cmeel-assimp
coal-library==3.0.1
# via pin
contourpy==1.3.3
# via
# lerobot
# matplotlib
coverage[toml]==7.13.4
contourpy==1.3.2
# via matplotlib
coverage[toml]==7.11.0
# via pytest-cov
cuda-bindings==12.9.4
# via torch
cuda-pathfinder==1.4.1
# via cuda-bindings
cycler==0.12.1
# via matplotlib
datasets==4.6.1
datasets==4.1.1
# via lerobot
debugpy==1.8.20
debugpy==1.8.17
# via lerobot
decorator==5.2.1
# via ipython
decord==0.6.0
# via lerobot
deepdiff==8.6.1
# via lerobot
diffusers==0.35.2
@@ -144,7 +132,7 @@ dill==0.4.0
# multiprocess
distlib==0.4.0
# via virtualenv
dm-control==1.0.37
dm-control==1.0.34
# via gym-aloha
dm-env==1.6
# via dm-control
@@ -152,6 +140,7 @@ dm-tree==0.1.9
# via
# dm-control
# dm-env
# lerobot
docopt==0.6.2
# via num2words
draccus==0.10.0
@@ -159,60 +148,66 @@ draccus==0.10.0
dynamixel-sdk==3.8.4
# via lerobot
easydict==1.13
# via hf-libero
egl-probe==1.0.2
# via robomimic
# via libero
egl-probe @ git+https://github.com/huggingface/egl_probe.git
# via
# libero
# robomimic
eigenpy==3.10.3
# via coal-library
einops==0.8.2
einops==0.8.1
# via
# hf-libero
# flash-attn
# lerobot
# libero
eiquadprog==1.2.9
# via placo
etils[epath,epy]==1.14.0
etils[epath,epy]==1.13.0
# via mujoco
evdev==1.9.3
evdev==1.9.2
# via pynput
exceptiongroup==1.3.0
# via
# anyio
# ipython
# pytest
executing==2.2.1
# via stack-data
faker==34.0.2
# via lerobot
farama-notifications==0.0.4
# via gymnasium
fastapi==0.135.1
# via
# lerobot
# teleop
fastapi==0.119.1
# via teleop
fastjsonschema==2.21.2
# via nbformat
feetech-servo-sdk==1.0.0
# via lerobot
filelock==3.25.0
filelock==3.20.0
# via
# datasets
# diffusers
# huggingface-hub
# python-discovery
# torch
# transformers
# virtualenv
fonttools==4.61.1
flash-attn==2.8.3
# via lerobot
fonttools==4.60.1
# via matplotlib
frozenlist==1.8.0
# via
# aiohttp
# aiosignal
fsspec[http]==2026.2.0
fsspec[http]==2025.9.0
# via
# datasets
# etils
# huggingface-hub
# torch
future==1.0.0
# via hf-libero
# via libero
gitdb==4.0.12
# via gitpython
gitpython==3.1.46
gitpython==3.1.45
# via wandb
glfw==2.10.0
# via
@@ -235,60 +230,50 @@ gym-hil==0.1.13
# via lerobot
gym-pusht==0.1.6
# via lerobot
gymnasium==1.2.3
gymnasium==1.2.1
# via
# gym-aloha
# gym-hil
# gym-pusht
# hf-libero
# lerobot
# libero
# metaworld
h11==0.16.0
# via
# httpcore
# uvicorn
h5py==3.16.0
# via uvicorn
h5py==3.15.1
# via robomimic
hebi-py==2.11.0
# via lerobot
hf-egl-probe==1.0.2
# via hf-libero
hf-libero==0.1.3
# via lerobot
hf-xet==1.3.2
hf-transfer==0.1.9
# via huggingface-hub
hf-xet==1.1.10
# via huggingface-hub
hidapi==0.14.0.post4
# via
# gym-hil
# lerobot
httpcore==1.0.9
# via httpx
httptools==0.7.1
# via uvicorn
httpx==0.28.1
# via
# datasets
# huggingface-hub
huggingface-hub==1.6.0
huggingface-hub[cli,hf-transfer]==0.35.3
# via
# accelerate
# datasets
# diffusers
# lerobot
# peft
# timm
# tokenizers
# transformers
hydra-core==1.3.2
# via hf-libero
identify==2.6.17
# via libero
identify==2.6.15
# via pre-commit
idna==3.11
# via
# anyio
# httpx
# requests
# yarl
imageio[ffmpeg]==2.37.2
imageio[ffmpeg]==2.37.0
# via
# gym-aloha
# gym-hil
@@ -300,14 +285,16 @@ imageio-ffmpeg==0.6.0
# via
# imageio
# robomimic
importlib-metadata==8.7.1
importlib-metadata==8.7.0
# via diffusers
importlib-resources==6.5.2
# via etils
iniconfig==2.3.0
# via pytest
ipython==9.11.0
inquirerpy==0.3.4
# via huggingface-hub
ipython==8.37.0
# via meshcat
ipython-pygments-lexers==1.1.1
# via ipython
ischedule==1.2.7
# via placo
jedi==0.19.2
@@ -316,41 +303,40 @@ jinja2==3.1.6
# via torch
jsonlines==4.0.0
# via lerobot
jsonschema==4.26.0
jsonschema==4.25.1
# via nbformat
jsonschema-specifications==2025.9.1
# via jsonschema
jupyter-core==5.9.1
# via nbformat
jupytext==1.19.1
jupytext==1.18.1
# via bddl
kiwisolver==1.4.9
# via matplotlib
labmaze==1.0.6
# via dm-control
lazy-loader==0.5
lazy-loader==0.4
# via scikit-image
librt==0.8.1
# via mypy
llvmlite==0.46.0
libero @ git+https://github.com/huggingface/lerobot-libero.git@main
# via lerobot
llvmlite==0.45.1
# via numba
lxml==6.0.2
# via dm-control
markdown==3.10.2
markdown==3.9
# via tensorboard
markdown-it-py==4.0.0
# via
# jupytext
# mdit-py-plugins
# rich
markupsafe==3.0.3
# via
# jinja2
# werkzeug
matplotlib==3.10.8
matplotlib==3.10.7
# via
# hf-libero
# lerobot
# libero
matplotlib-inline==0.2.1
# via ipython
mdit-py-plugins==0.5.0
@@ -367,38 +353,36 @@ mock-serial==0.0.1
# via lerobot
mpmath==1.3.0
# via sympy
mujoco==3.5.0
mujoco==3.3.7
# via
# dm-control
# gym-aloha
# gym-hil
# hf-libero
# libero
# metaworld
# robosuite
multidict==6.7.1
multidict==6.7.0
# via
# aiohttp
# yarl
multiprocess==0.70.18
multiprocess==0.70.16
# via datasets
mypy==1.19.1
# via lerobot
mypy-extensions==1.1.0
# via
# mypy
# typing-inspect
# via typing-inspect
nbformat==5.10.4
# via jupytext
networkx==3.6.1
networkx==3.4.2
# via
# bddl
# scikit-image
# torch
nodeenv==1.10.0
ninja==1.13.0
# via lerobot
nodeenv==1.9.1
# via pre-commit
num2words==0.5.14
# via lerobot
numba==0.64.0
numba==0.62.1
# via robosuite
numpy==2.2.6
# via
@@ -407,6 +391,7 @@ numpy==2.2.6
# cmeel-boost
# contourpy
# datasets
# decord
# diffusers
# dm-control
# dm-env
@@ -414,10 +399,9 @@ numpy==2.2.6
# gymnasium
# h5py
# hebi-py
# hf-libero
# imageio
# labmaze
# lerobot
# libero
# matplotlib
# meshcat
# metaworld
@@ -442,51 +426,49 @@ numpy==2.2.6
# torchvision
# transformers
# transforms3d
nvidia-cublas-cu12==12.8.4.1
nvidia-cublas-cu12==12.6.4.1
# via
# nvidia-cudnn-cu12
# nvidia-cusolver-cu12
# torch
nvidia-cuda-cupti-cu12==12.8.90
nvidia-cuda-cupti-cu12==12.6.80
# via torch
nvidia-cuda-nvrtc-cu12==12.8.93
nvidia-cuda-nvrtc-cu12==12.6.77
# via torch
nvidia-cuda-runtime-cu12==12.8.90
nvidia-cuda-runtime-cu12==12.6.77
# via torch
nvidia-cudnn-cu12==9.10.2.21
nvidia-cudnn-cu12==9.5.1.17
# via torch
nvidia-cufft-cu12==11.3.3.83
nvidia-cufft-cu12==11.3.0.4
# via torch
nvidia-cufile-cu12==1.13.1.3
nvidia-cufile-cu12==1.11.1.6
# via torch
nvidia-curand-cu12==10.3.9.90
nvidia-curand-cu12==10.3.7.77
# via torch
nvidia-cusolver-cu12==11.7.3.90
nvidia-cusolver-cu12==11.7.1.2
# via torch
nvidia-cusparse-cu12==12.5.8.93
nvidia-cusparse-cu12==12.5.4.2
# via
# nvidia-cusolver-cu12
# torch
nvidia-cusparselt-cu12==0.7.1
nvidia-cusparselt-cu12==0.6.3
# via torch
nvidia-nccl-cu12==2.27.5
nvidia-nccl-cu12==2.26.2
# via torch
nvidia-nvjitlink-cu12==12.8.93
nvidia-nvjitlink-cu12==12.6.85
# via
# nvidia-cufft-cu12
# nvidia-cusolver-cu12
# nvidia-cusparse-cu12
# torch
nvidia-nvshmem-cu12==3.4.5
# via torch
nvidia-nvtx-cu12==12.8.90
nvidia-nvtx-cu12==12.6.77
# via torch
omegaconf==2.3.0
# via hydra-core
opencv-python==4.13.0.92
opencv-python==4.12.0.88
# via
# gym-pusht
# hf-libero
# libero
# reachy2-sdk
# robosuite
opencv-python-headless==4.12.0.88
@@ -505,7 +487,6 @@ packaging==25.0
# matplotlib
# peft
# pytest
# qwen-vl-utils
# reachy2-sdk
# scikit-image
# tensorboard
@@ -516,21 +497,21 @@ pandas==2.3.3
# via
# datasets
# lerobot
parso==0.8.6
parso==0.8.5
# via jedi
pathspec==1.0.4
# via mypy
peft==0.18.1
peft==0.17.1
# via lerobot
pexpect==4.9.0
# via ipython
pillow==12.1.1
pfzy==0.3.4
# via inquirerpy
pillow==12.0.0
# via
# diffusers
# imageio
# lerobot
# matplotlib
# meshcat
# qwen-vl-utils
# rerun-sdk
# robosuite
# scikit-image
@@ -538,27 +519,28 @@ pillow==12.1.1
# torchvision
pin==3.4.0
# via placo
placo==0.9.16
placo==0.9.14
# via lerobot
platformdirs==4.9.4
platformdirs==4.5.0
# via
# jupyter-core
# python-discovery
# virtualenv
# wandb
pluggy==1.6.0
# via
# pytest
# pytest-cov
pre-commit==4.5.1
pre-commit==4.3.0
# via lerobot
prompt-toolkit==3.0.52
# via ipython
# via
# inquirerpy
# ipython
propcache==0.4.1
# via
# aiohttp
# yarl
protobuf==6.31.1
protobuf==6.31.0
# via
# dm-control
# grpcio-tools
@@ -568,7 +550,7 @@ protobuf==6.31.1
# tensorboard
# tensorboardx
# wandb
psutil==7.2.2
psutil==7.1.1
# via
# accelerate
# imageio
@@ -578,17 +560,17 @@ ptyprocess==0.7.0
# via pexpect
pure-eval==0.2.3
# via stack-data
pyarrow==23.0.1
pyarrow==21.0.0
# via
# datasets
# rerun-sdk
pycparser==3.0
pycparser==2.23
# via cffi
pydantic==2.12.5
pydantic==2.12.3
# via
# fastapi
# wandb
pydantic-core==2.41.5
pydantic-core==2.41.4
# via pydantic
pygame==2.6.1
# via
@@ -598,14 +580,12 @@ pygame==2.6.1
pygments==2.19.2
# via
# ipython
# ipython-pygments-lexers
# pytest
# rich
pymunk==6.11.1
# via
# gym-pusht
# lerobot
pyngrok==7.5.1
pyngrok==7.4.1
# via meshcat
pynput==1.8.1
# via
@@ -615,7 +595,7 @@ pyopengl==3.1.10
# via
# dm-control
# mujoco
pyparsing==3.3.2
pyparsing==3.2.5
# via
# dm-control
# matplotlib
@@ -641,16 +621,13 @@ pytest-timeout==2.4.0
# via lerobot
python-dateutil==2.9.0.post0
# via
# faker
# matplotlib
# pandas
python-discovery==1.1.1
# via virtualenv
python-dotenv==1.2.2
python-dotenv==1.1.1
# via uvicorn
python-xlib==0.33
# via pynput
pytz==2026.1.post1
pytz==2025.2
# via pandas
pyyaml==6.0.3
# via
@@ -665,6 +642,7 @@ pyyaml==6.0.3
# pre-commit
# pyngrok
# pyyaml-include
# timm
# transformers
# uvicorn
# wandb
@@ -674,9 +652,7 @@ pyzmq==27.1.0
# via
# lerobot
# meshcat
qwen-vl-utils==0.0.14
# via lerobot
reachy2-sdk==1.0.15
reachy2-sdk==1.0.14
# via lerobot
reachy2-sdk-api==1.0.21
# via reachy2-sdk
@@ -684,7 +660,7 @@ referencing==0.37.0
# via
# jsonschema
# jsonschema-specifications
regex==2026.2.28
regex==2025.10.23
# via
# diffusers
# transformers
@@ -693,62 +669,60 @@ requests==2.32.5
# datasets
# diffusers
# dm-control
# qwen-vl-utils
# huggingface-hub
# teleop
# transformers
# wandb
rerun-sdk==0.26.2
rerun-sdk==0.26.1
# via lerobot
rhoban-cmeel-jsoncpp==1.9.4.9
# via placo
rich==14.3.3
# via typer
robomimic==0.2.0
# via hf-libero
# via libero
robosuite==1.4.0
# via hf-libero
rpds-py==0.30.0
# via libero
rpds-py==0.28.0
# via
# jsonschema
# referencing
safetensors==0.7.0
safetensors==0.6.2
# via
# accelerate
# diffusers
# lerobot
# peft
# timm
# transformers
scikit-image==0.25.2
# via
# gym-pusht
# lerobot
scipy==1.17.1
scipy==1.15.3
# via
# dm-control
# lerobot
# metaworld
# robosuite
# scikit-image
# torchdiffeq
sentry-sdk==2.54.0
sentry-sdk==2.42.1
# via wandb
shapely==2.1.2
# via gym-pusht
shellingham==1.5.4
# via typer
six==1.17.0
# via
# pynput
# python-dateutil
# python-xlib
smmap==5.0.3
smmap==5.0.2
# via gitdb
sniffio==1.3.1
# via anyio
stack-data==0.6.3
# via ipython
starlette==0.52.1
starlette==0.48.0
# via fastapi
sympy==1.14.0
# via torch
teleop==0.1.4
teleop==0.1.2
# via lerobot
tensorboard==2.20.0
# via robomimic
@@ -756,38 +730,46 @@ tensorboard-data-server==0.7.2
# via tensorboard
tensorboardx==2.6.4
# via robomimic
termcolor==3.3.0
termcolor==3.1.0
# via
# lerobot
# robomimic
thop==0.1.1.post2209072238
# via hf-libero
tifffile==2026.3.3
# via libero
tifffile==2025.5.10
# via scikit-image
tokenizers==0.22.2
timm==1.0.20
# via lerobot
tokenizers==0.22.1
# via transformers
toml==0.10.2
# via draccus
torch==2.10.0
tomli==2.3.0
# via
# cmeel
# coverage
# jupytext
# pytest
torch==2.7.1
# via
# accelerate
# flash-attn
# lerobot
# peft
# robomimic
# thop
# torchdiffeq
# timm
# torchvision
torchcodec==0.10.0
torchcodec==0.5
# via lerobot
torchdiffeq==0.2.5
# via lerobot
torchvision==0.25.0
torchvision==0.22.1
# via
# lerobot
# robomimic
tornado==6.5.4
# timm
tornado==6.5.2
# via meshcat
tqdm==4.67.3
tqdm==4.67.1
# via
# datasets
# dm-control
@@ -801,29 +783,26 @@ traitlets==5.14.3
# jupyter-core
# matplotlib-inline
# nbformat
transformers==5.3.0
transformers==4.57.1
# via
# hf-libero
# lerobot
# libero
# peft
transforms3d==0.4.2
# via teleop
triton==3.6.0
triton==3.3.1
# via torch
typer==0.24.1
# via
# huggingface-hub
# transformers
typing-extensions==4.15.0
# via
# aiosignal
# anyio
# etils
# faker
# exceptiongroup
# fastapi
# gymnasium
# huggingface-hub
# mypy
# ipython
# multidict
# pydantic
# pydantic-core
# referencing
@@ -832,46 +811,46 @@ typing-extensions==4.15.0
# torch
# typing-inspect
# typing-inspection
# uvicorn
# virtualenv
# wandb
typing-inspect==0.9.0
# via draccus
typing-inspection==0.4.2
# via
# fastapi
# pydantic
tzdata==2025.3
# via pydantic
tzdata==2025.2
# via pandas
u-msgpack-python==2.8.0
# via meshcat
urllib3==2.6.3
urllib3==2.5.0
# via
# requests
# sentry-sdk
uvicorn[standard]==0.41.0
uvicorn[standard]==0.38.0
# via teleop
uvloop==0.22.1
# via uvicorn
virtualenv==21.1.0
virtualenv==20.35.3
# via pre-commit
wandb==0.24.2
wandb==0.21.4
# via
# hf-libero
# lerobot
# libero
watchfiles==1.1.1
# via uvicorn
wcwidth==0.6.0
wcwidth==0.2.14
# via prompt-toolkit
websocket-client==1.9.0
# via teleop
websockets==16.0
websockets==15.0.1
# via uvicorn
werkzeug==3.1.6
werkzeug==3.1.3
# via tensorboard
wrapt==2.1.2
wrapt==2.0.0
# via dm-tree
xxhash==3.6.0
# via datasets
yarl==1.23.0
yarl==1.22.0
# via aiohttp
zipp==3.23.0
# via
+4 -4
View File
@@ -1,9 +1,9 @@
# requirements.in
# requirements-macos.txt was generated on macOS and is platform-specific (macOS 26.3.1 25D2128 arm64).
# Darwin MacBook-Pro.local 25.3.0 Darwin Kernel Version 25.3.0: Wed Jan 28 20:54:55 PST 2026; root:xnu-12377.91.3~2/RELEASE_ARM64_T8132 arm64
# requirements-macos.txt was generated on macOS and is platform-specific (macOS 26.0.1 25A362 arm64).
# Darwin MacBook-Pro.local 25.0.0 Darwin Kernel Version 25.0.0: Wed Sep 17 21:42:08 PDT 2025; root:xnu-12377.1.9~141/RELEASE_ARM64_T8132 arm64
# requirements-ubuntu.txt was generated on Linux and is platform-specific (Ubuntu 24.04.4 LTS x86_64).
# Linux lerobot-linux 6.17.0-14-generic #14~24.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Jan 15 15:52:10 UTC 2 x86_64 x86_64 x86_64 GNU/Linux
# requirements-ubuntu.txt was generated on Linux and is platform-specific (Ubuntu 24.04.3 LTS x86_64).
# Linux mlerobot-linux 6.14.0-33-generic #33~24.04.1-Ubuntu SMP PREEMPT_DYNAMIC Fri Sep 19 17:02:30 UTC 2 x86_64 x86_64 x86_64 GNU/Linux
-e .[all]
+3 -5
View File
@@ -63,9 +63,9 @@ from lerobot.transport import (
services_pb2_grpc, # type: ignore
)
from lerobot.transport.utils import grpc_channel_options, send_bytes_in_chunks
from lerobot.utils.import_utils import register_third_party_plugins
from .configs import RobotClientConfig
from .constants import SUPPORTED_ROBOTS
from .helpers import (
Action,
FPSTracker,
@@ -485,9 +485,8 @@ class RobotClient:
def async_client(cfg: RobotClientConfig):
logging.info(pformat(asdict(cfg)))
# TODO: Assert if checking robot support is still needed with the plugin system
# if cfg.robot.type not in SUPPORTED_ROBOTS:
# raise ValueError(f"Robot {cfg.robot.type} not yet supported!")
if cfg.robot.type not in SUPPORTED_ROBOTS:
raise ValueError(f"Robot {cfg.robot.type} not yet supported!")
client = RobotClient(cfg)
@@ -513,5 +512,4 @@ def async_client(cfg: RobotClientConfig):
if __name__ == "__main__":
register_third_party_plugins()
async_client() # run the client
+1 -1
View File
@@ -13,5 +13,5 @@
# limitations under the License.
from .camera import Camera
from .configs import CameraConfig, ColorMode, Cv2Backends, Cv2Rotation
from .configs import CameraConfig, ColorMode, Cv2Rotation
from .utils import make_cameras_from_configs
+1 -1
View File
@@ -150,7 +150,7 @@ class Camera(abc.ABC):
"""
pass
def read_latest(self, max_age_ms: int = 500) -> NDArray[Any]:
def read_latest(self, max_age_ms: int = 1000) -> NDArray[Any]:
"""Return the most recent frame captured immediately (Peeking).
This method is non-blocking and returns whatever is currently in the
-23
View File
@@ -25,10 +25,6 @@ class ColorMode(str, Enum):
RGB = "rgb"
BGR = "bgr"
@classmethod
def _missing_(cls, value: object) -> None:
raise ValueError(f"`color_mode` is expected to be in {list(cls)}, but {value} is provided.")
class Cv2Rotation(int, Enum):
NO_ROTATION = 0
@@ -36,25 +32,6 @@ class Cv2Rotation(int, Enum):
ROTATE_180 = 180
ROTATE_270 = -90
@classmethod
def _missing_(cls, value: object) -> None:
raise ValueError(f"`rotation` is expected to be in {list(cls)}, but {value} is provided.")
# Subset from https://docs.opencv.org/3.4/d4/d15/group__videoio__flags__base.html
class Cv2Backends(int, Enum):
ANY = 0
V4L2 = 200
DSHOW = 700
PVAPI = 800
ANDROID = 1000
AVFOUNDATION = 1200
MSMF = 1400
@classmethod
def _missing_(cls, value: object) -> None:
raise ValueError(f"`backend` is expected to be in {list(cls)}, but {value} is provided.")
@dataclass(kw_only=True)
class CameraConfig(draccus.ChoiceRegistry, abc.ABC): # type: ignore # TODO: add type stubs for draccus
+15 -10
View File
@@ -32,11 +32,10 @@ if platform.system() == "Windows" and "OPENCV_VIDEOIO_MSMF_ENABLE_HW_TRANSFORMS"
os.environ["OPENCV_VIDEOIO_MSMF_ENABLE_HW_TRANSFORMS"] = "0"
import cv2 # type: ignore # TODO: add type stubs for OpenCV
from lerobot.utils.decorators import check_if_already_connected, check_if_not_connected
from lerobot.utils.errors import DeviceNotConnectedError
from lerobot.utils.errors import DeviceAlreadyConnectedError, DeviceNotConnectedError
from ..camera import Camera
from ..utils import get_cv2_rotation
from ..utils import get_cv2_backend, get_cv2_rotation
from .configuration_opencv import ColorMode, OpenCVCameraConfig
# NOTE(Steven): The maximum opencv device index depends on your operating system. For instance,
@@ -118,7 +117,7 @@ class OpenCVCamera(Camera):
self.new_frame_event: Event = Event()
self.rotation: int | None = get_cv2_rotation(config.rotation)
self.backend: int = config.backend
self.backend: int = get_cv2_backend()
if self.height and self.width:
self.capture_width, self.capture_height = self.width, self.height
@@ -133,7 +132,6 @@ class OpenCVCamera(Camera):
"""Checks if the camera is currently connected and opened."""
return isinstance(self.videocapture, cv2.VideoCapture) and self.videocapture.isOpened()
@check_if_already_connected
def connect(self, warmup: bool = True) -> None:
"""
Connects to the OpenCV camera specified in the configuration.
@@ -150,6 +148,8 @@ class OpenCVCamera(Camera):
ConnectionError: If the specified camera index/path is not found or fails to open.
RuntimeError: If the camera opens but fails to apply requested settings.
"""
if self.is_connected:
raise DeviceAlreadyConnectedError(f"{self} is already connected.")
# Use 1 thread for OpenCV operations to avoid potential conflicts or
# blocking in multi-threaded applications, especially during data collection.
@@ -178,7 +178,6 @@ class OpenCVCamera(Camera):
logger.info(f"{self} connected.")
@check_if_not_connected
def _configure_capture_settings(self) -> None:
"""
Applies the specified FOURCC, FPS, width, and height settings to the connected camera.
@@ -198,6 +197,8 @@ class OpenCVCamera(Camera):
to the requested value.
DeviceNotConnectedError: If the camera is not connected.
"""
if not self.is_connected:
raise DeviceNotConnectedError(f"Cannot configure settings for {self} as it is not connected.")
# Set FOURCC first (if specified) as it can affect available FPS/resolution options
if self.config.fourcc is not None:
@@ -347,7 +348,6 @@ class OpenCVCamera(Camera):
return frame
@check_if_not_connected
def read(self, color_mode: ColorMode | None = None) -> NDArray[Any]:
"""
Reads a single frame synchronously from the camera.
@@ -374,6 +374,9 @@ class OpenCVCamera(Camera):
f"{self} read() color_mode parameter is deprecated and will be removed in future versions."
)
if not self.is_connected:
raise DeviceNotConnectedError(f"{self} is not connected.")
if self.thread is None or not self.thread.is_alive():
raise RuntimeError(f"{self} read thread is not running.")
@@ -487,7 +490,6 @@ class OpenCVCamera(Camera):
self.latest_timestamp = None
self.new_frame_event.clear()
@check_if_not_connected
def async_read(self, timeout_ms: float = 200) -> NDArray[Any]:
"""
Reads the latest available frame asynchronously.
@@ -510,6 +512,8 @@ class OpenCVCamera(Camera):
TimeoutError: If no frame becomes available within the specified timeout.
RuntimeError: If an unexpected error occurs.
"""
if not self.is_connected:
raise DeviceNotConnectedError(f"{self} is not connected.")
if self.thread is None or not self.thread.is_alive():
raise RuntimeError(f"{self} read thread is not running.")
@@ -529,8 +533,7 @@ class OpenCVCamera(Camera):
return frame
@check_if_not_connected
def read_latest(self, max_age_ms: int = 500) -> NDArray[Any]:
def read_latest(self, max_age_ms: int = 1000) -> NDArray[Any]:
"""Return the most recent frame captured immediately (Peeking).
This method is non-blocking and returns whatever is currently in the
@@ -545,6 +548,8 @@ class OpenCVCamera(Camera):
DeviceNotConnectedError: If the camera is not connected.
RuntimeError: If the camera is connected but has not captured any frames yet.
"""
if not self.is_connected:
raise DeviceNotConnectedError(f"{self} is not connected.")
if self.thread is None or not self.thread.is_alive():
raise RuntimeError(f"{self} read thread is not running.")
@@ -15,9 +15,9 @@
from dataclasses import dataclass
from pathlib import Path
from ..configs import CameraConfig, ColorMode, Cv2Backends, Cv2Rotation
from ..configs import CameraConfig, ColorMode, Cv2Rotation
__all__ = ["OpenCVCameraConfig", "ColorMode", "Cv2Rotation", "Cv2Backends"]
__all__ = ["OpenCVCameraConfig", "ColorMode", "Cv2Rotation"]
@CameraConfig.register_subclass("opencv")
@@ -50,7 +50,6 @@ class OpenCVCameraConfig(CameraConfig):
rotation: Image rotation setting (0°, 90°, 180°, or 270°). Defaults to no rotation.
warmup_s: Time reading frames before returning from connect (in seconds)
fourcc: FOURCC code for video format (e.g., "MJPG", "YUYV", "I420"). Defaults to None (auto-detect).
backend: OpenCV backend identifier (https://docs.opencv.org/3.4/d4/d15/group__videoio__flags__base.html). Defaults to ANY.
Note:
- Only 3-channel color output (RGB/BGR) is currently supported.
@@ -63,12 +62,22 @@ class OpenCVCameraConfig(CameraConfig):
rotation: Cv2Rotation = Cv2Rotation.NO_ROTATION
warmup_s: int = 1
fourcc: str | None = None
backend: Cv2Backends = Cv2Backends.ANY
def __post_init__(self) -> None:
self.color_mode = ColorMode(self.color_mode)
self.rotation = Cv2Rotation(self.rotation)
self.backend = Cv2Backends(self.backend)
if self.color_mode not in (ColorMode.RGB, ColorMode.BGR):
raise ValueError(
f"`color_mode` is expected to be {ColorMode.RGB.value} or {ColorMode.BGR.value}, but {self.color_mode} is provided."
)
if self.rotation not in (
Cv2Rotation.NO_ROTATION,
Cv2Rotation.ROTATE_90,
Cv2Rotation.ROTATE_180,
Cv2Rotation.ROTATE_270,
):
raise ValueError(
f"`rotation` is expected to be in {(Cv2Rotation.NO_ROTATION, Cv2Rotation.ROTATE_90, Cv2Rotation.ROTATE_180, Cv2Rotation.ROTATE_270)}, but {self.rotation} is provided."
)
if self.fourcc is not None and (not isinstance(self.fourcc, str) or len(self.fourcc) != 4):
raise ValueError(
@@ -74,4 +74,7 @@ class Reachy2CameraConfig(CameraConfig):
f"`image_type` is expected to be 'left' or 'right' for teleop camera, and 'rgb' or 'depth' for depth camera, but {self.image_type} is provided."
)
self.color_mode = ColorMode(self.color_mode)
if self.color_mode not in ["rgb", "bgr"]:
raise ValueError(
f"`color_mode` is expected to be 'rgb' or 'bgr', but {self.color_mode} is provided."
)
@@ -32,7 +32,6 @@ if platform.system() == "Windows" and "OPENCV_VIDEOIO_MSMF_ENABLE_HW_TRANSFORMS"
import cv2 # type: ignore # TODO: add type stubs for OpenCV
import numpy as np # type: ignore # TODO: add type stubs for numpy
from lerobot.utils.decorators import check_if_not_connected
from lerobot.utils.import_utils import _reachy2_sdk_available
if TYPE_CHECKING or _reachy2_sdk_available:
@@ -124,7 +123,6 @@ class Reachy2Camera(Camera):
"""
raise NotImplementedError("Camera detection is not implemented for Reachy2 cameras.")
@check_if_not_connected
def read(self, color_mode: ColorMode | None = None) -> NDArray[Any]:
"""
Reads a single frame synchronously from the camera.
@@ -138,6 +136,9 @@ class Reachy2Camera(Camera):
"""
start_time = time.perf_counter()
if not self.is_connected:
raise DeviceNotConnectedError(f"{self} is not connected.")
if self.cam_manager is None:
raise DeviceNotConnectedError(f"{self} is not connected.")
@@ -183,7 +184,6 @@ class Reachy2Camera(Camera):
return frame
@check_if_not_connected
def async_read(self, timeout_ms: float = 200) -> NDArray[Any]:
"""
Same as read()
@@ -197,11 +197,12 @@ class Reachy2Camera(Camera):
TimeoutError: If no frame becomes available within the specified timeout.
RuntimeError: If an unexpected error occurs.
"""
if not self.is_connected:
raise DeviceNotConnectedError(f"{self} is not connected.")
return self.read()
@check_if_not_connected
def read_latest(self, max_age_ms: int = 500) -> NDArray[Any]:
def read_latest(self, max_age_ms: int = 1000) -> NDArray[Any]:
"""Return the most recent frame captured immediately (Peeking).
This method is non-blocking and returns whatever is currently in the
@@ -218,6 +219,8 @@ class Reachy2Camera(Camera):
DeviceNotConnectedError: If the camera is not connected.
RuntimeError: If the camera is connected but has not captured any frames yet.
"""
if not self.is_connected:
raise DeviceNotConnectedError(f"{self} is not connected.")
if self.latest_frame is None or self.latest_timestamp is None:
raise RuntimeError(f"{self} has not captured any frames yet.")
@@ -230,7 +233,6 @@ class Reachy2Camera(Camera):
return self.latest_frame
@check_if_not_connected
def disconnect(self) -> None:
"""
Stops the background read thread (if running).
@@ -238,6 +240,8 @@ class Reachy2Camera(Camera):
Raises:
DeviceNotConnectedError: If the camera is already disconnected.
"""
if not self.is_connected:
raise DeviceNotConnectedError(f"{self} not connected.")
if self.cam_manager is not None:
self.cam_manager.disconnect()
@@ -30,8 +30,7 @@ try:
except Exception as e:
logging.info(f"Could not import realsense: {e}")
from lerobot.utils.decorators import check_if_already_connected, check_if_not_connected
from lerobot.utils.errors import DeviceNotConnectedError
from lerobot.utils.errors import DeviceAlreadyConnectedError, DeviceNotConnectedError
from ..camera import Camera
from ..configs import ColorMode
@@ -153,7 +152,6 @@ class RealSenseCamera(Camera):
"""Checks if the camera pipeline is started and streams are active."""
return self.rs_pipeline is not None and self.rs_profile is not None
@check_if_already_connected
def connect(self, warmup: bool = True) -> None:
"""
Connects to the RealSense camera specified in the configuration.
@@ -171,6 +169,8 @@ class RealSenseCamera(Camera):
ConnectionError: If the camera is found but fails to start the pipeline or no RealSense devices are detected at all.
RuntimeError: If the pipeline starts but fails to apply requested settings.
"""
if self.is_connected:
raise DeviceAlreadyConnectedError(f"{self} is already connected.")
self.rs_pipeline = rs.pipeline()
rs_config = rs.config()
@@ -290,7 +290,6 @@ class RealSenseCamera(Camera):
if self.use_depth:
rs_config.enable_stream(rs.stream.depth)
@check_if_not_connected
def _configure_capture_settings(self) -> None:
"""Sets fps, width, and height from device stream if not already configured.
@@ -300,6 +299,8 @@ class RealSenseCamera(Camera):
Raises:
DeviceNotConnectedError: If device is not connected.
"""
if not self.is_connected:
raise DeviceNotConnectedError(f"Cannot validate settings for {self} as it is not connected.")
if self.rs_profile is None:
raise RuntimeError(f"{self}: rs_profile must be initialized before use.")
@@ -319,7 +320,6 @@ class RealSenseCamera(Camera):
self.width, self.height = actual_width, actual_height
self.capture_width, self.capture_height = actual_width, actual_height
@check_if_not_connected
def read_depth(self, timeout_ms: int = 200) -> NDArray[Any]:
"""
Reads a single frame (depth) synchronously from the camera.
@@ -345,6 +345,9 @@ class RealSenseCamera(Camera):
f"Failed to capture depth frame '.read_depth()'. Depth stream is not enabled for {self}."
)
if not self.is_connected:
raise DeviceNotConnectedError(f"{self} is not connected.")
if self.thread is None or not self.thread.is_alive():
raise RuntimeError(f"{self} read thread is not running.")
@@ -371,7 +374,6 @@ class RealSenseCamera(Camera):
return frame
@check_if_not_connected
def read(self, color_mode: ColorMode | None = None, timeout_ms: int = 0) -> NDArray[Any]:
"""
Reads a single frame (color) synchronously from the camera.
@@ -401,6 +403,9 @@ class RealSenseCamera(Camera):
f"{self} read() timeout_ms parameter is deprecated and will be removed in future versions."
)
if not self.is_connected:
raise DeviceNotConnectedError(f"{self} is not connected.")
if self.thread is None or not self.thread.is_alive():
raise RuntimeError(f"{self} read thread is not running.")
@@ -529,7 +534,6 @@ class RealSenseCamera(Camera):
self.new_frame_event.clear()
# NOTE(Steven): Missing implementation for depth for now
@check_if_not_connected
def async_read(self, timeout_ms: float = 200) -> NDArray[Any]:
"""
Reads the latest available frame data (color) asynchronously.
@@ -552,6 +556,8 @@ class RealSenseCamera(Camera):
TimeoutError: If no frame data becomes available within the specified timeout.
RuntimeError: If the background thread died unexpectedly or another error occurs.
"""
if not self.is_connected:
raise DeviceNotConnectedError(f"{self} is not connected.")
if self.thread is None or not self.thread.is_alive():
raise RuntimeError(f"{self} read thread is not running.")
@@ -572,8 +578,7 @@ class RealSenseCamera(Camera):
return frame
# NOTE(Steven): Missing implementation for depth for now
@check_if_not_connected
def read_latest(self, max_age_ms: int = 500) -> NDArray[Any]:
def read_latest(self, max_age_ms: int = 1000) -> NDArray[Any]:
"""Return the most recent (color) frame captured immediately (Peeking).
This method is non-blocking and returns whatever is currently in the
@@ -588,6 +593,8 @@ class RealSenseCamera(Camera):
DeviceNotConnectedError: If the camera is not connected.
RuntimeError: If the camera is connected but has not captured any frames yet.
"""
if not self.is_connected:
raise DeviceNotConnectedError(f"{self} is not connected.")
if self.thread is None or not self.thread.is_alive():
raise RuntimeError(f"{self} read thread is not running.")
@@ -60,8 +60,20 @@ class RealSenseCameraConfig(CameraConfig):
warmup_s: int = 1
def __post_init__(self) -> None:
self.color_mode = ColorMode(self.color_mode)
self.rotation = Cv2Rotation(self.rotation)
if self.color_mode not in (ColorMode.RGB, ColorMode.BGR):
raise ValueError(
f"`color_mode` is expected to be {ColorMode.RGB.value} or {ColorMode.BGR.value}, but {self.color_mode} is provided."
)
if self.rotation not in (
Cv2Rotation.NO_ROTATION,
Cv2Rotation.ROTATE_90,
Cv2Rotation.ROTATE_180,
Cv2Rotation.ROTATE_270,
):
raise ValueError(
f"`rotation` is expected to be in {(Cv2Rotation.NO_ROTATION, Cv2Rotation.ROTATE_90, Cv2Rotation.ROTATE_180, Cv2Rotation.ROTATE_270)}, but {self.rotation} is provided."
)
values = (self.fps, self.width, self.height)
if any(v is not None for v in values) and any(v is None for v in values):
+12
View File
@@ -14,6 +14,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import platform
from typing import cast
from lerobot.utils.import_utils import make_device_from_device_class
@@ -67,3 +68,14 @@ def get_cv2_rotation(rotation: Cv2Rotation) -> int | None:
return int(cv2.ROTATE_90_COUNTERCLOCKWISE)
else:
return None
def get_cv2_backend() -> int:
import cv2
if platform.system() == "Windows":
return int(cv2.CAP_MSMF) # Use MSMF for Windows instead of AVFOUNDATION
# elif platform.system() == "Darwin": # macOS
# return cv2.CAP_AVFOUNDATION
else: # Linux and others
return int(cv2.CAP_ANY)
+11 -7
View File
@@ -34,8 +34,7 @@ import cv2
import numpy as np
from numpy.typing import NDArray
from lerobot.utils.decorators import check_if_already_connected, check_if_not_connected
from lerobot.utils.errors import DeviceNotConnectedError
from lerobot.utils.errors import DeviceAlreadyConnectedError, DeviceNotConnectedError
from ..camera import Camera
from ..configs import ColorMode
@@ -105,7 +104,6 @@ class ZMQCamera(Camera):
"""Checks if the ZMQ socket is initialized and connected."""
return self._connected and self.context is not None and self.socket is not None
@check_if_already_connected
def connect(self, warmup: bool = True) -> None:
"""Connect to ZMQ camera server.
@@ -113,6 +111,8 @@ class ZMQCamera(Camera):
warmup (bool): If True, waits for the camera to provide at least one
valid frame before returning. Defaults to True.
"""
if self.is_connected:
raise DeviceAlreadyConnectedError(f"{self} is already connected.")
logger.info(f"Connecting to {self}...")
@@ -181,7 +181,7 @@ class ZMQCamera(Camera):
try:
message = self.socket.recv_string()
except Exception as e:
# zmq is lazy-imported in connect(), so check by name to avoid a top-level import
# Check for ZMQ timeout (EAGAIN/Again) without requiring global zmq import
if type(e).__name__ == "Again":
raise TimeoutError(f"{self} timeout after {self.timeout_ms}ms") from e
raise
@@ -211,7 +211,6 @@ class ZMQCamera(Camera):
return frame
@check_if_not_connected
def read(self, color_mode: ColorMode | None = None) -> NDArray[Any]:
"""
Reads a single frame synchronously from the camera.
@@ -229,6 +228,9 @@ class ZMQCamera(Camera):
f"{self} read() color_mode parameter is deprecated and will be removed in future versions."
)
if not self.is_connected:
raise DeviceNotConnectedError(f"{self} is not connected.")
if self.thread is None or not self.thread.is_alive():
raise RuntimeError(f"{self} read thread is not running.")
@@ -299,7 +301,6 @@ class ZMQCamera(Camera):
self.latest_timestamp = None
self.new_frame_event.clear()
@check_if_not_connected
def async_read(self, timeout_ms: float = 200) -> NDArray[Any]:
"""
Reads the latest available frame asynchronously.
@@ -316,6 +317,8 @@ class ZMQCamera(Camera):
TimeoutError: If no frame data becomes available within the specified timeout.
RuntimeError: If the background thread is not running.
"""
if not self.is_connected:
raise DeviceNotConnectedError(f"{self} is not connected.")
if self.thread is None or not self.thread.is_alive():
raise RuntimeError(f"{self} read thread is not running.")
@@ -332,7 +335,6 @@ class ZMQCamera(Camera):
return frame
@check_if_not_connected
def read_latest(self, max_age_ms: int = 1000) -> NDArray[Any]:
"""Return the most recent frame captured immediately (Peeking).
@@ -348,6 +350,8 @@ class ZMQCamera(Camera):
DeviceNotConnectedError: If the camera is not connected.
RuntimeError: If the camera is connected but has not captured any frames yet.
"""
if not self.is_connected:
raise DeviceNotConnectedError(f"{self} is not connected.")
if self.thread is None or not self.thread.is_alive():
raise RuntimeError(f"{self} read thread is not running.")
+4 -1
View File
@@ -32,7 +32,10 @@ class ZMQCameraConfig(CameraConfig):
warmup_s: int = 1
def __post_init__(self) -> None:
self.color_mode = ColorMode(self.color_mode)
if self.color_mode not in (ColorMode.RGB, ColorMode.BGR):
raise ValueError(
f"`color_mode` is expected to be {ColorMode.RGB.value} or {ColorMode.BGR.value}, but {self.color_mode} is provided."
)
if self.timeout_ms <= 0:
raise ValueError(f"`timeout_ms` must be positive, but {self.timeout_ms} is provided.")
+4 -72
View File
@@ -23,7 +23,6 @@ import base64
import contextlib
import json
import logging
import threading
import time
from collections import deque
@@ -43,57 +42,10 @@ def encode_image(image: np.ndarray, quality: int = 80) -> str:
return base64.b64encode(buffer).decode("utf-8")
class CameraCaptureThread:
"""Background thread that continuously captures and encodes frames from a camera."""
def __init__(self, camera: OpenCVCamera, name: str):
self.camera = camera
self.name = name
self.latest_encoded: str | None = None # Pre-encoded JPEG as base64
self.latest_timestamp: float = 0.0
self.frame_lock = threading.Lock()
self.running = False
self.thread: threading.Thread | None = None
def start(self):
"""Start the capture thread."""
self.running = True
self.thread = threading.Thread(target=self._capture_loop, daemon=True)
self.thread.start()
def stop(self):
"""Stop the capture thread."""
self.running = False
if self.thread:
self.thread.join(timeout=1.0)
def _capture_loop(self):
"""Continuously capture and encode frames at the camera's native rate."""
while self.running:
try:
frame = self.camera.read() # Blocks at camera's native rate
timestamp = time.time()
# Encode immediately in capture thread (this is the slow part)
encoded = encode_image(frame)
with self.frame_lock:
self.latest_encoded = encoded
self.latest_timestamp = timestamp
except Exception as e:
logger.warning(f"Camera {self.name} capture error: {e}")
time.sleep(0.01)
def get_latest(self) -> tuple[str | None, float]:
"""Get the latest encoded frame and its timestamp."""
with self.frame_lock:
return self.latest_encoded, self.latest_timestamp
class ImageServer:
def __init__(self, config: dict, port: int = 5555):
# fps controls the publish loop rate (how often frames are sent over ZMQ), not the camera capture rate
self.fps = config.get("fps", 30)
self.cameras: dict[str, OpenCVCamera] = {}
self.capture_threads: dict[str, CameraCaptureThread] = {}
for name, cfg in config.get("cameras", {}).items():
shape = cfg.get("shape", [480, 640])
@@ -109,10 +61,6 @@ class ImageServer:
self.cameras[name] = camera
logger.info(f"Camera {name}: {shape[1]}x{shape[0]}")
# Create capture thread for this camera
capture_thread = CameraCaptureThread(camera, name)
self.capture_threads[name] = capture_thread
# ZMQ PUB socket
self.context = zmq.Context()
self.socket = self.context.socket(zmq.PUB)
@@ -125,18 +73,6 @@ class ImageServer:
def run(self):
frame_count = 0
frame_times = deque(maxlen=60)
last_published_ts: dict[str, float] = {}
# Start all capture threads
for capture_thread in self.capture_threads.values():
capture_thread.start()
# Wait for first frames to be captured and encoded
logger.info("Waiting for cameras to start capturing...")
for name, capture_thread in self.capture_threads.items():
while capture_thread.get_latest()[0] is None:
time.sleep(0.01)
logger.info(f"Camera {name} ready (capture + encode in background)")
try:
while True:
@@ -144,12 +80,10 @@ class ImageServer:
# Build message
message = {"timestamps": {}, "images": {}}
for name, capture_thread in self.capture_threads.items():
encoded, timestamp = capture_thread.get_latest()
if encoded is not None and timestamp > last_published_ts.get(name, 0.0):
message["timestamps"][name] = timestamp
message["images"][name] = encoded
last_published_ts[name] = timestamp
for name, cam in self.cameras.items():
frame = cam.read() # Returns RGB
message["timestamps"][name] = time.time()
message["images"][name] = encode_image(frame)
# Send as JSON string (suppress if buffer full)
with contextlib.suppress(zmq.Again):
@@ -168,8 +102,6 @@ class ImageServer:
except KeyboardInterrupt:
pass
finally:
for capture_thread in self.capture_threads.values():
capture_thread.stop()
for cam in self.cameras.values():
cam.disconnect()
self.socket.close()
+6 -27
View File
@@ -16,13 +16,18 @@
from dataclasses import dataclass, field
from lerobot.datasets.transforms import DatasetTransformStepConfig, ImageTransformsConfig
from lerobot.datasets.transforms import ImageTransformsConfig
from lerobot.datasets.video_utils import get_safe_default_codec
@dataclass
class DatasetConfig:
# You may provide a list of datasets here. `train.py` creates them all and concatenates them. Note: only data
# keys common between the datasets are kept. Each dataset gets and additional transform that inserts the
# "dataset_index" into the returned item. The index mapping is made according to the order in which the
# datasets are provided.
repo_id: str
# Root directory where the dataset will be stored (e.g. 'dataset/path').
root: str | None = None
episodes: list[int] | None = None
image_transforms: ImageTransformsConfig = field(default_factory=ImageTransformsConfig)
@@ -32,32 +37,6 @@ class DatasetConfig:
streaming: bool = False
@dataclass
class SubDatasetConfig:
"""Configuration for a single dataset within a MultiDatasetConfig."""
repo_id: str
root: str | None = None
episodes: list[int] | None = None
revision: str | None = None
video_backend: str = field(default_factory=get_safe_default_codec)
weight: float = 1.0
# Maps dataset-local feature keys to unified policy keys.
# Keys not listed pass through unchanged.
feature_map: dict[str, str] = field(default_factory=dict)
# Per-dataset transforms applied after feature renaming, before cross-dataset padding.
transforms: list[DatasetTransformStepConfig] | None = None
@dataclass
class MultiDatasetConfig:
"""Configuration for training on multiple datasets jointly."""
datasets: list[SubDatasetConfig] = field(default_factory=list)
image_transforms: ImageTransformsConfig = field(default_factory=ImageTransformsConfig)
use_imagenet_stats: bool = True
@dataclass
class WandBConfig:
enable: bool = False
+6 -9
View File
@@ -24,7 +24,7 @@ from huggingface_hub.errors import HfHubHTTPError
from lerobot import envs
from lerobot.configs import parser
from lerobot.configs.default import DatasetConfig, EvalConfig, MultiDatasetConfig, PeftConfig, WandBConfig
from lerobot.configs.default import DatasetConfig, EvalConfig, PeftConfig, WandBConfig
from lerobot.configs.policies import PreTrainedConfig
from lerobot.optim import OptimizerConfig
from lerobot.optim.schedulers import LRSchedulerConfig
@@ -35,7 +35,7 @@ TRAIN_CONFIG_NAME = "train_config.json"
@dataclass
class TrainPipelineConfig(HubMixin):
dataset: DatasetConfig | MultiDatasetConfig
dataset: DatasetConfig
env: envs.EnvConfig | None = None
policy: PreTrainedConfig | None = None
# Set `dir` to where you would like to save all of the run outputs. If you run another training session
@@ -50,9 +50,6 @@ class TrainPipelineConfig(HubMixin):
# `seed` is used for training (eg: model initialization, dataset shuffling)
# AND for the evaluation environments.
seed: int | None = 1000
# Set to True to use deterministic cuDNN algorithms for reproducibility.
# This disables cudnn.benchmark and may reduce training speed by ~10-20%.
cudnn_deterministic: bool = False
# Number of workers for the dataloader.
num_workers: int = 4
batch_size: int = 8
@@ -129,9 +126,8 @@ class TrainPipelineConfig(HubMixin):
train_dir = f"{now:%Y-%m-%d}/{now:%H-%M-%S}_{self.job_name}"
self.output_dir = Path("outputs/train") / train_dir
if isinstance(self.dataset, MultiDatasetConfig):
if len(self.dataset.datasets) < 1:
raise ValueError("MultiDatasetConfig.datasets must contain at least one sub-dataset.")
if isinstance(self.dataset.repo_id, list):
raise NotImplementedError("LeRobotMultiDataset is not currently implemented.")
if not self.use_policy_training_preset and (self.optimizer is None or self.scheduler is None):
raise ValueError("Optimizer and Scheduler must be set when the policy presets are not used.")
@@ -144,7 +140,8 @@ class TrainPipelineConfig(HubMixin):
"'policy.repo_id' argument missing. Please specify it to push the model to the hub."
)
if self.use_rabc and not self.rabc_progress_path and isinstance(self.dataset, DatasetConfig):
if self.use_rabc and not self.rabc_progress_path:
# Auto-detect from dataset path
repo_id = self.dataset.repo_id
if self.dataset.root:
self.rabc_progress_path = str(Path(self.dataset.root) / "sarm_progress.parquet")
@@ -0,0 +1,50 @@
#!/bin/bash
# Example script to run synthetic data generation with Qwen VLM
# This generates user prompts and robot utterances for hierarchical policy training
# Configuration
REPO_ID="lerobot/libero_10"
MODEL="Qwen/Qwen3-VL-30B-A3B-Instruct"
# or: MODEL="Qwen/Qwen2-VL-7B-Instruct"
OUTPUT_DIR="/fsx/jade_choghari/outputs/libero-10-annotate-high"
BATCH_SIZE=16
TEMPERATURE=0.9
SAMPLE_INTERVAL=5.0 # generate dialogue every 1 second (all episodes processed)
# Run subtask annotation
# python /admin/home/jade_choghari/lerobot/src/lerobot/policies/pi05_full/annotate/subtask_annotate.py \
# --repo-id "$REPO_ID" \
# --video-key observation.images.image \
# --output-dir "$OUTPUT_DIR" \
# --skip-existing \
# --output-repo-id "jadechoghari/libero10-annotate" \
# --batch-size "$BATCH_SIZE" \
# run synthetic data generation (all episodes processed)
# python examples/dataset/annotate_pgen.py \
# --repo-id "$REPO_ID" \
# --model "$MODEL" \
# --output-dir "$OUTPUT_DIR" \
# --temperature "$TEMPERATURE" \
# --batch-size "$BATCH_SIZE" \
# --sample-interval "$SAMPLE_INTERVAL" \
# --image-key observation.images.base \
# --num-image-views-per-sample 1
# for faster testing, increase sample interval:
# --sample-interval 5.0 # Samples every 5 seconds (much faster)
# to push to hub after generation:
# add --push-to-hub flag
# efficient batch processing: 4 episodes at once
python src/lerobot/data_processing/annotations/high_level_annotate.py \
--data-dir "/fsx/jade_choghari/outputs/libero-10-annotate" \
--output-dir "$OUTPUT_DIR" \
--video-mode \
--video-key observation.images.image \
--video-batch-size "$BATCH_SIZE" \
--sample-interval 5.0
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,52 @@
import torch
from huggingface_hub import HfApi
import lerobot
from lerobot.datasets.lerobot_dataset import LeRobotDataset, LeRobotDatasetMetadata
from lerobot.policies.factory import make_pre_post_processors
from lerobot.configs.policies import PreTrainedConfig
# /fsx/jade_choghari/data/libero_10_subtasks_kw_converted
dataset = LeRobotDataset(repo_id="lerobot/libero_10_image_subtask")
dataloader = torch.utils.data.DataLoader(
dataset,
num_workers=0,
batch_size=2,
shuffle=True,
)
cfg = PreTrainedConfig.from_pretrained(
pretrained_name_or_path="/fsx/jade_choghari/models/pi05-base",
)
cfg.dtype = "bfloat16"
pre_processor, post_processor = make_pre_post_processors(
policy_cfg=cfg,
pretrained_path="/fsx/jade_choghari/models/pi05-base",
)
batch = next(iter(dataloader))
breakpoint()
batch1 = pre_processor(batch)
breakpoint()
print(batch.keys())
# print(batch['task_index_high_level'].shape)
# print(batch['task_index_high_level'])
# print(batch['user_prompt'][0])
# print(batch['robot_utterance'][0])
# print(batch['task'][0])
valid_episode_list = []
for episode_idx in range(len(dataset.meta.episodes)):
subtask_index = dataset[episode_idx]["subtask_index"]
valid_episode_list.append(episode_idx)
print(len(valid_episode_list))
# read this parquet /fsx/jade_choghari/outputs/pgen_annotations1/meta/tasks.parquett
# import pandas as pd
# tasks_df = pd.read_parquet('/fsx/jade_choghari/outputs/pgen_annotations1/meta/tasks.parquet')
# # print all
# print(tasks_df.columns)
# breakpoint()
@@ -0,0 +1,74 @@
#!/bin/bash
# Example script to run synthetic data generation with Qwen VLM
# This generates user prompts and robot utterances for hierarchical policy training
# Configuration
REPO_ID="jadechoghari/piper-demo-20260205_103303"
# MODEL="Qwen/Qwen3-VL-30B-A3B-Thinking"
MODEL="Qwen/Qwen3.5-27B"
# or: MODEL="Qwen/Qwen2-VL-7B-Instruct"
OUTPUT_DIR="/fsx/jade_choghari/outputs/collect-data-pgen_new"
BATCH_SIZE=2
TEMPERATURE=0.9
SAMPLE_INTERVAL=5.0 # generate dialogue every 1 second (all episodes processed)
# Run subtask annotation.
# To use closed-vocabulary labels, add a line: --subtask-labels "label1" "label2" ...
# Example (add backslash after "$MODEL" and uncomment the next line):
# --model "$MODEL" \
# --subtask-labels "pick_up_yellow_nut_bar" "pick_up_cake" "pick_up_biscuit_pack" "pick_up_soda_can"
python /home/lerobot/src/lerobot/data_processing/annotations/subtask_annotate.py \
--repo-id "$REPO_ID" \
--video-key observation.images.top \
--output-dir "$OUTPUT_DIR" \
--output-repo-id "jadechoghari/piper-demo-annotated1" \
--push-to-hub \
--no-timer-overlay \
--model "$MODEL" \
--subtask-labels "pick_up_yellow_nut_bar" "pick_up_cake" "pick_up_biscuit_pack" "pick_up_soda_can" \
--batch-size 2
# Run subtask annotation (image-window: frames as images for better accuracy)
# python /admin/home/jade_choghari/lerobot/src/lerobot/data_processing/annotations/subtask_annotate_image.py \
# --repo-id "$REPO_ID" \
# --camera-key observation.images.wrist \
# --output-dir "$OUTPUT_DIR" \
# --output-repo-id "jadechoghari/piper-demo-annotated1-image" \
# --push-to-hub \
# --model "$MODEL" \
# --window-size 184 \
# --max-frames-per-window 16 \
# --subtask-labels "pick_up_yellow_nut_bar" "pick_up_cake" "pick_up_biscuit_pack" "pick_up_soda_can" \
# --batch-size 2
# run synthetic data generation (all episodes processed)
# python examples/dataset/annotate_pgen.py \
# --repo-id "$REPO_ID" \
# --model "$MODEL" \
# --output-dir "$OUTPUT_DIR" \
# --temperature "$TEMPERATURE" \
# --batch-size "$BATCH_SIZE" \
# --sample-interval "$SAMPLE_INTERVAL" \
# --image-key observation.images.base \
# --num-image-views-per-sample 1
# for faster testing, increase sample interval:
# --sample-interval 5.0 # Samples every 5 seconds (much faster)
# to push to hub after generation:
# add --push-to-hub flag
# efficient batch processing: 4 episodes at once
# python examples/dataset/annotate_pgen.py \
# --repo-id "$REPO_ID" \
# --model "$MODEL" \
# --output-dir "$OUTPUT_DIR" \
# --video-mode \
# --video-key observation.images.up \
# --video-batch-size "$BATCH_SIZE" \
# --sample-interval 1.0
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,561 @@
#!/usr/bin/env python
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Image-window subtask annotation for LeRobot datasets using Qwen VLMs.
This script assigns a subtask to each window of consecutive frames by sending
those frames as images to the VLM (instead of a video) for better accuracy.
Supports Qwen2-VL and Qwen3-VL (same models as subtask_annotate.py).
Pipeline:
1. Load a LeRobot dataset (local or Hub).
2. For each episode, slide a window over frame indices.
3. For each window, load the corresponding images (from image_key or decoded video_key).
4. Send the window of images to Qwen2-VL with the same skill prompt; get one subtask name.
5. Assign that subtask to all frames in the window.
6. Write subtasks.parquet and add subtask_index via add_features (same as subtask_annotate).
Usage:
python -m lerobot.data_processing.annotations.subtask_annotate_image \\
--data-dir /path/to/dataset --camera-key observation.images.base \\
--window-size 8 --stride 8 --output-dir ./output
"""
from __future__ import annotations
import argparse
import random
import textwrap
from pathlib import Path
import numpy as np
import PIL.Image
import torch
from rich.console import Console
from lerobot.datasets.lerobot_dataset import LeRobotDataset
# Reuse data structures and save/load from the video-based annotator
from lerobot.data_processing.annotations.subtask_annotate import (
EpisodeSkills,
Skill,
load_skill_annotations,
save_skill_annotations,
)
def create_window_skill_prompt(
coarse_goal: str | None = None,
subtask_labels: list[str] | None = None,
) -> str:
"""Prompt for labeling a single window of frames with one atomic skill.
If subtask_labels are provided, the model must choose exactly one from that list.
"""
goal_context = f'The overall goal is: "{coarse_goal}".\n\n' if coarse_goal else ""
if subtask_labels:
labels_list = ", ".join(f'"{l}"' for l in subtask_labels)
label_instruction = (
f"You must choose exactly ONE skill from this list: [{labels_list}]. "
"Do not create new labels. Reply with only that label.\n\n"
)
else:
label_instruction = ""
return textwrap.dedent(f"""\
# Role
You are a Robotics Vision System that labels short clips from robot manipulation demonstrations.
# Task
{goal_context}{label_instruction}The following images are consecutive frames from a single short clip of a robot demonstration.
What single atomic manipulation skill is being performed in this clip?
# Requirements
- Reply with ONLY one short skill name (e.g. "pick up object", "move arm left", "release gripper").
- No explanation, no timestamps, no JSON. Just the skill name.
""").strip()
def _run_image_segmenter(
self,
images: list[PIL.Image.Image],
coarse_goal: str | None,
subtask_labels: list[str] | None = None,
) -> str:
"""Shared inference for Qwen2-VL and Qwen3-VL image window labeling."""
prompt = create_window_skill_prompt(coarse_goal, subtask_labels)
content = []
for img in images:
content.append({"type": "image", "image": img})
content.append({"type": "text", "text": "What single atomic skill is shown in these frames? Reply with only the skill name."})
messages = [
{"role": "system", "content": [{"type": "text", "text": prompt}]},
{"role": "user", "content": content},
]
text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = self.process_vision_info(messages)
inputs = self.processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
).to(self.device)
with torch.no_grad():
generated_ids = self.model.generate(**inputs, max_new_tokens=128, do_sample=False)
response = self.processor.batch_decode(
[out[len(inp) :] for inp, out in zip(inputs.input_ids, generated_ids)],
skip_special_tokens=True,
)[0].strip()
skill_name = response.split("\n")[0].strip().strip('."')
return skill_name if skill_name else "unknown"
def _run_image_segmenter_batch(
self,
batch_images: list[list[PIL.Image.Image]],
coarse_goal: str | None,
subtask_labels: list[str] | None = None,
) -> list[str]:
"""Run VLM on multiple windows at once; returns one skill name per window."""
if not batch_images:
return []
prompt = create_window_skill_prompt(coarse_goal, subtask_labels)
all_texts = []
all_image_inputs = []
all_video_inputs = []
for images in batch_images:
content = []
for img in images:
content.append({"type": "image", "image": img})
content.append({"type": "text", "text": "What single atomic skill is shown in these frames? Reply with only the skill name."})
messages = [
{"role": "system", "content": [{"type": "text", "text": prompt}]},
{"role": "user", "content": content},
]
text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = self.process_vision_info(messages)
all_texts.append(text)
if image_inputs is not None:
all_image_inputs.extend(image_inputs if isinstance(image_inputs, list) else [image_inputs])
if video_inputs is not None:
all_video_inputs.extend(video_inputs if isinstance(video_inputs, list) else [video_inputs])
inputs = self.processor(
text=all_texts,
images=all_image_inputs if all_image_inputs else None,
videos=all_video_inputs if all_video_inputs else None,
padding=True,
return_tensors="pt",
).to(self.device)
with torch.no_grad():
generated_ids = self.model.generate(**inputs, max_new_tokens=128, do_sample=False)
responses = self.processor.batch_decode(
[out[len(inp) :] for inp, out in zip(inputs.input_ids, generated_ids)],
skip_special_tokens=True,
)
return [
(r.split("\n")[0].strip().strip('."') or "unknown")
for r in responses
]
class Qwen2VLImageSegmenter:
"""Uses Qwen2-VL to assign one skill name to a window of images (same model as subtask_annotate)."""
def __init__(self, model_name: str, device: str = "cuda", torch_dtype: torch.dtype = torch.bfloat16):
from qwen_vl_utils import process_vision_info
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
self.console = Console()
self.device = device
self.process_vision_info = process_vision_info
self.console.print(f"[cyan]Loading Qwen2-VL for image-window labeling: {model_name}...[/cyan]")
self.model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name, torch_dtype=torch_dtype, device_map=device, trust_remote_code=True
)
self.processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
self.console.print(f"[green]✓ Model loaded on {device}[/green]")
def segment_skill_from_images(
self,
images: list[PIL.Image.Image],
coarse_goal: str | None = None,
subtask_labels: list[str] | None = None,
) -> str:
"""Return a single skill name for the given window of images."""
return _run_image_segmenter(self, images, coarse_goal, subtask_labels)
def segment_skill_from_images_batch(
self,
batch_images: list[list[PIL.Image.Image]],
coarse_goal: str | None = None,
subtask_labels: list[str] | None = None,
) -> list[str]:
"""Return one skill name per window; processes multiple windows in one forward pass."""
return _run_image_segmenter_batch(self, batch_images, coarse_goal, subtask_labels)
class Qwen3VLImageSegmenter:
"""Uses Qwen3-VL (MoE) to assign one skill name to a window of images."""
def __init__(self, model_name: str, device: str = "cuda", torch_dtype: torch.dtype = torch.bfloat16):
from qwen_vl_utils import process_vision_info
from transformers import AutoProcessor, Qwen3VLMoeForConditionalGeneration
self.console = Console()
self.device = device
self.process_vision_info = process_vision_info
self.console.print(f"[cyan]Loading Qwen3-VL for image-window labeling: {model_name}...[/cyan]")
self.model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
model_name, torch_dtype=torch_dtype, device_map=device, trust_remote_code=True
)
self.processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
self.console.print(f"[green]✓ Model loaded on {device}[/green]")
def segment_skill_from_images(
self,
images: list[PIL.Image.Image],
coarse_goal: str | None = None,
subtask_labels: list[str] | None = None,
) -> str:
"""Return a single skill name for the given window of images."""
return _run_image_segmenter(self, images, coarse_goal, subtask_labels)
def segment_skill_from_images_batch(
self,
batch_images: list[list[PIL.Image.Image]],
coarse_goal: str | None = None,
subtask_labels: list[str] | None = None,
) -> list[str]:
"""Return one skill name per window; processes multiple windows in one forward pass."""
return _run_image_segmenter_batch(self, batch_images, coarse_goal, subtask_labels)
def get_image_segmenter(
model_name: str,
device: str = "cuda",
torch_dtype: torch.dtype = torch.bfloat16,
):
"""Return the appropriate image-window segmenter for the model (Qwen2-VL or Qwen3-VL)."""
model_lower = model_name.lower()
if "qwen3" in model_lower:
return Qwen3VLImageSegmenter(model_name, device, torch_dtype)
return Qwen2VLImageSegmenter(model_name, device, torch_dtype)
def frame_to_pil(frame_value) -> PIL.Image.Image:
"""Convert a single frame from dataset (tensor or PIL or path) to PIL.Image."""
if isinstance(frame_value, PIL.Image.Image):
return frame_value
if isinstance(frame_value, (str, Path)):
return PIL.Image.open(frame_value).convert("RGB")
if hasattr(frame_value, "numpy"):
arr = frame_value.numpy()
else:
arr = np.asarray(frame_value)
if arr.ndim == 3 and arr.shape[0] in (1, 3, 4):
arr = np.transpose(arr, (1, 2, 0))
if arr.dtype == np.float32 or arr.dtype == np.float64:
arr = (np.clip(arr, 0, 1) * 255).astype(np.uint8)
elif arr.dtype != np.uint8:
arr = np.clip(arr, 0, 255).astype(np.uint8)
if arr.shape[-1] == 1:
arr = np.repeat(arr, 3, axis=-1)
return PIL.Image.fromarray(arr)
def _sample_window_indices(window_length: int, max_frames: int) -> list[int]:
"""Return indices into a window of length window_length, at most max_frames, in order.
If window_length <= max_frames, returns range(window_length).
Otherwise returns sorted random sample of max_frames indices (temporal order preserved).
"""
if max_frames <= 0 or window_length <= max_frames:
return list(range(window_length))
return sorted(random.sample(range(window_length), max_frames))
class SkillAnnotatorImage:
"""Annotates episodes by sliding a window over frames and labeling each window with the VLM."""
def __init__(
self,
segmenter: Qwen2VLImageSegmenter | Qwen3VLImageSegmenter,
window_size: int = 8,
stride: int | None = None,
batch_size: int = 1,
max_frames_per_window: int | None = None,
console: Console | None = None,
):
self.segmenter = segmenter
self.window_size = window_size
self.stride = stride if stride is not None else window_size
self.batch_size = max(1, batch_size)
self.max_frames_per_window = max_frames_per_window
self.console = console or Console()
def annotate_dataset(
self,
dataset: LeRobotDataset,
camera_key: str,
episodes: list[int] | None = None,
skip_existing: bool = False,
subtask_labels: list[str] | None = None,
) -> dict[int, EpisodeSkills]:
"""Annotate episodes using image windows. camera_key can be an image_key or video_key."""
episode_indices = episodes or list(range(dataset.meta.total_episodes))
coarse_goal = self._get_coarse_goal(dataset)
annotations: dict[int, EpisodeSkills] = {}
if skip_existing:
existing = load_skill_annotations(dataset.root)
if existing and existing.get("episodes"):
existing_eps = {int(k) for k in existing["episodes"] if existing["episodes"][k].get("skills")}
episode_indices = [i for i in episode_indices if i not in existing_eps]
for ep_idx in episode_indices:
try:
skills = self._annotate_episode(
dataset, ep_idx, camera_key, coarse_goal, subtask_labels
)
if skills:
annotations[ep_idx] = EpisodeSkills(
episode_index=ep_idx,
description=coarse_goal,
skills=skills,
)
self.console.print(f"[green]✓ Episode {ep_idx}: {len(skills)} window skills[/green]")
else:
self.console.print(f"[yellow]⚠ Episode {ep_idx}: no skills[/yellow]")
except Exception as e:
self.console.print(f"[red]Episode {ep_idx} failed: {e}[/red]")
return annotations
def _get_coarse_goal(self, dataset: LeRobotDataset) -> str:
if dataset.meta.tasks is not None and len(dataset.meta.tasks) > 0:
return str(dataset.meta.tasks.index[0])
return "Perform the demonstrated manipulation task."
def _annotate_episode(
self,
dataset: LeRobotDataset,
episode_index: int,
camera_key: str,
coarse_goal: str,
subtask_labels: list[str] | None = None,
) -> list[Skill]:
ep = dataset.meta.episodes[episode_index]
ep_from = int(ep["dataset_from_index"])
ep_to = int(ep["dataset_to_index"])
length = ep_to - ep_from
fps = dataset.meta.fps
if length == 0:
return []
# Collect full windows: (images, t_start, t_end) using frame timestamps.
# If max_frames_per_window is set and window is larger, sample that many frames (order preserved).
window_specs: list[tuple[list[PIL.Image.Image], float, float]] = []
start = 0
while start + self.window_size <= length:
offsets = _sample_window_indices(
self.window_size,
self.max_frames_per_window or self.window_size,
)
frame_indices = [ep_from + start + i for i in offsets]
images = []
t_start = float(dataset[frame_indices[0]]["timestamp"].item())
for idx in frame_indices:
item = dataset[idx]
images.append(frame_to_pil(item[camera_key]))
t_end = t_start + self.window_size / fps
window_specs.append((images, t_start, t_end))
start += self.stride
# Last partial window
if start < length:
partial_len = ep_to - (ep_from + start)
offsets = _sample_window_indices(
partial_len,
self.max_frames_per_window or partial_len,
)
frame_indices = [ep_from + start + i for i in offsets]
images = []
t_start = float(dataset[frame_indices[0]]["timestamp"].item())
for idx in frame_indices:
item = dataset[idx]
images.append(frame_to_pil(item[camera_key]))
t_end = float(dataset[frame_indices[-1]]["timestamp"].item()) + 1.0 / fps
window_specs.append((images, t_start, t_end))
# Run in batches
skills: list[Skill] = []
for i in range(0, len(window_specs), self.batch_size):
chunk = window_specs[i : i + self.batch_size]
batch_images = [spec[0] for spec in chunk]
if len(batch_images) > 1:
skill_names = self.segmenter.segment_skill_from_images_batch(
batch_images, coarse_goal, subtask_labels
)
else:
skill_names = [
self.segmenter.segment_skill_from_images(
batch_images[0], coarse_goal, subtask_labels
)
]
for (_, t_start, t_end), name in zip(chunk, skill_names, strict=True):
skills.append(Skill(name=name, start=t_start, end=t_end))
return skills
def main():
parser = argparse.ArgumentParser(
description="Image-window subtask annotation using Qwen VLM (frames as images for better accuracy)",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=textwrap.dedent("""\
Examples:
python -m lerobot.data_processing.annotations.subtask_annotate_image \\
--data-dir /path/to/dataset --camera-key observation.images.base \\
--window-size 8 --output-dir ./output
python -m lerobot.data_processing.annotations.subtask_annotate_image \\
--repo-id user/dataset --camera-key observation.images.base \\
--window-size 6 --stride 3 --model Qwen/Qwen2-VL-7B-Instruct
# Use Qwen3-VL (MoE)
python -m lerobot.data_processing.annotations.subtask_annotate_image \\
--data-dir /path/to/dataset --camera-key observation.images.base \\
--model Qwen/Qwen3-VL-30B-A3B-Instruct
"""),
)
data_group = parser.add_mutually_exclusive_group(required=True)
data_group.add_argument("--data-dir", type=str, help="Path to local LeRobot dataset")
data_group.add_argument("--repo-id", type=str, help="HuggingFace Hub dataset repository ID")
parser.add_argument(
"--camera-key",
type=str,
required=True,
help="Image or video observation key (e.g. observation.images.base)",
)
parser.add_argument(
"--model",
type=str,
default="Qwen/Qwen2-VL-7B-Instruct",
help="VLM model: Qwen2-VL or Qwen3-VL (default: Qwen/Qwen2-VL-7B-Instruct)",
)
parser.add_argument(
"--device",
type=str,
default="cuda",
)
parser.add_argument(
"--window-size",
type=int,
default=8,
help="Number of frames per window (default: 8)",
)
parser.add_argument(
"--stride",
type=int,
default=None,
help="Stride for sliding window (default: window_size = non-overlapping)",
)
parser.add_argument(
"--batch-size",
type=int,
default=1,
help="Number of windows to process in one VLM call (default: 1; increase for speed)",
)
parser.add_argument(
"--max-frames-per-window",
type=int,
default=None,
metavar="N",
help="If window has more than N frames, randomly sample N frames (order kept) to avoid OOM (e.g. 16)",
)
parser.add_argument("--episodes", type=int, nargs="+", help="Episode indices to annotate (default: all)")
parser.add_argument("--skip-existing", action="store_true", help="Skip episodes that already have annotations")
parser.add_argument(
"--subtask-labels",
type=str,
nargs="*",
default=None,
help="Closed vocabulary: model must choose only from these labels",
)
parser.add_argument("--output-dir", type=str, help="Output directory for dataset with subtask_index")
parser.add_argument("--output-repo-id", type=str, help="Output repo id (default: <repo_id>_with_subtasks)")
parser.add_argument("--push-to-hub", action="store_true")
args = parser.parse_args()
console = Console()
# Load dataset
console.print("[cyan]Loading dataset...[/cyan]")
if args.data_dir:
dataset = LeRobotDataset(repo_id="local/dataset", root=args.data_dir, download_videos=False)
else:
dataset = LeRobotDataset(repo_id=args.repo_id, download_videos=True)
camera_keys = dataset.meta.camera_keys
if args.camera_key not in camera_keys:
console.print(f"[red]Error: camera key '{args.camera_key}' not in {camera_keys}[/red]")
return
console.print(f"[green]✓ Loaded dataset, {dataset.meta.total_episodes} episodes[/green]")
# Same Qwen VLM as subtask_annotate (Qwen2-VL or Qwen3-VL), image windows instead of video
segmenter = get_image_segmenter(args.model, args.device, torch.bfloat16)
annotator = SkillAnnotatorImage(
segmenter=segmenter,
window_size=args.window_size,
stride=args.stride,
batch_size=args.batch_size,
max_frames_per_window=args.max_frames_per_window,
console=console,
)
annotations = annotator.annotate_dataset(
dataset=dataset,
camera_key=args.camera_key,
episodes=args.episodes,
skip_existing=args.skip_existing,
subtask_labels=args.subtask_labels,
)
if not annotations:
console.print("[yellow]No annotations to save.[/yellow]")
return
output_dir = Path(args.output_dir) if args.output_dir else None
output_repo_id = args.output_repo_id
new_dataset = save_skill_annotations(dataset, annotations, output_dir, output_repo_id)
total_skills = sum(len(a.skills) for a in annotations.values())
console.print(f"[bold green]✓ Done.[/bold green] Episodes: {len(annotations)}, total window skills: {total_skills}")
console.print(f" Dataset with subtask_index: {new_dataset.root}")
if args.push_to_hub and not args.data_dir:
console.print("[cyan]Pushing to Hub...[/cyan]")
try:
new_dataset.push_to_hub(push_videos=False)
console.print("[green]✓ Pushed.[/green]")
except Exception as e:
console.print(f"[red]Push failed: {e}[/red]")
if __name__ == "__main__":
main()
+1 -3
View File
@@ -289,9 +289,7 @@ def aggregate_datasets(
logging.info("Find all tasks")
unique_tasks = pd.concat([m.tasks for m in all_metadata]).index.unique()
dst_meta.tasks = pd.DataFrame(
{"task_index": range(len(unique_tasks))}, index=pd.Index(unique_tasks, name="task")
)
dst_meta.tasks = pd.DataFrame({"task_index": range(len(unique_tasks))}, index=unique_tasks)
meta_idx = {"chunk": 0, "file": 0}
data_idx = {"chunk": 0, "file": 0}
-7
View File
@@ -7,13 +7,6 @@
This dataset was created using [LeRobot](https://github.com/huggingface/lerobot).
{% if repo_id is defined and repo_id %}
<a class="flex" href="https://huggingface.co/spaces/lerobot/visualize_dataset?path={{ repo_id }}">
<img class="block dark:hidden" src="https://huggingface.co/datasets/huggingface/badges/resolve/main/visualize-this-dataset-xl.svg"/>
<img class="hidden dark:block" src="https://huggingface.co/datasets/huggingface/badges/resolve/main/visualize-this-dataset-xl-dark.svg"/>
</a>
{% endif %}
## Dataset Description
{{ dataset_description | default("", true) }}
+33 -41
View File
@@ -89,8 +89,8 @@ def delete_episodes(
Args:
dataset: The source LeRobotDataset.
episode_indices: List of episode indices to delete.
output_dir: Root directory where the edited dataset will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id. Equivalent to new_root in EditDatasetConfig.
repo_id: Edited dataset identifier. Equivalent to new_repo_id in EditDatasetConfig.
output_dir: Directory to save the new dataset. If None, uses default location.
repo_id: Repository ID for the new dataset. If None, appends "_modified" to original.
"""
if not episode_indices:
raise ValueError("No episodes to delete")
@@ -152,7 +152,7 @@ def split_dataset(
dataset: The source LeRobotDataset to split.
splits: Either a dict mapping split names to episode indices, or a dict mapping
split names to fractions (must sum to <= 1.0).
output_dir: Root directory where the split datasets will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id.
output_dir: Base directory for output datasets. If None, uses default location.
Examples:
Split by specific episodes
@@ -243,8 +243,8 @@ def merge_datasets(
Args:
datasets: List of LeRobotDatasets to merge.
output_repo_id: Merged dataset identifier.
output_dir: Root directory where the merged dataset will be stored. If not specified, defaults to $HF_LEROBOT_HOME/output_repo_id.
output_repo_id: Repository ID for the merged dataset.
output_dir: Directory to save the merged dataset. If None, uses default location.
"""
if not datasets:
raise ValueError("No datasets to merge")
@@ -288,8 +288,8 @@ def modify_features(
dataset: The source LeRobotDataset.
add_features: Optional dict mapping feature names to (feature_values, feature_info) tuples.
remove_features: Optional feature name(s) to remove. Can be a single string or list.
output_dir: Root directory where the edited dataset will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id. Equivalent to new_root in EditDatasetConfig.
repo_id: Edited dataset identifier. Equivalent to new_repo_id in EditDatasetConfig.
output_dir: Directory to save the new dataset. If None, uses default location.
repo_id: Repository ID for the new dataset. If None, appends "_modified" to original.
Returns:
New dataset with features modified.
@@ -390,8 +390,8 @@ def add_features(
Args:
dataset: The source LeRobotDataset.
features: Dictionary mapping feature names to (feature_values, feature_info) tuples.
output_dir: Root directory where the edited dataset will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id. Equivalent to new_root in EditDatasetConfig.
repo_id: Edited dataset identifier. Equivalent to new_repo_id in EditDatasetConfig.
output_dir: Directory to save the new dataset. If None, uses default location.
repo_id: Repository ID for the new dataset. If None, appends "_modified" to original.
Returns:
New dataset with all features added.
@@ -427,8 +427,8 @@ def remove_feature(
Args:
dataset: The source LeRobotDataset.
feature_names: Name(s) of features to remove. Can be a single string or list.
output_dir: Root directory where the edited dataset will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id. Equivalent to new_root in EditDatasetConfig.
repo_id: Edited dataset identifier. Equivalent to new_repo_id in EditDatasetConfig.
output_dir: Directory to save the new dataset. If None, uses default location.
repo_id: Repository ID for the new dataset. If None, appends "_modified" to original.
Returns:
New dataset with features removed.
@@ -567,22 +567,20 @@ def _copy_and_reindex_data(
def _keep_episodes_from_video_with_av(
input_path: Path,
output_path: Path,
episodes_to_keep: list[tuple[int, int]],
episodes_to_keep: list[tuple[float, float]],
fps: float,
vcodec: str = "libsvtav1",
pix_fmt: str = "yuv420p",
) -> None:
"""Keep only specified episodes from a video file using PyAV.
This function decodes frames from specified frame ranges and re-encodes them with
This function decodes frames from specified time ranges and re-encodes them with
properly reset timestamps to ensure monotonic progression.
Args:
input_path: Source video file path.
output_path: Destination video file path.
episodes_to_keep: List of (start_frame, end_frame) tuples for episodes to keep.
Ranges are half-open intervals: [start_frame, end_frame), where start_frame
is inclusive and end_frame is exclusive.
episodes_to_keep: List of (start_time, end_time) tuples for episodes to keep.
fps: Frame rate of the video.
vcodec: Video codec to use for encoding.
pix_fmt: Pixel format for output video.
@@ -624,10 +622,9 @@ def _keep_episodes_from_video_with_av(
# Create set of (start, end) ranges for fast lookup.
# Convert to a sorted list for efficient checking.
frame_ranges = sorted(episodes_to_keep)
time_ranges = sorted(episodes_to_keep)
# Track frame index for setting PTS and current range being processed.
src_frame_count = 0
frame_count = 0
range_idx = 0
@@ -637,20 +634,21 @@ def _keep_episodes_from_video_with_av(
if frame is None:
continue
# Check if frame is in any of our desired frame ranges.
# Get frame timestamp.
frame_time = float(frame.pts * frame.time_base) if frame.pts is not None else 0.0
# Check if frame is in any of our desired time ranges.
# Skip ranges that have already passed.
while range_idx < len(frame_ranges) and src_frame_count >= frame_ranges[range_idx][1]:
while range_idx < len(time_ranges) and frame_time >= time_ranges[range_idx][1]:
range_idx += 1
# If we've passed all ranges, stop processing.
if range_idx >= len(frame_ranges):
if range_idx >= len(time_ranges):
break
# Check if frame is in current range.
start_frame = frame_ranges[range_idx][0]
if src_frame_count < start_frame:
src_frame_count += 1
start_ts, end_ts = time_ranges[range_idx]
if frame_time < start_ts:
continue
# Frame is in range - create a new frame with reset timestamps.
@@ -663,7 +661,6 @@ def _keep_episodes_from_video_with_av(
for pkt in v_out.encode(new_frame):
out.mux(pkt)
src_frame_count += 1
frame_count += 1
# Flush encoder.
@@ -752,17 +749,15 @@ def _copy_and_reindex_videos(
f"videos/{video_key}/to_timestamp"
]
else:
# Build list of frame ranges to keep, in sorted order.
# Build list of time ranges to keep, in sorted order.
sorted_keep_episodes = sorted(episodes_in_file, key=lambda x: episode_mapping[x])
episodes_to_keep_ranges: list[tuple[int, int]] = []
episodes_to_keep_ranges: list[tuple[float, float]] = []
for old_idx in sorted_keep_episodes:
src_ep = src_dataset.meta.episodes[old_idx]
from_frame = round(src_ep[f"videos/{video_key}/from_timestamp"] * src_dataset.meta.fps)
to_frame = round(src_ep[f"videos/{video_key}/to_timestamp"] * src_dataset.meta.fps)
assert src_ep["length"] == to_frame - from_frame, (
f"Episode length mismatch: {src_ep['length']} vs {to_frame - from_frame}"
)
episodes_to_keep_ranges.append((from_frame, to_frame))
from_ts = src_ep[f"videos/{video_key}/from_timestamp"]
to_ts = src_ep[f"videos/{video_key}/to_timestamp"]
episodes_to_keep_ranges.append((from_ts, to_ts))
# Use PyAV filters to efficiently re-encode only the desired segments.
assert src_dataset.meta.video_path is not None
@@ -1475,9 +1470,7 @@ def modify_tasks(
# Collect all unique tasks and create new task mapping
unique_tasks = sorted(set(episode_to_task.values()))
new_task_df = pd.DataFrame(
{"task_index": list(range(len(unique_tasks)))}, index=pd.Index(unique_tasks, name="task")
)
new_task_df = pd.DataFrame({"task_index": list(range(len(unique_tasks)))}, index=unique_tasks)
task_to_index = {task: idx for idx, task in enumerate(unique_tasks)}
logging.info(f"Modifying tasks in {dataset.repo_id}")
@@ -1531,7 +1524,7 @@ def modify_tasks(
def convert_image_to_video_dataset(
dataset: LeRobotDataset,
output_dir: Path | None = None,
output_dir: Path,
repo_id: str | None = None,
vcodec: str = "libsvtav1",
pix_fmt: str = "yuv420p",
@@ -1550,8 +1543,8 @@ def convert_image_to_video_dataset(
Args:
dataset: The source LeRobot dataset with images
output_dir: Root directory where the edited dataset will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id. Equivalent to new_root in EditDatasetConfig.
repo_id: Edited dataset identifier. Equivalent to new_repo_id in EditDatasetConfig.
output_dir: Directory to save the new video dataset
repo_id: Repository ID for the new dataset (default: original_id + "_video")
vcodec: Video codec (default: libsvtav1)
pix_fmt: Pixel format (default: yuv420p)
g: Group of pictures size (default: 2)
@@ -1602,7 +1595,6 @@ def convert_image_to_video_dataset(
# Video info will be updated after episodes are encoded
# Create new metadata for video dataset
output_dir = Path(output_dir) if output_dir is not None else HF_LEROBOT_HOME / repo_id
new_meta = LeRobotDatasetMetadata.create(
repo_id=repo_id,
fps=dataset.meta.fps,
+50 -66
View File
@@ -18,14 +18,13 @@ from pprint import pformat
import torch
from lerobot.configs.default import DatasetConfig, MultiDatasetConfig
from lerobot.configs.policies import PreTrainedConfig
from lerobot.configs.train import TrainPipelineConfig
from lerobot.datasets.lerobot_dataset import (
LeRobotDataset,
LeRobotDatasetMetadata,
MultiLeRobotDataset,
)
from lerobot.datasets.multi_dataset import NewMultiLeRobotDataset
from lerobot.datasets.streaming_dataset import StreamingLeRobotDataset
from lerobot.datasets.transforms import ImageTransforms
from lerobot.utils.constants import ACTION, OBS_PREFIX, REWARD
@@ -69,81 +68,66 @@ def resolve_delta_timestamps(
return delta_timestamps
def make_dataset(cfg: TrainPipelineConfig) -> LeRobotDataset | NewMultiLeRobotDataset:
"""Create a single or multi-dataset depending on the config type.
def make_dataset(cfg: TrainPipelineConfig) -> LeRobotDataset | MultiLeRobotDataset:
"""Handles the logic of setting up delta timestamps and image transforms before creating a dataset.
Args:
cfg (TrainPipelineConfig): A TrainPipelineConfig config which contains a DatasetConfig and a PreTrainedConfig.
Raises:
NotImplementedError: The MultiLeRobotDataset is currently deactivated.
Returns:
LeRobotDataset | NewMultiLeRobotDataset
LeRobotDataset | MultiLeRobotDataset
"""
if isinstance(cfg.dataset, MultiDatasetConfig):
return _make_multi_dataset(cfg)
return _make_single_dataset(cfg)
def _make_single_dataset(cfg: TrainPipelineConfig) -> LeRobotDataset:
ds_cfg: DatasetConfig = cfg.dataset # type: ignore[assignment]
image_transforms = (
ImageTransforms(ds_cfg.image_transforms) if ds_cfg.image_transforms.enable else None
ImageTransforms(cfg.dataset.image_transforms) if cfg.dataset.image_transforms.enable else None
)
ds_meta = LeRobotDatasetMetadata(ds_cfg.repo_id, root=ds_cfg.root, revision=ds_cfg.revision)
delta_timestamps = resolve_delta_timestamps(cfg.policy, ds_meta)
if not ds_cfg.streaming:
dataset = LeRobotDataset(
ds_cfg.repo_id,
root=ds_cfg.root,
episodes=ds_cfg.episodes,
delta_timestamps=delta_timestamps,
image_transforms=image_transforms,
revision=ds_cfg.revision,
video_backend=ds_cfg.video_backend,
tolerance_s=cfg.tolerance_s,
if isinstance(cfg.dataset.repo_id, str):
ds_meta = LeRobotDatasetMetadata(
cfg.dataset.repo_id, root=cfg.dataset.root, revision=cfg.dataset.revision
)
delta_timestamps = resolve_delta_timestamps(cfg.policy, ds_meta)
if not cfg.dataset.streaming:
dataset = LeRobotDataset(
cfg.dataset.repo_id,
root=cfg.dataset.root,
episodes=cfg.dataset.episodes,
delta_timestamps=delta_timestamps,
image_transforms=image_transforms,
revision=cfg.dataset.revision,
video_backend=cfg.dataset.video_backend,
tolerance_s=cfg.tolerance_s,
)
else:
dataset = StreamingLeRobotDataset(
cfg.dataset.repo_id,
root=cfg.dataset.root,
episodes=cfg.dataset.episodes,
delta_timestamps=delta_timestamps,
image_transforms=image_transforms,
revision=cfg.dataset.revision,
max_num_shards=cfg.num_workers,
tolerance_s=cfg.tolerance_s,
)
else:
dataset = StreamingLeRobotDataset(
ds_cfg.repo_id,
root=ds_cfg.root,
episodes=ds_cfg.episodes,
delta_timestamps=delta_timestamps,
raise NotImplementedError("The MultiLeRobotDataset isn't supported for now.")
dataset = MultiLeRobotDataset(
cfg.dataset.repo_id,
# TODO(aliberts): add proper support for multi dataset
# delta_timestamps=delta_timestamps,
image_transforms=image_transforms,
revision=ds_cfg.revision,
max_num_shards=cfg.num_workers,
tolerance_s=cfg.tolerance_s,
video_backend=cfg.dataset.video_backend,
)
logging.info(
"Multiple datasets were provided. Applied the following index mapping to the provided datasets: "
f"{pformat(dataset.repo_id_to_index, indent=2)}"
)
if ds_cfg.use_imagenet_stats:
if cfg.dataset.use_imagenet_stats:
for key in dataset.meta.camera_keys:
for stats_type, stats_val in IMAGENET_STATS.items():
dataset.meta.stats[key][stats_type] = torch.tensor(stats_val, dtype=torch.float32)
return dataset
def _make_multi_dataset(cfg: TrainPipelineConfig) -> NewMultiLeRobotDataset:
multi_cfg: MultiDatasetConfig = cfg.dataset # type: ignore[assignment]
image_transforms = (
ImageTransforms(multi_cfg.image_transforms) if multi_cfg.image_transforms.enable else None
)
dataset = NewMultiLeRobotDataset(
configs=multi_cfg.datasets,
image_transforms=image_transforms,
tolerance_s=cfg.tolerance_s,
)
logging.info(
"MultiLeRobotDataset created with %d sub-datasets:\n%s",
len(multi_cfg.datasets),
pformat(
{i: c.repo_id for i, c in enumerate(multi_cfg.datasets)},
indent=2,
),
)
if multi_cfg.use_imagenet_stats:
for key in dataset.meta.camera_keys:
for stats_type, stats_val in IMAGENET_STATS.items():
dataset.meta.stats[key][stats_type] = torch.tensor(stats_val, dtype=torch.float32)
for stats_type, stats in IMAGENET_STATS.items():
dataset.meta.stats[key][stats_type] = torch.tensor(stats, dtype=torch.float32)
return dataset
+51 -127
View File
@@ -59,6 +59,7 @@ from lerobot.datasets.utils import (
load_stats,
load_subtasks,
load_tasks,
load_tasks_high_level,
update_chunk_file_indices,
validate_episode_buffer,
validate_frame,
@@ -68,7 +69,6 @@ from lerobot.datasets.utils import (
write_tasks,
)
from lerobot.datasets.video_utils import (
StreamingVideoEncoder,
VideoFrame,
concatenate_video_files,
decode_video_frames,
@@ -76,11 +76,11 @@ from lerobot.datasets.video_utils import (
get_safe_default_codec,
get_video_duration_in_s,
get_video_info,
resolve_vcodec,
)
from lerobot.utils.constants import HF_LEROBOT_HOME
CODEBASE_VERSION = "v3.0"
VALID_VIDEO_CODECS = {"h264", "hevc", "libsvtav1"}
class LeRobotDatasetMetadata:
@@ -164,6 +164,7 @@ class LeRobotDatasetMetadata:
self.info = load_info(self.root)
check_version_compatibility(self.repo_id, self._version, CODEBASE_VERSION)
self.tasks = load_tasks(self.root)
self.tasks_high_level = load_tasks_high_level(self.root)
self.subtasks = load_subtasks(self.root)
self.episodes = load_episodes(self.root)
self.stats = load_stats(self.root)
@@ -314,7 +315,7 @@ class LeRobotDatasetMetadata:
if self.tasks is None:
new_tasks = tasks
task_indices = range(len(tasks))
self.tasks = pd.DataFrame({"task_index": task_indices}, index=pd.Index(tasks, name="task"))
self.tasks = pd.DataFrame({"task_index": task_indices}, index=tasks)
else:
new_tasks = [task for task in tasks if task not in self.tasks.index]
new_task_indices = range(len(self.tasks), len(self.tasks) + len(new_tasks))
@@ -521,6 +522,7 @@ class LeRobotDatasetMetadata:
_validate_feature_names(features)
obj.tasks = None
obj.tasks_high_level = None
obj.subtasks = None
obj.episodes = None
obj.stats = None
@@ -546,19 +548,12 @@ class LeRobotDatasetMetadata:
def _encode_video_worker(
video_key: str,
episode_index: int,
root: Path,
fps: int,
vcodec: str = "libsvtav1",
encoder_threads: int | None = None,
video_key: str, episode_index: int, root: Path, fps: int, vcodec: str = "libsvtav1"
) -> Path:
temp_path = Path(tempfile.mkdtemp(dir=root)) / f"{video_key}_{episode_index:03d}.mp4"
fpath = DEFAULT_IMAGE_PATH.format(image_key=video_key, episode_index=episode_index, frame_index=0)
img_dir = (root / fpath).parent
encode_video_frames(
img_dir, temp_path, fps, vcodec=vcodec, overwrite=True, encoder_threads=encoder_threads
)
encode_video_frames(img_dir, temp_path, fps, vcodec=vcodec, overwrite=True)
shutil.rmtree(img_dir)
return temp_path
@@ -578,9 +573,6 @@ class LeRobotDataset(torch.utils.data.Dataset):
video_backend: str | None = None,
batch_encoding_size: int = 1,
vcodec: str = "libsvtav1",
streaming_encoding: bool = False,
encoder_queue_maxsize: int = 30,
encoder_threads: int | None = None,
):
"""
2 modes are available for instantiating this class, depending on 2 different use cases:
@@ -664,11 +656,11 @@ class LeRobotDataset(torch.utils.data.Dataset):
for the README).
Args:
repo_id (str): This is the repo id that will be used to fetch the dataset.
root (Path | None, optional): Local directory where the dataset will be downloaded and
stored. If set, all dataset files will be stored directly under this path. If not set, the
dataset files will be stored under $HF_LEROBOT_HOME/repo_id (configurable via the
HF_LEROBOT_HOME environment variable).
repo_id (str): This is the repo id that will be used to fetch the dataset. Locally, the dataset
will be stored under root/repo_id.
root (Path | None, optional): Local directory to use for downloading/writing files. You can also
set the LEROBOT_HOME environment variable to point to a different location. Defaults to
'~/.cache/huggingface/lerobot'.
episodes (list[int] | None, optional): If specified, this will only load episodes specified by
their episode_index in this list. Defaults to None.
image_transforms (Callable | None, optional): You can pass standard v2 image transforms from
@@ -694,17 +686,12 @@ class LeRobotDataset(torch.utils.data.Dataset):
batch_encoding_size (int, optional): Number of episodes to accumulate before batch encoding videos.
Set to 1 for immediate encoding (default), or higher for batched encoding. Defaults to 1.
vcodec (str, optional): Video codec for encoding videos during recording. Options: 'h264', 'hevc',
'libsvtav1', 'auto', or hardware-specific codecs like 'h264_videotoolbox', 'h264_nvenc'.
Defaults to 'libsvtav1'. Use 'auto' to auto-detect the best available hardware encoder.
streaming_encoding (bool, optional): If True, encode video frames in real-time during capture
instead of writing PNG images first. This makes save_episode() near-instant. Defaults to False.
encoder_queue_maxsize (int, optional): Maximum number of frames to buffer per camera when using
streaming encoding. Defaults to 30 (~1s at 30fps).
encoder_threads (int | None, optional): Number of threads per encoder instance. None lets the
codec auto-detect (default). Lower values reduce CPU usage per encoder. Maps to 'lp' (via svtav1-params) for
libsvtav1 and 'threads' for h264/hevc.
'libsvtav1'. Defaults to 'libsvtav1'. Use 'h264' for faster encoding on systems where AV1
encoding is CPU-heavy.
"""
super().__init__()
if vcodec not in VALID_VIDEO_CODECS:
raise ValueError(f"Invalid vcodec '{vcodec}'. Must be one of: {sorted(VALID_VIDEO_CODECS)}")
self.repo_id = repo_id
self.root = Path(root) if root else HF_LEROBOT_HOME / repo_id
self.image_transforms = image_transforms
@@ -716,8 +703,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
self.delta_indices = None
self.batch_encoding_size = batch_encoding_size
self.episodes_since_last_encoding = 0
self.vcodec = resolve_vcodec(vcodec)
self._encoder_threads = encoder_threads
self.vcodec = vcodec
# Unused attributes
self.image_writer = None
@@ -725,7 +711,6 @@ class LeRobotDataset(torch.utils.data.Dataset):
self.writer = None
self.latest_episode = None
self._current_file_start_frame = None # Track the starting frame index of the current parquet file
self._streaming_encoder = None
self.root.mkdir(exist_ok=True, parents=True)
@@ -747,7 +732,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
# Check if cached dataset contains all requested episodes
if not self._check_cached_episodes_sufficient():
raise FileNotFoundError("Cached dataset doesn't contain all requested episodes")
except (FileNotFoundError, NotADirectoryError):
except (AssertionError, FileNotFoundError, NotADirectoryError):
if is_valid_version(self.revision):
self.revision = get_safe_version(self.repo_id, self.revision)
self.download(download_videos)
@@ -767,19 +752,6 @@ class LeRobotDataset(torch.utils.data.Dataset):
check_delta_timestamps(self.delta_timestamps, self.fps, self.tolerance_s)
self.delta_indices = get_delta_indices(self.delta_timestamps, self.fps)
# Initialize streaming encoder for resumed recording
if streaming_encoding and len(self.meta.video_keys) > 0:
self._streaming_encoder = StreamingVideoEncoder(
fps=self.meta.fps,
vcodec=self.vcodec,
pix_fmt="yuv420p",
g=2,
crf=30,
preset=None,
queue_maxsize=encoder_queue_maxsize,
encoder_threads=encoder_threads,
)
def _close_writer(self) -> None:
"""Close and cleanup the parquet writer if it exists."""
writer = getattr(self, "writer", None)
@@ -839,7 +811,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
hub_api.upload_folder(**upload_kwargs)
card = create_lerobot_dataset_card(
tags=tags, dataset_info=self.meta.info, license=license, repo_id=self.repo_id, **card_kwargs
tags=tags, dataset_info=self.meta.info, license=license, **card_kwargs
)
card.push_to_hub(repo_id=self.repo_id, repo_type="dataset", revision=branch)
@@ -1098,7 +1070,17 @@ class LeRobotDataset(torch.utils.data.Dataset):
if len(self.meta.video_keys) > 0:
current_ts = item["timestamp"].item()
query_timestamps = self._get_query_timestamps(current_ts, query_indices)
video_frames = self._query_videos(query_timestamps, ep_idx)
try:
video_frames = self._query_videos(query_timestamps, ep_idx)
except Exception as e:
print("\n" + "=" * 120)
print("[VIDEO DECODE FAILURE]")
print(f"item={item}")
print(f"query_indices={query_indices}")
print(f"query_timestamps={query_timestamps}")
print(f"ep_idx={ep_idx}")
print("=" * 120 + "\n")
raise
item = {**video_frames, **item}
if self.image_transforms is not None:
@@ -1109,6 +1091,14 @@ class LeRobotDataset(torch.utils.data.Dataset):
# Add task as a string
task_idx = item["task_index"].item()
item["task"] = self.meta.tasks.iloc[task_idx].name
# optionally add high level task index
if "task_index_high_level" in self.features:
high_level_task_idx = item["task_index_high_level"].item()
item["robot_utterance"] = self.meta.tasks_high_level.iloc[high_level_task_idx]["robot_utterance"]
item["user_prompt"] = self.meta.tasks_high_level.iloc[high_level_task_idx]["user_prompt"]
# add subtask information if available
if "subtask_index" in self.features and self.meta.subtasks is not None:
@@ -1135,8 +1125,6 @@ class LeRobotDataset(torch.utils.data.Dataset):
"""
self._close_writer()
self.meta._close_writer()
if self._streaming_encoder is not None:
self._streaming_encoder.close()
def create_episode_buffer(self, episode_index: int | None = None) -> dict:
current_ep_idx = self.meta.total_episodes if episode_index is None else episode_index
@@ -1191,13 +1179,6 @@ class LeRobotDataset(torch.utils.data.Dataset):
self.episode_buffer["timestamp"].append(timestamp)
self.episode_buffer["task"].append(frame.pop("task")) # Remove task from frame after processing
# Start streaming encoder on first frame of episode (once, before iterating keys)
if frame_index == 0 and self._streaming_encoder is not None:
self._streaming_encoder.start_episode(
video_keys=list(self.meta.video_keys),
temp_dir=self.root,
)
# Add frame features to episode_buffer
for key in frame:
if key not in self.features:
@@ -1205,10 +1186,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
f"An element of the frame is not in the features. '{key}' not in '{self.features.keys()}'."
)
if self.features[key]["dtype"] == "video" and self._streaming_encoder is not None:
self._streaming_encoder.feed_frame(key, frame[key])
self.episode_buffer[key].append(None) # Placeholder (video keys are skipped in parquet)
elif self.features[key]["dtype"] in ["image", "video"]:
if self.features[key]["dtype"] in ["image", "video"]:
img_path = self._get_image_file_path(
episode_index=self.episode_buffer["episode_index"], image_key=key, frame_index=frame_index
)
@@ -1269,38 +1247,13 @@ class LeRobotDataset(torch.utils.data.Dataset):
# Wait for image writer to end, so that episode stats over images can be computed
self._wait_image_writer()
has_video_keys = len(self.meta.video_keys) > 0
use_streaming = self._streaming_encoder is not None and has_video_keys
use_batched_encoding = self.batch_encoding_size > 1
if use_streaming:
# Compute stats for non-video features only (video stats come from encoder)
non_video_buffer = {
k: v
for k, v in episode_buffer.items()
if self.features.get(k, {}).get("dtype") not in ("video",)
}
non_video_features = {k: v for k, v in self.features.items() if v["dtype"] != "video"}
ep_stats = compute_episode_stats(non_video_buffer, non_video_features)
else:
ep_stats = compute_episode_stats(episode_buffer, self.features)
ep_stats = compute_episode_stats(episode_buffer, self.features)
ep_metadata = self._save_episode_data(episode_buffer)
has_video_keys = len(self.meta.video_keys) > 0
use_batched_encoding = self.batch_encoding_size > 1
if use_streaming:
# Finish streaming encoding and collect results
streaming_results = self._streaming_encoder.finish_episode()
for video_key in self.meta.video_keys:
temp_path, video_stats = streaming_results[video_key]
if video_stats is not None:
# Format stats same as compute_episode_stats: normalize to [0,1], reshape to (C,1,1)
ep_stats[video_key] = {
k: v if k == "count" else np.squeeze(v.reshape(1, -1, 1, 1) / 255.0, axis=0)
for k, v in video_stats.items()
}
ep_metadata.update(self._save_episode_video(video_key, episode_index, temp_path=temp_path))
elif has_video_keys and not use_batched_encoding:
if has_video_keys and not use_batched_encoding:
num_cameras = len(self.meta.video_keys)
if parallel_encoding and num_cameras > 1:
# TODO(Steven): Ideally we would like to control the number of threads per encoding such that:
@@ -1314,7 +1267,6 @@ class LeRobotDataset(torch.utils.data.Dataset):
self.root,
self.fps,
self.vcodec,
self._encoder_threads,
): video_key
for video_key in self.meta.video_keys
}
@@ -1583,10 +1535,6 @@ class LeRobotDataset(torch.utils.data.Dataset):
return metadata
def clear_episode_buffer(self, delete_images: bool = True) -> None:
# Cancel streaming encoder if active
if self._streaming_encoder is not None:
self._streaming_encoder.cancel_episode()
# Clean up image files for the current episode buffer
if delete_images:
# Wait for the async image writer to finish
@@ -1634,9 +1582,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
Note: `encode_video_frames` is a blocking call. Making it asynchronous shouldn't speedup encoding,
since video encoding with ffmpeg is already using multithreading.
"""
return _encode_video_worker(
video_key, episode_index, self.root, self.fps, self.vcodec, self._encoder_threads
)
return _encode_video_worker(video_key, episode_index, self.root, self.fps, self.vcodec)
@classmethod
def create(
@@ -1653,13 +1599,10 @@ class LeRobotDataset(torch.utils.data.Dataset):
video_backend: str | None = None,
batch_encoding_size: int = 1,
vcodec: str = "libsvtav1",
metadata_buffer_size: int = 10,
streaming_encoding: bool = False,
encoder_queue_maxsize: int = 30,
encoder_threads: int | None = None,
) -> "LeRobotDataset":
"""Create a LeRobot Dataset from scratch in order to record data."""
vcodec = resolve_vcodec(vcodec)
if vcodec not in VALID_VIDEO_CODECS:
raise ValueError(f"Invalid vcodec '{vcodec}'. Must be one of: {sorted(VALID_VIDEO_CODECS)}")
obj = cls.__new__(cls)
obj.meta = LeRobotDatasetMetadata.create(
repo_id=repo_id,
@@ -1668,7 +1611,6 @@ class LeRobotDataset(torch.utils.data.Dataset):
features=features,
root=root,
use_videos=use_videos,
metadata_buffer_size=metadata_buffer_size,
)
obj.repo_id = obj.meta.repo_id
obj.root = obj.meta.root
@@ -1678,7 +1620,6 @@ class LeRobotDataset(torch.utils.data.Dataset):
obj.batch_encoding_size = batch_encoding_size
obj.episodes_since_last_encoding = 0
obj.vcodec = vcodec
obj._encoder_threads = encoder_threads
if image_writer_processes or image_writer_threads:
obj.start_image_writer(image_writer_processes, image_writer_threads)
@@ -1700,22 +1641,6 @@ class LeRobotDataset(torch.utils.data.Dataset):
obj._lazy_loading = False
obj._recorded_frames = 0
obj._writer_closed_for_reading = False
# Initialize streaming encoder
if streaming_encoding and len(obj.meta.video_keys) > 0:
obj._streaming_encoder = StreamingVideoEncoder(
fps=fps,
vcodec=vcodec,
pix_fmt="yuv420p",
g=2,
crf=30,
preset=None,
queue_maxsize=encoder_queue_maxsize,
encoder_threads=encoder_threads,
)
else:
obj._streaming_encoder = None
return obj
@@ -1771,12 +1696,11 @@ class MultiLeRobotDataset(torch.utils.data.Dataset):
)
for repo_id, ds in zip(self.repo_ids, self._datasets, strict=True):
extra_keys = set(ds.features).difference(intersection_features)
if extra_keys:
logging.warning(
f"keys {extra_keys} of {repo_id} were disabled as they are not contained in all the "
"other datasets."
)
self.disabled_features.update(extra_keys)
logging.warning(
f"keys {extra_keys} of {repo_id} were disabled as they are not contained in all the "
"other datasets."
)
self.disabled_features.update(extra_keys)
self.image_transforms = image_transforms
self.delta_timestamps = delta_timestamps
-364
View File
@@ -1,364 +0,0 @@
"""MultiLeRobotDataset: joint training over heterogeneous LeRobot datasets.
Supports:
- Per-dataset feature mapping (rename keys to a unified namespace)
- Automatic zero-padding for features missing in some datasets
- Per-dataset transform pipelines
- Weighted sampling via dataset weights
- Aggregated stats across all sub-datasets
- A ``meta`` shim compatible with EpisodeAwareSampler and make_policy
"""
from __future__ import annotations
import logging
from collections.abc import Callable
import numpy as np
import torch
import torch.utils.data
from lerobot.configs.default import SubDatasetConfig
from lerobot.datasets.compute_stats import aggregate_stats
from lerobot.datasets.lerobot_dataset import LeRobotDataset
from lerobot.datasets.transforms import DatasetTransformPipeline
class MultiDatasetMeta:
"""Lightweight metadata shim that exposes the same interface as ``LeRobotDatasetMetadata``.
Built by aggregating the metadata of multiple sub-datasets after their
feature keys have been mapped to a unified namespace.
"""
def __init__(
self,
datasets: list[LeRobotDataset],
feature_maps: list[dict[str, str]],
):
self._datasets = datasets
self._feature_maps = feature_maps
self._unified_features = self._build_unified_features()
self._episodes = self._build_episodes()
self._stats = self._build_stats()
# ------------------------------------------------------------------
# Feature union
# ------------------------------------------------------------------
def _build_unified_features(self) -> dict[str, dict]:
"""Build feature dict as the *union* of all mapped feature keys."""
unified: dict[str, dict] = {}
for ds, fmap in zip(self._datasets, self._feature_maps):
for original_key, feat_info in ds.meta.features.items():
mapped_key = fmap.get(original_key, original_key)
if mapped_key not in unified:
unified[mapped_key] = dict(feat_info)
else:
existing_shape = tuple(unified[mapped_key]["shape"])
new_shape = tuple(feat_info["shape"])
if existing_shape != new_shape and unified[mapped_key]["dtype"] == feat_info["dtype"]:
logging.warning(
"Feature '%s' has shape %s in one dataset but %s in another. "
"The larger shape will be used (padding applied automatically).",
mapped_key,
existing_shape,
new_shape,
)
if np.prod(new_shape) > np.prod(existing_shape):
unified[mapped_key] = dict(feat_info)
return unified
# ------------------------------------------------------------------
# Episode metadata (global flat indexing)
# ------------------------------------------------------------------
def _build_episodes(self) -> dict[str, list]:
"""Concatenate episode boundaries across sub-datasets with frame offsets.
Produces the same column structure as ``load_episodes()`` so that
``EpisodeAwareSampler`` and ``WeightedEpisodeAwareSampler`` can consume it.
"""
from_indices: list[int] = []
to_indices: list[int] = []
dataset_source: list[int] = []
frame_offset = 0
for ds_idx, ds in enumerate(self._datasets):
eps = ds.meta.episodes
for ep in eps:
from_indices.append(ep["dataset_from_index"] + frame_offset)
to_indices.append(ep["dataset_to_index"] + frame_offset)
dataset_source.append(ds_idx)
frame_offset += ds.num_frames
return {
"dataset_from_index": from_indices,
"dataset_to_index": to_indices,
"dataset_source": dataset_source,
}
# ------------------------------------------------------------------
# Stats aggregation
# ------------------------------------------------------------------
def _build_stats(self) -> dict[str, dict[str, np.ndarray]]:
"""Aggregate stats across sub-datasets using mapped feature keys."""
mapped_stats_list: list[dict[str, dict]] = []
for ds, fmap in zip(self._datasets, self._feature_maps):
reverse_map = {v: k for k, v in fmap.items()}
mapped: dict[str, dict] = {}
for unified_key in self._unified_features:
original_key = reverse_map.get(unified_key, unified_key)
if original_key in ds.meta.stats:
mapped[unified_key] = ds.meta.stats[original_key]
mapped_stats_list.append(mapped)
return aggregate_stats(mapped_stats_list)
# ------------------------------------------------------------------
# Properties matching LeRobotDatasetMetadata API
# ------------------------------------------------------------------
@property
def features(self) -> dict[str, dict]:
return self._unified_features
@property
def image_keys(self) -> list[str]:
return [k for k, f in self._unified_features.items() if f["dtype"] == "image"]
@property
def video_keys(self) -> list[str]:
return [k for k, f in self._unified_features.items() if f["dtype"] == "video"]
@property
def camera_keys(self) -> list[str]:
return [k for k, f in self._unified_features.items() if f["dtype"] in ("video", "image")]
@property
def names(self) -> dict[str, list | dict]:
return {k: f["names"] for k, f in self._unified_features.items()}
@property
def shapes(self) -> dict[str, tuple]:
return {k: tuple(f["shape"]) for k, f in self._unified_features.items()}
@property
def fps(self) -> int:
fps_values = {ds.meta.fps for ds in self._datasets}
if len(fps_values) > 1:
logging.warning("Sub-datasets have different FPS values: %s. Using the first.", fps_values)
return self._datasets[0].meta.fps
@property
def stats(self) -> dict[str, dict[str, np.ndarray]]:
return self._stats
@stats.setter
def stats(self, value: dict):
self._stats = value
@property
def episodes(self) -> dict[str, list]:
return self._episodes
@property
def total_episodes(self) -> int:
return sum(ds.meta.total_episodes for ds in self._datasets)
@property
def total_frames(self) -> int:
return sum(ds.meta.total_frames for ds in self._datasets)
@property
def total_tasks(self) -> int:
return sum(ds.meta.total_tasks for ds in self._datasets)
@property
def info(self) -> dict:
return {
"fps": self.fps,
"features": self._unified_features,
"total_episodes": self.total_episodes,
"total_frames": self.total_frames,
"total_tasks": self.total_tasks,
"codebase_version": "v3.0",
}
class NewMultiLeRobotDataset(torch.utils.data.Dataset):
"""Dataset that wraps multiple ``LeRobotDataset`` instances with feature mapping and padding.
Each sub-dataset can have different feature names and shapes. A per-dataset
``feature_map`` renames keys into a shared namespace. Features that a given
sub-dataset does not provide are zero-padded so every ``__getitem__`` returns
the full unified feature set.
"""
def __init__(
self,
configs: list[SubDatasetConfig],
image_transforms: Callable | None = None,
delta_timestamps: dict[str, list[float]] | None = None,
tolerance_s: float = 1e-4,
):
super().__init__()
self._configs = configs
self.image_transforms = image_transforms
self._datasets: list[LeRobotDataset] = []
self._feature_maps: list[dict[str, str]] = []
self._transform_pipelines: list[DatasetTransformPipeline | None] = []
self._weights: list[float] = []
for cfg in configs:
ds = LeRobotDataset(
repo_id=cfg.repo_id,
root=cfg.root,
episodes=cfg.episodes,
image_transforms=image_transforms,
delta_timestamps=delta_timestamps,
tolerance_s=tolerance_s,
revision=cfg.revision,
video_backend=cfg.video_backend,
)
self._datasets.append(ds)
self._feature_maps.append(cfg.feature_map or {})
self._transform_pipelines.append(
DatasetTransformPipeline(cfg.transforms) if cfg.transforms else None
)
self._weights.append(cfg.weight)
self._meta = MultiDatasetMeta(self._datasets, self._feature_maps)
# Pre-compute cumulative frame counts for fast index mapping.
self._cumulative_frames: list[int] = []
total = 0
for ds in self._datasets:
total += ds.num_frames
self._cumulative_frames.append(total)
# Build reverse maps (unified_key -> original_key) per dataset for padding.
self._reverse_maps: list[dict[str, str]] = []
for fmap in self._feature_maps:
self._reverse_maps.append({v: k for k, v in fmap.items()})
logging.info(
"MultiLeRobotDataset: %d sub-datasets, %d total frames, %d total episodes, "
"%d unified features",
len(self._datasets),
self.num_frames,
self.num_episodes,
len(self._meta.features),
)
# ------------------------------------------------------------------
# Public interface
# ------------------------------------------------------------------
@property
def meta(self) -> MultiDatasetMeta:
return self._meta
@property
def dataset_weights(self) -> list[float]:
return self._weights
@property
def num_frames(self) -> int:
return self._cumulative_frames[-1] if self._cumulative_frames else 0
@property
def num_episodes(self) -> int:
return sum(ds.num_episodes for ds in self._datasets)
@property
def episodes(self) -> list[int] | None:
return None
@property
def fps(self) -> int:
return self._meta.fps
@property
def features(self) -> dict[str, dict]:
return self._meta.features
@property
def camera_keys(self) -> list[str]:
return self._meta.camera_keys
# ------------------------------------------------------------------
# Indexing
# ------------------------------------------------------------------
def _locate(self, idx: int) -> tuple[int, int]:
"""Map a global frame index to (dataset_index, local_index)."""
for ds_idx, cum in enumerate(self._cumulative_frames):
if idx < cum:
local = idx - (self._cumulative_frames[ds_idx - 1] if ds_idx > 0 else 0)
return ds_idx, local
raise IndexError(f"Index {idx} out of range (total {self.num_frames})")
def __len__(self) -> int:
return self.num_frames
def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
ds_idx, local_idx = self._locate(idx)
item = self._datasets[ds_idx][local_idx]
# 1. Rename keys according to feature_map.
fmap = self._feature_maps[ds_idx]
if fmap:
renamed: dict[str, torch.Tensor] = {}
for key, value in item.items():
renamed[fmap.get(key, key)] = value
item = renamed
# 2. Apply per-dataset transform pipeline.
pipeline = self._transform_pipelines[ds_idx]
if pipeline is not None:
item = pipeline(item)
# 3. Pad missing features with zeros.
reverse_map = self._reverse_maps[ds_idx]
ds_features = self._datasets[ds_idx].meta.features
for unified_key, feat_info in self._meta.features.items():
if unified_key in item:
continue
original_key = reverse_map.get(unified_key, unified_key)
if original_key in ds_features:
continue
shape = tuple(feat_info["shape"])
dtype = feat_info["dtype"]
if dtype in ("video", "image"):
# Camera tensors are (C, H, W) after transforms.
c, h, w = (shape[2], shape[0], shape[1]) if len(shape) == 3 else (3, shape[0], shape[1])
item[unified_key] = torch.zeros(c, h, w, dtype=torch.float32)
elif dtype in ("float32", "float64"):
item[unified_key] = torch.zeros(shape, dtype=torch.float32)
elif dtype in ("int32", "int64"):
item[unified_key] = torch.zeros(shape, dtype=torch.int64)
elif dtype == "bool":
item[unified_key] = torch.zeros(shape, dtype=torch.bool)
else:
item[unified_key] = torch.zeros(shape, dtype=torch.float32)
item[f"{unified_key}_is_pad"] = torch.tensor(True)
# 4. Tag which dataset this sample came from.
item["dataset_index"] = torch.tensor(ds_idx)
return item
def __repr__(self) -> str:
repo_ids = [c.repo_id for c in self._configs]
return (
f"NewMultiLeRobotDataset(\n"
f" repo_ids={repo_ids},\n"
f" num_frames={self.num_frames},\n"
f" num_episodes={self.num_episodes},\n"
f" unified_features={list(self._meta.features.keys())},\n"
f" weights={self._weights},\n"
f")"
)
-77
View File
@@ -59,80 +59,3 @@ class EpisodeAwareSampler:
def __len__(self) -> int:
return len(self.indices)
class WeightedEpisodeAwareSampler:
"""Sampler that draws frames from multiple datasets according to per-dataset weights.
Each iteration first selects a sub-dataset proportionally to its weight, then
uniformly samples a frame from that sub-dataset's valid index set. Episode
boundary information is respected so that dropped frames are excluded.
Args:
dataset_from_indices: Start index for each episode (global, flat).
dataset_to_indices: End index (exclusive) for each episode (global, flat).
dataset_membership: Which sub-dataset each episode belongs to (integer id).
dataset_weights: Relative sampling weight per sub-dataset.
episode_indices_to_use: If given, only episodes in this set are used.
drop_n_first_frames: Frames to skip at the start of each episode.
drop_n_last_frames: Frames to skip at the end of each episode.
shuffle: Whether to shuffle within each epoch.
num_samples: How many samples per epoch. Defaults to total valid frames.
generator: Optional torch.Generator for reproducibility.
"""
def __init__(
self,
dataset_from_indices: list[int],
dataset_to_indices: list[int],
dataset_membership: list[int],
dataset_weights: list[float],
episode_indices_to_use: list | None = None,
drop_n_first_frames: int = 0,
drop_n_last_frames: int = 0,
shuffle: bool = False,
num_samples: int | None = None,
generator: torch.Generator | None = None,
):
n_datasets = max(dataset_membership) + 1 if dataset_membership else 0
self._per_dataset_indices: list[list[int]] = [[] for _ in range(n_datasets)]
episodes_to_use = set(episode_indices_to_use) if episode_indices_to_use is not None else None
for ep_idx, (start, end, ds_id) in enumerate(
zip(dataset_from_indices, dataset_to_indices, dataset_membership, strict=True)
):
if episodes_to_use is not None and ep_idx not in episodes_to_use:
continue
frame_range = range(start + drop_n_first_frames, end - drop_n_last_frames)
self._per_dataset_indices[ds_id].extend(frame_range)
# Normalise weights (only over datasets that actually have frames).
raw_weights = list(dataset_weights[:n_datasets])
self._weights = torch.zeros(n_datasets)
for i, w in enumerate(raw_weights):
if len(self._per_dataset_indices[i]) > 0:
self._weights[i] = w
total_w = self._weights.sum()
if total_w > 0:
self._weights /= total_w
self._total_frames = sum(len(idx) for idx in self._per_dataset_indices)
self._num_samples = num_samples if num_samples is not None else self._total_frames
self.shuffle = shuffle
self._generator = generator
def __iter__(self) -> Iterator[int]:
if not self.shuffle:
for ds_indices in self._per_dataset_indices:
yield from ds_indices
return
for _ in range(self._num_samples):
ds_id = int(torch.multinomial(self._weights, 1, generator=self._generator).item())
indices = self._per_dataset_indices[ds_id]
local_idx = int(torch.randint(len(indices), (1,), generator=self._generator).item())
yield indices[local_idx]
def __len__(self) -> int:
return self._num_samples
-113
View File
@@ -14,13 +14,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import logging
from collections.abc import Callable, Sequence
from dataclasses import dataclass, field
from typing import Any
import torch
import torch.nn.functional as F_nn
from torchvision.transforms import v2
from torchvision.transforms.v2 import (
Transform,
@@ -260,114 +258,3 @@ class ImageTransforms(Transform):
def forward(self, *inputs: Any) -> Any:
return self.tf(*inputs)
# Per-dataset transform pipeline (used by MultiLeRobotDataset)
@dataclass
class DatasetTransformStepConfig:
"""Config for a single per-dataset transform step."""
type: str
kwargs: dict[str, Any] = field(default_factory=dict)
_DATASET_TRANSFORM_REGISTRY: dict[str, type["DatasetTransformStep"]] = {}
def register_dataset_transform(name: str):
"""Decorator to register a DatasetTransformStep by name."""
def decorator(cls: type["DatasetTransformStep"]) -> type["DatasetTransformStep"]:
_DATASET_TRANSFORM_REGISTRY[name] = cls
return cls
return decorator
class DatasetTransformStep:
"""Base class for a single per-dataset transform applied to a sample dict."""
def __call__(self, sample: dict) -> dict:
raise NotImplementedError
@register_dataset_transform("pad_action")
class PadAction(DatasetTransformStep):
"""Zero-pad the ``action`` tensor to *target_dim* along the last axis."""
def __init__(self, target_dim: int):
self.target_dim = target_dim
def __call__(self, sample: dict) -> dict:
action = sample.get("action")
if action is None:
return sample
current = action.shape[-1]
if current < self.target_dim:
sample["action"] = F_nn.pad(action, (0, self.target_dim - current))
return sample
@register_dataset_transform("pad_state")
class PadState(DatasetTransformStep):
"""Zero-pad ``observation.state`` to *target_dim* along the last axis."""
def __init__(self, target_dim: int):
self.target_dim = target_dim
def __call__(self, sample: dict) -> dict:
state = sample.get("observation.state")
if state is None:
return sample
current = state.shape[-1]
if current < self.target_dim:
sample["observation.state"] = F_nn.pad(state, (0, self.target_dim - current))
return sample
@register_dataset_transform("resize_images")
class ResizeImages(DatasetTransformStep):
"""Resize all image/video camera tensors to (height, width)."""
def __init__(self, height: int, width: int):
self.size = (height, width)
def __call__(self, sample: dict) -> dict:
for key in list(sample.keys()):
if not key.startswith("observation.images."):
continue
img = sample[key]
if not isinstance(img, torch.Tensor) or img.ndim < 3:
continue
sample[key] = F.resize(img, self.size, antialias=True)
return sample
class DatasetTransformPipeline:
"""Sequential pipeline of DatasetTransformStep instances."""
def __init__(self, configs: list[DatasetTransformStepConfig] | None = None):
self.steps: list[DatasetTransformStep] = []
if configs:
for cfg in configs:
self.steps.append(self._build(cfg))
@staticmethod
def _build(cfg: DatasetTransformStepConfig) -> DatasetTransformStep:
cls = _DATASET_TRANSFORM_REGISTRY.get(cfg.type)
if cls is None:
raise ValueError(
f"Unknown dataset transform '{cfg.type}'. "
f"Available: {list(_DATASET_TRANSFORM_REGISTRY)}"
)
return cls(**cfg.kwargs)
def __call__(self, sample: dict) -> dict:
for step in self.steps:
sample = step(sample)
return sample
def __repr__(self) -> str:
return f"DatasetTransformPipeline(steps={self.steps})"
+33 -6
View File
@@ -21,7 +21,7 @@ from collections import deque
from collections.abc import Iterable, Iterator
from pathlib import Path
from pprint import pformat
from typing import Any
from typing import Any, Generic, TypeVar
import datasets
import numpy as np
@@ -62,6 +62,8 @@ CHUNK_FILE_PATTERN = "chunk-{chunk_index:03d}/file-{file_index:03d}"
DEFAULT_TASKS_PATH = "meta/tasks.parquet"
DEFAULT_SUBTASKS_PATH = "meta/subtasks.parquet"
DEFAULT_EPISODES_PATH = EPISODES_DIR + "/" + CHUNK_FILE_PATTERN + ".parquet"
DEFAULT_TASKS_HIGH_LEVEL_PATH = "meta/tasks_high_level.parquet"
DEFAULT_SUBTASKS_PATH = "meta/subtasks.parquet"
DEFAULT_DATA_PATH = DATA_DIR + "/" + CHUNK_FILE_PATTERN + ".parquet"
DEFAULT_VIDEO_PATH = VIDEO_DIR + "/{video_key}/" + CHUNK_FILE_PATTERN + ".mp4"
DEFAULT_IMAGE_PATH = "images/{image_key}/episode-{episode_index:06d}/frame-{frame_index:06d}.png"
@@ -78,6 +80,8 @@ DEFAULT_FEATURES = {
"task_index": {"dtype": "int64", "shape": (1,), "names": None},
}
T = TypeVar("T")
def get_parquet_file_size_in_mb(parquet_path: str | Path) -> float:
metadata = pq.read_metadata(parquet_path)
@@ -120,9 +124,19 @@ def load_nested_dataset(
raise FileNotFoundError(f"Provided directory does not contain any parquet file: {pq_dir}")
with SuppressProgressBars():
# We use .from_parquet() memory-mapped loading for efficiency
filters = pa_ds.field("episode_index").isin(episodes) if episodes is not None else None
return Dataset.from_parquet([str(path) for path in paths], filters=filters, features=features)
# When no filtering needed, Dataset uses memory-mapped loading for efficiency
# PyArrow loads the entire dataset into memory
if episodes is None:
return Dataset.from_parquet([str(path) for path in paths], features=features)
arrow_dataset = pa_ds.dataset(paths, format="parquet")
filter_expr = pa_ds.field("episode_index").isin(episodes)
table = arrow_dataset.to_table(filter=filter_expr)
if features is not None:
table = table.cast(features.arrow_schema)
return Dataset(table)
def get_parquet_num_frames(parquet_path: str | Path) -> int:
@@ -339,9 +353,22 @@ def write_tasks(tasks: pandas.DataFrame, local_dir: Path) -> None:
def load_tasks(local_dir: Path) -> pandas.DataFrame:
tasks = pd.read_parquet(local_dir / DEFAULT_TASKS_PATH)
tasks.index.name = "task"
return tasks
def load_tasks_high_level(local_dir: Path) -> pandas.DataFrame | None:
"""Load high-level tasks from tasks_high_level.parquet if it exists."""
tasks_high_level_path = local_dir / DEFAULT_TASKS_HIGH_LEVEL_PATH
if tasks_high_level_path.exists():
return pd.read_parquet(tasks_high_level_path)
return None
def load_subtasks(local_dir: Path) -> pandas.DataFrame | None:
"""Load subtasks from subtasks.parquet if it exists."""
subtasks_path = local_dir / DEFAULT_SUBTASKS_PATH
if subtasks_path.exists():
return pd.read_parquet(subtasks_path)
return None
def load_subtasks(local_dir: Path) -> pandas.DataFrame | None:
"""Load subtasks from subtasks.parquet if it exists."""
@@ -1232,7 +1259,7 @@ class LookAheadError(Exception):
pass
class Backtrackable[T]:
class Backtrackable(Generic[T]):
"""
Wrap any iterator/iterable so you can step back up to `history` items
and look ahead up to `lookahead` items.
@@ -36,11 +36,8 @@ Convert a local dataset (works in place):
```bash
python src/lerobot/datasets/v30/convert_dataset_v21_to_v30.py \
--repo-id=lerobot/pusht \
--root=/path/to/local/dataset/directory \
--root=/path/to/local/dataset/directory
--push-to-hub=false
N.B. Path semantics (v2): --root is the exact dataset folder containing
meta/, data/, videos/. When omitted, defaults to $HF_LEROBOT_HOME/{repo_id}.
```
"""
@@ -108,7 +105,7 @@ episodes.jsonl
{"episode_index": 1, "tasks": ["Put the blue block in the green bowl"], "length": 266}
NEW
meta/episodes/chunk-000/file_000.parquet
meta/episodes/chunk-000/episodes_000.parquet
episode_index | video_chunk_index | video_file_index | data_chunk_index | data_file_index | tasks | length
-------------------------
OLD
@@ -116,16 +113,15 @@ tasks.jsonl
{"task_index": 1, "task": "Put the blue block in the green bowl"}
NEW
meta/tasks.parquet
meta/tasks/chunk-000/file_000.parquet
task_index | task
-------------------------
OLD
episodes_stats.jsonl
{"episode_index": 1, "stats": {"feature_name": {"min": ..., "max": ..., "mean": ..., "std": ..., "count": ...}}}
NEW
meta/episodes/chunk-000/file_000.parquet
episode_index | feature_name/min | feature_name/max | feature_name/mean | feature_name/std | feature_name/count
meta/episodes_stats/chunk-000/file_000.parquet
episode_index | mean | std | min | max
-------------------------
UPDATE
meta/info.json
@@ -174,7 +170,7 @@ def convert_tasks(root, new_root):
tasks, _ = legacy_load_tasks(root)
task_indices = tasks.keys()
task_strings = tasks.values()
df_tasks = pd.DataFrame({"task_index": task_indices}, index=pd.Index(task_strings, name="task"))
df_tasks = pd.DataFrame({"task_index": task_indices}, index=task_strings)
write_tasks(df_tasks, new_root)
@@ -205,6 +201,7 @@ def convert_data(root: Path, new_root: Path, data_file_size_in_mb: int):
image_keys = get_image_keys(root)
ep_idx = 0
chunk_idx = 0
file_idx = 0
size_in_mb = 0
@@ -214,23 +211,9 @@ def convert_data(root: Path, new_root: Path, data_file_size_in_mb: int):
logging.info(f"Converting data files from {len(ep_paths)} episodes")
for ep_idx, ep_path in enumerate(tqdm.tqdm(ep_paths, desc="convert data files")):
for ep_path in tqdm.tqdm(ep_paths, desc="convert data files"):
ep_size_in_mb = get_parquet_file_size_in_mb(ep_path)
ep_num_frames = get_parquet_num_frames(ep_path)
# Check if we need to start a new file BEFORE creating metadata
if size_in_mb + ep_size_in_mb >= data_file_size_in_mb and len(paths_to_cat) > 0:
# Write the accumulated data files
concat_data_files(paths_to_cat, new_root, chunk_idx, file_idx, image_keys)
# Move to next file
chunk_idx, file_idx = update_chunk_file_indices(chunk_idx, file_idx, DEFAULT_CHUNK_SIZE)
# Reset for the next file
size_in_mb = 0
paths_to_cat = []
# Now create metadata with correct chunk/file indices
ep_metadata = {
"episode_index": ep_idx,
"data/chunk_index": chunk_idx,
@@ -241,7 +224,20 @@ def convert_data(root: Path, new_root: Path, data_file_size_in_mb: int):
size_in_mb += ep_size_in_mb
num_frames += ep_num_frames
episodes_metadata.append(ep_metadata)
paths_to_cat.append(ep_path)
ep_idx += 1
if size_in_mb < data_file_size_in_mb:
paths_to_cat.append(ep_path)
continue
if paths_to_cat:
concat_data_files(paths_to_cat, new_root, chunk_idx, file_idx, image_keys)
# Reset for the next file
size_in_mb = ep_size_in_mb
paths_to_cat = [ep_path]
chunk_idx, file_idx = update_chunk_file_indices(chunk_idx, file_idx, DEFAULT_CHUNK_SIZE)
# Write remaining data if any
if paths_to_cat:
@@ -473,7 +469,7 @@ def convert_dataset(
# Set root based on whether local dataset path is provided
use_local_dataset = False
root = HF_LEROBOT_HOME / repo_id if root is None else Path(root)
root = HF_LEROBOT_HOME / repo_id if root is None else Path(root) / repo_id
if root.exists():
validate_local_dataset_version(root)
use_local_dataset = True
@@ -533,7 +529,7 @@ if __name__ == "__main__":
type=str,
required=True,
help="Repository identifier on Hugging Face: a community or a user name `/` the name of the dataset "
"(e.g. `lerobot/pusht`, `<USER>/aloha_sim_insertion_human`).",
"(e.g. `lerobot/pusht`, `cadene/aloha_sim_insertion_human`).",
)
parser.add_argument(
"--branch",
@@ -557,7 +553,7 @@ if __name__ == "__main__":
"--root",
type=str,
default=None,
help="Local directory to use for downloading/writing the dataset. Defaults to $HF_LEROBOT_HOME/repo_id.",
help="Local directory to use for downloading/writing the dataset.",
)
parser.add_argument(
"--push-to-hub",
+46 -480
View File
@@ -13,106 +13,25 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import contextlib
import glob
import importlib
import logging
import queue
import shutil
import tempfile
import threading
import warnings
from dataclasses import dataclass, field
from fractions import Fraction
from pathlib import Path
from threading import Lock
from typing import Any, ClassVar
import av
import fsspec
import numpy as np
import pyarrow as pa
import torch
import torchvision
from datasets.features.features import register_feature
from PIL import Image
# List of hardware encoders to probe for auto-selection. Availability depends on the platform and FFmpeg build.
# Determines the order of preference for auto-selection when vcodec="auto" is used.
HW_ENCODERS = [
"h264_videotoolbox", # macOS
"hevc_videotoolbox", # macOS
"h264_nvenc", # NVIDIA GPU
"hevc_nvenc", # NVIDIA GPU
"h264_vaapi", # Linux Intel/AMD
"h264_qsv", # Intel Quick Sync
]
VALID_VIDEO_CODECS = {"h264", "hevc", "libsvtav1", "auto"} | set(HW_ENCODERS)
def _get_codec_options(
vcodec: str,
g: int | None = 2,
crf: int | None = 30,
preset: int | None = None,
) -> dict:
"""Build codec-specific options dict for video encoding."""
options = {}
# GOP size (keyframe interval) - supported by VideoToolbox and software encoders
if g is not None and (vcodec in ("h264_videotoolbox", "hevc_videotoolbox") or vcodec not in HW_ENCODERS):
options["g"] = str(g)
# Quality control (codec-specific parameter names)
if crf is not None:
if vcodec in ("h264", "hevc", "libsvtav1"):
options["crf"] = str(crf)
elif vcodec in ("h264_videotoolbox", "hevc_videotoolbox"):
quality = max(1, min(100, int(100 - crf * 2)))
options["q:v"] = str(quality)
elif vcodec in ("h264_nvenc", "hevc_nvenc"):
options["rc"] = "constqp"
options["qp"] = str(crf)
elif vcodec in ("h264_vaapi",):
options["qp"] = str(crf)
elif vcodec in ("h264_qsv",):
options["global_quality"] = str(crf)
# Preset (only for libsvtav1)
if vcodec == "libsvtav1":
options["preset"] = str(preset) if preset is not None else "12"
return options
def detect_available_hw_encoders() -> list[str]:
"""Probe PyAV/FFmpeg for available hardware video encoders."""
available = []
for codec_name in HW_ENCODERS:
try:
av.codec.Codec(codec_name, "w")
available.append(codec_name)
except Exception: # nosec B110
pass # nosec B110
return available
def resolve_vcodec(vcodec: str) -> str:
"""Validate vcodec and resolve 'auto' to best available HW encoder, fallback to libsvtav1."""
if vcodec not in VALID_VIDEO_CODECS:
raise ValueError(f"Invalid vcodec '{vcodec}'. Must be one of: {sorted(VALID_VIDEO_CODECS)}")
if vcodec != "auto":
logging.info(f"Using video codec: {vcodec}")
return vcodec
available = detect_available_hw_encoders()
for encoder in HW_ENCODERS:
if encoder in available:
logging.info(f"Auto-selected video codec: {encoder}")
return encoder
logging.info("No hardware encoder available, falling back to software encoder 'libsvtav1'")
return "libsvtav1"
def get_safe_default_codec():
if importlib.util.find_spec("torchcodec"):
@@ -227,17 +146,16 @@ def decode_video_frames_torchvision(
min_, argmin_ = dist.min(1)
is_within_tol = min_ < tolerance_s
if not is_within_tol.all():
raise FrameTimestampError(
f"One or several query timestamps unexpectedly violate the tolerance ({min_[~is_within_tol]} > {tolerance_s=})."
" It means that the closest frame that can be loaded from the video is too far away in time."
" This might be due to synchronization issues with timestamps during data collection."
" To be safe, we advise to ignore this item during training."
f"\nqueried timestamps: {query_ts}"
f"\nloaded timestamps: {loaded_ts}"
f"\nvideo: {video_path}"
f"\nbackend: {backend}"
)
assert is_within_tol.all(), (
f"One or several query timestamps unexpectedly violate the tolerance ({min_[~is_within_tol]} > {tolerance_s=})."
"It means that the closest frame that can be loaded from the video is too far away in time."
"This might be due to synchronization issues with timestamps during data collection."
"To be safe, we advise to ignore this item during training."
f"\nqueried timestamps: {query_ts}"
f"\nloaded timestamps: {loaded_ts}"
f"\nvideo: {video_path}"
f"\nbackend: {backend}"
)
# get closest frames to the query timestamps
closest_frames = torch.stack([loaded_frames[idx] for idx in argmin_])
@@ -249,11 +167,7 @@ def decode_video_frames_torchvision(
# convert to the pytorch format which is float32 in [0,1] range (and channel first)
closest_frames = closest_frames.type(torch.float32) / 255
if len(timestamps) != len(closest_frames):
raise FrameTimestampError(
f"Number of retrieved frames ({len(closest_frames)}) does not match "
f"number of queried timestamps ({len(timestamps)})"
)
assert len(timestamps) == len(closest_frames)
return closest_frames
@@ -358,16 +272,15 @@ def decode_video_frames_torchcodec(
min_, argmin_ = dist.min(1)
is_within_tol = min_ < tolerance_s
if not is_within_tol.all():
raise FrameTimestampError(
f"One or several query timestamps unexpectedly violate the tolerance ({min_[~is_within_tol]} > {tolerance_s=})."
" It means that the closest frame that can be loaded from the video is too far away in time."
" This might be due to synchronization issues with timestamps during data collection."
" To be safe, we advise to ignore this item during training."
f"\nqueried timestamps: {query_ts}"
f"\nloaded timestamps: {loaded_ts}"
f"\nvideo: {video_path}"
)
assert is_within_tol.all(), (
f"One or several query timestamps unexpectedly violate the tolerance ({min_[~is_within_tol]} > {tolerance_s=})."
"It means that the closest frame that can be loaded from the video is too far away in time."
"This might be due to synchronization issues with timestamps during data collection."
"To be safe, we advise to ignore this item during training."
f"\nqueried timestamps: {query_ts}"
f"\nloaded timestamps: {loaded_ts}"
f"\nvideo: {video_path}"
)
# get closest frames to the query timestamps
closest_frames = torch.stack([loaded_frames[idx] for idx in argmin_])
@@ -396,13 +309,14 @@ def encode_video_frames(
g: int | None = 2,
crf: int | None = 30,
fast_decode: int = 0,
log_level: int | None = av.logging.WARNING,
log_level: int | None = av.logging.ERROR,
overwrite: bool = False,
preset: int | None = None,
encoder_threads: int | None = None,
) -> None:
"""More info on ffmpeg arguments tuning on `benchmark/video/README.md`"""
vcodec = resolve_vcodec(vcodec)
# Check encoder availability
if vcodec not in ["h264", "hevc", "libsvtav1"]:
raise ValueError(f"Unsupported video codec: {vcodec}. Supported codecs are: h264, hevc, libsvtav1.")
video_path = Path(video_path)
imgs_dir = Path(imgs_dir)
@@ -433,22 +347,21 @@ def encode_video_frames(
width, height = dummy_image.size
# Define video codec options
video_options = _get_codec_options(vcodec, g, crf, preset)
video_options = {}
if g is not None:
video_options["g"] = str(g)
if crf is not None:
video_options["crf"] = str(crf)
if fast_decode:
key = "svtav1-params" if vcodec == "libsvtav1" else "tune"
value = f"fast-decode={fast_decode}" if vcodec == "libsvtav1" else "fastdecode"
video_options[key] = value
if encoder_threads is not None:
if vcodec == "libsvtav1":
lp_param = f"lp={encoder_threads}"
if "svtav1-params" in video_options:
video_options["svtav1-params"] += f":{lp_param}"
else:
video_options["svtav1-params"] = lp_param
else:
video_options["threads"] = str(encoder_threads)
if vcodec == "libsvtav1":
video_options["preset"] = str(preset) if preset is not None else "12"
# Set logging level
if log_level is not None:
@@ -567,348 +480,6 @@ def concatenate_video_files(
Path(tmp_concatenate_path).unlink()
class _CameraEncoderThread(threading.Thread):
"""A thread that encodes video frames streamed via a queue into an MP4 file.
One instance is created per camera per episode. Frames are received as numpy arrays
from the main thread, encoded in real-time using PyAV (which releases the GIL during
encoding), and written to disk. Stats are computed incrementally using
RunningQuantileStats and returned via result_queue.
"""
def __init__(
self,
video_path: Path,
fps: int,
vcodec: str,
pix_fmt: str,
g: int | None,
crf: int | None,
preset: int | None,
frame_queue: queue.Queue,
result_queue: queue.Queue,
stop_event: threading.Event,
encoder_threads: int | None = None,
):
super().__init__(daemon=True)
self.video_path = video_path
self.fps = fps
self.vcodec = vcodec
self.pix_fmt = pix_fmt
self.g = g
self.crf = crf
self.preset = preset
self.frame_queue = frame_queue
self.result_queue = result_queue
self.stop_event = stop_event
self.encoder_threads = encoder_threads
def run(self) -> None:
from lerobot.datasets.compute_stats import RunningQuantileStats, auto_downsample_height_width
container = None
output_stream = None
stats_tracker = RunningQuantileStats()
frame_count = 0
try:
logging.getLogger("libav").setLevel(av.logging.WARNING)
while True:
try:
frame_data = self.frame_queue.get(timeout=1)
except queue.Empty:
if self.stop_event.is_set():
break
continue
if frame_data is None:
# Sentinel: flush and close
break
# Ensure HWC uint8 numpy array
if isinstance(frame_data, np.ndarray):
if frame_data.ndim == 3 and frame_data.shape[0] == 3:
# CHW -> HWC
frame_data = frame_data.transpose(1, 2, 0)
if frame_data.dtype != np.uint8:
frame_data = (frame_data * 255).astype(np.uint8)
# Open container on first frame (to get width/height)
if container is None:
height, width = frame_data.shape[:2]
video_options = _get_codec_options(self.vcodec, self.g, self.crf, self.preset)
if self.encoder_threads is not None:
if self.vcodec == "libsvtav1":
lp_param = f"lp={self.encoder_threads}"
if "svtav1-params" in video_options:
video_options["svtav1-params"] += f":{lp_param}"
else:
video_options["svtav1-params"] = lp_param
else:
video_options["threads"] = str(self.encoder_threads)
Path(self.video_path).parent.mkdir(parents=True, exist_ok=True)
container = av.open(str(self.video_path), "w")
output_stream = container.add_stream(self.vcodec, self.fps, options=video_options)
output_stream.pix_fmt = self.pix_fmt
output_stream.width = width
output_stream.height = height
output_stream.time_base = Fraction(1, self.fps)
# Encode frame with explicit timestamps
pil_img = Image.fromarray(frame_data)
video_frame = av.VideoFrame.from_image(pil_img)
video_frame.pts = frame_count
video_frame.time_base = Fraction(1, self.fps)
packet = output_stream.encode(video_frame)
if packet:
container.mux(packet)
# Update stats with downsampled frame (per-channel stats like compute_episode_stats)
img_chw = frame_data.transpose(2, 0, 1) # HWC -> CHW
img_downsampled = auto_downsample_height_width(img_chw)
# Reshape CHW to (H*W, C) for per-channel stats
channels = img_downsampled.shape[0]
img_for_stats = img_downsampled.transpose(1, 2, 0).reshape(-1, channels)
stats_tracker.update(img_for_stats)
frame_count += 1
# Flush encoder
if output_stream is not None:
packet = output_stream.encode()
if packet:
container.mux(packet)
if container is not None:
container.close()
av.logging.restore_default_callback()
# Get stats and put on result queue
if frame_count >= 2:
stats = stats_tracker.get_statistics()
self.result_queue.put(("ok", stats))
else:
self.result_queue.put(("ok", None))
except Exception as e:
logging.error(f"Encoder thread error: {e}")
if container is not None:
with contextlib.suppress(Exception):
container.close()
self.result_queue.put(("error", str(e)))
class StreamingVideoEncoder:
"""Manages per-camera encoder threads for real-time video encoding during recording.
Instead of writing frames as PNG images and then encoding to MP4 at episode end,
this class streams frames directly to encoder threads, eliminating the
PNG round-trip and making save_episode() near-instant.
Uses threading instead of multiprocessing to avoid the overhead of pickling large
numpy arrays through multiprocessing.Queue. PyAV's encode() releases the GIL,
so encoding runs in parallel with the main recording loop.
"""
def __init__(
self,
fps: int,
vcodec: str = "libsvtav1",
pix_fmt: str = "yuv420p",
g: int | None = 2,
crf: int | None = 30,
preset: int | None = None,
queue_maxsize: int = 30,
encoder_threads: int | None = None,
):
self.fps = fps
self.vcodec = resolve_vcodec(vcodec)
self.pix_fmt = pix_fmt
self.g = g
self.crf = crf
self.preset = preset
self.queue_maxsize = queue_maxsize
self.encoder_threads = encoder_threads
self._frame_queues: dict[str, queue.Queue] = {}
self._result_queues: dict[str, queue.Queue] = {}
self._threads: dict[str, _CameraEncoderThread] = {}
self._stop_events: dict[str, threading.Event] = {}
self._video_paths: dict[str, Path] = {}
self._dropped_frames: dict[str, int] = {}
self._episode_active = False
def start_episode(self, video_keys: list[str], temp_dir: Path) -> None:
"""Start encoder threads for a new episode.
Args:
video_keys: List of video feature keys (e.g. ["observation.images.laptop"])
temp_dir: Base directory for temporary MP4 files
"""
if self._episode_active:
self.cancel_episode()
self._dropped_frames.clear()
for video_key in video_keys:
frame_queue: queue.Queue = queue.Queue(maxsize=self.queue_maxsize)
result_queue: queue.Queue = queue.Queue(maxsize=1)
stop_event = threading.Event()
temp_video_dir = Path(tempfile.mkdtemp(dir=temp_dir))
video_path = temp_video_dir / f"{video_key.replace('/', '_')}_streaming.mp4"
encoder_thread = _CameraEncoderThread(
video_path=video_path,
fps=self.fps,
vcodec=self.vcodec,
pix_fmt=self.pix_fmt,
g=self.g,
crf=self.crf,
preset=self.preset,
frame_queue=frame_queue,
result_queue=result_queue,
stop_event=stop_event,
encoder_threads=self.encoder_threads,
)
encoder_thread.start()
self._frame_queues[video_key] = frame_queue
self._result_queues[video_key] = result_queue
self._threads[video_key] = encoder_thread
self._stop_events[video_key] = stop_event
self._video_paths[video_key] = video_path
self._episode_active = True
def feed_frame(self, video_key: str, image: np.ndarray) -> None:
"""Feed a frame to the encoder for a specific camera.
A copy of the image is made before enqueueing to prevent race conditions
with camera drivers that may reuse buffers. If the encoder queue is full
(encoder can't keep up), the frame is dropped with a warning instead of
crashing the recording session.
Args:
video_key: The video feature key
image: numpy array in (H,W,C) or (C,H,W) format, uint8 or float
Raises:
RuntimeError: If the encoder thread has crashed
"""
if not self._episode_active:
raise RuntimeError("No active episode. Call start_episode() first.")
thread = self._threads[video_key]
if not thread.is_alive():
# Check for error
try:
status, msg = self._result_queues[video_key].get_nowait()
if status == "error":
raise RuntimeError(f"Encoder thread for {video_key} crashed: {msg}")
except queue.Empty:
pass
raise RuntimeError(f"Encoder thread for {video_key} is not alive")
try:
self._frame_queues[video_key].put(image.copy(), timeout=0.1)
except queue.Full:
self._dropped_frames[video_key] = self._dropped_frames.get(video_key, 0) + 1
count = self._dropped_frames[video_key]
# Log periodically to avoid spam (1st, then every 10th)
if count == 1 or count % 10 == 0:
logging.warning(
f"Encoder queue full for {video_key}, dropped {count} frame(s). "
f"Consider using vcodec='auto' for hardware encoding or increasing encoder_queue_maxsize."
)
def finish_episode(self) -> dict[str, tuple[Path, dict | None]]:
"""Finish encoding the current episode.
Sends sentinel values, waits for encoder threads to complete,
and collects results.
Returns:
Dict mapping video_key to (mp4_path, stats_dict_or_None)
"""
if not self._episode_active:
raise RuntimeError("No active episode to finish.")
results = {}
# Report dropped frames
for video_key, count in self._dropped_frames.items():
if count > 0:
logging.warning(f"Episode finished with {count} dropped frame(s) for {video_key}.")
# Send sentinel to all queues
for video_key in self._frame_queues:
self._frame_queues[video_key].put(None)
# Wait for all threads and collect results
for video_key in self._threads:
self._threads[video_key].join(timeout=120)
if self._threads[video_key].is_alive():
logging.error(f"Encoder thread for {video_key} did not finish in time")
self._stop_events[video_key].set()
self._threads[video_key].join(timeout=5)
results[video_key] = (self._video_paths[video_key], None)
continue
try:
status, data = self._result_queues[video_key].get(timeout=5)
if status == "error":
raise RuntimeError(f"Encoder thread for {video_key} failed: {data}")
results[video_key] = (self._video_paths[video_key], data)
except queue.Empty:
logging.error(f"No result from encoder thread for {video_key}")
results[video_key] = (self._video_paths[video_key], None)
self._cleanup()
self._episode_active = False
return results
def cancel_episode(self) -> None:
"""Cancel the current episode, stopping encoder threads and cleaning up."""
if not self._episode_active:
return
# Signal all threads to stop
for video_key in self._stop_events:
self._stop_events[video_key].set()
# Wait for threads to finish
for video_key in self._threads:
self._threads[video_key].join(timeout=5)
# Clean up temp MP4 files
video_path = self._video_paths.get(video_key)
if video_path is not None and video_path.exists():
shutil.rmtree(str(video_path.parent), ignore_errors=True)
self._cleanup()
self._episode_active = False
def close(self) -> None:
"""Close the encoder, canceling any in-progress episode."""
if self._episode_active:
self.cancel_episode()
def _cleanup(self) -> None:
"""Clean up queues and thread tracking dicts."""
for q in self._frame_queues.values():
with contextlib.suppress(Exception):
while not q.empty():
q.get_nowait()
self._frame_queues.clear()
self._result_queues.clear()
self._threads.clear()
self._stop_events.clear()
self._video_paths.clear()
@dataclass
class VideoFrame:
# TODO(rcadene, lhoestq): move to Hugging Face `datasets` repo
@@ -943,7 +514,7 @@ with warnings.catch_warnings():
def get_audio_info(video_path: Path | str) -> dict:
# Set logging level
logging.getLogger("libav").setLevel(av.logging.WARNING)
logging.getLogger("libav").setLevel(av.logging.ERROR)
# Getting audio stream information
audio_info = {}
@@ -975,7 +546,7 @@ def get_audio_info(video_path: Path | str) -> dict:
def get_video_info(video_path: Path | str) -> dict:
# Set logging level
logging.getLogger("libav").setLevel(av.logging.WARNING)
logging.getLogger("libav").setLevel(av.logging.ERROR)
# Getting video stream information
video_info = {}
@@ -1061,15 +632,8 @@ class VideoEncodingManager:
return self
def __exit__(self, exc_type, exc_val, exc_tb):
streaming_encoder = getattr(self.dataset, "_streaming_encoder", None)
if streaming_encoder is not None:
# Handle streaming encoder cleanup
if exc_type is not None:
streaming_encoder.cancel_episode()
streaming_encoder.close()
elif self.dataset.episodes_since_last_encoding > 0:
# Handle any remaining episodes that haven't been batch encoded
# Handle any remaining episodes that haven't been batch encoded
if self.dataset.episodes_since_last_encoding > 0:
if exc_type is not None:
logging.info("Exception occurred. Encoding remaining episodes before exit...")
else:
@@ -1086,8 +650,8 @@ class VideoEncodingManager:
# Finalize the dataset to properly close all writers
self.dataset.finalize()
# Clean up episode images if recording was interrupted (only for non-streaming mode)
if exc_type is not None and streaming_encoder is None:
# Clean up episode images if recording was interrupted
if exc_type is not None:
interrupted_episode_index = self.dataset.num_episodes
for key in self.dataset.meta.video_keys:
img_dir = self.dataset._get_image_file_path(
@@ -1101,12 +665,14 @@ class VideoEncodingManager:
# Clean up any remaining images directory if it's empty
img_dir = self.dataset.root / "images"
if img_dir.exists():
png_files = list(img_dir.rglob("*.png"))
if len(png_files) == 0:
# Check for any remaining PNG files
png_files = list(img_dir.rglob("*.png"))
if len(png_files) == 0:
# Only remove the images directory if no PNG files remain
if img_dir.exists():
shutil.rmtree(img_dir)
logging.debug("Cleaned up empty images directory")
else:
logging.debug(f"Images directory is not empty, containing {len(png_files)} PNG files")
else:
logging.debug(f"Images directory is not empty, containing {len(png_files)} PNG files")
return False # Don't suppress the original exception
-99
View File
@@ -346,105 +346,6 @@ class LiberoEnv(EnvConfig):
return kwargs
@EnvConfig.register_subclass("libero_plus")
@dataclass
class LiberoPlusEnv(LiberoEnv):
"""Alias config for LIBERO-plus benchmarks.
LIBERO-plus keeps the same Python package/module names as LIBERO, so this
config reuses the existing LIBERO env implementation while making intent explicit
in experiment configs (`env.type=libero_plus`).
"""
task: str = "libero_spatial"
@EnvConfig.register_subclass("robocasa")
@dataclass
class RoboCasaEnv(EnvConfig):
"""RoboCasa kitchen composite-task environments.
Wraps ``robocasa.wrappers.gym_wrapper.RoboCasaGymEnv`` with a flat 12-D Box
action space and a structured pixel + state observation dict.
Selected benchmark tasks (3 short + 2 long):
Short: PickPlaceCounterToCabinet, PrepareToast, CoffeeSetupMug
Long: PrepareCoffee, RestockPantry
"""
task: str = "PickPlaceCounterToCabinet"
tasks: list[str] | None = None # multi-task: list of task names (without robocasa/ prefix)
fps: int = 20
episode_length: int = 500
image_size: int = 128
split: str = "target" # "pretrain" or "target"
features: dict[str, PolicyFeature] = field(
default_factory=lambda: {
ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(12,)),
}
)
features_map: dict[str, str] = field(
default_factory=lambda: {
ACTION: ACTION,
"agentview_left": f"{OBS_IMAGES}.agentview_left",
"agentview_right": f"{OBS_IMAGES}.agentview_right",
"eye_in_hand": f"{OBS_IMAGES}.eye_in_hand",
"robot_state": OBS_STATE,
}
)
def __post_init__(self):
for cam in ("agentview_left", "agentview_right", "eye_in_hand"):
self.features[cam] = PolicyFeature(
type=FeatureType.VISUAL, shape=(self.image_size, self.image_size, 3)
)
self.features["robot_state"] = PolicyFeature(type=FeatureType.STATE, shape=(16,))
@property
def gym_kwargs(self) -> dict:
return {"split": self.split}
@EnvConfig.register_subclass("robomme")
@dataclass
class RoboMMEEnv(EnvConfig):
"""RoboMME memory-augmented manipulation benchmark (ManiSkill/SAPIEN).
16 tasks across 4 suites: Counting, Permanence, Reference, Imitation.
Uses BenchmarkEnvBuilder from the robomme package.
"""
task: str = "PickXtimes"
fps: int = 10
episode_length: int = 300
action_space: str = "joint_angle"
dataset_split: str = "test"
task_ids: list[int] | None = None
features: dict[str, PolicyFeature] = field(
default_factory=lambda: {
ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(8,)),
"front_rgb": PolicyFeature(type=FeatureType.VISUAL, shape=(256, 256, 3)),
"wrist_rgb": PolicyFeature(type=FeatureType.VISUAL, shape=(256, 256, 3)),
OBS_STATE: PolicyFeature(type=FeatureType.STATE, shape=(8,)),
}
)
features_map: dict[str, str] = field(
default_factory=lambda: {
ACTION: ACTION,
"front_rgb": f"{OBS_IMAGES}.front",
"wrist_rgb": f"{OBS_IMAGES}.wrist",
OBS_STATE: OBS_STATE,
}
)
@property
def gym_kwargs(self) -> dict:
return {
"action_space": self.action_space,
"dataset": self.dataset_split,
}
@EnvConfig.register_subclass("metaworld")
@dataclass
class MetaworldEnv(EnvConfig):
+3 -50
View File
@@ -20,21 +20,11 @@ import gymnasium as gym
from gymnasium.envs.registration import registry as gym_registry
from lerobot.configs.policies import PreTrainedConfig
from lerobot.envs.configs import (
AlohaEnv,
EnvConfig,
HubEnvConfig,
IsaaclabArenaEnv,
LiberoEnv,
LiberoPlusEnv,
PushtEnv,
RoboCasaEnv,
RoboMMEEnv,
)
from lerobot.envs.configs import AlohaEnv, EnvConfig, HubEnvConfig, IsaaclabArenaEnv, LiberoEnv, PushtEnv
from lerobot.envs.utils import _call_make_env, _download_hub_file, _import_hub_module, _normalize_hub_result
from lerobot.policies.xvla.configuration_xvla import XVLAConfig
from lerobot.processor import ProcessorStep
from lerobot.processor.env_processor import IsaaclabArenaProcessorStep, LiberoProcessorStep, RoboCasaProcessorStep
from lerobot.processor.env_processor import IsaaclabArenaProcessorStep, LiberoProcessorStep
from lerobot.processor.pipeline import PolicyProcessorPipeline
@@ -45,12 +35,6 @@ def make_env_config(env_type: str, **kwargs) -> EnvConfig:
return PushtEnv(**kwargs)
elif env_type == "libero":
return LiberoEnv(**kwargs)
elif env_type == "libero_plus":
return LiberoPlusEnv(**kwargs)
elif env_type == "robocasa":
return RoboCasaEnv(**kwargs)
elif env_type == "robomme":
return RoboMMEEnv(**kwargs)
else:
raise ValueError(f"Policy type '{env_type}' is not available.")
@@ -86,13 +70,9 @@ def make_env_pre_post_processors(
return make_xvla_libero_pre_post_processors()
# For LIBERO environments, add the LiberoProcessorStep to preprocessor
if isinstance(env_cfg, (LiberoEnv, LiberoPlusEnv)) or "libero" in env_cfg.type:
if isinstance(env_cfg, LiberoEnv) or "libero" in env_cfg.type:
preprocessor_steps.append(LiberoProcessorStep())
# For RoboCasa environments, add the RoboCasaProcessorStep to preprocessor
if isinstance(env_cfg, RoboCasaEnv) or "robocasa" in env_cfg.type:
preprocessor_steps.append(RoboCasaProcessorStep())
# For Isaaclab Arena environments, add the IsaaclabArenaProcessorStep
if isinstance(env_cfg, IsaaclabArenaEnv) or "isaaclab_arena" in env_cfg.type:
# Parse comma-separated keys (handle None for state-based policies)
@@ -201,33 +181,6 @@ def make_env(
control_mode=cfg.control_mode,
episode_length=cfg.episode_length,
)
elif "robocasa" in cfg.type:
from lerobot.envs.robocasa import create_robocasa_envs
tasks = cfg.tasks if cfg.tasks else [cfg.task]
return create_robocasa_envs(
tasks=tasks,
n_envs=n_envs,
image_size=cfg.image_size,
split=cfg.split,
episode_length=cfg.episode_length,
gym_kwargs=cfg.gym_kwargs,
env_cls=env_cls,
)
elif "robomme" in cfg.type:
from lerobot.envs.robomme import create_robomme_envs
return create_robomme_envs(
task=cfg.task,
n_envs=n_envs,
action_space_type=cfg.action_space,
dataset=cfg.dataset_split,
episode_length=cfg.episode_length,
task_ids=cfg.task_ids,
env_cls=env_cls,
)
elif "metaworld" in cfg.type:
from lerobot.envs.metaworld import create_metaworld_envs
+4 -15
View File
@@ -26,14 +26,8 @@ import gymnasium as gym
import numpy as np
import torch
from gymnasium import spaces
try:
from libero.libero import benchmark, get_libero_path
from libero.libero.envs import OffScreenRenderEnv
except ImportError:
# LIBERO-plus may be installed from source with an extra nested package level.
from libero.libero.libero import benchmark, get_libero_path
from libero.libero.libero.envs import OffScreenRenderEnv
from libero.libero import benchmark, get_libero_path
from libero.libero.envs import OffScreenRenderEnv
from lerobot.processor import RobotObservation
@@ -118,7 +112,6 @@ class LiberoEnv(gym.Env):
visualization_height: int = 480,
init_states: bool = True,
episode_index: int = 0,
n_envs: int = 1,
camera_name_mapping: dict[str, str] | None = None,
num_steps_wait: int = 10,
control_mode: str = "relative",
@@ -152,9 +145,7 @@ class LiberoEnv(gym.Env):
self.episode_length = episode_length
# Load once and keep
self._init_states = get_task_init_states(task_suite, self.task_id) if self.init_states else None
self._reset_stride = n_envs # when performing a reset, append `_reset_stride` to `init_state_id`.
self.init_state_id = self.episode_index # tie each sub-env to a fixed init state
self._init_state_id = self.episode_index # tie each sub-env to a fixed init state
self._env = self._make_envs_task(task_suite, self.task_id)
default_steps = 500
@@ -304,8 +295,7 @@ class LiberoEnv(gym.Env):
self._env.seed(seed)
raw_obs = self._env.reset()
if self.init_states and self._init_states is not None:
raw_obs = self._env.set_init_state(self._init_states[self.init_state_id % len(self._init_states)])
self.init_state_id += self._reset_stride # Change init_state_id when reset
raw_obs = self._env.set_init_state(self._init_states[self._init_state_id])
# After reset, objects may be unstable (slightly floating, intersecting, etc.).
# Step the simulator with a no-op action for a few frames so everything settles.
@@ -383,7 +373,6 @@ def _make_env_fns(
init_states=init_states,
episode_length=episode_length,
episode_index=episode_index,
n_envs=n_envs,
control_mode=control_mode,
**local_kwargs,
)
-273
View File
@@ -1,273 +0,0 @@
#!/usr/bin/env python
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
from collections import defaultdict
from collections.abc import Callable, Sequence
from functools import partial
from typing import Any
import gymnasium as gym
import numpy as np
from gymnasium import spaces
# Action layout (flat 12D, normalized to [-1, 1]):
# [0:3] end_effector_position (delta x, y, z)
# [3:6] end_effector_rotation (delta roll, pitch, yaw)
# [6:7] gripper_close (open=-1, close=+1)
# [7:11] base_motion (x, y, theta, torso_height)
# [11:12] control_mode (arm=-1, base=+1)
ACTION_DIM = 12
ACTION_LOW = -1.0
ACTION_HIGH = 1.0
# Proprioceptive state layout (flat 16D):
# [0:2] gripper_qpos
# [2:5] base_position
# [5:9] base_rotation (quaternion)
# [9:12] end_effector_position_relative
# [12:16] end_effector_rotation_relative (quaternion)
STATE_DIM = 16
# Obs dict keys from RoboCasaGymEnv.get_observation()
_CAM_KEYS = (
"video.robot0_agentview_left",
"video.robot0_agentview_right",
"video.robot0_eye_in_hand",
)
_STATE_KEYS_ORDERED = (
"state.gripper_qpos", # (2,)
"state.base_position", # (3,)
"state.base_rotation", # (4,)
"state.end_effector_position_relative", # (3,)
"state.end_effector_rotation_relative", # (4,)
)
# Mapping from video.* key → short image name used in features_map
CAM_KEY_TO_NAME = {
"video.robot0_agentview_left": "agentview_left",
"video.robot0_agentview_right": "agentview_right",
"video.robot0_eye_in_hand": "eye_in_hand",
}
def _flat_to_action_dict(flat: np.ndarray) -> dict[str, np.ndarray]:
"""Convert a 12D flat action array to the Dict format expected by RoboCasaGymEnv."""
return {
"action.end_effector_position": flat[0:3],
"action.end_effector_rotation": flat[3:6],
"action.gripper_close": flat[6:7],
"action.base_motion": flat[7:11],
"action.control_mode": flat[11:12],
}
class RoboCasaEnv(gym.Env):
"""Thin wrapper around RoboCasaGymEnv that provides a flat Box action space
and a structured observation dict compatible with LeRobot policies.
Observations returned by step/reset:
{
"pixels": {
"agentview_left": (H, W, 3) uint8,
"agentview_right": (H, W, 3) uint8,
"eye_in_hand": (H, W, 3) uint8,
},
"robot_state": (16,) float32,
}
Actions: flat float32 ndarray of shape (12,), normalized to [-1, 1].
"""
metadata = {"render_modes": ["rgb_array"], "render_fps": 20}
def __init__(
self,
task: str,
split: str = "target",
image_size: int = 128,
render_mode: str = "rgb_array",
episode_length: int = 500,
**gym_kwargs: Any,
):
super().__init__()
# Lazy import — robocasa is optional
import robocasa.environments # noqa: F401 — registers all gym envs
self.task = task
self.render_mode = render_mode
self.image_size = image_size
self._max_episode_steps = episode_length
self._step_count = 0
self._env = gym.make(
f"robocasa/{task}",
split=split,
camera_widths=image_size,
camera_heights=image_size,
**gym_kwargs,
)
# Flat 12D Box action space
self.action_space = spaces.Box(
low=ACTION_LOW,
high=ACTION_HIGH,
shape=(ACTION_DIM,),
dtype=np.float32,
)
images = {
name: spaces.Box(low=0, high=255, shape=(image_size, image_size, 3), dtype=np.uint8)
for name in CAM_KEY_TO_NAME.values()
}
self.observation_space = spaces.Dict(
{
"pixels": spaces.Dict(images),
"robot_state": spaces.Box(
low=-np.inf, high=np.inf, shape=(STATE_DIM,), dtype=np.float32
),
}
)
def _format_obs(self, raw_obs: dict) -> dict:
pixels = {
CAM_KEY_TO_NAME[k]: raw_obs[k]
for k in _CAM_KEYS
if k in raw_obs
}
state_parts = [
np.asarray(raw_obs[k], dtype=np.float32)
for k in _STATE_KEYS_ORDERED
if k in raw_obs
]
robot_state = np.concatenate(state_parts) if state_parts else np.zeros(STATE_DIM, dtype=np.float32)
return {"pixels": pixels, "robot_state": robot_state}
def reset(self, seed: int | None = None, **kwargs) -> tuple[dict, dict]:
super().reset(seed=seed)
self._step_count = 0
raw_obs, info = self._env.reset(seed=seed)
info.setdefault("is_success", False)
info["task"] = self.task
return self._format_obs(raw_obs), info
def step(self, action: np.ndarray) -> tuple[dict, float, bool, bool, dict]:
if action.ndim != 1 or action.shape[0] != ACTION_DIM:
raise ValueError(
f"Expected 1-D action of shape ({ACTION_DIM},), got {action.shape}"
)
action_dict = _flat_to_action_dict(action)
raw_obs, reward, terminated, truncated, info = self._env.step(action_dict)
self._step_count += 1
is_success = bool(info.get("success", False))
terminated = terminated or is_success
if self._step_count >= self._max_episode_steps:
truncated = True
info.update({"task": self.task, "is_success": is_success})
obs = self._format_obs(raw_obs)
if terminated or truncated:
info["final_info"] = {"task": self.task, "is_success": is_success}
return obs, reward, terminated, truncated, info
def render(self) -> np.ndarray | None:
if self.render_mode == "rgb_array":
return self._env.render()
return None
def close(self) -> None:
self._env.close()
def _make_env_fns(
*,
task: str,
n_envs: int,
image_size: int,
split: str,
episode_length: int,
gym_kwargs: dict[str, Any],
) -> list[Callable[[], RoboCasaEnv]]:
"""Build n_envs factory callables for a single task."""
def _make(episode_index: int) -> RoboCasaEnv: # noqa: ARG001
return RoboCasaEnv(
task=task,
split=split,
image_size=image_size,
episode_length=episode_length,
**gym_kwargs,
)
return [partial(_make, i) for i in range(n_envs)]
def create_robocasa_envs(
tasks: str | Sequence[str],
n_envs: int,
image_size: int = 128,
split: str = "target",
episode_length: int = 500,
gym_kwargs: dict[str, Any] | None = None,
env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None,
) -> dict[str, dict[int, Any]]:
"""Create vectorized RoboCasa environments.
Args:
tasks: A single task name or list of task names (without "robocasa/" prefix).
E.g. "PickPlaceCounterToCabinet" or ["BoilPot", "PrepareCoffee"].
n_envs: Number of parallel envs per task.
image_size: Square image resolution for all cameras.
split: RoboCasa dataset split "pretrain" or "target".
episode_length: Max steps per episode before truncation.
gym_kwargs: Extra kwargs forwarded to each RoboCasaEnv.
env_cls: Callable to wrap list of factory fns (SyncVectorEnv or AsyncVectorEnv).
Returns:
dict[task_name][task_id=0] -> vec_env
"""
if env_cls is None or not callable(env_cls):
raise ValueError("env_cls must be a callable wrapping a list of env factory callables.")
if not isinstance(n_envs, int) or n_envs <= 0:
raise ValueError(f"n_envs must be a positive int; got {n_envs}.")
if isinstance(tasks, str):
task_list = [t.strip() for t in tasks.split(",") if t.strip()]
else:
task_list = [str(t).strip() for t in tasks if str(t).strip()]
if not task_list:
raise ValueError("`tasks` must contain at least one task name.")
gym_kwargs = dict(gym_kwargs or {})
out: dict[str, dict[int, Any]] = defaultdict(dict)
print(f"Creating RoboCasa envs | tasks={task_list} | n_envs(per task)={n_envs} | split={split}")
for task in task_list:
fns = _make_env_fns(
task=task,
n_envs=n_envs,
image_size=image_size,
split=split,
episode_length=episode_length,
gym_kwargs=gym_kwargs,
)
out["robocasa"][len(out["robocasa"])] = env_cls(fns)
print(f" Built vec env | task={task} | n_envs={n_envs}")
return {suite: dict(task_map) for suite, task_map in out.items()}
-154
View File
@@ -1,154 +0,0 @@
"""RoboMME environment wrapper for LeRobot evaluation.
Wraps the RoboMME ``BenchmarkEnvBuilder`` into a Gymnasium-compatible
``VectorEnv`` suitable for ``lerobot_eval``.
RoboMME tasks:
Counting: BinFill, PickXtimes, SwingXtimes, StopCube
Permanence: VideoUnmask, VideoUnmaskSwap, ButtonUnmask, ButtonUnmaskSwap
Reference: PickHighlight, VideoRepick, VideoPlaceButton, VideoPlaceOrder
Imitation: MoveCube, InsertPeg, PatternLock, RouteStick
Install: pip install robomme (or from source: https://github.com/RoboMME/robomme_benchmark)
"""
from __future__ import annotations
from typing import Any
import gymnasium as gym
import numpy as np
from gymnasium import spaces
ROBOMME_TASKS = [
"BinFill", "PickXtimes", "SwingXtimes", "StopCube",
"VideoUnmask", "VideoUnmaskSwap", "ButtonUnmask", "ButtonUnmaskSwap",
"PickHighlight", "VideoRepick", "VideoPlaceButton", "VideoPlaceOrder",
"MoveCube", "InsertPeg", "PatternLock", "RouteStick",
]
class RoboMMEGymEnv(gym.Env):
"""Thin Gymnasium wrapper around a single RoboMME episode env."""
metadata = {"render_modes": ["rgb_array"]}
def __init__(
self,
task: str = "PickXtimes",
action_space_type: str = "joint_angle",
dataset: str = "test",
episode_idx: int = 0,
max_steps: int = 300,
):
super().__init__()
from robomme.env_record_wrapper import BenchmarkEnvBuilder
self._task = task
self._action_space_type = action_space_type
self._dataset = dataset
self._episode_idx = episode_idx
self._max_steps = max_steps
self._builder = BenchmarkEnvBuilder(
env_id=task,
dataset=dataset,
action_space=action_space_type,
gui_render=False,
max_steps=max_steps,
)
self._env = None
action_dim = 8 if action_space_type == "joint_angle" else 7
self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(action_dim,), dtype=np.float32)
self.observation_space = spaces.Dict({
"front_rgb": spaces.Box(0, 255, shape=(256, 256, 3), dtype=np.uint8),
"wrist_rgb": spaces.Box(0, 255, shape=(256, 256, 3), dtype=np.uint8),
"state": spaces.Box(-np.inf, np.inf, shape=(8,), dtype=np.float32),
})
def reset(self, *, seed=None, options=None):
super().reset(seed=seed)
self._env = self._builder.make_env_for_episode(
episode_idx=self._episode_idx, max_steps=self._max_steps,
)
obs, info = self._env.reset()
return self._convert_obs(obs), self._convert_info(info)
def step(self, action):
obs, reward, terminated, truncated, info = self._env.step(action)
terminated_bool = bool(terminated.item()) if hasattr(terminated, "item") else bool(terminated)
truncated_bool = bool(truncated.item()) if hasattr(truncated, "item") else bool(truncated)
status = info.get("status", "ongoing")
is_success = status == "success"
conv_info = self._convert_info(info)
conv_info["is_success"] = is_success
return self._convert_obs(obs), float(reward), terminated_bool, truncated_bool, conv_info
def _convert_obs(self, obs: dict) -> dict:
front_rgb = obs["front_rgb_list"][-1] if isinstance(obs["front_rgb_list"], list) else obs["front_rgb_list"]
wrist_rgb = obs["wrist_rgb_list"][-1] if isinstance(obs["wrist_rgb_list"], list) else obs["wrist_rgb_list"]
joint_state = obs["joint_state_list"][-1] if isinstance(obs["joint_state_list"], list) else obs["joint_state_list"]
gripper_state = obs["gripper_state_list"][-1] if isinstance(obs["gripper_state_list"], list) else obs["gripper_state_list"]
front_rgb = np.asarray(front_rgb, dtype=np.uint8)
wrist_rgb = np.asarray(wrist_rgb, dtype=np.uint8)
joint = np.asarray(joint_state, dtype=np.float32).flatten()[:7]
gripper = np.asarray(gripper_state, dtype=np.float32).flatten()[:1]
state = np.concatenate([joint, gripper])
return {
"front_rgb": front_rgb,
"wrist_rgb": wrist_rgb,
"state": state,
}
def _convert_info(self, info: dict) -> dict:
return {
"status": info.get("status", "ongoing"),
"task_goal": info.get("task_goal", ""),
}
def create_robomme_envs(
task: str,
n_envs: int = 1,
action_space_type: str = "joint_angle",
dataset: str = "test",
episode_length: int = 300,
task_ids: list[int] | None = None,
env_cls=None,
) -> dict[str, dict[int, gym.vector.VectorEnv]]:
"""Create vectorized RoboMME environments for evaluation.
Returns {suite_name: {task_id: VectorEnv}} matching lerobot's expected format.
"""
if env_cls is None:
env_cls = gym.vector.SyncVectorEnv
if task_ids is None:
task_ids = [0]
suite_name = "robomme"
envs_by_task = {}
for task_id in task_ids:
def _make_one(ep_idx=task_id):
return RoboMMEGymEnv(
task=task,
action_space_type=action_space_type,
dataset=dataset,
episode_idx=ep_idx,
max_steps=episode_length,
)
vec = env_cls(
[_make_one for _ in range(n_envs)],
autoreset_mode=gym.vector.AutoresetMode.SAME_STEP,
)
envs_by_task[task_id] = vec
return {suite_name: envs_by_task}
+4 -6
View File
@@ -221,7 +221,7 @@ class RangeFinderGUI:
self.bus = bus
self.groups = groups if groups is not None else {"all": list(bus.motors)}
self.group_names = list(self.groups)
self.group_names = list(groups)
self.current_group = self.group_names[0]
if not bus.is_connected:
@@ -230,20 +230,18 @@ class RangeFinderGUI:
self.calibration = bus.read_calibration()
self.res_table = bus.model_resolution_table
self.present_cache = {
m: bus.read("Present_Position", m, normalize=False)
for motors in self.groups.values()
for m in motors
m: bus.read("Present_Position", m, normalize=False) for motors in groups.values() for m in motors
}
pygame.init()
self.font = pygame.font.Font(None, FONT_SIZE)
label_pad = max(self.font.size(m)[0] for ms in self.groups.values() for m in ms)
label_pad = max(self.font.size(m)[0] for ms in groups.values() for m in ms)
self.label_pad = label_pad
width = 40 + label_pad + BAR_LEN + 6 + BTN_W + 10 + SAVE_W + 10
self.controls_bottom = 10 + SAVE_H
self.base_y = self.controls_bottom + TOP_GAP
height = self.base_y + PADDING_Y * len(self.groups[self.current_group]) + 40
height = self.base_y + PADDING_Y * len(groups[self.current_group]) + 40
self.screen = pygame.display.set_mode((width, height))
pygame.display.set_caption("Motors range finder")
+15 -41
View File
@@ -23,7 +23,6 @@ from copy import deepcopy
from functools import cached_property
from typing import TYPE_CHECKING, Any, TypedDict
from lerobot.utils.decorators import check_if_already_connected, check_if_not_connected
from lerobot.utils.import_utils import _can_available
if TYPE_CHECKING or _can_available:
@@ -37,6 +36,7 @@ else:
import numpy as np
from lerobot.utils.errors import DeviceAlreadyConnectedError, DeviceNotConnectedError
from lerobot.utils.robot_utils import precise_sleep
from lerobot.utils.utils import enter_pressed, move_cursor_up
@@ -155,7 +155,6 @@ class DamiaoMotorsBus(MotorsBusBase):
"""Check if the CAN bus is connected."""
return self._is_connected and self.canbus is not None
@check_if_already_connected
def connect(self, handshake: bool = True) -> None:
"""
Open the CAN bus and initialize communication.
@@ -163,6 +162,10 @@ class DamiaoMotorsBus(MotorsBusBase):
Args:
handshake: If True, ping all motors to verify they're present
"""
if self.is_connected:
raise DeviceAlreadyConnectedError(
f"{self.__class__.__name__}('{self.port}') is already connected."
)
try:
# Auto-detect interface type based on port name
@@ -208,9 +211,6 @@ class DamiaoMotorsBus(MotorsBusBase):
logger.info("Starting handshake with motors...")
# Drain any pending messages
if self.canbus is None:
raise RuntimeError("CAN bus is not initialized.")
while self.canbus.recv(timeout=0.01):
pass
@@ -246,7 +246,6 @@ class DamiaoMotorsBus(MotorsBusBase):
)
logger.info("Handshake successful. All motors ready.")
@check_if_not_connected
def disconnect(self, disable_torque: bool = True) -> None:
"""
Close the CAN bus connection.
@@ -254,6 +253,8 @@ class DamiaoMotorsBus(MotorsBusBase):
Args:
disable_torque: If True, disable torque on all motors before disconnecting
"""
if not self.is_connected:
raise DeviceNotConnectedError(f"{self.__class__.__name__}('{self.port}') is not connected.")
if disable_torque:
try:
@@ -282,10 +283,6 @@ class DamiaoMotorsBus(MotorsBusBase):
recv_id = self._get_motor_recv_id(motor)
data = [0xFF] * 7 + [command_byte]
msg = can.Message(arbitration_id=motor_id, data=data, is_extended_id=False, is_fd=self.use_can_fd)
if self.canbus is None:
raise RuntimeError("CAN bus is not initialized.")
self.canbus.send(msg)
if msg := self._recv_motor_response(expected_recv_id=recv_id):
self._process_response(motor_name, msg)
@@ -344,10 +341,6 @@ class DamiaoMotorsBus(MotorsBusBase):
recv_id = self._get_motor_recv_id(motor)
data = [motor_id & 0xFF, (motor_id >> 8) & 0xFF, CAN_CMD_REFRESH, 0, 0, 0, 0, 0]
msg = can.Message(arbitration_id=CAN_PARAM_ID, data=data, is_extended_id=False, is_fd=self.use_can_fd)
if self.canbus is None:
raise RuntimeError("CAN bus is not initialized.")
self.canbus.send(msg)
return self._recv_motor_response(expected_recv_id=recv_id)
@@ -363,10 +356,6 @@ class DamiaoMotorsBus(MotorsBusBase):
Returns:
CAN message if received, None otherwise
"""
if self.canbus is None:
raise RuntimeError("CAN bus is not initialized.")
try:
start_time = time.time()
messages_seen = []
@@ -405,13 +394,10 @@ class DamiaoMotorsBus(MotorsBusBase):
Returns:
Dictionary mapping recv_id to CAN message
"""
responses: dict[int, can.Message] = {}
responses = {}
expected_set = set(expected_recv_ids)
start_time = time.time()
if self.canbus is None:
raise RuntimeError("CAN bus is not initialized.")
try:
while len(responses) < len(expected_recv_ids) and (time.time() - start_time) < timeout:
# 100us poll timeout
@@ -475,9 +461,6 @@ class DamiaoMotorsBus(MotorsBusBase):
motor_name = self._get_motor_name(motor)
motor_type = self._motor_types[motor_name]
if self.canbus is None:
raise RuntimeError("CAN bus is not initialized.")
data = self._encode_mit_packet(motor_type, kp, kd, position_degrees, velocity_deg_per_sec, torque)
msg = can.Message(arbitration_id=motor_id, data=data, is_extended_id=False, is_fd=self.use_can_fd)
self.canbus.send(msg)
@@ -505,9 +488,6 @@ class DamiaoMotorsBus(MotorsBusBase):
recv_id_to_motor: dict[int, str] = {}
if self.canbus is None:
raise RuntimeError("CAN bus is not initialized.")
# Step 1: Send all MIT control commands
for motor, (kp, kd, position_degrees, velocity_deg_per_sec, torque) in commands.items():
motor_id = self._get_motor_id(motor)
@@ -582,9 +562,10 @@ class DamiaoMotorsBus(MotorsBusBase):
except Exception as e:
logger.warning(f"Failed to decode response from {motor}: {e}")
@check_if_not_connected
def read(self, data_name: str, motor: str) -> Value:
"""Read a value from a single motor. Positions are always in degrees."""
if not self.is_connected:
raise DeviceNotConnectedError(f"{self} is not connected.")
# Refresh motor to get latest state
msg = self._refresh_motor(motor)
@@ -614,7 +595,6 @@ class DamiaoMotorsBus(MotorsBusBase):
raise ValueError(f"Unknown data_name: {data_name}")
return mapping[data_name]
@check_if_not_connected
def write(
self,
data_name: str,
@@ -625,6 +605,8 @@ class DamiaoMotorsBus(MotorsBusBase):
Write a value to a single motor. Positions are always in degrees.
Can write 'Goal_Position', 'Kp', or 'Kd'.
"""
if not self.is_connected:
raise DeviceNotConnectedError(f"{self} is not connected.")
if data_name in ("Kp", "Kd"):
self._gains[motor][data_name.lower()] = float(value)
@@ -674,10 +656,6 @@ class DamiaoMotorsBus(MotorsBusBase):
def _batch_refresh(self, motors: list[str]) -> None:
"""Internal helper to refresh a list of motors and update cache."""
if self.canbus is None:
raise RuntimeError("CAN bus is not initialized.")
# Send refresh commands
for motor in motors:
motor_id = self._get_motor_id(motor)
@@ -700,12 +678,10 @@ class DamiaoMotorsBus(MotorsBusBase):
else:
logger.warning(f"Packet drop: {motor} (ID: 0x{recv_id:02X}). Using last known state.")
@check_if_not_connected
def sync_write(self, data_name: str, values: dict[str, Value]) -> None:
def sync_write(self, data_name: str, values: Value | dict[str, Value]) -> None:
"""
Write values to multiple motors simultaneously. Positions are always in degrees.
"""
if data_name in ("Kp", "Kd"):
key = data_name.lower()
for motor, val in values.items():
@@ -714,8 +690,6 @@ class DamiaoMotorsBus(MotorsBusBase):
elif data_name == "Goal_Position":
# Step 1: Send all MIT control commands
recv_id_to_motor: dict[int, str] = {}
if self.canbus is None:
raise RuntimeError("CAN bus is not initialized.")
for motor, value_degrees in values.items():
motor_id = self._get_motor_id(motor)
motor_name = self._get_motor_name(motor)
@@ -758,9 +732,9 @@ class DamiaoMotorsBus(MotorsBusBase):
def record_ranges_of_motion(
self,
motors: str | list[str] | None = None,
motors: NameOrID | list[NameOrID] | None = None,
display_values: bool = True,
) -> tuple[dict[str, Value], dict[str, Value]]:
) -> tuple[dict[NameOrID, Value], dict[NameOrID, Value]]:
"""
Interactively record the min/max values of each motor in degrees.
+8 -8
View File
@@ -181,10 +181,10 @@ class DynamixelMotorsBus(SerialMotorsBus):
for motor, m in self.motors.items():
calibration[motor] = MotorCalibration(
id=m.id,
drive_mode=int(drive_modes[motor]),
homing_offset=int(offsets[motor]),
range_min=int(mins[motor]),
range_max=int(maxes[motor]),
drive_mode=drive_modes[motor],
homing_offset=offsets[motor],
range_min=mins[motor],
range_max=maxes[motor],
)
return calibration
@@ -198,7 +198,7 @@ class DynamixelMotorsBus(SerialMotorsBus):
if cache:
self.calibration = calibration_dict
def disable_torque(self, motors: int | str | list[str] | None = None, num_retry: int = 0) -> None:
def disable_torque(self, motors: str | list[str] | None = None, num_retry: int = 0) -> None:
for motor in self._get_motors_list(motors):
self.write("Torque_Enable", motor, TorqueMode.DISABLED.value, num_retry=num_retry)
@@ -206,7 +206,7 @@ class DynamixelMotorsBus(SerialMotorsBus):
addr, length = get_address(self.model_ctrl_table, model, "Torque_Enable")
self._write(addr, length, motor, TorqueMode.DISABLED.value, num_retry=num_retry)
def enable_torque(self, motors: int | str | list[str] | None = None, num_retry: int = 0) -> None:
def enable_torque(self, motors: str | list[str] | None = None, num_retry: int = 0) -> None:
for motor in self._get_motors_list(motors):
self.write("Torque_Enable", motor, TorqueMode.ENABLED.value, num_retry=num_retry)
@@ -235,7 +235,7 @@ class DynamixelMotorsBus(SerialMotorsBus):
On Dynamixel Motors:
Present_Position = Actual_Position + Homing_Offset
"""
half_turn_homings: dict[NameOrID, Value] = {}
half_turn_homings = {}
for motor, pos in positions.items():
model = self._get_motor_model(motor)
max_res = self.model_resolution_table[model] - 1
@@ -258,6 +258,6 @@ class DynamixelMotorsBus(SerialMotorsBus):
if raise_on_error:
raise ConnectionError(self.packet_handler.getTxRxResult(comm))
return None
return
return {id_: data[0] for id_, data in data_list.items()}
+9 -9
View File
@@ -126,7 +126,7 @@ class FeetechMotorsBus(SerialMotorsBus):
self.port_handler = scs.PortHandler(self.port)
# HACK: monkeypatch
self.port_handler.setPacketTimeout = patch_setPacketTimeout.__get__( # type: ignore[method-assign]
self.port_handler.setPacketTimeout = patch_setPacketTimeout.__get__(
self.port_handler, scs.PortHandler
)
self.packet_handler = scs.PacketHandler(protocol_version)
@@ -262,9 +262,9 @@ class FeetechMotorsBus(SerialMotorsBus):
calibration[motor] = MotorCalibration(
id=m.id,
drive_mode=0,
homing_offset=int(offsets[motor]),
range_min=int(mins[motor]),
range_max=int(maxes[motor]),
homing_offset=offsets[motor],
range_min=mins[motor],
range_max=maxes[motor],
)
return calibration
@@ -284,7 +284,7 @@ class FeetechMotorsBus(SerialMotorsBus):
On Feetech Motors:
Present_Position = Actual_Position - Homing_Offset
"""
half_turn_homings: dict[NameOrID, Value] = {}
half_turn_homings = {}
for motor, pos in positions.items():
model = self._get_motor_model(motor)
max_res = self.model_resolution_table[model] - 1
@@ -292,7 +292,7 @@ class FeetechMotorsBus(SerialMotorsBus):
return half_turn_homings
def disable_torque(self, motors: int | str | list[str] | None = None, num_retry: int = 0) -> None:
def disable_torque(self, motors: str | list[str] | None = None, num_retry: int = 0) -> None:
for motor in self._get_motors_list(motors):
self.write("Torque_Enable", motor, TorqueMode.DISABLED.value, num_retry=num_retry)
self.write("Lock", motor, 0, num_retry=num_retry)
@@ -303,7 +303,7 @@ class FeetechMotorsBus(SerialMotorsBus):
addr, length = get_address(self.model_ctrl_table, model, "Lock")
self._write(addr, length, motor, 0, num_retry=num_retry)
def enable_torque(self, motors: int | str | list[str] | None = None, num_retry: int = 0) -> None:
def enable_torque(self, motors: str | list[str] | None = None, num_retry: int = 0) -> None:
for motor in self._get_motors_list(motors):
self.write("Torque_Enable", motor, TorqueMode.ENABLED.value, num_retry=num_retry)
self.write("Lock", motor, 1, num_retry=num_retry)
@@ -334,7 +334,7 @@ class FeetechMotorsBus(SerialMotorsBus):
def _broadcast_ping(self) -> tuple[dict[int, int], int]:
import scservo_sdk as scs
data_list: dict[int, int] = {}
data_list = {}
status_length = 6
@@ -414,7 +414,7 @@ class FeetechMotorsBus(SerialMotorsBus):
if not self._is_comm_success(comm):
if raise_on_error:
raise ConnectionError(self.packet_handler.getTxRxResult(comm))
return None
return
ids_errors = {id_: status for id_, status in ids_status.items() if self._is_error(status)}
if ids_errors:
+94 -97
View File
@@ -23,13 +23,12 @@ from __future__ import annotations
import abc
import logging
from collections.abc import Sequence
from contextlib import contextmanager
from dataclasses import dataclass
from enum import Enum
from functools import cached_property
from pprint import pformat
from typing import Protocol
from typing import Protocol, TypeAlias
import serial
from deepdiff import DeepDiff
@@ -38,8 +37,8 @@ from tqdm import tqdm
from lerobot.utils.decorators import check_if_already_connected, check_if_not_connected
from lerobot.utils.utils import enter_pressed, move_cursor_up
type NameOrID = str | int
type Value = int | float
NameOrID: TypeAlias = str | int
Value: TypeAlias = int | float
logger = logging.getLogger(__name__)
@@ -94,7 +93,7 @@ class MotorsBusBase(abc.ABC):
pass
@abc.abstractmethod
def sync_write(self, data_name: str, values: dict[str, Value]) -> None:
def sync_write(self, data_name: str, values: Value | dict[str, Value]) -> None:
"""Write values to multiple motors."""
pass
@@ -180,16 +179,15 @@ class Motor:
class PortHandler(Protocol):
is_open: bool
baudrate: int
packet_start_time: float
packet_timeout: float
tx_time_per_byte: float
is_using: bool
port_name: str
ser: serial.Serial
def __init__(self, port_name: str) -> None: ...
def __init__(self, port_name):
self.is_open: bool
self.baudrate: int
self.packet_start_time: float
self.packet_timeout: float
self.tx_time_per_byte: float
self.is_using: bool
self.port_name: str
self.ser: serial.Serial
def openPort(self): ...
def closePort(self): ...
@@ -242,22 +240,19 @@ class PacketHandler(Protocol):
def regWriteTxRx(self, port, id, address, length, data): ...
def syncReadTx(self, port, start_address, data_length, param, param_length): ...
def syncWriteTxOnly(self, port, start_address, data_length, param, param_length): ...
def broadcastPing(self, port): ...
class GroupSyncRead(Protocol):
port: str
ph: PortHandler
start_address: int
data_length: int
last_result: bool
is_param_changed: bool
param: list
data_dict: dict
def __init__(self, port, ph, start_address, data_length):
self.port: str
self.ph: PortHandler
self.start_address: int
self.data_length: int
self.last_result: bool
self.is_param_changed: bool
self.param: list
self.data_dict: dict
def __init__(
self, port: PortHandler, ph: PacketHandler, start_address: int, data_length: int
) -> None: ...
def makeParam(self): ...
def addParam(self, id): ...
def removeParam(self, id): ...
@@ -270,17 +265,15 @@ class GroupSyncRead(Protocol):
class GroupSyncWrite(Protocol):
port: str
ph: PortHandler
start_address: int
data_length: int
is_param_changed: bool
param: list
data_dict: dict
def __init__(self, port, ph, start_address, data_length):
self.port: str
self.ph: PortHandler
self.start_address: int
self.data_length: int
self.is_param_changed: bool
self.param: list
self.data_dict: dict
def __init__(
self, port: PortHandler, ph: PacketHandler, start_address: int, data_length: int
) -> None: ...
def makeParam(self): ...
def addParam(self, id, data): ...
def removeParam(self, id): ...
@@ -407,7 +400,7 @@ class SerialMotorsBus(MotorsBusBase):
else:
raise TypeError(f"'{motor}' should be int, str.")
def _get_motor_model(self, motor: NameOrID) -> str:
def _get_motor_model(self, motor: NameOrID) -> int:
if isinstance(motor, str):
return self.motors[motor].model
elif isinstance(motor, int):
@@ -415,19 +408,17 @@ class SerialMotorsBus(MotorsBusBase):
else:
raise TypeError(f"'{motor}' should be int, str.")
def _get_motors_list(self, motors: NameOrID | Sequence[NameOrID] | None) -> list[str]:
def _get_motors_list(self, motors: str | list[str] | None) -> list[str]:
if motors is None:
return list(self.motors)
elif isinstance(motors, str):
return [motors]
elif isinstance(motors, int):
return [self._id_to_name(motors)]
elif isinstance(motors, Sequence):
return [m if isinstance(m, str) else self._id_to_name(m) for m in motors]
elif isinstance(motors, list):
return motors.copy()
else:
raise TypeError(motors)
def _get_ids_values_dict(self, values: Value | dict[str, Value] | None) -> dict[int, Value]:
def _get_ids_values_dict(self, values: Value | dict[str, Value] | None) -> list[str]:
if isinstance(values, (int | float)):
return dict.fromkeys(self.ids, values)
elif isinstance(values, dict):
@@ -649,19 +640,18 @@ class SerialMotorsBus(MotorsBusBase):
pass
@abc.abstractmethod
def enable_torque(self, motors: int | str | list[str] | None = None, num_retry: int = 0) -> None:
def enable_torque(self, motors: str | list[str] | None = None, num_retry: int = 0) -> None:
"""Enable torque on selected motors.
Args:
motors (int | str | list[str] | None, optional): Same semantics as :pymeth:`disable_torque`.
Defaults to `None`.
motor (int): Same semantics as :pymeth:`disable_torque`. Defaults to `None`.
num_retry (int, optional): Number of additional retry attempts on communication failure.
Defaults to 0.
"""
pass
@contextmanager
def torque_disabled(self, motors: str | list[str] | None = None):
def torque_disabled(self, motors: int | str | list[str] | None = None):
"""Context-manager that guarantees torque is re-enabled.
This helper is useful to temporarily disable torque when configuring motors.
@@ -738,19 +728,24 @@ class SerialMotorsBus(MotorsBusBase):
"""
pass
def reset_calibration(self, motors: NameOrID | Sequence[NameOrID] | None = None) -> None:
def reset_calibration(self, motors: NameOrID | list[NameOrID] | None = None) -> None:
"""Restore factory calibration for the selected motors.
Homing offset is set to ``0`` and min/max position limits are set to the full usable range.
The in-memory :pyattr:`calibration` is cleared.
Args:
motors (NameOrID | Sequence[NameOrID] | None, optional): Selection of motors. `None` (default)
motors (NameOrID | list[NameOrID] | None, optional): Selection of motors. `None` (default)
resets every motor.
"""
motor_names = self._get_motors_list(motors)
if motors is None:
motors = list(self.motors)
elif isinstance(motors, (str | int)):
motors = [motors]
elif not isinstance(motors, list):
raise TypeError(motors)
for motor in motor_names:
for motor in motors:
model = self._get_motor_model(motor)
max_res = self.model_resolution_table[model] - 1
self.write("Homing_Offset", motor, 0, normalize=False)
@@ -759,9 +754,7 @@ class SerialMotorsBus(MotorsBusBase):
self.calibration = {}
def set_half_turn_homings(
self, motors: NameOrID | Sequence[NameOrID] | None = None
) -> dict[NameOrID, Value]:
def set_half_turn_homings(self, motors: NameOrID | list[NameOrID] | None = None) -> dict[NameOrID, Value]:
"""Centre each motor range around its current position.
The function computes and writes a homing offset such that the present position becomes exactly one
@@ -771,12 +764,17 @@ class SerialMotorsBus(MotorsBusBase):
motors (NameOrID | list[NameOrID] | None, optional): Motors to adjust. Defaults to all motors (`None`).
Returns:
dict[str, Value]: Mapping *motor name written homing offset*.
dict[NameOrID, Value]: Mapping *motor written homing offset*.
"""
motor_names = self._get_motors_list(motors)
if motors is None:
motors = list(self.motors)
elif isinstance(motors, (str | int)):
motors = [motors]
elif not isinstance(motors, list):
raise TypeError(motors)
self.reset_calibration(motor_names)
actual_positions = self.sync_read("Present_Position", motor_names, normalize=False)
self.reset_calibration(motors)
actual_positions = self.sync_read("Present_Position", motors, normalize=False)
homing_offsets = self._get_half_turn_homings(actual_positions)
for motor, offset in homing_offsets.items():
self.write("Homing_Offset", motor, offset)
@@ -788,8 +786,8 @@ class SerialMotorsBus(MotorsBusBase):
pass
def record_ranges_of_motion(
self, motors: NameOrID | Sequence[NameOrID] | None = None, display_values: bool = True
) -> tuple[dict[str, Value], dict[str, Value]]:
self, motors: NameOrID | list[NameOrID] | None = None, display_values: bool = True
) -> tuple[dict[NameOrID, Value], dict[NameOrID, Value]]:
"""Interactively record the min/max encoder values of each motor.
Move the joints by hand (with torque disabled) while the method streams live positions. Press
@@ -801,25 +799,30 @@ class SerialMotorsBus(MotorsBusBase):
display_values (bool, optional): When `True` (default) a live table is printed to the console.
Returns:
tuple[dict[str, Value], dict[str, Value]]: Two dictionaries *mins* and *maxes* with the
tuple[dict[NameOrID, Value], dict[NameOrID, Value]]: Two dictionaries *mins* and *maxes* with the
extreme values observed for each motor.
"""
motor_names = self._get_motors_list(motors)
if motors is None:
motors = list(self.motors)
elif isinstance(motors, (str | int)):
motors = [motors]
elif not isinstance(motors, list):
raise TypeError(motors)
start_positions = self.sync_read("Present_Position", motor_names, normalize=False)
start_positions = self.sync_read("Present_Position", motors, normalize=False)
mins = start_positions.copy()
maxes = start_positions.copy()
user_pressed_enter = False
while not user_pressed_enter:
positions = self.sync_read("Present_Position", motor_names, normalize=False)
positions = self.sync_read("Present_Position", motors, normalize=False)
mins = {motor: min(positions[motor], min_) for motor, min_ in mins.items()}
maxes = {motor: max(positions[motor], max_) for motor, max_ in maxes.items()}
if display_values:
print("\n-------------------------------------------")
print(f"{'NAME':<15} | {'MIN':>6} | {'POS':>6} | {'MAX':>6}")
for motor in motor_names:
for motor in motors:
print(f"{motor:<15} | {mins[motor]:>6} | {positions[motor]:>6} | {maxes[motor]:>6}")
if enter_pressed():
@@ -827,9 +830,9 @@ class SerialMotorsBus(MotorsBusBase):
if display_values and not user_pressed_enter:
# Move cursor up to overwrite the previous output
move_cursor_up(len(motor_names) + 3)
move_cursor_up(len(motors) + 3)
same_min_max = [motor for motor in motor_names if mins[motor] == maxes[motor]]
same_min_max = [motor for motor in motors if mins[motor] == maxes[motor]]
if same_min_max:
raise ValueError(f"Some motors have the same min and max values:\n{pformat(same_min_max)}")
@@ -952,12 +955,12 @@ class SerialMotorsBus(MotorsBusBase):
if raise_on_error:
raise ConnectionError(self.packet_handler.getTxRxResult(comm))
else:
return None
return
if self._is_error(error):
if raise_on_error:
raise RuntimeError(self.packet_handler.getRxPacketError(error))
else:
return None
return
return model_number
@@ -1004,13 +1007,12 @@ class SerialMotorsBus(MotorsBusBase):
err_msg = f"Failed to read '{data_name}' on {id_=} after {num_retry + 1} tries."
value, _, _ = self._read(addr, length, id_, num_retry=num_retry, raise_on_error=True, err_msg=err_msg)
decoded = self._decode_sign(data_name, {id_: value})
id_value = self._decode_sign(data_name, {id_: value})
if normalize and data_name in self.normalized_data:
normalized = self._normalize(decoded)
return normalized[id_]
id_value = self._normalize(id_value)
return decoded[id_]
return id_value[id_]
def _read(
self,
@@ -1021,7 +1023,7 @@ class SerialMotorsBus(MotorsBusBase):
num_retry: int = 0,
raise_on_error: bool = True,
err_msg: str = "",
) -> tuple[int, int, int]:
) -> tuple[int, int]:
if length == 1:
read_fn = self.packet_handler.read1ByteTxRx
elif length == 2:
@@ -1071,14 +1073,13 @@ class SerialMotorsBus(MotorsBusBase):
model = self.motors[motor].model
addr, length = get_address(self.model_ctrl_table, model, data_name)
int_value = int(value)
if normalize and data_name in self.normalized_data:
int_value = self._unnormalize({id_: value})[id_]
value = self._unnormalize({id_: value})[id_]
int_value = self._encode_sign(data_name, {id_: int_value})[id_]
value = self._encode_sign(data_name, {id_: value})[id_]
err_msg = f"Failed to write '{data_name}' on {id_=} with '{int_value}' after {num_retry + 1} tries."
self._write(addr, length, id_, int_value, num_retry=num_retry, raise_on_error=True, err_msg=err_msg)
err_msg = f"Failed to write '{data_name}' on {id_=} with '{value}' after {num_retry + 1} tries."
self._write(addr, length, id_, value, num_retry=num_retry, raise_on_error=True, err_msg=err_msg)
def _write(
self,
@@ -1112,7 +1113,7 @@ class SerialMotorsBus(MotorsBusBase):
def sync_read(
self,
data_name: str,
motors: NameOrID | Sequence[NameOrID] | None = None,
motors: str | list[str] | None = None,
*,
normalize: bool = True,
num_retry: int = 0,
@@ -1121,7 +1122,7 @@ class SerialMotorsBus(MotorsBusBase):
Args:
data_name (str): Register name.
motors (NameOrID | Sequence[NameOrID] | None, optional): Motors to query. `None` (default) reads every motor.
motors (str | list[str] | None, optional): Motors to query. `None` (default) reads every motor.
normalize (bool, optional): Normalisation flag. Defaults to `True`.
num_retry (int, optional): Retry attempts. Defaults to `0`.
@@ -1142,17 +1143,16 @@ class SerialMotorsBus(MotorsBusBase):
addr, length = get_address(self.model_ctrl_table, model, data_name)
err_msg = f"Failed to sync read '{data_name}' on {ids=} after {num_retry + 1} tries."
raw_ids_values, _ = self._sync_read(
ids_values, _ = self._sync_read(
addr, length, ids, num_retry=num_retry, raise_on_error=True, err_msg=err_msg
)
decoded = self._decode_sign(data_name, raw_ids_values)
ids_values = self._decode_sign(data_name, ids_values)
if normalize and data_name in self.normalized_data:
normalized = self._normalize(decoded)
return {self._id_to_name(id_): value for id_, value in normalized.items()}
ids_values = self._normalize(ids_values)
return {self._id_to_name(id_): value for id_, value in decoded.items()}
return {self._id_to_name(id_): value for id_, value in ids_values.items()}
def _sync_read(
self,
@@ -1224,24 +1224,21 @@ class SerialMotorsBus(MotorsBusBase):
num_retry (int, optional): Retry attempts. Defaults to `0`.
"""
raw_ids_values = self._get_ids_values_dict(values)
models = [self._id_to_model(id_) for id_ in raw_ids_values]
ids_values = self._get_ids_values_dict(values)
models = [self._id_to_model(id_) for id_ in ids_values]
if self._has_different_ctrl_tables:
assert_same_address(self.model_ctrl_table, models, data_name)
model = next(iter(models))
addr, length = get_address(self.model_ctrl_table, model, data_name)
int_ids_values = {id_: int(val) for id_, val in raw_ids_values.items()}
if normalize and data_name in self.normalized_data:
int_ids_values = self._unnormalize(raw_ids_values)
ids_values = self._unnormalize(ids_values)
int_ids_values = self._encode_sign(data_name, int_ids_values)
ids_values = self._encode_sign(data_name, ids_values)
err_msg = f"Failed to sync write '{data_name}' with ids_values={int_ids_values} after {num_retry + 1} tries."
self._sync_write(
addr, length, int_ids_values, num_retry=num_retry, raise_on_error=True, err_msg=err_msg
)
err_msg = f"Failed to sync write '{data_name}' with {ids_values=} after {num_retry + 1} tries."
self._sync_write(addr, length, ids_values, num_retry=num_retry, raise_on_error=True, err_msg=err_msg)
def _sync_write(
self,
@@ -1277,4 +1274,4 @@ class SerialMotorsBus(MotorsBusBase):
# Backward compatibility alias
MotorsBus = SerialMotorsBus
MotorsBus: TypeAlias = SerialMotorsBus
File diff suppressed because it is too large Load Diff
-120
View File
@@ -1,120 +0,0 @@
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Configuration tables for Damiao motors."""
from enum import IntEnum
# Motor type definitions
class MotorType(IntEnum):
O0 = 0
O1 = 1
O2 = 2
O3 = 3
O4 = 4
O5 = 5
ELO5 = 6
O6 = 7
class CommMode(IntEnum):
PrivateProtocole = 0
CANopen = 1
MIT = 2
# Control modes
class ControlMode(IntEnum):
MIT = 0
POS_VEL = 1
VEL = 2
# Motor limit parameters [PMAX, VMAX, TMAX]
# PMAX: Maximum position (rad)
# VMAX: Maximum velocity (rad/s)
# TMAX: Maximum torque (N·m)
MOTOR_LIMIT_PARAMS: dict[MotorType, tuple[float, float, float]] = {
MotorType.O0: (12.57, 33, 14),
MotorType.O1: (12.57, 44, 17),
MotorType.O2: (12.57, 33, 20),
MotorType.O3: (12.57, 33, 60),
MotorType.O4: (12.57, 33, 120),
MotorType.O5: (12.57, 50, 5.5),
MotorType.ELO5: (12.57, 50, 6),
MotorType.O6: (112.5, 50, 36),
}
# Motor model names
MODEL_NAMES = {
MotorType.O0: "O0",
MotorType.O1: "O1",
MotorType.O2: "O2",
MotorType.O3: "O3",
MotorType.O4: "O4",
MotorType.O5: "O5",
MotorType.ELO5: "ELO5",
MotorType.O6: "O6",
}
# Motor resolution table (encoder counts per revolution)
MODEL_RESOLUTION = {
"O0": 65536,
"O1": 65536,
"O2": 65536,
"O3": 65536,
"O4": 65536,
"O5": 65536,
"ELO5": 65536,
"O6": 65536,
}
# CAN baudrates supported by Robstride motors
AVAILABLE_BAUDRATES = [
1000000, # 4: 1 mbps (default)
]
DEFAULT_BAUDRATE = 1000000
# Default timeout in milliseconds
DEFAULT_TIMEOUT_MS = 0 # disabled by default, otherwise 20000 is 1s
# Data that should be normalized
NORMALIZED_DATA = ["Present_Position", "Goal_Position"]
# MIT control parameter ranges
MIT_KP_RANGE = (0.0, 500.0)
MIT_KD_RANGE = (0.0, 5.0)
# CAN frame command IDs
CAN_CMD_ENABLE = 0xFC
CAN_CMD_DISABLE = 0xFD
CAN_CMD_SET_ZERO = 0xFE
CAN_CMD_CLEAR_FAULT = 0xFB
CAN_CMD_QUERY_PARAM = 0x33
CAN_CMD_WRITE_PARAM = 0x55
CAN_CMD_SAVE_PARAM = 0xAA
# CAN ID for parameter operations
CAN_PARAM_ID = 0x7FF
RUNNING_TIMEOUT = 0.001
PARAM_TIMEOUT = 0.01
STATE_CACHE_TTL_S = 0.02
@@ -55,16 +55,10 @@ class DiffusionConfig(PreTrainedConfig):
normalization_mapping: A dictionary that maps from a str value of FeatureType (e.g., "STATE", "VISUAL") to
a corresponding NormalizationMode (e.g., NormalizationMode.MIN_MAX)
vision_backbone: Name of the torchvision resnet backbone to use for encoding images.
resize_shape: (H, W) shape to resize images to as a preprocessing step for the vision
backbone. If None, no resizing is done and the original image resolution is used.
crop_ratio: Ratio in (0, 1] used to derive the crop size from resize_shape
(crop_h = int(resize_shape[0] * crop_ratio), likewise for width).
Set to 1.0 to disable cropping. Only takes effect when resize_shape is not None.
crop_shape: (H, W) shape to crop images to. When resize_shape is set and crop_ratio < 1.0,
this is computed automatically. Can also be set directly for legacy configs that use
crop-only (without resize). If None and no derivation applies, no cropping is done.
crop_is_random: Whether the crop should be random at training time (it's always a center
crop in eval mode).
crop_shape: (H, W) shape to crop images to as a preprocessing step for the vision backbone. Must fit
within the image size. If None, no cropping is done.
crop_is_random: Whether the crop should be random at training time (it's always a center crop in eval
mode).
pretrained_backbone_weights: Pretrained weights from torchvision to initialize the backbone.
`None` means no pretrained weights.
use_group_norm: Whether to replace batch normalization with group normalization in the backbone.
@@ -120,9 +114,7 @@ class DiffusionConfig(PreTrainedConfig):
# Architecture / modeling.
# Vision backbone.
vision_backbone: str = "resnet18"
resize_shape: tuple[int, int] | None = None
crop_ratio: float = 1.0
crop_shape: tuple[int, int] | None = None
crop_shape: tuple[int, int] | None = (84, 84)
crop_is_random: bool = True
pretrained_backbone_weights: str | None = None
use_group_norm: bool = True
@@ -147,10 +139,6 @@ class DiffusionConfig(PreTrainedConfig):
# Inference
num_inference_steps: int | None = None
# Optimization
compile_model: bool = False
compile_mode: str = "reduce-overhead"
# Loss computation
do_mask_loss_for_padding: bool = False
@@ -183,25 +171,6 @@ class DiffusionConfig(PreTrainedConfig):
f"Got {self.noise_scheduler_type}."
)
if self.resize_shape is not None and (
len(self.resize_shape) != 2 or any(d <= 0 for d in self.resize_shape)
):
raise ValueError(f"`resize_shape` must be a pair of positive integers. Got {self.resize_shape}.")
if not (0 < self.crop_ratio <= 1.0):
raise ValueError(f"`crop_ratio` must be in (0, 1]. Got {self.crop_ratio}.")
if self.resize_shape is not None:
if self.crop_ratio < 1.0:
self.crop_shape = (
int(self.resize_shape[0] * self.crop_ratio),
int(self.resize_shape[1] * self.crop_ratio),
)
else:
# Explicitly disable cropping for resize+ratio path when crop_ratio == 1.0.
self.crop_shape = None
if self.crop_shape is not None and (self.crop_shape[0] <= 0 or self.crop_shape[1] <= 0):
raise ValueError(f"`crop_shape` must have positive dimensions. Got {self.crop_shape}.")
# Check that the horizon size and U-Net downsampling is compatible.
# U-Net downsamples by 2 with each stage.
downsampling_factor = 2 ** len(self.down_dims)
@@ -229,12 +198,13 @@ class DiffusionConfig(PreTrainedConfig):
if len(self.image_features) == 0 and self.env_state_feature is None:
raise ValueError("You must provide at least one image or the environment state among the inputs.")
if self.resize_shape is None and self.crop_shape is not None:
if self.crop_shape is not None:
for key, image_ft in self.image_features.items():
if self.crop_shape[0] > image_ft.shape[1] or self.crop_shape[1] > image_ft.shape[2]:
raise ValueError(
f"`crop_shape` should fit within the image shapes. Got {self.crop_shape} "
f"for `crop_shape` and {image_ft.shape} for `{key}`."
f"`crop_shape` should fit within the images shapes. Got {self.crop_shape} "
f"for `crop_shape` and {image_ft.shape} for "
f"`{key}`."
)
# Check that all input images have the same shape.
@@ -142,9 +142,6 @@ class DiffusionPolicy(PreTrainedPolicy):
"""Run the batch through the model and compute the loss for training or validation."""
if self.config.image_features:
batch = dict(batch) # shallow copy so that adding a key doesn't modify the original
for key in self.config.image_features:
if self.config.n_obs_steps == 1 and batch[key].ndim == 4:
batch[key] = batch[key].unsqueeze(1)
batch[OBS_IMAGES] = torch.stack([batch[key] for key in self.config.image_features], dim=-4)
loss = self.diffusion.compute_loss(batch)
# no output_dict so returning None
@@ -185,11 +182,6 @@ class DiffusionModel(nn.Module):
self.unet = DiffusionConditionalUnet1d(config, global_cond_dim=global_cond_dim * config.n_obs_steps)
if config.compile_model:
# Compile the U-Net. "reduce-overhead" is preferred for the small-batch repetitive loops
# common in diffusion inference.
self.unet = torch.compile(self.unet, mode=config.compile_mode)
self.noise_scheduler = _make_noise_scheduler(
config.noise_scheduler_type,
num_train_timesteps=config.num_train_timesteps,
@@ -454,18 +446,12 @@ class DiffusionRgbEncoder(nn.Module):
def __init__(self, config: DiffusionConfig):
super().__init__()
# Set up optional preprocessing.
if config.resize_shape is not None:
self.resize = torchvision.transforms.Resize(config.resize_shape)
else:
self.resize = None
crop_shape = config.crop_shape
if crop_shape is not None:
if config.crop_shape is not None:
self.do_crop = True
# Always use center crop for eval
self.center_crop = torchvision.transforms.CenterCrop(crop_shape)
self.center_crop = torchvision.transforms.CenterCrop(config.crop_shape)
if config.crop_is_random:
self.maybe_random_crop = torchvision.transforms.RandomCrop(crop_shape)
self.maybe_random_crop = torchvision.transforms.RandomCrop(config.crop_shape)
else:
self.maybe_random_crop = self.center_crop
else:
@@ -491,16 +477,13 @@ class DiffusionRgbEncoder(nn.Module):
# Set up pooling and final layers.
# Use a dry run to get the feature map shape.
# The dummy shape mirrors the runtime preprocessing order: resize -> crop.
# The dummy input should take the number of image channels from `config.image_features` and it should
# use the height and width from `config.crop_shape` if it is provided, otherwise it should use the
# height and width from `config.image_features`.
# Note: we have a check in the config class to make sure all images have the same shape.
images_shape = next(iter(config.image_features.values())).shape
if config.crop_shape is not None:
dummy_shape_h_w = config.crop_shape
elif config.resize_shape is not None:
dummy_shape_h_w = config.resize_shape
else:
dummy_shape_h_w = images_shape[1:]
dummy_shape_h_w = config.crop_shape if config.crop_shape is not None else images_shape[1:]
dummy_shape = (1, images_shape[0], *dummy_shape_h_w)
feature_map_shape = get_output_shape(self.backbone, dummy_shape)[1:]
@@ -516,10 +499,7 @@ class DiffusionRgbEncoder(nn.Module):
Returns:
(B, D) image feature.
"""
# Preprocess: resize if configured, then crop if configured.
if self.resize is not None:
x = self.resize(x)
# Preprocess: maybe crop (if it was set up in the __init__).
if self.do_crop:
if self.training: # noqa: SIM108
x = self.maybe_random_crop(x)
+10 -1
View File
@@ -18,9 +18,10 @@ from __future__ import annotations
import importlib
import logging
from typing import Any, TypedDict, Unpack
from typing import Any, TypedDict
import torch
from typing_extensions import Unpack
from lerobot.configs.policies import PreTrainedConfig
from lerobot.configs.types import FeatureType
@@ -33,6 +34,7 @@ from lerobot.policies.diffusion.configuration_diffusion import DiffusionConfig
from lerobot.policies.groot.configuration_groot import GrootConfig
from lerobot.policies.pi0.configuration_pi0 import PI0Config
from lerobot.policies.pi05.configuration_pi05 import PI05Config
from lerobot.policies.pi05_full.configuration_pi05 import PI05FullConfig
from lerobot.policies.pretrained import PreTrainedPolicy
from lerobot.policies.sac.configuration_sac import SACConfig
from lerobot.policies.sac.reward_model.configuration_classifier import RewardClassifierConfig
@@ -389,6 +391,13 @@ def make_pre_post_processors(
config=policy_cfg,
dataset_stats=kwargs.get("dataset_stats"),
)
elif isinstance(policy_cfg, PI05FullConfig):
from lerobot.policies.pi05_full.processor_pi05 import make_pi05_full_pre_post_processors
processors = make_pi05_full_pre_post_processors(
config=policy_cfg,
dataset_stats=kwargs.get("dataset_stats"),
)
else:
try:
@@ -4,16 +4,17 @@
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
from __future__ import annotations
# copy from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py
from typing import Optional
from transformers.image_processing_utils import (
BatchFeature,
get_patch_output_size,
)
from transformers.image_processing_utils_fast import (
BaseImageProcessorFast,
ImagesKwargs,
DefaultFastImageProcessorKwargs,
group_images_by_shape,
reorder_images,
)
@@ -76,7 +77,7 @@ def crop(img: torch.Tensor, left: int, top: int, right: int, bottom: int) -> tor
return img[:, top:bottom, left:right]
class Eagle25VLFastImageProcessorKwargs(ImagesKwargs):
class Eagle25VLFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
max_dynamic_tiles: int | None
min_dynamic_tiles: int | None
use_thumbnail: bool | None
@@ -164,11 +165,11 @@ class Eagle25VLImageProcessorFast(BaseImageProcessorFast):
def _resize_for_patching(
self,
image: torch.Tensor,
image: "torch.Tensor",
target_resolution: tuple,
interpolation: F.InterpolationMode,
interpolation: "F.InterpolationMode",
input_data_format: ChannelDimension,
) -> torch.Tensor:
) -> "torch.Tensor":
"""
Resizes an image to a target resolution while maintaining aspect ratio.
@@ -218,8 +219,8 @@ class Eagle25VLImageProcessorFast(BaseImageProcessorFast):
return best_ratio
def _pad_for_patching(
self, image: torch.Tensor, target_resolution: tuple, input_data_format: ChannelDimension
) -> torch.Tensor:
self, image: "torch.Tensor", target_resolution: tuple, input_data_format: ChannelDimension
) -> "torch.Tensor":
"""
Pad an image to a target resolution while maintaining aspect ratio.
"""
@@ -235,15 +236,15 @@ class Eagle25VLImageProcessorFast(BaseImageProcessorFast):
def _get_image_patches(
self,
image: torch.Tensor,
image: "torch.Tensor",
min_num: int,
max_num: int,
size: tuple,
tile_size: int,
use_thumbnail: bool,
interpolation: F.InterpolationMode,
interpolation: "F.InterpolationMode",
pad_during_tiling: bool,
) -> list[torch.Tensor]:
) -> list["torch.Tensor"]:
image_size = get_image_size(image, channel_dim=ChannelDimension.FIRST)
orig_height, orig_width = image_size
aspect_ratio = orig_width / orig_height
@@ -304,8 +305,8 @@ class Eagle25VLImageProcessorFast(BaseImageProcessorFast):
def _pad_for_batching(
self,
pixel_values: list[torch.Tensor],
) -> list[torch.Tensor]:
pixel_values: list["torch.Tensor"],
) -> list["torch.Tensor"]:
"""
Pads images on the `num_of_patches` dimension with zeros to form a batch of same number of patches.
@@ -326,14 +327,14 @@ class Eagle25VLImageProcessorFast(BaseImageProcessorFast):
def _preprocess(
self,
images: list[torch.Tensor],
images: list["torch.Tensor"],
do_resize: bool,
size: SizeDict,
max_dynamic_tiles: int,
min_dynamic_tiles: int,
use_thumbnail: bool,
pad_during_tiling: bool,
interpolation: F.InterpolationMode | None,
interpolation: Optional["F.InterpolationMode"],
do_center_crop: bool,
crop_size: SizeDict,
do_rescale: bool,
+54 -68
View File
@@ -15,16 +15,16 @@
# limitations under the License.
import builtins
import copy
import logging
import math
from collections import deque
from pathlib import Path
from typing import TYPE_CHECKING, Literal, TypedDict, Unpack
from typing import TYPE_CHECKING, Literal, TypedDict
import torch
import torch.nn.functional as F # noqa: N812
from torch import Tensor, nn
from typing_extensions import Unpack
from lerobot.utils.import_utils import _transformers_available
@@ -32,21 +32,13 @@ from lerobot.utils.import_utils import _transformers_available
if TYPE_CHECKING or _transformers_available:
from transformers.models.auto import CONFIG_MAPPING
from transformers.models.gemma import modeling_gemma
from lerobot.policies.pi_gemma import (
PaliGemmaForConditionalGenerationWithPiGemma,
PiGemmaForCausalLM,
_gated_residual,
layernorm_forward,
)
from transformers.models.gemma.modeling_gemma import GemmaForCausalLM
from transformers.models.paligemma.modeling_paligemma import PaliGemmaForConditionalGeneration
else:
CONFIG_MAPPING = None
modeling_gemma = None
PiGemmaForCausalLM = None
_gated_residual = None
layernorm_forward = None
PaliGemmaForConditionalGenerationWithPiGemma = None
GemmaForCausalLM = None
PaliGemmaForConditionalGeneration = None
from lerobot.configs.policies import PreTrainedConfig
from lerobot.policies.pi0.configuration_pi0 import DEFAULT_IMAGE_SIZE, PI0Config
@@ -199,7 +191,7 @@ def resize_with_pad_torch( # see openpi `resize_with_pad_torch` (exact copy)
if images.dtype == torch.uint8:
resized_images = torch.round(resized_images).clamp(0, 255).to(torch.uint8)
elif images.dtype == torch.float32:
resized_images = resized_images.clamp(0.0, 1.0)
resized_images = resized_images.clamp(-1.0, 1.0)
else:
raise ValueError(f"Unsupported image dtype: {images.dtype}")
@@ -210,7 +202,7 @@ def resize_with_pad_torch( # see openpi `resize_with_pad_torch` (exact copy)
pad_w1 = pad_w0 + remainder_w
# Pad
constant_value = 0 if images.dtype == torch.uint8 else 0.0
constant_value = 0 if images.dtype == torch.uint8 else -1.0
padded_images = F.pad(
resized_images,
(pad_w0, pad_w1, pad_h0, pad_h1), # left, right, top, bottom
@@ -229,14 +221,14 @@ def resize_with_pad_torch( # see openpi `resize_with_pad_torch` (exact copy)
def compute_layer_complete(
layer_idx, inputs_embeds, attention_mask, position_ids, adarms_cond, paligemma, gemma_expert
):
models = [paligemma.model.language_model, gemma_expert.model]
models = [paligemma.language_model, gemma_expert.model]
query_states = []
key_states = []
value_states = []
gates = []
for i, hidden_states in enumerate(inputs_embeds):
layer = models[i].layers[layer_idx]
hidden_states, gate = layernorm_forward(layer.input_layernorm, hidden_states, adarms_cond[i])
hidden_states, gate = layer.input_layernorm(hidden_states, cond=adarms_cond[i]) # noqa: PLW2901
gates.append(gate)
input_shape = hidden_states.shape[:-1]
hidden_shape = (*input_shape, -1, layer.self_attn.head_dim)
@@ -262,10 +254,10 @@ def compute_layer_complete(
query_states, key_states, cos, sin, unsqueeze_dim=1
)
batch_size = query_states.shape[0]
scaling = paligemma.model.language_model.layers[layer_idx].self_attn.scaling
scaling = paligemma.language_model.layers[layer_idx].self_attn.scaling
# Attention computation
att_output, _ = modeling_gemma.eager_attention_forward(
paligemma.model.language_model.layers[layer_idx].self_attn,
paligemma.language_model.layers[layer_idx].self_attn,
query_states,
key_states,
value_states,
@@ -273,7 +265,7 @@ def compute_layer_complete(
scaling,
)
# Get head_dim from the current layer, not from the model
head_dim = paligemma.model.language_model.layers[layer_idx].self_attn.head_dim
head_dim = paligemma.language_model.layers[layer_idx].self_attn.head_dim
att_output = att_output.reshape(batch_size, -1, 1 * 8 * head_dim)
# Process layer outputs
outputs_embeds = []
@@ -285,15 +277,15 @@ def compute_layer_complete(
att_output = att_output.to(layer.self_attn.o_proj.weight.dtype)
out_emb = layer.self_attn.o_proj(att_output[:, start_pos:end_pos])
# first residual
out_emb = _gated_residual(hidden_states, out_emb, gates[i])
out_emb = modeling_gemma._gated_residual(hidden_states, out_emb, gates[i]) # noqa: SLF001
after_first_residual = out_emb.clone()
out_emb, gate = layernorm_forward(layer.post_attention_layernorm, out_emb, adarms_cond[i])
out_emb, gate = layer.post_attention_layernorm(out_emb, cond=adarms_cond[i])
# Convert to bfloat16 if the next layer (mlp) uses bfloat16
if layer.mlp.up_proj.weight.dtype == torch.bfloat16:
out_emb = out_emb.to(dtype=torch.bfloat16)
out_emb = layer.mlp(out_emb)
# second residual
out_emb = _gated_residual(after_first_residual, out_emb, gate)
out_emb = modeling_gemma._gated_residual(after_first_residual, out_emb, gate) # noqa: SLF001
outputs_embeds.append(out_emb)
start_pos = end_pos
return outputs_embeds
@@ -366,7 +358,7 @@ class PaliGemmaWithExpertModel(
vlm_config_hf.text_config.num_hidden_layers = vlm_config.depth
vlm_config_hf.text_config.num_key_value_heads = vlm_config.num_kv_heads
vlm_config_hf.text_config.hidden_activation = "gelu_pytorch_tanh"
vlm_config_hf.text_config.dtype = "float32"
vlm_config_hf.text_config.torch_dtype = "float32"
vlm_config_hf.text_config.vocab_size = 257152
vlm_config_hf.text_config.use_adarms = use_adarms[0]
vlm_config_hf.text_config.adarms_cond_dim = vlm_config.width if use_adarms[0] else None
@@ -374,7 +366,7 @@ class PaliGemmaWithExpertModel(
vlm_config_hf.vision_config.intermediate_size = 4304
vlm_config_hf.vision_config.projection_dim = 2048
vlm_config_hf.vision_config.projector_hidden_act = "gelu_fast"
vlm_config_hf.vision_config.dtype = "float32"
vlm_config_hf.vision_config.torch_dtype = "float32"
action_expert_config_hf = CONFIG_MAPPING["gemma"](
head_dim=action_expert_config.head_dim,
@@ -385,13 +377,13 @@ class PaliGemmaWithExpertModel(
num_key_value_heads=action_expert_config.num_kv_heads,
vocab_size=257152,
hidden_activation="gelu_pytorch_tanh",
dtype="float32",
torch_dtype="float32",
use_adarms=use_adarms[1],
adarms_cond_dim=action_expert_config.width if use_adarms[1] else None,
)
self.paligemma = PaliGemmaForConditionalGenerationWithPiGemma(config=vlm_config_hf)
self.gemma_expert = PiGemmaForCausalLM(config=action_expert_config_hf)
self.paligemma = PaliGemmaForConditionalGeneration(config=vlm_config_hf)
self.gemma_expert = GemmaForCausalLM(config=action_expert_config_hf)
self.gemma_expert.model.embed_tokens = None
self.to_bfloat16_for_selected_params(precision)
@@ -406,11 +398,10 @@ class PaliGemmaWithExpertModel(
else:
raise ValueError(f"Invalid precision: {precision}")
# Keep full vision path in float32 so we never toggle (toggle causes optimizer
# "same dtype" error). Align with PI05.
params_to_keep_float32 = [
"vision_tower",
"multi_modal_projector",
"vision_tower.vision_model.embeddings.patch_embedding.weight",
"vision_tower.vision_model.embeddings.patch_embedding.bias",
"vision_tower.vision_model.embeddings.position_embedding.weight",
"input_layernorm",
"post_attention_layernorm",
"model.norm",
@@ -422,8 +413,8 @@ class PaliGemmaWithExpertModel(
def _set_requires_grad(self):
if self.freeze_vision_encoder:
self.paligemma.model.vision_tower.eval()
for param in self.paligemma.model.vision_tower.parameters():
self.paligemma.vision_tower.eval()
for param in self.paligemma.vision_tower.parameters():
param.requires_grad = False
if self.train_expert_only:
self.paligemma.eval()
@@ -433,23 +424,15 @@ class PaliGemmaWithExpertModel(
def train(self, mode: bool = True):
super().train(mode)
if self.freeze_vision_encoder:
self.paligemma.model.vision_tower.eval()
self.paligemma.vision_tower.eval()
if self.train_expert_only:
self.paligemma.eval()
def embed_image(self, image: torch.Tensor):
# Vision tower and multi_modal_projector are kept in float32 (params_to_keep_float32). Align with PI05.
out_dtype = image.dtype
if image.dtype != torch.float32:
image = image.to(torch.float32)
image_outputs = self.paligemma.model.get_image_features(image)
features = image_outputs.pooler_output * self.paligemma.config.text_config.hidden_size**0.5
if features.dtype != out_dtype:
features = features.to(out_dtype)
return features
return self.paligemma.model.get_image_features(image)
def embed_language_tokens(self, tokens: torch.Tensor):
return self.paligemma.model.language_model.embed_tokens(tokens)
return self.paligemma.language_model.embed_tokens(tokens)
def forward(
self,
@@ -463,7 +446,7 @@ class PaliGemmaWithExpertModel(
if adarms_cond is None:
adarms_cond = [None, None]
if inputs_embeds[1] is None:
prefix_output = self.paligemma.model.language_model.forward(
prefix_output = self.paligemma.language_model.forward(
inputs_embeds=inputs_embeds[0],
attention_mask=attention_mask,
position_ids=position_ids,
@@ -487,7 +470,7 @@ class PaliGemmaWithExpertModel(
prefix_output = None
prefix_past_key_values = None
else:
models = [self.paligemma.model.language_model, self.gemma_expert.model]
models = [self.paligemma.language_model, self.gemma_expert.model]
num_layers = self.paligemma.config.text_config.num_hidden_layers
# Check if gradient checkpointing is enabled for any of the models
@@ -527,7 +510,7 @@ class PaliGemmaWithExpertModel(
def compute_final_norms(inputs_embeds, adarms_cond):
outputs_embeds = []
for i, hidden_states in enumerate(inputs_embeds):
out_emb, _ = layernorm_forward(models[i].norm, hidden_states, adarms_cond[i])
out_emb, _ = models[i].norm(hidden_states, cond=adarms_cond[i])
outputs_embeds.append(out_emb)
return outputs_embeds
@@ -593,19 +576,29 @@ class PI0Pytorch(nn.Module): # see openpi `PI0Pytorch`
# Also compile the main forward pass used during training
self.forward = torch.compile(self.forward, mode=config.compile_mode)
msg = """An incorrect transformer version is used, please create an issue on https://github.com/huggingface/lerobot/issues"""
try:
from transformers.models.siglip import check
if not check.check_whether_transformers_replace_is_installed_correctly():
raise ValueError(msg)
except ImportError:
raise ValueError(msg) from None
def gradient_checkpointing_enable(self):
"""Enable gradient checkpointing for memory optimization."""
self.gradient_checkpointing_enabled = True
self.paligemma_with_expert.paligemma.model.language_model.gradient_checkpointing = True
self.paligemma_with_expert.paligemma.model.vision_tower.gradient_checkpointing = True
self.paligemma_with_expert.paligemma.language_model.gradient_checkpointing = True
self.paligemma_with_expert.paligemma.vision_tower.gradient_checkpointing = True
self.paligemma_with_expert.gemma_expert.model.gradient_checkpointing = True
logging.info("Enabled gradient checkpointing for PI0Pytorch model")
def gradient_checkpointing_disable(self):
"""Disable gradient checkpointing."""
self.gradient_checkpointing_enabled = False
self.paligemma_with_expert.paligemma.model.language_model.gradient_checkpointing = False
self.paligemma_with_expert.paligemma.model.vision_tower.gradient_checkpointing = False
self.paligemma_with_expert.paligemma.language_model.gradient_checkpointing = False
self.paligemma_with_expert.paligemma.vision_tower.gradient_checkpointing = False
self.paligemma_with_expert.gemma_expert.model.gradient_checkpointing = False
logging.info("Disabled gradient checkpointing for PI0Pytorch model")
@@ -767,7 +760,7 @@ class PI0Pytorch(nn.Module): # see openpi `PI0Pytorch`
suffix_embs, suffix_pad_masks, suffix_att_masks, adarms_cond = self.embed_suffix(state, x_t, time)
if (
self.paligemma_with_expert.paligemma.model.language_model.layers[0].self_attn.q_proj.weight.dtype
self.paligemma_with_expert.paligemma.language_model.layers[0].self_attn.q_proj.weight.dtype
== torch.bfloat16
):
suffix_embs = suffix_embs.to(dtype=torch.bfloat16)
@@ -841,7 +834,7 @@ class PI0Pytorch(nn.Module): # see openpi `PI0Pytorch`
prefix_position_ids = torch.cumsum(prefix_pad_masks, dim=1) - 1
prefix_att_2d_masks_4d = self._prepare_attention_masks_4d(prefix_att_2d_masks)
self.paligemma_with_expert.paligemma.model.language_model.config._attn_implementation = "eager" # noqa: SLF001
self.paligemma_with_expert.paligemma.language_model.config._attn_implementation = "eager" # noqa: SLF001
_, past_key_values = self.paligemma_with_expert.forward(
attention_mask=prefix_att_2d_masks_4d,
@@ -915,7 +908,6 @@ class PI0Pytorch(nn.Module): # see openpi `PI0Pytorch`
full_att_2d_masks_4d = self._prepare_attention_masks_4d(full_att_2d_masks)
self.paligemma_with_expert.gemma_expert.model.config._attn_implementation = "eager" # noqa: SLF001
past_key_values = copy.deepcopy(past_key_values)
outputs_embeds, _ = self.paligemma_with_expert.forward(
attention_mask=full_att_2d_masks_4d,
position_ids=position_ids,
@@ -1005,12 +997,14 @@ class PI0Policy(PreTrainedPolicy):
# Check if dataset_stats were provided in kwargs
model = cls(config, **kwargs)
# Load state dict (expects keys with "model." prefix)
# Now manually load and remap the state dict
try:
# Try to load the pytorch_model.bin or model.safetensors file
print(f"Loading model from: {pretrained_name_or_path}")
try:
from transformers.utils import cached_file
# Try safetensors first
resolved_file = cached_file(
pretrained_name_or_path,
"model.safetensors",
@@ -1018,7 +1012,7 @@ class PI0Policy(PreTrainedPolicy):
force_download=kwargs.get("force_download", False),
resume_download=kwargs.get("resume_download"),
proxies=kwargs.get("proxies"),
token=kwargs.get("token"),
use_auth_token=kwargs.get("use_auth_token"),
revision=kwargs.get("revision"),
local_files_only=kwargs.get("local_files_only", False),
)
@@ -1031,7 +1025,7 @@ class PI0Policy(PreTrainedPolicy):
print("Returning model without loading pretrained weights")
return model
# First, fix any key differences (see openpi model.py, _fix_pytorch_state_dict_keys)
# First, fix any key differences # see openpi `model.py, _fix_pytorch_state_dict_keys`
fixed_state_dict = model._fix_pytorch_state_dict_keys(original_state_dict, model.config)
# Then add "model." prefix for all keys that don't already have it
@@ -1076,7 +1070,7 @@ class PI0Policy(PreTrainedPolicy):
print("All keys loaded successfully!")
except Exception as e:
print(f"Warning: Could not load state dict: {e}")
print(f"Warning: Could not remap state dict keys: {e}")
return model
@@ -1126,14 +1120,6 @@ class PI0Policy(PreTrainedPolicy):
# Some checkpoints might have this, but current model expects different structure
logging.warning(f"Vision embedding key might need handling: {key}")
if (
key == "model.paligemma_with_expert.paligemma.lm_head.weight"
or key == "paligemma_with_expert.paligemma.lm_head.weight"
):
fixed_state_dict[
"model.paligemma_with_expert.paligemma.model.language_model.embed_tokens.weight"
] = value.clone()
fixed_state_dict[new_key] = value
return fixed_state_dict

Some files were not shown because too many files have changed in this diff Show More