mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-11 14:49:43 +00:00
Compare commits
27 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 1396b9fab7 | |||
| 7c032f19fc | |||
| e2f27bf71b | |||
| ea36a4a176 | |||
| 399b3c9ba5 | |||
| 913041e753 | |||
| 2b541ddd4c | |||
| 50a1e67e94 | |||
| d60a700d2b | |||
| 8c3d4cf900 | |||
| b6e60a6e30 | |||
| 3596681d94 | |||
| 4dbbcca496 | |||
| 818892a38b | |||
| 66fef25ded | |||
| 2cf08b7a4b | |||
| 15934d8d08 | |||
| 9300352876 | |||
| 720cf8e3a0 | |||
| 5d4fdf5088 | |||
| 3b185f7f9d | |||
| 2e069b1c47 | |||
| 4e45acca52 | |||
| 975d89b38d | |||
| 07502868e5 | |||
| aa9cc9bd43 | |||
| 123495250b |
@@ -12,8 +12,8 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# This workflow handles nightly testing & docker images publishing.
|
||||
name: Nightly
|
||||
# This workflow handles Docker image publishing & testing.
|
||||
name: Docker Publish & Test
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
@@ -39,8 +39,8 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
# This job builds a CPU image for testing & distribution
|
||||
build-docker-cpu-nightly:
|
||||
name: Build CPU Docker for Nightly
|
||||
build-docker-cpu:
|
||||
name: Build CPU Docker
|
||||
runs-on:
|
||||
group: aws-general-8-plus
|
||||
if: github.repository == 'huggingface/lerobot'
|
||||
@@ -74,8 +74,8 @@ jobs:
|
||||
tags: ${{ env.DOCKER_IMAGE_NAME_CPU }}
|
||||
|
||||
# This job builds a GPU image for testing & distribution
|
||||
build-docker-gpu-nightly:
|
||||
name: Build GPU Docker for Nightly
|
||||
build-docker-gpu:
|
||||
name: Build GPU Docker
|
||||
runs-on:
|
||||
group: aws-general-8-plus
|
||||
if: github.repository == 'huggingface/lerobot'
|
||||
@@ -109,9 +109,9 @@ jobs:
|
||||
tags: ${{ env.DOCKER_IMAGE_NAME_GPU }}
|
||||
|
||||
# This job runs the E2E tests + pytest with all extras in the CPU image
|
||||
nightly-cpu-tests:
|
||||
name: Nightly CPU Tests
|
||||
needs: [build-docker-cpu-nightly]
|
||||
cpu-tests:
|
||||
name: CPU Tests
|
||||
needs: [build-docker-cpu]
|
||||
runs-on:
|
||||
group: aws-g6-4xlarge-plus
|
||||
env:
|
||||
@@ -121,7 +121,7 @@ jobs:
|
||||
TRITON_CACHE_DIR: /home/user_lerobot/.cache/triton
|
||||
HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
|
||||
container:
|
||||
image: ${{ needs.build-docker-cpu-nightly.outputs.image_tag }} # zizmor: ignore[unpinned-images]
|
||||
image: ${{ needs.build-docker-cpu.outputs.image_tag }} # zizmor: ignore[unpinned-images]
|
||||
options: --shm-size "16gb"
|
||||
credentials:
|
||||
username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
|
||||
@@ -142,9 +142,9 @@ jobs:
|
||||
run: make test-end-to-end
|
||||
|
||||
# This job runs the E2E tests + pytest with all extras in the GPU image
|
||||
nightly-gpu-tests:
|
||||
name: Nightly GPU Tests
|
||||
needs: [build-docker-gpu-nightly]
|
||||
gpu-tests:
|
||||
name: GPU Tests
|
||||
needs: [build-docker-gpu]
|
||||
runs-on:
|
||||
group: aws-g6-4xlarge-plus
|
||||
env:
|
||||
@@ -154,7 +154,7 @@ jobs:
|
||||
TRITON_CACHE_DIR: /home/user_lerobot/.cache/triton
|
||||
HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
|
||||
container:
|
||||
image: ${{ needs.build-docker-gpu-nightly.outputs.image_tag }} # zizmor: ignore[unpinned-images]
|
||||
image: ${{ needs.build-docker-gpu.outputs.image_tag }} # zizmor: ignore[unpinned-images]
|
||||
options: --gpus all --shm-size "16gb"
|
||||
credentials:
|
||||
username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
|
||||
@@ -175,9 +175,9 @@ jobs:
|
||||
run: make test-end-to-end
|
||||
|
||||
# This job runs multi-GPU training tests with 4 GPUs
|
||||
nightly-multi-gpu-tests:
|
||||
name: Nightly Multi-GPU Tests
|
||||
needs: [build-docker-gpu-nightly]
|
||||
multi-gpu-tests:
|
||||
name: Multi-GPU Tests
|
||||
needs: [build-docker-gpu]
|
||||
runs-on:
|
||||
group: aws-g4dn-12xlarge # Instance with 4 GPUs
|
||||
env:
|
||||
@@ -188,7 +188,7 @@ jobs:
|
||||
CUDA_VISIBLE_DEVICES: "0,1,2,3"
|
||||
HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
|
||||
container:
|
||||
image: ${{ needs.build-docker-gpu-nightly.outputs.image_tag }} # zizmor: ignore[unpinned-images]
|
||||
image: ${{ needs.build-docker-gpu.outputs.image_tag }} # zizmor: ignore[unpinned-images]
|
||||
options: --gpus all --shm-size "16gb"
|
||||
credentials:
|
||||
username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
|
||||
@@ -33,7 +33,7 @@ jobs:
|
||||
github.event.workflow_run.event == 'pull_request' &&
|
||||
github.event.workflow_run.conclusion == 'success' &&
|
||||
github.repository == 'huggingface/lerobot'
|
||||
uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main
|
||||
uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@90b4ee2c10b81b5c1a6367c4e6fc9e2fb510a7e3 # main
|
||||
with:
|
||||
package_name: lerobot
|
||||
secrets:
|
||||
|
||||
@@ -55,7 +55,7 @@ jobs:
|
||||
github.repository == 'huggingface/lerobot'
|
||||
permissions:
|
||||
contents: read
|
||||
uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main
|
||||
uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@90b4ee2c10b81b5c1a6367c4e6fc9e2fb510a7e3 # main
|
||||
with:
|
||||
commit_sha: ${{ github.sha }}
|
||||
package: lerobot
|
||||
@@ -78,7 +78,7 @@ jobs:
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: write
|
||||
uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
|
||||
uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@90b4ee2c10b81b5c1a6367c4e6fc9e2fb510a7e3 # main
|
||||
with:
|
||||
commit_sha: ${{ github.event.pull_request.head.sha }}
|
||||
pr_number: ${{ github.event.number }}
|
||||
|
||||
@@ -27,6 +27,7 @@ on:
|
||||
- "tests/**"
|
||||
- ".github/workflows/**"
|
||||
- "pyproject.toml"
|
||||
- "uv.lock"
|
||||
- "Makefile"
|
||||
push:
|
||||
branches:
|
||||
@@ -36,6 +37,7 @@ on:
|
||||
- "tests/**"
|
||||
- ".github/workflows/**"
|
||||
- "pyproject.toml"
|
||||
- "uv.lock"
|
||||
- "Makefile"
|
||||
|
||||
permissions:
|
||||
@@ -63,7 +65,7 @@ jobs:
|
||||
HF_LEROBOT_HOME: /mnt/cache/.cache/huggingface/lerobot
|
||||
HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
persist-credentials: false
|
||||
lfs: true
|
||||
@@ -81,14 +83,14 @@ jobs:
|
||||
libusb-1.0-0-dev speech-dispatcher libgeos-dev portaudio19-dev
|
||||
|
||||
- name: Setup uv and Python
|
||||
uses: astral-sh/setup-uv@v6 # zizmor: ignore[unpinned-uses]
|
||||
uses: astral-sh/setup-uv@d0cc045d04ccac9d8b7881df0226f9e82c39688e # v6
|
||||
with:
|
||||
enable-cache: true
|
||||
version: ${{ env.UV_VERSION }}
|
||||
python-version: ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- name: Install lerobot with test extras
|
||||
run: uv sync --extra "test"
|
||||
run: uv sync --locked --extra "test"
|
||||
|
||||
- name: Login to Hugging Face
|
||||
if: env.HF_USER_TOKEN != ''
|
||||
|
||||
@@ -29,6 +29,7 @@ on:
|
||||
- "tests/**"
|
||||
- ".github/workflows/**"
|
||||
- "pyproject.toml"
|
||||
- "uv.lock"
|
||||
- "Makefile"
|
||||
|
||||
permissions:
|
||||
@@ -62,7 +63,7 @@ jobs:
|
||||
HF_LEROBOT_HOME: /mnt/cache/.cache/huggingface/lerobot
|
||||
HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
lfs: true
|
||||
persist-credentials: false
|
||||
@@ -79,14 +80,14 @@ jobs:
|
||||
speech-dispatcher libgeos-dev portaudio19-dev
|
||||
|
||||
- name: Setup uv and Python
|
||||
uses: astral-sh/setup-uv@v6 # zizmor: ignore[unpinned-uses]
|
||||
uses: astral-sh/setup-uv@d0cc045d04ccac9d8b7881df0226f9e82c39688e # v6
|
||||
with:
|
||||
enable-cache: true
|
||||
version: ${{ env.UV_VERSION }}
|
||||
python-version: ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- name: Install lerobot with all extras
|
||||
run: uv sync --extra all # TODO(Steven): Make flash-attn optional
|
||||
run: uv sync --locked --extra all # TODO(Steven): Make flash-attn optional
|
||||
|
||||
- name: Login to Hugging Face
|
||||
if: env.HF_USER_TOKEN != ''
|
||||
@@ -136,21 +137,21 @@ jobs:
|
||||
sudo apt-get update
|
||||
sudo apt-get install git-lfs
|
||||
git lfs install
|
||||
- uses: actions/checkout@v6
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
lfs: true
|
||||
persist-credentials: false
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3
|
||||
with:
|
||||
cache-binary: false
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
|
||||
- name: Build and push Docker image
|
||||
uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
|
||||
uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 # v6
|
||||
with:
|
||||
context: .
|
||||
file: ./docker/Dockerfile.internal
|
||||
|
||||
+139
-37
@@ -12,38 +12,81 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# This workflow handles full testing with unboud dependencies versions.
|
||||
name: Unbound Dependency Tests
|
||||
# This workflow tests the project against the latest upstream dependencies
|
||||
# (within pyproject.toml constraints) and opens a PR to update uv.lock
|
||||
# if the tests pass and the lockfile has changed.
|
||||
name: Latest Dependency Tests
|
||||
|
||||
on:
|
||||
# Allows running this workflow manually from the Actions tab
|
||||
workflow_dispatch:
|
||||
|
||||
# Run on the 1st and 15th of every month at 09:00 UTC
|
||||
# schedule:
|
||||
# - cron: '0 2 1,15 * *'
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
# Runs at 03:00 UTC
|
||||
schedule:
|
||||
- cron: "0 3 * * *"
|
||||
|
||||
# Sets up the environment variables
|
||||
env:
|
||||
UV_VERSION: "0.8.0"
|
||||
PYTHON_VERSION: "3.12"
|
||||
DOCKER_IMAGE_NAME: huggingface/lerobot-gpu:unbound
|
||||
DOCKER_IMAGE_NAME: huggingface/lerobot-gpu:latest-deps
|
||||
|
||||
# Ensures that only the latest action is built, canceling older runs.
|
||||
# Ensures that only the latest run is active, canceling older runs.
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
||||
group: ${{ github.workflow }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
|
||||
# This job runs the E2E tests + pytest with all unbound extras
|
||||
full-tests:
|
||||
name: Full Unbound Tests
|
||||
# This job upgrades the lockfile and checks if dependencies have changed
|
||||
upgrade-lock:
|
||||
name: Upgrade Lockfile
|
||||
runs-on: ubuntu-latest
|
||||
if: github.repository == 'huggingface/lerobot'
|
||||
permissions:
|
||||
contents: read
|
||||
outputs:
|
||||
changed: ${{ steps.diff.outputs.changed }}
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
persist-credentials: false
|
||||
|
||||
- name: Setup uv and Python
|
||||
uses: astral-sh/setup-uv@v6 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
version: ${{ env.UV_VERSION }}
|
||||
python-version: ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- name: Upgrade uv.lock
|
||||
run: uv lock --upgrade
|
||||
|
||||
- name: Check for changes
|
||||
id: diff
|
||||
run: |
|
||||
if git diff --quiet uv.lock; then
|
||||
echo "changed=false" >> "$GITHUB_OUTPUT"
|
||||
echo "uv.lock is up to date — no dependency changes."
|
||||
else
|
||||
echo "changed=true" >> "$GITHUB_OUTPUT"
|
||||
echo "uv.lock has changed — running tests."
|
||||
fi
|
||||
|
||||
- name: Upload updated lockfile
|
||||
if: steps.diff.outputs.changed == 'true'
|
||||
uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
name: uv-lock
|
||||
path: uv.lock
|
||||
|
||||
# This job runs the full test suite with the upgraded dependencies
|
||||
cpu-tests:
|
||||
name: CPU Tests (Latest Deps)
|
||||
needs: [upgrade-lock]
|
||||
if: needs.upgrade-lock.outputs.changed == 'true'
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: read
|
||||
env:
|
||||
MUJOCO_GL: egl
|
||||
HF_HOME: /mnt/cache/.cache/huggingface
|
||||
@@ -55,6 +98,11 @@ jobs:
|
||||
lfs: true
|
||||
persist-credentials: false
|
||||
|
||||
- name: Download updated lockfile
|
||||
uses: actions/download-artifact@v4 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
name: uv-lock
|
||||
|
||||
# NOTE(Steven): Mount to `/mnt` to avoid the limited storage on `/home`. Consider cleaning default SDKs or using self-hosted runners for more space.
|
||||
# (As of 2024-06-10, the runner's `/home` has only 6.2 GB free—8% of its 72 GB total.)
|
||||
- name: Setup /mnt storage
|
||||
@@ -73,34 +121,32 @@ jobs:
|
||||
version: ${{ env.UV_VERSION }}
|
||||
python-version: ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- name: Unbound dependencies
|
||||
run: |
|
||||
sed -i 's/,[[:space:]]*<[0-9\.]*//g' pyproject.toml
|
||||
echo "Dependencies unbound:" && cat pyproject.toml
|
||||
|
||||
- name: Install lerobot with all extras
|
||||
run: uv sync --extra all # TODO(Steven): Make flash-attn optional
|
||||
run: uv sync --locked --extra all # TODO(Steven): Make flash-attn optional
|
||||
|
||||
- name: Login to Hugging Face
|
||||
if: env.HF_USER_TOKEN != ''
|
||||
run: |
|
||||
uv run hf auth login --token "$HF_USER_TOKEN" --add-to-git-credential
|
||||
uv run hf auth whoami
|
||||
|
||||
- name: Run pytest (all extras)
|
||||
run: uv run pytest tests -vv
|
||||
run: uv run pytest tests -vv --maxfail=10
|
||||
|
||||
- name: Run end-to-end tests
|
||||
run: uv run make test-end-to-end
|
||||
|
||||
# This job builds a GPU enabled image for testing
|
||||
# This job builds a GPU-enabled Docker image with the upgraded dependencies
|
||||
build-and-push-docker:
|
||||
name: Build and Push Docker
|
||||
needs: [upgrade-lock]
|
||||
if: needs.upgrade-lock.outputs.changed == 'true'
|
||||
permissions:
|
||||
contents: read
|
||||
runs-on:
|
||||
group: aws-general-8-plus
|
||||
if: github.repository == 'huggingface/lerobot'
|
||||
outputs:
|
||||
image_tag: ${{ env.DOCKER_IMAGE_NAME }}
|
||||
env:
|
||||
GITHUB_REF: ${{ github.ref }}
|
||||
steps:
|
||||
- name: Install Git LFS
|
||||
run: |
|
||||
@@ -111,6 +157,12 @@ jobs:
|
||||
with:
|
||||
lfs: true
|
||||
persist-credentials: false
|
||||
|
||||
- name: Download updated lockfile
|
||||
uses: actions/download-artifact@v4 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
name: uv-lock
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
@@ -127,14 +179,13 @@ jobs:
|
||||
file: ./docker/Dockerfile.internal
|
||||
push: true
|
||||
tags: ${{ env.DOCKER_IMAGE_NAME }}
|
||||
build-args: |
|
||||
UNBOUND_DEPS=true
|
||||
|
||||
# This job runs pytest with all unbound extras in a GPU enabled host
|
||||
# It runs everytime a test image is created
|
||||
# This job runs pytest with all extras on a GPU-enabled host
|
||||
gpu-tests:
|
||||
name: GPU Unbound Tests
|
||||
name: GPU Tests (Latest Deps)
|
||||
needs: [build-and-push-docker]
|
||||
permissions:
|
||||
contents: read
|
||||
runs-on:
|
||||
group: aws-g6-4xlarge-plus
|
||||
env:
|
||||
@@ -159,17 +210,69 @@ jobs:
|
||||
run: |
|
||||
hf auth login --token "$HF_USER_TOKEN" --add-to-git-credential
|
||||
hf auth whoami
|
||||
- name: Fix ptxas permissions
|
||||
run: chmod +x /lerobot/.venv/lib/python3.12/site-packages/triton/backends/nvidia/bin/ptxas
|
||||
- name: Run pytest on GPU
|
||||
run: pytest tests -vv
|
||||
run: pytest tests -vv --maxfail=10
|
||||
- name: Run end-to-end tests
|
||||
run: make test-end-to-end
|
||||
|
||||
# This job deletes the test image recently created
|
||||
# It runs everytime after the gpu-tests have finished
|
||||
delete-unbound-image:
|
||||
name: Delete Unbound Image
|
||||
# This job creates or updates a PR with the upgraded lockfile
|
||||
open-pr:
|
||||
name: Open PR
|
||||
needs: [cpu-tests, gpu-tests, upgrade-lock]
|
||||
if: success() && needs.upgrade-lock.outputs.changed == 'true'
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: write
|
||||
pull-requests: write
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.UPDATE_LOCK_TOKEN }}
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
persist-credentials: false
|
||||
|
||||
- name: Download updated lockfile
|
||||
uses: actions/download-artifact@v4 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
name: uv-lock
|
||||
|
||||
- name: Create or update PR
|
||||
run: |
|
||||
set -euo pipefail
|
||||
BRANCH="auto/update-uv-lock"
|
||||
|
||||
git config user.name "github-actions[bot]"
|
||||
git config user.email "github-actions[bot]@users.noreply.github.com"
|
||||
git remote set-url origin "https://x-access-token:${GH_TOKEN}@github.com/${{ github.repository }}.git"
|
||||
|
||||
git checkout -B "$BRANCH"
|
||||
git add uv.lock
|
||||
git commit -m "chore(dependencies): update uv.lock"
|
||||
git push --force origin "$BRANCH"
|
||||
|
||||
# Create PR only if one doesn't already exist for this branch
|
||||
EXISTING_PR=$(gh pr list --head "$BRANCH" --state open --json number --jq '.[0].number')
|
||||
if [ -z "$EXISTING_PR" ]; then
|
||||
gh pr create \
|
||||
--title "chore(dependencies): update uv.lock" \
|
||||
--body "Automated update of \`uv.lock\` after successful latest dependency tests (CPU + GPU).
|
||||
|
||||
This PR upgrades all dependencies to their latest versions within the ranges specified in \`pyproject.toml\`." \
|
||||
--head "$BRANCH" \
|
||||
--base main
|
||||
else
|
||||
echo "PR #$EXISTING_PR already exists, branch has been updated."
|
||||
fi
|
||||
|
||||
# This job deletes the temporary Docker image after tests complete
|
||||
cleanup-docker:
|
||||
name: Cleanup Docker Image
|
||||
needs: [gpu-tests, build-and-push-docker]
|
||||
if: always() && needs.build-and-push-docker.result == 'success'
|
||||
permissions:
|
||||
contents: read
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Get Docker Hub Token and Delete Image
|
||||
@@ -180,8 +283,7 @@ jobs:
|
||||
IMAGE_FULL: ${{ needs.build-and-push-docker.outputs.image_tag }}
|
||||
run: |
|
||||
IMAGE_NAME=$(echo "$IMAGE_FULL" | cut -d':' -f1)
|
||||
IMAGE_TAG=$(echo "$IMAGE_FULL" | cut -d':' -f2)
|
||||
|
||||
IMAGE_TAG=$(echo "$IMAGE_FULL" | cut -d':' -f2-)
|
||||
echo "Attempting to delete image: $IMAGE_NAME:$IMAGE_TAG"
|
||||
|
||||
TOKEN=$(curl -s -H "Content-Type: application/json" \
|
||||
@@ -43,16 +43,16 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
persist-credentials: false
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v6
|
||||
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6
|
||||
with:
|
||||
python-version: '3.12'
|
||||
|
||||
- name: Run pre-commit hooks
|
||||
uses: pre-commit/action@v3.0.1 # zizmor: ignore[unpinned-uses]
|
||||
uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
|
||||
with:
|
||||
extra_args: --all-files --show-diff-on-failure --color=always
|
||||
|
||||
@@ -38,12 +38,12 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
persist-credentials: false
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v6
|
||||
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6
|
||||
with:
|
||||
python-version: '3.12'
|
||||
|
||||
@@ -104,7 +104,7 @@ jobs:
|
||||
- name: Publish to TestPyPI for pre-releases
|
||||
# True for tags like 'v0.2.0-rc1'
|
||||
if: startsWith(github.ref, 'refs/tags/v') && contains(github.ref, '-')
|
||||
uses: pypa/gh-action-pypi-publish@v1.13.0 # zizmor: ignore[unpinned-uses, use-trusted-publishing]
|
||||
uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # v1.13.0
|
||||
with:
|
||||
repository-url: https://test.pypi.org/legacy/
|
||||
verbose: true
|
||||
@@ -112,7 +112,7 @@ jobs:
|
||||
|
||||
- name: Publish to PyPI
|
||||
if: startsWith(github.ref, 'refs/tags/v') && !contains(github.ref, '-')
|
||||
uses: pypa/gh-action-pypi-publish@v1.13.0 # zizmor: ignore[unpinned-uses, use-trusted-publishing]
|
||||
uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # v1.13.0
|
||||
with:
|
||||
verbose: true
|
||||
print-hash: true
|
||||
@@ -127,7 +127,7 @@ jobs:
|
||||
env:
|
||||
MUJOCO_GL: egl
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
lfs: true
|
||||
persist-credentials: false
|
||||
@@ -137,7 +137,7 @@ jobs:
|
||||
git curl libglib2.0-0 libegl1-mesa-dev ffmpeg libusb-1.0-0-dev \
|
||||
speech-dispatcher libgeos-dev portaudio19-dev
|
||||
- name: Setup uv and Python
|
||||
uses: astral-sh/setup-uv@v6 # zizmor: ignore[unpinned-uses]
|
||||
uses: astral-sh/setup-uv@d0cc045d04ccac9d8b7881df0226f9e82c39688e # v6
|
||||
with:
|
||||
enable-cache: true # zizmor: ignore[cache-poisoning]
|
||||
version: ${{ env.UV_VERSION }}
|
||||
|
||||
@@ -43,12 +43,12 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v6 # zizmor: ignore[unpinned-uses]
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
fetch-depth: 0
|
||||
persist-credentials: false
|
||||
|
||||
- name: Secret Scanning
|
||||
uses: trufflesecurity/trufflehog@v3.90.0 # zizmor: ignore[unpinned-uses]
|
||||
uses: trufflesecurity/trufflehog@eafb8c5f6a06175141c27f17bcc17941853d0047 # v3.90.0
|
||||
with:
|
||||
extra_args: --only-verified
|
||||
|
||||
@@ -25,7 +25,6 @@ node_modules/
|
||||
|
||||
# Lock files
|
||||
poetry.lock
|
||||
uv.lock
|
||||
Pipfile.lock
|
||||
|
||||
### Build & Distribution ###
|
||||
|
||||
@@ -4,7 +4,8 @@
|
||||
|
||||
<div align="center">
|
||||
|
||||
[](https://github.com/huggingface/lerobot/actions/workflows/nightly.yml?query=branch%3Amain)
|
||||
[](https://github.com/huggingface/lerobot/actions/workflows/latest_deps_tests.yml?query=branch%3Amain)
|
||||
[](https://github.com/huggingface/lerobot/actions/workflows/docker_publish.yml?query=branch%3Amain)
|
||||
[](https://www.python.org/downloads/)
|
||||
[](https://github.com/huggingface/lerobot/blob/main/LICENSE)
|
||||
[](https://pypi.org/project/lerobot/)
|
||||
@@ -100,11 +101,11 @@ lerobot-train \
|
||||
--dataset.repo_id=lerobot/aloha_mobile_cabinet
|
||||
```
|
||||
|
||||
| Category | Models |
|
||||
| -------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| **Imitation Learning** | [ACT](./docs/source/policy_act_README.md), [Diffusion](./docs/source/policy_diffusion_README.md), [VQ-BeT](./docs/source/policy_vqbet_README.md) |
|
||||
| **Reinforcement Learning** | [HIL-SERL](./docs/source/hilserl.mdx), [TDMPC](./docs/source/policy_tdmpc_README.md) & QC-FQL (coming soon) |
|
||||
| **VLAs Models** | [Pi0Fast](./docs/source/pi0fast.mdx), [Pi0.5](./docs/source/pi05.mdx), [GR00T N1.5](./docs/source/policy_groot_README.md), [SmolVLA](./docs/source/policy_smolvla_README.md), [XVLA](./docs/source/xvla.mdx) |
|
||||
| Category | Models |
|
||||
| -------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| **Imitation Learning** | [ACT](./docs/source/policy_act_README.md), [Diffusion](./docs/source/policy_diffusion_README.md), [VQ-BeT](./docs/source/policy_vqbet_README.md), [Multitask DiT Policy](./docs/source/policy_multi_task_dit_README.md) |
|
||||
| **Reinforcement Learning** | [HIL-SERL](./docs/source/hilserl.mdx), [TDMPC](./docs/source/policy_tdmpc_README.md) & QC-FQL (coming soon) |
|
||||
| **VLAs Models** | [Pi0Fast](./docs/source/pi0fast.mdx), [Pi0.5](./docs/source/pi05.mdx), [GR00T N1.5](./docs/source/policy_groot_README.md), [SmolVLA](./docs/source/policy_smolvla_README.md), [XVLA](./docs/source/xvla.mdx) |
|
||||
|
||||
Similarly to the hardware, you can easily implement your own policy & leverage LeRobot's data collection, training, and visualization tools, and share your model to the HF Hub
|
||||
|
||||
|
||||
@@ -73,17 +73,10 @@ ENV HOME=/home/user_lerobot \
|
||||
RUN uv venv --python python${PYTHON_VERSION}
|
||||
|
||||
# Install Python dependencies for caching
|
||||
COPY --chown=user_lerobot:user_lerobot setup.py pyproject.toml README.md MANIFEST.in ./
|
||||
COPY --chown=user_lerobot:user_lerobot setup.py pyproject.toml uv.lock README.md MANIFEST.in ./
|
||||
COPY --chown=user_lerobot:user_lerobot src/ src/
|
||||
|
||||
ARG UNBOUND_DEPS=false
|
||||
|
||||
RUN if [ "$UNBOUND_DEPS" = "true" ]; then \
|
||||
sed -i 's/,[[:space:]]*<[0-9\.]*//g' pyproject.toml; \
|
||||
echo "Dependencies unbound:" && cat pyproject.toml; \
|
||||
fi
|
||||
|
||||
RUN uv pip install --no-cache ".[all]"
|
||||
RUN uv sync --locked --extra all --no-cache
|
||||
|
||||
RUN chmod +x /lerobot/.venv/lib/python${PYTHON_VERSION}/site-packages/triton/backends/nvidia/bin/ptxas
|
||||
|
||||
|
||||
@@ -61,17 +61,10 @@ ENV HOME=/home/user_lerobot \
|
||||
RUN uv venv
|
||||
|
||||
# Install Python dependencies for caching
|
||||
COPY --chown=user_lerobot:user_lerobot setup.py pyproject.toml README.md MANIFEST.in ./
|
||||
COPY --chown=user_lerobot:user_lerobot setup.py pyproject.toml uv.lock README.md MANIFEST.in ./
|
||||
COPY --chown=user_lerobot:user_lerobot src/ src/
|
||||
|
||||
ARG UNBOUND_DEPS=false
|
||||
|
||||
RUN if [ "$UNBOUND_DEPS" = "true" ]; then \
|
||||
sed -i 's/,[[:space:]]*<[0-9\.]*//g' pyproject.toml; \
|
||||
echo "Dependencies unbound:" && cat pyproject.toml; \
|
||||
fi
|
||||
|
||||
RUN uv pip install --no-cache ".[all]"
|
||||
RUN uv sync --locked --extra all --no-cache
|
||||
|
||||
# Copy the rest of the application code
|
||||
# Make sure to have the git-LFS files for testing
|
||||
|
||||
@@ -0,0 +1,77 @@
|
||||
# Docker
|
||||
|
||||
This directory contains Dockerfiles for running LeRobot in containerized environments. Both images are **built nightly from `main`** and published to Docker Hub with the full environment pre-baked — no dependency setup required.
|
||||
|
||||
## Pre-built Images
|
||||
|
||||
```bash
|
||||
# CPU-only image (based on Dockerfile.user)
|
||||
docker pull huggingface/lerobot-cpu:latest
|
||||
|
||||
# GPU image with CUDA support (based on Dockerfile.internal)
|
||||
docker pull huggingface/lerobot-gpu:latest
|
||||
```
|
||||
|
||||
## Quick Start
|
||||
|
||||
The fastest way to start training is to pull the GPU image and run `lerobot-train` directly. This is the same environment used for all of our CI, so it is a well-tested, batteries-included setup.
|
||||
|
||||
```bash
|
||||
docker run -it --rm --gpus all --shm-size 16gb huggingface/lerobot-gpu:latest
|
||||
|
||||
# inside the container:
|
||||
lerobot-train --policy.type=act --dataset.repo_id=lerobot/aloha_sim_transfer_cube_human
|
||||
```
|
||||
|
||||
## Dockerfiles
|
||||
|
||||
### `Dockerfile.user` (CPU)
|
||||
|
||||
A lightweight image based on `python:3.12-slim`. Includes all Python dependencies and system libraries but does not include CUDA — there is no GPU support. Useful for exploring the codebase, running scripts, or working with robots, but not practical for training.
|
||||
|
||||
### `Dockerfile.internal` (GPU)
|
||||
|
||||
A CUDA-enabled image based on `nvidia/cuda`. This is the image for training — mostly used for internal interactions with the GPU cluster.
|
||||
|
||||
## Usage
|
||||
|
||||
### Running a pre-built image
|
||||
|
||||
```bash
|
||||
# CPU
|
||||
docker run -it --rm huggingface/lerobot-cpu:latest
|
||||
|
||||
# GPU
|
||||
docker run -it --rm --gpus all --shm-size 16gb huggingface/lerobot-gpu:latest
|
||||
```
|
||||
|
||||
### Building locally
|
||||
|
||||
From the repo root:
|
||||
|
||||
```bash
|
||||
# CPU
|
||||
docker build -f docker/Dockerfile.user -t lerobot-user .
|
||||
docker run -it --rm lerobot-user
|
||||
|
||||
# GPU
|
||||
docker build -f docker/Dockerfile.internal -t lerobot-internal .
|
||||
docker run -it --rm --gpus all --shm-size 16gb lerobot-internal
|
||||
```
|
||||
|
||||
### Multi-GPU training
|
||||
|
||||
To select specific GPUs, set `CUDA_VISIBLE_DEVICES` when launching the container:
|
||||
|
||||
```bash
|
||||
# Use 4 GPUs
|
||||
docker run -it --rm --gpus all --shm-size 16gb \
|
||||
-e CUDA_VISIBLE_DEVICES=0,1,2,3 \
|
||||
huggingface/lerobot-gpu:latest
|
||||
```
|
||||
|
||||
### USB device access (e.g. robots, cameras)
|
||||
|
||||
```bash
|
||||
docker run -it --device=/dev/ -v /dev/:/dev/ --rm huggingface/lerobot-cpu:latest
|
||||
```
|
||||
@@ -17,6 +17,8 @@
|
||||
title: Train RL in Simulation
|
||||
- local: multi_gpu_training
|
||||
title: Multi GPU training
|
||||
- local: hil_data_collection
|
||||
title: Human In the Loop Data Collection
|
||||
- local: peft_training
|
||||
title: Training with PEFT (e.g., LoRA)
|
||||
- local: rename_map
|
||||
@@ -49,6 +51,8 @@
|
||||
title: NVIDIA GR00T N1.5
|
||||
- local: xvla
|
||||
title: X-VLA
|
||||
- local: multi_task_dit
|
||||
title: Multitask DiT Policy
|
||||
- local: walloss
|
||||
title: WALL-OSS
|
||||
title: "Policies"
|
||||
@@ -67,13 +71,17 @@
|
||||
title: Environments from the Hub
|
||||
- local: envhub_leisaac
|
||||
title: Control & Train Robots in Sim (LeIsaac)
|
||||
title: "Simulation"
|
||||
- sections:
|
||||
- local: adding_benchmarks
|
||||
title: Adding a New Benchmark
|
||||
- local: libero
|
||||
title: LIBERO
|
||||
- local: metaworld
|
||||
title: Meta-World
|
||||
- local: envhub_isaaclab_arena
|
||||
title: NVIDIA IsaacLab Arena Environments
|
||||
- local: libero
|
||||
title: Using Libero
|
||||
- local: metaworld
|
||||
title: Using MetaWorld
|
||||
title: "Simulation"
|
||||
title: "Benchmarks"
|
||||
- sections:
|
||||
- local: introduction_processors
|
||||
title: Introduction to Robot Processors
|
||||
@@ -85,6 +93,8 @@
|
||||
title: Processors for Robots and Teleoperators
|
||||
- local: env_processor
|
||||
title: Environment Processors
|
||||
- local: action_representations
|
||||
title: Action Representations
|
||||
title: "Robot Processors"
|
||||
- sections:
|
||||
- local: so101
|
||||
|
||||
@@ -0,0 +1,223 @@
|
||||
# Action Representations
|
||||
|
||||
This guide explains the different ways robot actions can be represented in LeRobot, how they relate to each other, and when to use each one.
|
||||
|
||||
## Joint Space vs End-Effector Space
|
||||
|
||||
Before discussing action representations, it helps to understand the two coordinate spaces actions can live in.
|
||||
|
||||
### Joint Space
|
||||
|
||||
Joint-space actions directly specify target positions for each motor. For a 6-DOF arm with a gripper, a joint-space action might look like:
|
||||
|
||||
```
|
||||
action = [shoulder_pan: 45.0, shoulder_lift: -20.0, elbow: -30.0, wrist_pitch: 10.0, wrist_roll: 0.0, wrist_yaw: 5.0, gripper: 0.8]
|
||||
```
|
||||
|
||||
Joint space is the default in LeRobot. It is simple, requires no kinematics model, and maps directly to motor commands. Most beginner setups (SO-100, Koch) use joint-space actions.
|
||||
|
||||
### End-Effector (EE) Space
|
||||
|
||||
End-effector-space actions specify the desired position and orientation of the robot's tool tip (gripper) in Cartesian coordinates:
|
||||
|
||||
```
|
||||
action = [x: 0.25, y: -0.10, z: 0.15, wx: 0.0, wy: 0.0, wz: 0.1, gripper: 0.8]
|
||||
```
|
||||
|
||||
EE space is more intuitive for tasks like pick-and-place because it directly describes where the gripper should go, but it requires a kinematics model (URDF) to convert between EE poses and joint angles.
|
||||
|
||||
### Converting Between Spaces
|
||||
|
||||
LeRobot provides processor steps for converting between joint and EE spaces using forward and inverse kinematics. These are built on top of `RobotKinematics`, which loads a URDF model of your robot.
|
||||
|
||||
```python
|
||||
from lerobot.model.kinematics import RobotKinematics
|
||||
from lerobot.robots.so_follower.robot_kinematic_processor import (
|
||||
ForwardKinematicsJointsToEE,
|
||||
InverseKinematicsEEToJoints,
|
||||
)
|
||||
|
||||
kinematics = RobotKinematics(
|
||||
urdf_path="./SO101/so101_new_calib.urdf",
|
||||
target_frame_name="gripper_frame_link",
|
||||
joint_names=["shoulder", "elbow", "wrist_pitch", "wrist_roll", "wrist_yaw"],
|
||||
)
|
||||
|
||||
# Joints → EE (for observations: "where is my gripper?")
|
||||
fk_step = ForwardKinematicsJointsToEE(kinematics=kinematics, motor_names=[...])
|
||||
|
||||
# EE → Joints (for actions: "move my gripper here")
|
||||
ik_step = InverseKinematicsEEToJoints(kinematics=kinematics, motor_names=[...])
|
||||
```
|
||||
|
||||
See [`examples/so100_to_so100_EE/`](https://github.com/huggingface/lerobot/tree/main/examples/so100_to_so100_EE) for a complete working example of recording, replaying, and evaluating with EE-space actions on an SO-100 arm.
|
||||
|
||||
## Absolute, Relative, and Delta Actions
|
||||
|
||||
Regardless of whether you work in joint space or EE space, the action values can be expressed in three different ways. The terminology follows [UMI (Chi et al., 2024)](https://arxiv.org/abs/2402.10329).
|
||||
|
||||
### Absolute Actions (LeRobot default)
|
||||
|
||||
Each action specifies the target position directly.
|
||||
|
||||
**Example** (joint space, chunk of 4):
|
||||
|
||||
```
|
||||
current_state = [45.0, -30.0, 10.0]
|
||||
|
||||
action_chunk = [
|
||||
[46.0, -29.0, 11.0], # go to 46, -29, 11
|
||||
[47.5, -27.0, 12.0], # go to 47.5, -27, 12
|
||||
[49.0, -25.0, 13.5], # go to 49, -25, 13.5
|
||||
[50.0, -24.0, 15.0], # go to 50, -24, 15
|
||||
]
|
||||
```
|
||||
|
||||
Each value is a target position in the robot's coordinate frame. Simple and direct, but requires a consistent global coordinate frame. This is the default in LeRobot.
|
||||
|
||||
### Relative Actions (used by OpenPI / pi0)
|
||||
|
||||
Each action in the chunk is an offset from the **current state at the moment of prediction**. All actions in the chunk share the same reference point:
|
||||
|
||||
```
|
||||
current_state = [45.0, -30.0, 10.0]
|
||||
|
||||
relative_chunk = [
|
||||
[1.0, 1.0, 1.0], # +1 from current → target 46, -29, 11
|
||||
[2.5, 3.0, 2.0], # +2.5 from current → target 47.5, -27, 12
|
||||
[4.0, 5.0, 3.5], # +4 from current → target 49, -25, 13.5
|
||||
[5.0, 6.0, 5.0], # +5 from current → target 50, -24, 15
|
||||
]
|
||||
```
|
||||
|
||||
The conversion is straightforward: `relative = absolute - current_state`. To recover absolute: `absolute = relative + current_state`.
|
||||
|
||||
**Why use relative actions?** The model learns to predict offsets centered around zero, which is easier to normalize and leads to more stable training. Because every chunk references the same current state, there is no error accumulation across chunks.
|
||||
|
||||
### Delta Actions (sequential differences)
|
||||
|
||||
Each action is an offset from the **previous action** (or from the current state for the first step):
|
||||
|
||||
```
|
||||
current_state = [45.0, -30.0, 10.0]
|
||||
|
||||
delta_chunk = [
|
||||
[1.0, 1.0, 1.0], # current → 46, -29, 11
|
||||
[1.5, 2.0, 1.0], # previous action → 47.5, -27, 12
|
||||
[1.5, 2.0, 1.5], # previous action → 49, -25, 13.5
|
||||
[1.0, 1.0, 1.5], # previous action → 50, -24, 15
|
||||
]
|
||||
```
|
||||
|
||||
Here each step is relative to the one before it. To recover absolute positions you must sum all previous deltas, which means errors accumulate over time. UMI explicitly argues against this representation for this reason.
|
||||
|
||||
### Visual Comparison
|
||||
|
||||
The figure below (based on a figure from [UMI, Chi et al., 2024](https://arxiv.org/abs/2402.10329)) illustrates the key difference. With **relative trajectory**, every action in the chunk points back to the same origin (current state), so a new inference step cleanly resets the reference. With **delta**, each action depends on the previous one, so errors accumulate. **Absolute** actions require a consistent global coordinate frame.
|
||||
|
||||
<img
|
||||
src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lerobot/action_representations_umi.png"
|
||||
alt="Relative Trajectory as Action Representation (UMI, Chi et al., 2024)"
|
||||
width="85%"
|
||||
/>
|
||||
|
||||
## Using Relative Actions in LeRobot
|
||||
|
||||
LeRobot provides `RelativeActionsProcessorStep` to convert between absolute and relative actions inside the processor pipeline. This is how pi0, pi0.5, and pi0_fast support relative actions.
|
||||
|
||||
> **Note:** All pi models (pi0, pi0.5, pi0*fast) apply relative conversion \_before* normalization (`relative → normalize`), so the normalizer always sees delta (relative) values. This means **relative action stats are required** for all of them when training with `use_relative_actions=true`. In pi0_fast the `RelativeActionsProcessorStep` only modifies the action — the state observation is unchanged — so `NormalizerProcessorStep` still runs before the state tokenizer and the tokenizer continues to receive normalized state as expected.
|
||||
|
||||
### How it works
|
||||
|
||||
During **training** (preprocessing), actions are converted from absolute to relative before the model sees them:
|
||||
|
||||
```
|
||||
raw absolute action → RelativeActionsProcessorStep → normalize → model
|
||||
```
|
||||
|
||||
During **inference** (postprocessing), model predictions are converted back to absolute before being sent to the robot:
|
||||
|
||||
```
|
||||
model output → unnormalize → AbsoluteActionsProcessorStep → robot
|
||||
```
|
||||
|
||||
The `AbsoluteActionsProcessorStep` reads the cached current state from its paired `RelativeActionsProcessorStep`, so the two must be wired together (handled automatically by the policy factory).
|
||||
|
||||
### Enabling relative actions for the pi family (pi0, pi0.5, pi0_fast)
|
||||
|
||||
**Step 1**: Precompute relative action statistics for your dataset:
|
||||
|
||||
```bash
|
||||
lerobot-edit-dataset \
|
||||
--repo_id your_dataset \
|
||||
--operation.type recompute_stats \
|
||||
--operation.relative_action true \
|
||||
--operation.chunk_size 50 \
|
||||
--operation.relative_exclude_joints "['gripper']"
|
||||
```
|
||||
|
||||
**Step 2**: Train with relative actions enabled:
|
||||
|
||||
```bash
|
||||
lerobot-train \
|
||||
--dataset.repo_id=your_dataset \
|
||||
--policy.type=pi0 \
|
||||
--policy.use_relative_actions=true \
|
||||
--policy.relative_exclude_joints='["gripper"]'
|
||||
```
|
||||
|
||||
The `relative_exclude_joints` parameter specifies joints that should remain in absolute space. For example, gripper commands are typically binary (open/close) and don't benefit from relative encoding.
|
||||
|
||||
### Combining relative actions with RTC
|
||||
|
||||
[RTC](https://arxiv.org/abs/2506.07339) runs policy inference at high frequency and sends actions to the robot as they are predicted rather than waiting for a full chunk. Relative actions and RTC are fully compatible: because every chunk in relative mode references the **same** current state (captured at the start of inference), each predicted action in the chunk remains a valid offset even if the robot has already moved. No special handling is needed — `RelativeActionsProcessorStep` caches the state once per inference call and `AbsoluteActionsProcessorStep` applies it to every action in the streamed output.
|
||||
|
||||
### Combining relative actions with EE space
|
||||
|
||||
Relative actions work in both joint space and EE space. For example, if your dataset stores EE actions, relative encoding converts them to offsets from the current EE pose:
|
||||
|
||||
```
|
||||
current_ee_state = [x: 0.25, y: -0.10, z: 0.15, gripper: 0.8]
|
||||
|
||||
absolute_ee_chunk = [
|
||||
[0.26, -0.09, 0.16, 0.8],
|
||||
[0.28, -0.07, 0.18, 0.8],
|
||||
]
|
||||
|
||||
relative_ee_chunk = [
|
||||
[0.01, 0.01, 0.01, 0.0], # offset from current EE pose
|
||||
[0.03, 0.03, 0.03, 0.0], # offset from current EE pose
|
||||
]
|
||||
```
|
||||
|
||||
## Processing Pipeline Summary
|
||||
|
||||
Here is how the different processors compose. Each arrow is a processor step, and they can be chained in a `RobotProcessorPipeline` or `PolicyProcessorPipeline`:
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────┐
|
||||
Action Space │ Joint Space ←──IK──→ EE Space │
|
||||
│ ForwardKinematicsJointsToEE │
|
||||
│ InverseKinematicsEEToJoints │
|
||||
└─────────────────────────────────────────┘
|
||||
|
||||
┌─────────────────────────────────────────┐
|
||||
Representation │ Absolute ←────→ Relative │
|
||||
│ RelativeActionsProcessorStep (pre) │
|
||||
│ AbsoluteActionsProcessorStep (post) │
|
||||
└─────────────────────────────────────────┘
|
||||
|
||||
┌─────────────────────────────────────────┐
|
||||
Normalization │ Raw ←────→ Normalized │
|
||||
│ NormalizerProcessorStep (pre) │
|
||||
│ UnnormalizerProcessorStep (post) │
|
||||
└─────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
A typical training preprocessor might chain: `raw absolute joint actions → relative → normalize`. A typical inference postprocessor: `unnormalize → absolute → (optionally IK to joints)`.
|
||||
|
||||
## References
|
||||
|
||||
- [Universal Manipulation Interface (UMI)](https://arxiv.org/abs/2402.10329) - Chi et al., 2024. Defines the relative trajectory action representation and compares it with absolute and delta actions.
|
||||
- [Introduction to Processors](./introduction_processors) - How processor pipelines work in LeRobot.
|
||||
- [`examples/so100_to_so100_EE/`](https://github.com/huggingface/lerobot/tree/main/examples/so100_to_so100_EE) - Complete example of recording and evaluating with EE-space actions.
|
||||
@@ -0,0 +1,329 @@
|
||||
# Adding a New Benchmark
|
||||
|
||||
This guide walks you through adding a new simulation benchmark to LeRobot. Follow the steps in order and use the existing benchmarks as templates.
|
||||
|
||||
A benchmark in LeRobot is a set of [Gymnasium](https://gymnasium.farama.org/) environments that wrap a third-party simulator (like LIBERO or Meta-World) behind a standard `gym.Env` interface. The `lerobot-eval` CLI then runs evaluation uniformly across all benchmarks.
|
||||
|
||||
## Existing benchmarks at a glance
|
||||
|
||||
Before diving in, here is what is already integrated:
|
||||
|
||||
| Benchmark | Env file | Config class | Tasks | Action dim | Processor |
|
||||
| -------------- | ------------------- | ------------------ | ------------------- | ------------ | ---------------------------- |
|
||||
| LIBERO | `envs/libero.py` | `LiberoEnv` | 130 across 5 suites | 7 | `LiberoProcessorStep` |
|
||||
| Meta-World | `envs/metaworld.py` | `MetaworldEnv` | 50 (MT50) | 4 | None |
|
||||
| IsaacLab Arena | Hub-hosted | `IsaaclabArenaEnv` | Configurable | Configurable | `IsaaclabArenaProcessorStep` |
|
||||
|
||||
Use `src/lerobot/envs/libero.py` and `src/lerobot/envs/metaworld.py` as reference implementations.
|
||||
|
||||
## How it all fits together
|
||||
|
||||
### Data flow
|
||||
|
||||
During evaluation, data moves through four stages:
|
||||
|
||||
```
|
||||
1. gym.Env ──→ raw observations (numpy dicts)
|
||||
|
||||
2. Preprocessing ──→ standard LeRobot keys + task description
|
||||
(preprocess_observation, add_envs_task in envs/utils.py)
|
||||
|
||||
3. Processors ──→ env-specific then policy-specific transforms
|
||||
(env_preprocessor, policy_preprocessor)
|
||||
|
||||
4. Policy ──→ select_action() ──→ action tensor
|
||||
then reverse: policy_postprocessor → env_postprocessor → numpy action → env.step()
|
||||
```
|
||||
|
||||
Most benchmarks only need to care about stage 1 (producing observations in the right format) and optionally stage 3 (if env-specific transforms are needed).
|
||||
|
||||
### Environment structure
|
||||
|
||||
`make_env()` returns a nested dict of vectorized environments:
|
||||
|
||||
```python
|
||||
dict[str, dict[int, gym.vector.VectorEnv]]
|
||||
# ^suite ^task_id
|
||||
```
|
||||
|
||||
A single-task env (e.g. PushT) looks like `{"pusht": {0: vec_env}}`.
|
||||
A multi-task benchmark (e.g. LIBERO) looks like `{"libero_spatial": {0: vec0, 1: vec1, ...}, ...}`.
|
||||
|
||||
### How evaluation runs
|
||||
|
||||
All benchmarks are evaluated the same way by `lerobot-eval`:
|
||||
|
||||
1. `make_env()` builds the nested `{suite: {task_id: VectorEnv}}` dict.
|
||||
2. `eval_policy_all()` iterates over every suite and task.
|
||||
3. For each task, it runs `n_episodes` rollouts via `rollout()`.
|
||||
4. Results are aggregated hierarchically: episode, task, suite, overall.
|
||||
5. Metrics include `pc_success` (success rate), `avg_sum_reward`, and `avg_max_reward`.
|
||||
|
||||
The critical piece: your env must return `info["is_success"]` on every `step()` call. This is how the eval loop knows whether a task was completed.
|
||||
|
||||
## What your environment must provide
|
||||
|
||||
LeRobot does not enforce a strict observation schema. Instead it relies on a set of conventions that all benchmarks follow.
|
||||
|
||||
### Env attributes
|
||||
|
||||
Your `gym.Env` must set these attributes:
|
||||
|
||||
| Attribute | Type | Why |
|
||||
| -------------------- | ----- | ---------------------------------------------------- |
|
||||
| `_max_episode_steps` | `int` | `rollout()` uses this to cap episode length |
|
||||
| `task_description` | `str` | Passed to VLA policies as a language instruction |
|
||||
| `task` | `str` | Fallback identifier if `task_description` is not set |
|
||||
|
||||
### Success reporting
|
||||
|
||||
Your `step()` and `reset()` must include `"is_success"` in the `info` dict:
|
||||
|
||||
```python
|
||||
info = {"is_success": True} # or False
|
||||
return observation, reward, terminated, truncated, info
|
||||
```
|
||||
|
||||
### Observations
|
||||
|
||||
The simplest approach is to map your simulator's outputs to the standard keys that `preprocess_observation()` already understands. Do this inside your `gym.Env` (e.g. in a `_format_raw_obs()` helper):
|
||||
|
||||
| Your env should output | LeRobot maps it to | What it is |
|
||||
| ------------------------- | -------------------------- | ------------------------------------- |
|
||||
| `"pixels"` (single array) | `observation.image` | Single camera image, HWC uint8 |
|
||||
| `"pixels"` (dict) | `observation.images.<cam>` | Multiple cameras, each HWC uint8 |
|
||||
| `"agent_pos"` | `observation.state` | Proprioceptive state vector |
|
||||
| `"environment_state"` | `observation.env_state` | Full environment state (e.g. PushT) |
|
||||
| `"robot_state"` | `observation.robot_state` | Nested robot state dict (e.g. LIBERO) |
|
||||
|
||||
If your simulator uses different key names, you have two options:
|
||||
|
||||
1. **Recommended:** Rename them to the standard keys inside your `gym.Env` wrapper.
|
||||
2. **Alternative:** Write an env processor to transform observations after `preprocess_observation()` runs (see step 4 below).
|
||||
|
||||
### Actions
|
||||
|
||||
Actions are continuous numpy arrays in a `gym.spaces.Box`. The dimensionality depends on your benchmark (7 for LIBERO, 4 for Meta-World, etc.). Policies adapt to different action dimensions through their `input_features` / `output_features` config.
|
||||
|
||||
### Feature declaration
|
||||
|
||||
Each `EnvConfig` subclass declares two dicts that tell the policy what to expect:
|
||||
|
||||
- `features` — maps feature names to `PolicyFeature(type, shape)` (e.g. action dim, image shape).
|
||||
- `features_map` — maps raw observation keys to LeRobot convention keys (e.g. `"agent_pos"` to `"observation.state"`).
|
||||
|
||||
## Step by step
|
||||
|
||||
<Tip>
|
||||
At minimum, you need three files: a **gym.Env wrapper**, an **EnvConfig
|
||||
subclass**, and a **factory dispatch branch**. Everything else is optional or
|
||||
documentation.
|
||||
</Tip>
|
||||
|
||||
### Checklist
|
||||
|
||||
| File | Required | Why |
|
||||
| ---------------------------------------- | -------- | ----------------------------------------- |
|
||||
| `src/lerobot/envs/<benchmark>.py` | Yes | Wraps the simulator as a standard gym.Env |
|
||||
| `src/lerobot/envs/configs.py` | Yes | Registers your benchmark for the CLI |
|
||||
| `src/lerobot/envs/factory.py` | Yes | Tells `make_env()` how to build your envs |
|
||||
| `src/lerobot/processor/env_processor.py` | Optional | Custom observation/action transforms |
|
||||
| `src/lerobot/envs/utils.py` | Optional | Only if you need new raw observation keys |
|
||||
| `pyproject.toml` | Yes | Declares benchmark-specific dependencies |
|
||||
| `docs/source/<benchmark>.mdx` | Yes | User-facing documentation page |
|
||||
| `docs/source/_toctree.yml` | Yes | Adds your page to the docs sidebar |
|
||||
|
||||
### 1. The gym.Env wrapper (`src/lerobot/envs/<benchmark>.py`)
|
||||
|
||||
Create a `gym.Env` subclass that wraps the third-party simulator:
|
||||
|
||||
```python
|
||||
class MyBenchmarkEnv(gym.Env):
|
||||
metadata = {"render_modes": ["rgb_array"], "render_fps": <fps>}
|
||||
|
||||
def __init__(self, task_suite, task_id, ...):
|
||||
super().__init__()
|
||||
self.task = <task_name_string>
|
||||
self.task_description = <natural_language_instruction>
|
||||
self._max_episode_steps = <max_steps>
|
||||
self.observation_space = spaces.Dict({...})
|
||||
self.action_space = spaces.Box(low=..., high=..., shape=(...,), dtype=np.float32)
|
||||
|
||||
def reset(self, seed=None, **kwargs):
|
||||
... # return (observation, info) — info must contain {"is_success": False}
|
||||
|
||||
def step(self, action: np.ndarray):
|
||||
... # return (obs, reward, terminated, truncated, info) — info must contain {"is_success": <bool>}
|
||||
|
||||
def render(self):
|
||||
... # return RGB image as numpy array
|
||||
|
||||
def close(self):
|
||||
...
|
||||
```
|
||||
|
||||
Also provide a factory function that returns the nested dict structure:
|
||||
|
||||
```python
|
||||
def create_mybenchmark_envs(
|
||||
task: str,
|
||||
n_envs: int,
|
||||
gym_kwargs: dict | None = None,
|
||||
env_cls: type | None = None,
|
||||
) -> dict[str, dict[int, Any]]:
|
||||
"""Create {suite_name: {task_id: VectorEnv}} for MyBenchmark."""
|
||||
...
|
||||
```
|
||||
|
||||
See `create_libero_envs()` (multi-suite, multi-task) and `create_metaworld_envs()` (difficulty-grouped tasks) for reference.
|
||||
|
||||
### 2. The config (`src/lerobot/envs/configs.py`)
|
||||
|
||||
Register a config dataclass so users can select your benchmark with `--env.type=<name>`:
|
||||
|
||||
```python
|
||||
@EnvConfig.register_subclass("<benchmark_name>")
|
||||
@dataclass
|
||||
class MyBenchmarkEnvConfig(EnvConfig):
|
||||
task: str = "<default_task>"
|
||||
fps: int = <fps>
|
||||
obs_type: str = "pixels_agent_pos"
|
||||
|
||||
features: dict[str, PolicyFeature] = field(default_factory=lambda: {
|
||||
ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(<action_dim>,)),
|
||||
})
|
||||
features_map: dict[str, str] = field(default_factory=lambda: {
|
||||
ACTION: ACTION,
|
||||
"agent_pos": OBS_STATE,
|
||||
"pixels": OBS_IMAGE,
|
||||
})
|
||||
|
||||
def __post_init__(self):
|
||||
... # populate features based on obs_type
|
||||
|
||||
@property
|
||||
def gym_kwargs(self) -> dict:
|
||||
return {"obs_type": self.obs_type, "render_mode": self.render_mode}
|
||||
```
|
||||
|
||||
Key points:
|
||||
|
||||
- The `register_subclass` name is what users pass on the CLI (`--env.type=<name>`).
|
||||
- `features` tells the policy what the environment produces.
|
||||
- `features_map` maps raw observation keys to LeRobot convention keys.
|
||||
|
||||
### 3. The factory dispatch (`src/lerobot/envs/factory.py`)
|
||||
|
||||
Add a branch in `make_env()` to call your factory function:
|
||||
|
||||
```python
|
||||
elif "<benchmark_name>" in cfg.type:
|
||||
from lerobot.envs.<benchmark> import create_<benchmark>_envs
|
||||
|
||||
if cfg.task is None:
|
||||
raise ValueError("<BenchmarkName> requires a task to be specified")
|
||||
|
||||
return create_<benchmark>_envs(
|
||||
task=cfg.task,
|
||||
n_envs=n_envs,
|
||||
gym_kwargs=cfg.gym_kwargs,
|
||||
env_cls=env_cls,
|
||||
)
|
||||
```
|
||||
|
||||
If your benchmark needs an env processor, add it in `make_env_pre_post_processors()`:
|
||||
|
||||
```python
|
||||
if isinstance(env_cfg, MyBenchmarkEnvConfig) or "<benchmark_name>" in env_cfg.type:
|
||||
preprocessor_steps.append(MyBenchmarkProcessorStep())
|
||||
```
|
||||
|
||||
### 4. Env processor (optional — `src/lerobot/processor/env_processor.py`)
|
||||
|
||||
Only needed if your benchmark requires observation transforms beyond what `preprocess_observation()` handles (e.g. image flipping, coordinate conversion):
|
||||
|
||||
```python
|
||||
@dataclass
|
||||
@ProcessorStepRegistry.register(name="<benchmark>_processor")
|
||||
class MyBenchmarkProcessorStep(ObservationProcessorStep):
|
||||
def _process_observation(self, observation):
|
||||
processed = observation.copy()
|
||||
# your transforms here
|
||||
return processed
|
||||
|
||||
def transform_features(self, features):
|
||||
return features # update if shapes change
|
||||
|
||||
def observation(self, observation):
|
||||
return self._process_observation(observation)
|
||||
```
|
||||
|
||||
See `LiberoProcessorStep` for a full example (image rotation, quaternion-to-axis-angle conversion).
|
||||
|
||||
### 5. Dependencies (`pyproject.toml`)
|
||||
|
||||
Add a new optional-dependency group:
|
||||
|
||||
```toml
|
||||
mybenchmark = ["my-benchmark-pkg==1.2.3", "lerobot[scipy-dep]"]
|
||||
```
|
||||
|
||||
Pinning rules:
|
||||
|
||||
- **Always pin** benchmark packages to exact versions for reproducibility (e.g. `metaworld==3.0.0`).
|
||||
- **Add platform markers** when needed (e.g. `; sys_platform == 'linux'`).
|
||||
- **Pin fragile transitive deps** if known (e.g. `gymnasium==1.1.0` for Meta-World).
|
||||
- **Document constraints** in your benchmark doc page.
|
||||
|
||||
Users install with:
|
||||
|
||||
```bash
|
||||
pip install -e ".[mybenchmark]"
|
||||
```
|
||||
|
||||
### 6. Documentation (`docs/source/<benchmark>.mdx`)
|
||||
|
||||
Write a user-facing page following the template in the next section. See `docs/source/libero.mdx` and `docs/source/metaworld.mdx` for full examples.
|
||||
|
||||
### 7. Table of contents (`docs/source/_toctree.yml`)
|
||||
|
||||
Add your benchmark to the "Benchmarks" section:
|
||||
|
||||
```yaml
|
||||
- sections:
|
||||
- local: libero
|
||||
title: LIBERO
|
||||
- local: metaworld
|
||||
title: Meta-World
|
||||
- local: envhub_isaaclab_arena
|
||||
title: NVIDIA IsaacLab Arena Environments
|
||||
- local: <your_benchmark>
|
||||
title: <Your Benchmark Name>
|
||||
title: "Benchmarks"
|
||||
```
|
||||
|
||||
## Verifying your integration
|
||||
|
||||
After completing the steps above, confirm that everything works:
|
||||
|
||||
1. **Install** — `pip install -e ".[mybenchmark]"` and verify the dependency group installs cleanly.
|
||||
2. **Smoke test env creation** — call `make_env()` with your config in Python, check that the returned dict has the expected `{suite: {task_id: VectorEnv}}` shape, and that `reset()` returns observations with the right keys.
|
||||
3. **Run a full eval** — `lerobot-eval --env.type=<name> --env.task=<task> --eval.n_episodes=1 --eval.batch_size=1 --policy.path=<any_compatible_policy>` to exercise the full pipeline end-to-end.
|
||||
4. **Check success detection** — verify that `info["is_success"]` flips to `True` when the task is actually completed. This is what the eval loop uses to compute success rates.
|
||||
|
||||
## Writing a benchmark doc page
|
||||
|
||||
Each benchmark `.mdx` page should include:
|
||||
|
||||
- **Title and description** — 1-2 paragraphs on what the benchmark tests and why it matters.
|
||||
- **Links** — paper, GitHub repo, project website (if available).
|
||||
- **Overview image or GIF.**
|
||||
- **Available tasks** — table of task suites with counts and brief descriptions.
|
||||
- **Installation** — `pip install -e ".[<benchmark>]"` plus any extra steps (env vars, system packages).
|
||||
- **Evaluation** — recommended `lerobot-eval` command with `n_episodes` and `batch_size` for reproducible results. Include single-task and multi-task examples if applicable.
|
||||
- **Policy inputs and outputs** — observation keys with shapes, action space description.
|
||||
- **Recommended evaluation episodes** — how many episodes per task is standard.
|
||||
- **Training** — example `lerobot-train` command.
|
||||
- **Reproducing published results** — link to pretrained model, eval command, results table (if available).
|
||||
|
||||
See `docs/source/libero.mdx` and `docs/source/metaworld.mdx` for complete examples.
|
||||
@@ -41,13 +41,15 @@ requires = # your-build-system
|
||||
|
||||
## Step 2: Define the Policy Configuration
|
||||
|
||||
Create a configuration class that inherits from `PreTrainedConfig` and registers your policy type:
|
||||
Create a configuration class that inherits from [`PreTrainedConfig`](https://github.com/huggingface/lerobot/blob/main/src/lerobot/configs/policies.py) and registers your policy type:
|
||||
Here is a template to get you started, customize the parameters and methods as needed for your policy's architecture and training requirements.
|
||||
|
||||
```python
|
||||
# configuration_my_custom_policy.py
|
||||
from dataclasses import dataclass, field
|
||||
from lerobot.configs.policies import PreTrainedConfig
|
||||
from lerobot.configs.types import NormalizationMode
|
||||
from lerobot.optim.optimizers import AdamWConfig
|
||||
from lerobot.optim.schedulers import CosineDecayWithWarmupSchedulerConfig
|
||||
|
||||
@PreTrainedConfig.register_subclass("my_custom_policy")
|
||||
@dataclass
|
||||
@@ -61,22 +63,56 @@ class MyCustomPolicyConfig(PreTrainedConfig):
|
||||
hidden_dim: Hidden dimension for the policy network
|
||||
# Add your policy-specific parameters here
|
||||
"""
|
||||
# ...PreTrainedConfig fields...
|
||||
pass
|
||||
|
||||
horizon: int = 50
|
||||
n_action_steps: int = 50
|
||||
hidden_dim: int = 256
|
||||
|
||||
optimizer_lr: float = 1e-4
|
||||
optimizer_weight_decay: float = 1e-4
|
||||
|
||||
def __post_init__(self):
|
||||
super().__post_init__()
|
||||
# Add any validation logic here
|
||||
if self.n_action_steps > self.horizon:
|
||||
raise ValueError("n_action_steps cannot exceed horizon")
|
||||
|
||||
def validate_features(self) -> None:
|
||||
"""Validate input/output feature compatibility."""
|
||||
# Implement validation logic for your policy's requirements
|
||||
pass
|
||||
if not self.image_features:
|
||||
raise ValueError("MyCustomPolicy requires at least one image feature.")
|
||||
if self.action_feature is None:
|
||||
raise ValueError("MyCustomPolicy requires 'action' in output_features.")
|
||||
|
||||
def get_optimizer_preset(self) -> AdamWConfig:
|
||||
return AdamWConfig(lr=self.optimizer_lr, weight_decay=self.optimizer_weight_decay)
|
||||
|
||||
def get_scheduler_preset(self):
|
||||
return None
|
||||
|
||||
@property
|
||||
def observation_delta_indices(self) -> list[int] | None:
|
||||
"""Relative timestep offsets the dataset loader provides per observation.
|
||||
|
||||
Return `None` for single-frame policies. For temporal policies that consume
|
||||
multiple past or future frames, return a list of offsets, e.g. `[-20, -10, 0, 10]` for
|
||||
3 past frames at stride 10 and 1 future frame at stride 10.
|
||||
"""
|
||||
return None
|
||||
|
||||
@property
|
||||
def action_delta_indices(self) -> list[int]:
|
||||
"""Relative timestep offsets for the action chunk the dataset loader returns.
|
||||
"""
|
||||
return list(range(self.horizon))
|
||||
|
||||
@property
|
||||
def reward_delta_indices(self) -> None:
|
||||
return None
|
||||
```
|
||||
|
||||
## Step 3: Implement the Policy Class
|
||||
|
||||
Create your policy implementation by inheriting from LeRobot's base `PreTrainedPolicy` class:
|
||||
Create your policy implementation by inheriting from [`PreTrainedPolicy`](https://github.com/huggingface/lerobot/blob/main/src/lerobot/policies/pretrained.py):
|
||||
|
||||
```python
|
||||
# modeling_my_custom_policy.py
|
||||
@@ -85,38 +121,74 @@ import torch.nn as nn
|
||||
from typing import Any
|
||||
|
||||
from lerobot.policies.pretrained import PreTrainedPolicy
|
||||
from lerobot.utils.constants import ACTION
|
||||
from .configuration_my_custom_policy import MyCustomPolicyConfig
|
||||
|
||||
class MyCustomPolicy(PreTrainedPolicy):
|
||||
config_class = MyCustomPolicyConfig
|
||||
config_class = MyCustomPolicyConfig # must match the string in @register_subclass
|
||||
name = "my_custom_policy"
|
||||
|
||||
def __init__(self, config: MyCustomPolicyConfig, dataset_stats: dict[str, Any] = None):
|
||||
super().__init__(config, dataset_stats)
|
||||
config.validate_features() # not called automatically by the base class
|
||||
self.config = config
|
||||
self.model = ... # your nn.Module here
|
||||
|
||||
def reset(self):
|
||||
"""Reset episode state."""
|
||||
...
|
||||
|
||||
def get_optim_params(self) -> dict:
|
||||
"""Return parameters to pass to the optimizer (e.g. with per-group lr/wd)."""
|
||||
return {"params": self.parameters()}
|
||||
|
||||
def predict_action_chunk(self, batch: dict[str, torch.Tensor], **kwargs) -> torch.Tensor:
|
||||
"""Return the full action chunk (B, chunk_size, action_dim) for the current observation."""
|
||||
...
|
||||
|
||||
def select_action(self, batch: dict[str, torch.Tensor], **kwargs) -> torch.Tensor:
|
||||
"""Return a single action for the current timestep (called at inference)."""
|
||||
...
|
||||
|
||||
def forward(self, batch: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
|
||||
"""Compute the training loss.
|
||||
|
||||
`batch["action_is_pad"]` is a bool mask of shape (B, horizon) that marks
|
||||
timesteps padded because the episode ended before `horizon` steps, you
|
||||
can exclude those from your loss.
|
||||
"""
|
||||
actions = batch[ACTION]
|
||||
action_is_pad = batch.get("action_is_pad")
|
||||
...
|
||||
return {"loss": ...}
|
||||
```
|
||||
|
||||
## Step 4: Add Data Processors
|
||||
|
||||
Create processor functions:
|
||||
Create processor functions. For a concrete reference, see [processor_act.py](https://github.com/huggingface/lerobot/blob/main/src/lerobot/policies/act/processor_act.py) or [processor_diffusion.py](https://github.com/huggingface/lerobot/blob/main/src/lerobot/policies/diffusion/processor_diffusion.py).
|
||||
|
||||
```python
|
||||
# processor_my_custom_policy.py
|
||||
from typing import Any
|
||||
import torch
|
||||
|
||||
from lerobot.processor import PolicyAction, PolicyProcessorPipeline
|
||||
|
||||
|
||||
def make_my_custom_policy_pre_post_processors(
|
||||
config,
|
||||
dataset_stats: dict[str, dict[str, torch.Tensor]] | None = None,
|
||||
) -> tuple[
|
||||
PolicyProcessorPipeline[dict[str, Any], dict[str, Any]],
|
||||
PolicyProcessorPipeline[PolicyAction, PolicyAction],
|
||||
]:
|
||||
"""Create preprocessing and postprocessing functions for your policy."""
|
||||
pass # Define your preprocessing and postprocessing logic here
|
||||
|
||||
preprocessor = ... # build your PolicyProcessorPipeline for inputs
|
||||
postprocessor = ... # build your PolicyProcessorPipeline for outputs
|
||||
return preprocessor, postprocessor
|
||||
```
|
||||
|
||||
**Important - function naming:** LeRobot discovers your processor by name. The function **must** be called `make_{policy_name}_pre_post_processors` (matching the string you passed to `@PreTrainedConfig.register_subclass`).
|
||||
|
||||
## Step 5: Package Initialization
|
||||
|
||||
Expose your classes in the package's `__init__.py`:
|
||||
|
||||
@@ -131,4 +131,4 @@ lerobot-record \
|
||||
|
||||
## License
|
||||
|
||||
This model follows the **Apache 2.0 License**, consistent with the original [GR00T repository](https://github.com/NVIDIA/Isaac-GR00T).
|
||||
This model follows NVIDIA's proprietary license, consistent with the original [GR00T repository](https://github.com/NVIDIA/Isaac-GR00T). Future versions (starting from N1.7) will follow **Apache 2.0 License**.
|
||||
|
||||
@@ -0,0 +1,269 @@
|
||||
# Human-In-the-Loop Data Collection
|
||||
|
||||
Human-In-the-Loop (HIL) data collection lets you improve a trained policy by deploying it on a real robot while a human operator monitors and intervenes when needed. The intervention data (recovery movements and corrections) is recorded alongside autonomous segments, producing a richer training dataset that teaches the policy how to handle failures.
|
||||
|
||||
---
|
||||
|
||||
## Why Human-In-the-Loop?
|
||||
|
||||
Standard behavioral cloning trains policies on successful demonstrations only. During deployment, small errors can compound and push the robot into states never seen during training (distribution shift). HIL data collection addresses this by:
|
||||
|
||||
- Running the trained policy on the real robot
|
||||
- Having a human intervene when the robot is about to fail
|
||||
- Recording the human's recovery and correction as training data
|
||||
- Fine-tuning the policy on the combined dataset
|
||||
|
||||
This produces a policy that not only knows how to perform the task, but also how to recover when things go wrong.
|
||||
|
||||
---
|
||||
|
||||
## How It Works
|
||||
|
||||
During a HIL session, the human operator follows this loop within each episode:
|
||||
|
||||
1. **Watch** the policy run autonomously
|
||||
2. **Pause** when failure is imminent, the robot holds its position
|
||||
3. **Take control** and teleoperate the robot back to a good state (recovery), then correct the behavior
|
||||
4. **Return control to the policy**, the policy resumes autonomous execution
|
||||
5. Repeat steps 2–4 as many times as needed during the episode
|
||||
6. **End the episode** when the task is complete, save and move on to the next rollout
|
||||
|
||||
Both autonomous and human-controlled segments are recorded. The policy and human can alternate control multiple times within a single episode, and the episode continues from the current state after each handoff (no reset required just because intervention happened). This captures autonomous execution, recovery, and correction in one continuous trajectory. After collection, the combined dataset (original demonstrations + HIL data) is used to fine-tune the policy.
|
||||
|
||||
This process can be repeated iteratively: deploy, collect, fine-tune, repeat. Each round targets the current policy's failure modes.
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────┐
|
||||
│ Policy v0 (trained on demos) │
|
||||
│ ↓ │
|
||||
│ HIL Collection (target current failure modes) → Fine-tune → Policy v1 │
|
||||
│ ↓ │
|
||||
│ HIL Collection (target new failure modes) → Fine-tune → Policy v2 │
|
||||
│ ↓ │
|
||||
│ ... (repeat until satisfactory performance) │
|
||||
└─────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
### Teleoperator Requirements
|
||||
|
||||
The `examples/hil` HIL scripts require **teleoperators with active motors** that can:
|
||||
|
||||
- Enable/disable torque programmatically
|
||||
- Move to target positions (to mirror the robot state when pausing)
|
||||
|
||||
**Compatible teleoperators in the current `examples/hil` scripts:**
|
||||
|
||||
- `openarm_mini` - OpenArm Mini
|
||||
- `so_leader` - SO100 / SO101 leader arm
|
||||
|
||||
> [!IMPORTANT]
|
||||
> The provided `examples/hil` commands default to `bi_openarm_follower` + `openarm_mini`.
|
||||
> `so_follower` + `so_leader` configs are also registered and can be used via CLI flags.
|
||||
|
||||
---
|
||||
|
||||
## Script
|
||||
|
||||
A single script handles both synchronous and RTC-based inference. Toggle RTC with `--rtc.enabled=true`:
|
||||
|
||||
| Mode | Flag | Models |
|
||||
| ------------------------ | -------------------- | --------------------- |
|
||||
| Standard (default) | _(no flag needed)_ | ACT, Diffusion Policy |
|
||||
| Real-Time Chunking (RTC) | `--rtc.enabled=true` | Pi0, Pi0.5, SmolVLA |
|
||||
|
||||
---
|
||||
|
||||
## Step-by-Step Guide
|
||||
|
||||
### Step 1: Pre-train a Base Policy
|
||||
|
||||
First, train a policy on your demonstration dataset:
|
||||
|
||||
```bash
|
||||
python src/lerobot/scripts/lerobot_train.py \
|
||||
--dataset.repo_id=your-username/demo-dataset \
|
||||
--policy.type=pi0 \
|
||||
--output_dir=outputs/pretrain \
|
||||
--batch_size=32 \
|
||||
--steps=50000
|
||||
```
|
||||
|
||||
### Step 2: Collect HIL Data
|
||||
|
||||
**Standard inference (ACT, Diffusion Policy):**
|
||||
|
||||
```bash
|
||||
python examples/hil/hil_data_collection.py \
|
||||
--robot.type=bi_openarm_follower \
|
||||
--robot.left_arm_config.port=can1 \
|
||||
--robot.left_arm_config.side=left \
|
||||
--robot.right_arm_config.port=can0 \
|
||||
--robot.right_arm_config.side=right \
|
||||
--robot.cameras='{left_wrist: {type: opencv, index_or_path: "/dev/video0", width: 1280, height: 720, fps: 30}, right_wrist: {type: opencv, index_or_path: "/dev/video4", width: 1280, height: 720, fps: 30}, base: {type: opencv, index_or_path: "/dev/video2", width: 640, height: 480, fps: 30}}' \
|
||||
--teleop.type=openarm_mini \
|
||||
--teleop.port_left=/dev/ttyACM0 \
|
||||
--teleop.port_right=/dev/ttyACM1 \
|
||||
--policy.path=outputs/pretrain/checkpoints/last/pretrained_model \
|
||||
--dataset.repo_id=your-username/hil-dataset \
|
||||
--dataset.single_task="Fold the T-shirt properly" \
|
||||
--dataset.fps=30 \
|
||||
--dataset.episode_time_s=1000 \
|
||||
--dataset.num_episodes=50 \
|
||||
--interpolation_multiplier=2
|
||||
```
|
||||
|
||||
**With RTC for large models (Pi0, Pi0.5, SmolVLA):**
|
||||
|
||||
For models with high inference latency, enable RTC for smooth execution:
|
||||
|
||||
```bash
|
||||
python examples/hil/hil_data_collection.py \
|
||||
--rtc.enabled=true \
|
||||
--rtc.execution_horizon=20 \
|
||||
--rtc.max_guidance_weight=5.0 \
|
||||
--rtc.prefix_attention_schedule=LINEAR \
|
||||
--robot.type=bi_openarm_follower \
|
||||
--robot.left_arm_config.port=can1 \
|
||||
--robot.left_arm_config.side=left \
|
||||
--robot.right_arm_config.port=can0 \
|
||||
--robot.right_arm_config.side=right \
|
||||
--robot.cameras='{left_wrist: {type: opencv, index_or_path: "/dev/video0", width: 1280, height: 720, fps: 30}, right_wrist: {type: opencv, index_or_path: "/dev/video4", width: 1280, height: 720, fps: 30}, base: {type: opencv, index_or_path: "/dev/video2", width: 640, height: 480, fps: 30}}' \
|
||||
--teleop.type=openarm_mini \
|
||||
--teleop.port_left=/dev/ttyACM0 \
|
||||
--teleop.port_right=/dev/ttyACM1 \
|
||||
--policy.path=outputs/pretrain/checkpoints/last/pretrained_model \
|
||||
--dataset.repo_id=your-username/hil-rtc-dataset \
|
||||
--dataset.single_task="Fold the T-shirt properly" \
|
||||
--dataset.fps=30 \
|
||||
--dataset.episode_time_s=1000 \
|
||||
--dataset.num_episodes=50 \
|
||||
--interpolation_multiplier=3
|
||||
```
|
||||
|
||||
**Controls (Conceptual):**
|
||||
|
||||
The interaction model is:
|
||||
|
||||
- **Pause input**: pause autonomous policy execution
|
||||
- **Takeover input**: transfer control to the human operator and record intervention data
|
||||
- **Return-to-policy input**: hand control back to the policy and continue the same episode
|
||||
- **Episode control inputs**: save/re-record/stop/reset as needed
|
||||
|
||||
Exact key/pedal bindings can differ across scripts and hardware integrations. Use each script's printed controls as the source of truth for the concrete mapping on your setup.
|
||||
|
||||
**The HIL Protocol:**
|
||||
|
||||
1. Watch the policy run autonomously (teleop is idle/free)
|
||||
2. When you see imminent failure, trigger the **pause input**
|
||||
- Policy stops
|
||||
- Teleoperator moves to match robot position (torque enabled)
|
||||
- No frames recorded during pause
|
||||
3. Trigger the **takeover input** to take control
|
||||
- Teleoperator torque disabled, free to move
|
||||
- **Recovery**: Teleoperate the robot back to a good state
|
||||
- **Correction**: Correct the behavior
|
||||
- All movements are recorded
|
||||
4. Trigger the **return-to-policy input**
|
||||
- Policy resumes autonomous execution from the current state
|
||||
- You can intervene again at any time (repeat steps 2–4)
|
||||
5. End and save the episode when the task is complete (or episode time limit is reached)
|
||||
6. **Reset**: Teleop moves to robot position, you can move the robot to the starting position
|
||||
7. Start the next episode
|
||||
|
||||
**Foot Pedal Setup (Linux):**
|
||||
|
||||
If using a USB foot pedal (PCsensor FootSwitch), ensure access:
|
||||
|
||||
```bash
|
||||
sudo setfacl -m u:$USER:rw /dev/input/by-id/usb-PCsensor_FootSwitch-event-kbd
|
||||
```
|
||||
|
||||
### Step 3: Fine-tune the Policy
|
||||
|
||||
Fine-tune on the **combined** dataset (`demo-dataset` + `hil-dataset` merged together):
|
||||
|
||||
```bash
|
||||
python src/lerobot/scripts/lerobot_train.py \
|
||||
--dataset.repo_id=your-username/hil-dataset \
|
||||
--policy.type=pi0 \
|
||||
--policy.pretrained_path=outputs/pretrain/checkpoints/last/pretrained_model \
|
||||
--output_dir=outputs/hil_finetune \
|
||||
--steps=20000
|
||||
```
|
||||
|
||||
Then deploy the fine-tuned policy and repeat from Step 2 to target its remaining failure modes.
|
||||
|
||||
---
|
||||
|
||||
## Tips for Effective HIL Collection
|
||||
|
||||
### When to Intervene
|
||||
|
||||
Intervene when you see:
|
||||
|
||||
- Robot about to make an irreversible mistake
|
||||
- Robot hesitating or showing uncertain behavior
|
||||
- Robot deviating from the expected trajectory
|
||||
|
||||
### Recovery: Teleoperating Back to a Good State
|
||||
|
||||
During recovery, teleoperate the robot back to a state where:
|
||||
|
||||
- The robot is in a familiar, in-distribution configuration
|
||||
- The current subtask can still be completed
|
||||
- The recovery trajectory itself is informative training data
|
||||
|
||||
### Quality of Corrections
|
||||
|
||||
During correction:
|
||||
|
||||
- Provide **confident, clean** trajectories
|
||||
- Complete the current subtask fully
|
||||
- Don't overcorrect or add unnecessary movements
|
||||
|
||||
---
|
||||
|
||||
## Related Work
|
||||
|
||||
This HIL data collection approach builds on ideas from interactive imitation learning:
|
||||
|
||||
- **DAgger** (Ross et al., 2011) introduced the core idea: instead of only training on expert demonstrations, query the expert for corrections on states the _learner_ visits. This breaks the compounding-error cycle of standard behavioral cloning by iteratively collecting on-policy data.
|
||||
|
||||
- **HG-DAgger** (Kelly et al., 2019) made this practical for robotics: a human expert monitors the robot and only intervenes when needed, rather than labeling every state. The gating between autonomous and human control is exactly the pause → takeover → return-to-policy loop used in the scripts here.
|
||||
|
||||
- **RaC** (Hu et al., 2025) scales this loop to long-horizon tasks by explicitly decomposing interventions into **recovery** (teleoperating back to a good state) and **correction** (demonstrating the right behavior from there). This decomposition is the protocol followed by the HIL scripts in `examples/hil`.
|
||||
|
||||
- **π0.6/RECAP** (Physical Intelligence, 2025) applies the same iterative collect-and-finetune loop at scale with VLA models, showing that even large pretrained policies benefit substantially from targeted human corrections on their own failure modes. π0.6 is trained using RECAP.
|
||||
|
||||
```bibtex
|
||||
@article{ross2011dagger,
|
||||
title={A Reduction of Imitation Learning and Structured Prediction to No-Regret Online Learning},
|
||||
author={Ross, Stéphane and Gordon, Geoffrey and Bagnell, Drew},
|
||||
journal={Proceedings of the Fourteenth International Conference on Artificial Intelligence and Statistics},
|
||||
year={2011}
|
||||
}
|
||||
|
||||
@article{kelly2019hgdagger,
|
||||
title={HG-DAgger: Interactive Imitation Learning with Human Experts},
|
||||
author={Kelly, Michael and Sidrane, Chelsea and Driggs-Campbell, Katherine and Kochenderfer, Mykel J},
|
||||
journal={arXiv preprint arXiv:1810.02890},
|
||||
year={2019}
|
||||
}
|
||||
|
||||
@article{hu2025rac,
|
||||
title={RaC: Robot Learning for Long-Horizon Tasks by Scaling Recovery and Correction},
|
||||
author={Hu, Zheyuan and Wu, Robyn and Enock, Naveen and Li, Jasmine and Kadakia, Riya and Erickson, Zackory and Kumar, Aviral},
|
||||
journal={arXiv preprint arXiv:2509.07953},
|
||||
year={2025}
|
||||
}
|
||||
|
||||
@article{pi2025recap,
|
||||
title={π0.6: a VLA That Learns From Experience},
|
||||
author={Physical Intelligence},
|
||||
year={2025}
|
||||
}
|
||||
```
|
||||
@@ -424,7 +424,7 @@ robot = SO100Follower(robot_config)
|
||||
robot.connect()
|
||||
|
||||
dataset = LeRobotDataset("<hf_username>/<dataset_repo_id>", episodes=[episode_idx])
|
||||
actions = dataset.hf_dataset.select_columns("action")
|
||||
actions = dataset.select_columns("action")
|
||||
|
||||
log_say(f"Replaying episode {episode_idx}")
|
||||
for idx in range(dataset.num_frames):
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# Installation
|
||||
|
||||
This guide uses `conda` (via miniforge) to manage environments (recommended). If you prefer another environment manager (e.g. `uv`, `venv`), ensure you have Python >=3.12 and `ffmpeg` installed with the `libsvtav1` encoder, then skip ahead to [Environment Setup](#step-2-environment-setup).
|
||||
This guide uses `conda` (via miniforge) to manage environments (recommended). If you prefer another environment manager (e.g. `uv`, `venv`), ensure you have Python >=3.12 and support PyTorch >= 2.10, then skip ahead to [Environment Setup](#step-2-environment-setup).
|
||||
|
||||
## Step 1 (`conda` only): Install [`miniforge`](https://conda-forge.org/download/)
|
||||
|
||||
@@ -20,7 +20,7 @@ Create a virtual environment with Python 3.12:
|
||||
conda create -y -n lerobot python=3.12
|
||||
```
|
||||
</hfoption>
|
||||
<hfoption id="uv">
|
||||
<hfoption id="uv (PyTorch >= 2.10 only)">
|
||||
```bash
|
||||
uv python install 3.12
|
||||
uv venv --python 3.12
|
||||
@@ -32,48 +32,87 @@ uv venv --python 3.12
|
||||
Then activate your virtual environment, you have to do this each time you open a shell to use lerobot:
|
||||
|
||||
<!-- prettier-ignore-start -->
|
||||
|
||||
<hfoptions id="activate_venv">
|
||||
<hfoption id="conda">```bash
|
||||
<hfoption id="conda">
|
||||
```bash
|
||||
conda activate lerobot
|
||||
```</hfoption>
|
||||
<hfoption id="uv">
|
||||
```bash
|
||||
# Linux/macOSsource
|
||||
source .venv/bin/activate
|
||||
# Windows PowerShell
|
||||
source .venv\Scripts\Activate.ps1
|
||||
```
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
<!-- prettier-ignore-end -->
|
||||
|
||||
When using `conda`, install `ffmpeg` in your environment:
|
||||
|
||||
```bash
|
||||
conda install ffmpeg -c conda-forge
|
||||
ffmpeg -version # ffmpeg 8.X is not yet supported !
|
||||
```
|
||||
|
||||
> [!TIP]
|
||||
> This usually installs `ffmpeg 7.X` for your platform compiled with the `libsvtav1` encoder. If `libsvtav1` is not supported (check supported encoders with `ffmpeg -encoders`), you can:
|
||||
>
|
||||
> - _[On any platform]_ Explicitly install `ffmpeg 7.X` using:
|
||||
>
|
||||
> ```bash
|
||||
> conda install ffmpeg=7.1.1 -c conda-forge
|
||||
> ```
|
||||
>
|
||||
> - _[On Linux only]_ If you want to bring your own ffmpeg: Install [ffmpeg build dependencies](https://trac.ffmpeg.org/wiki/CompilationGuide/Ubuntu#GettheDependencies) and [compile ffmpeg from source with libsvtav1](https://trac.ffmpeg.org/wiki/CompilationGuide/Ubuntu#libsvtav1), and make sure you use the corresponding ffmpeg binary to your install with `which ffmpeg`.
|
||||
|
||||
> [!NOTE]
|
||||
> When installing LeRobot inside WSL (Windows Subsystem for Linux), make sure to install `evdev` with the following command:
|
||||
> When installing LeRobot inside WSL (Windows Subsystem for Linux), make sure to also install `evdev`:
|
||||
>
|
||||
> ```bash
|
||||
> conda install evdev -c conda-forge
|
||||
> ```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="uv (PyTorch >= 2.10 only)">
|
||||
```bash
|
||||
# Linux/macOS
|
||||
source .venv/bin/activate
|
||||
# Windows PowerShell
|
||||
.venv\Scripts\activate
|
||||
```
|
||||
|
||||
> [!NOTE]
|
||||
> When installing LeRobot inside WSL (Windows Subsystem for Linux), make sure to also install `evdev`:
|
||||
>
|
||||
> ```bash
|
||||
> sudo apt install libevdev-dev
|
||||
> uv pip install evdev
|
||||
> ```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
<!-- prettier-ignore-end -->
|
||||
|
||||
### Install `ffmpeg` (for video decoding)
|
||||
|
||||
LeRobot uses [TorchCodec](https://github.com/meta-pytorch/torchcodec) for video decoding by default, which requires `ffmpeg`.
|
||||
|
||||
> [!NOTE]
|
||||
> **Platform support:** TorchCodec is **not available** on macOS Intel (x86_64), Linux ARM (aarch64, arm64, armv7l), or Windows with PyTorch < 2.8. On these platforms, LeRobot automatically falls back to `pyav` — so you do not need to install `ffmpeg` and can skip to Step 3.
|
||||
|
||||
If your platform supports TorchCodec, install `ffmpeg` using one of the methods below:
|
||||
|
||||
<!-- prettier-ignore-start -->
|
||||
|
||||
<hfoptions id="install_ffmpeg">
|
||||
<hfoption id="conda (any PyTorch version)">
|
||||
|
||||
Install `ffmpeg` in your conda environment. This works with **all PyTorch versions** and is **required for PyTorch < 2.10**:
|
||||
|
||||
```bash
|
||||
conda install ffmpeg -c conda-forge
|
||||
```
|
||||
|
||||
> [!TIP]
|
||||
> This usually installs `ffmpeg 8.X` with the `libsvtav1` encoder. If you run into issues (e.g. `libsvtav1` missing — check with `ffmpeg -encoders` — or a version mismatch with `torchcodec`), you can explicitly install `ffmpeg 7.1.1` using:
|
||||
>
|
||||
> ```bash
|
||||
> conda install ffmpeg=7.1.1 -c conda-forge
|
||||
> ```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="uv (PyTorch >= 2.10 only)">
|
||||
|
||||
Starting with **PyTorch >= 2.10** (TorchCodec ≥ 0.10), TorchCodec can dynamically link to a system-wide `ffmpeg` installation. This is useful when using `uv` or other non-`conda` environment managers:
|
||||
|
||||
```bash
|
||||
# Ubuntu/Debian
|
||||
sudo apt install ffmpeg
|
||||
|
||||
# macOS (Apple Silicon)
|
||||
brew install ffmpeg
|
||||
```
|
||||
|
||||
> [!IMPORTANT]
|
||||
> If you are using `uv` you will have to install `ffmpeg` system-wide (outside of the virtual environment). You rely on `uv` and `torchcodec` ability to dynamically link to the system `ffmpeg`.
|
||||
> System-wide `ffmpeg` is **only supported with PyTorch >= 2.10** (TorchCodec ≥ 0.10). For older PyTorch versions, you **must** use `conda install ffmpeg -c conda-forge` instead.
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
<!-- prettier-ignore-end -->
|
||||
|
||||
## Step 3: Install LeRobot 🤗
|
||||
|
||||
|
||||
+90
-81
@@ -1,36 +1,61 @@
|
||||
# LIBERO
|
||||
|
||||
**LIBERO** is a benchmark designed to study **lifelong robot learning**. The idea is that robots won’t just be pretrained once in a factory, they’ll need to keep learning and adapting with their human users over time. This ongoing adaptation is called **lifelong learning in decision making (LLDM)**, and it’s a key step toward building robots that become truly personalized helpers.
|
||||
LIBERO is a benchmark designed to study **lifelong robot learning** — the idea that robots need to keep learning and adapting with their users over time, not just be pretrained once. It provides a set of standardized manipulation tasks that focus on **knowledge transfer**: how well a robot can apply what it has already learned to new situations. By evaluating on LIBERO, different algorithms can be compared fairly and researchers can build on each other's work.
|
||||
|
||||
- 📄 [LIBERO paper](https://arxiv.org/abs/2306.03310)
|
||||
- 💻 [Original LIBERO repo](https://github.com/Lifelong-Robot-Learning/LIBERO)
|
||||
|
||||
To make progress on this challenge, LIBERO provides a set of standardized tasks that focus on **knowledge transfer**: how well a robot can apply what it has already learned to new situations. By evaluating on LIBERO, different algorithms can be compared fairly and researchers can build on each other’s work.
|
||||
|
||||
LIBERO includes **five task suites**:
|
||||
|
||||
- **LIBERO-Spatial (`libero_spatial`)** – tasks that require reasoning about spatial relations.
|
||||
- **LIBERO-Object (`libero_object`)** – tasks centered on manipulating different objects.
|
||||
- **LIBERO-Goal (`libero_goal`)** – goal-conditioned tasks where the robot must adapt to changing targets.
|
||||
- **LIBERO-90 (`libero_90`)** – 90 short-horizon tasks from the LIBERO-100 collection.
|
||||
- **LIBERO-Long (`libero_10`)** – 10 long-horizon tasks from the LIBERO-100 collection.
|
||||
|
||||
Together, these suites cover **130 tasks**, ranging from simple object manipulations to complex multi-step scenarios. LIBERO is meant to grow over time, and to serve as a shared benchmark where the community can test and improve lifelong learning algorithms.
|
||||
- Paper: [Benchmarking Knowledge Transfer for Lifelong Robot Learning](https://arxiv.org/abs/2306.03310)
|
||||
- GitHub: [Lifelong-Robot-Learning/LIBERO](https://github.com/Lifelong-Robot-Learning/LIBERO)
|
||||
- Project website: [libero-project.github.io](https://libero-project.github.io)
|
||||
|
||||

|
||||
|
||||
## Evaluating with LIBERO
|
||||
## Available tasks
|
||||
|
||||
At **LeRobot**, we ported [LIBERO](https://github.com/Lifelong-Robot-Learning/LIBERO) into our framework and used it mainly to **evaluate [SmolVLA](https://huggingface.co/docs/lerobot/en/smolvla)**, our lightweight Vision-Language-Action model.
|
||||
LIBERO includes **five task suites** covering **130 tasks**, ranging from simple object manipulations to complex multi-step scenarios:
|
||||
|
||||
LIBERO is now part of our **multi-eval supported simulation**, meaning you can benchmark your policies either on a **single suite of tasks** or across **multiple suites at once** with just a flag.
|
||||
| Suite | CLI name | Tasks | Description |
|
||||
| -------------- | ---------------- | ----- | -------------------------------------------------- |
|
||||
| LIBERO-Spatial | `libero_spatial` | 10 | Tasks requiring reasoning about spatial relations |
|
||||
| LIBERO-Object | `libero_object` | 10 | Tasks centered on manipulating different objects |
|
||||
| LIBERO-Goal | `libero_goal` | 10 | Goal-conditioned tasks with changing targets |
|
||||
| LIBERO-90 | `libero_90` | 90 | Short-horizon tasks from the LIBERO-100 collection |
|
||||
| LIBERO-Long | `libero_10` | 10 | Long-horizon tasks from the LIBERO-100 collection |
|
||||
|
||||
To Install LIBERO, after following LeRobot official instructions, just do:
|
||||
`pip install -e ".[libero]"`
|
||||
## Installation
|
||||
|
||||
After following the LeRobot installation instructions:
|
||||
|
||||
```bash
|
||||
pip install -e ".[libero]"
|
||||
```
|
||||
|
||||
<Tip>
|
||||
LIBERO requires Linux (`sys_platform == 'linux'`). LeRobot uses MuJoCo for simulation — set the rendering backend before training or evaluation:
|
||||
|
||||
```bash
|
||||
export MUJOCO_GL=egl # for headless servers (HPC, cloud)
|
||||
```
|
||||
|
||||
</Tip>
|
||||
|
||||
## Evaluation
|
||||
|
||||
### Default evaluation (recommended)
|
||||
|
||||
Evaluate across the four standard suites (10 episodes per task):
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path="your-policy-id" \
|
||||
--env.type=libero \
|
||||
--env.task=libero_spatial,libero_object,libero_goal,libero_10 \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=10 \
|
||||
--env.max_parallel_tasks=1
|
||||
```
|
||||
|
||||
### Single-suite evaluation
|
||||
|
||||
Evaluate a policy on one LIBERO suite:
|
||||
Evaluate on one LIBERO suite:
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
@@ -42,15 +67,13 @@ lerobot-eval \
|
||||
```
|
||||
|
||||
- `--env.task` picks the suite (`libero_object`, `libero_spatial`, etc.).
|
||||
- `--env.task_ids` picks task ids to run (`[0]`, `[1,2,3]`, etc.). Omit this flag (or set it to `null`) to run all tasks in the suite.
|
||||
- `--env.task_ids` restricts to specific task indices (`[0]`, `[1,2,3]`, etc.). Omit to run all tasks in the suite.
|
||||
- `--eval.batch_size` controls how many environments run in parallel.
|
||||
- `--eval.n_episodes` sets how many episodes to run in total.
|
||||
|
||||
---
|
||||
- `--eval.n_episodes` sets how many episodes to run per task.
|
||||
|
||||
### Multi-suite evaluation
|
||||
|
||||
Benchmark a policy across multiple suites at once:
|
||||
Benchmark a policy across multiple suites at once by passing a comma-separated list:
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
@@ -61,50 +84,49 @@ lerobot-eval \
|
||||
--eval.n_episodes=2
|
||||
```
|
||||
|
||||
- Pass a comma-separated list to `--env.task` for multi-suite evaluation.
|
||||
### Control mode
|
||||
|
||||
### Control Mode
|
||||
LIBERO supports two control modes — `relative` (default) and `absolute`. Different VLA checkpoints are trained with different action parameterizations, so make sure the mode matches your policy:
|
||||
|
||||
LIBERO now supports two control modes: relative and absolute. This matters because different VLA checkpoints are trained with different mode of action to output hence control parameterizations.
|
||||
You can switch them with: `env.control_mode = "relative"` and `env.control_mode = "absolute"`
|
||||
```bash
|
||||
--env.control_mode=relative # or "absolute"
|
||||
```
|
||||
|
||||
### Policy inputs and outputs
|
||||
|
||||
When using LIBERO through LeRobot, policies interact with the environment via **observations** and **actions**:
|
||||
**Observations:**
|
||||
|
||||
- **Observations**
|
||||
- `observation.state` – proprioceptive features (agent state).
|
||||
- `observation.images.image` – main camera view (`agentview_image`).
|
||||
- `observation.images.image2` – wrist camera view (`robot0_eye_in_hand_image`).
|
||||
- `observation.state` — 8-dim proprioceptive features (eef position, axis-angle orientation, gripper qpos)
|
||||
- `observation.images.image` — main camera view (`agentview_image`), HWC uint8
|
||||
- `observation.images.image2` — wrist camera view (`robot0_eye_in_hand_image`), HWC uint8
|
||||
|
||||
⚠️ **Note:** LeRobot enforces the `.images.*` prefix for any multi-modal visual features. Always ensure that your policy config `input_features` use the same naming keys, and that your dataset metadata keys follow this convention during evaluation.
|
||||
If your data contains different keys, you must rename the observations to match what the policy expects, since naming keys are encoded inside the normalization statistics layer.
|
||||
This will be fixed with the upcoming Pipeline PR.
|
||||
<Tip warning={true}>
|
||||
LeRobot enforces the `.images.*` prefix for visual features. Ensure your
|
||||
policy config `input_features` use the same naming keys, and that your dataset
|
||||
metadata keys follow this convention. If your data contains different keys,
|
||||
you must rename the observations to match what the policy expects, since
|
||||
naming keys are encoded inside the normalization statistics layer.
|
||||
</Tip>
|
||||
|
||||
- **Actions**
|
||||
- Continuous control values in a `Box(-1, 1, shape=(7,))` space.
|
||||
**Actions:**
|
||||
|
||||
We also provide a notebook for quick testing:
|
||||
Training with LIBERO
|
||||
- Continuous control in `Box(-1, 1, shape=(7,))` — 6D end-effector delta + 1D gripper
|
||||
|
||||
## Training with LIBERO
|
||||
### Recommended evaluation episodes
|
||||
|
||||
When training on LIBERO tasks, make sure your dataset parquet and metadata keys follow the LeRobot convention.
|
||||
For reproducible benchmarking, use **10 episodes per task** across all four standard suites (Spatial, Object, Goal, Long). This gives 400 total episodes and matches the protocol used for published results.
|
||||
|
||||
The environment expects:
|
||||
## Training
|
||||
|
||||
- `observation.state` → 8-dim agent state
|
||||
- `observation.images.image` → main camera (`agentview_image`)
|
||||
- `observation.images.image2` → wrist camera (`robot0_eye_in_hand_image`)
|
||||
### Dataset
|
||||
|
||||
⚠️ Cleaning the dataset upfront is **cleaner and more efficient** than remapping keys inside the code.
|
||||
To avoid potential mismatches and key errors, we provide a **preprocessed LIBERO dataset** that is fully compatible with the current LeRobot codebase and requires no additional manipulation:
|
||||
👉 [HuggingFaceVLA/libero](https://huggingface.co/datasets/HuggingFaceVLA/libero)
|
||||
We provide a preprocessed LIBERO dataset fully compatible with LeRobot:
|
||||
|
||||
For reference, here is the **original dataset** published by Physical Intelligence:
|
||||
👉 [physical-intelligence/libero](https://huggingface.co/datasets/physical-intelligence/libero)
|
||||
- [HuggingFaceVLA/libero](https://huggingface.co/datasets/HuggingFaceVLA/libero)
|
||||
|
||||
---
|
||||
For reference, the original dataset published by Physical Intelligence:
|
||||
|
||||
- [physical-intelligence/libero](https://huggingface.co/datasets/physical-intelligence/libero)
|
||||
|
||||
### Example training command
|
||||
|
||||
@@ -121,52 +143,39 @@ lerobot-train \
|
||||
--batch_size=4 \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=1 \
|
||||
--eval_freq=1000 \
|
||||
--eval_freq=1000
|
||||
```
|
||||
|
||||
---
|
||||
## Reproducing published results
|
||||
|
||||
### Note on rendering
|
||||
We reproduce the results of Pi0.5 on the LIBERO benchmark. We take the Physical Intelligence LIBERO base model (`pi05_libero`) and finetune for an additional 6k steps in bfloat16, with batch size of 256 on 8 H100 GPUs using the [HuggingFace LIBERO dataset](https://huggingface.co/datasets/HuggingFaceVLA/libero).
|
||||
|
||||
LeRobot uses MuJoCo for simulation. You need to set the rendering backend before training or evaluation:
|
||||
The finetuned model: [lerobot/pi05_libero_finetuned](https://huggingface.co/lerobot/pi05_libero_finetuned)
|
||||
|
||||
- `export MUJOCO_GL=egl` → for headless servers (e.g. HPC, cloud)
|
||||
|
||||
## Reproducing π₀.₅ results
|
||||
|
||||
We reproduce the results of π₀.₅ on the LIBERO benchmark using the LeRobot implementation. We take the Physical Intelligence LIBERO base model (`pi05_libero`) and finetune for an additional 6k steps in bfloat16, with batch size of 256 on 8 H100 GPUs using the [HuggingFace LIBERO dataset](https://huggingface.co/datasets/HuggingFaceVLA/libero).
|
||||
|
||||
The finetuned model can be found here:
|
||||
|
||||
- **π₀.₅ LIBERO**: [lerobot/pi05_libero_finetuned](https://huggingface.co/lerobot/pi05_libero_finetuned)
|
||||
|
||||
We then evaluate the finetuned model using the LeRobot LIBERO implementation, by running the following command:
|
||||
### Evaluation command
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--output_dir=/logs/ \
|
||||
--output_dir=./eval_logs/ \
|
||||
--env.type=libero \
|
||||
--env.task=libero_spatial,libero_object,libero_goal,libero_10 \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=10 \
|
||||
--policy.path=pi05_libero_finetuned \
|
||||
--policy.n_action_steps=10 \
|
||||
--output_dir=./eval_logs/ \
|
||||
--env.max_parallel_tasks=1
|
||||
```
|
||||
|
||||
**Note:** We set `n_action_steps=10`, similar to the original OpenPI implementation.
|
||||
We set `n_action_steps=10`, matching the original OpenPI implementation.
|
||||
|
||||
### Results
|
||||
|
||||
We obtain the following results on the LIBERO benchmark:
|
||||
| Model | LIBERO Spatial | LIBERO Object | LIBERO Goal | LIBERO 10 | Average |
|
||||
| ------------------- | -------------- | ------------- | ----------- | --------- | -------- |
|
||||
| **Pi0.5 (LeRobot)** | 97.0 | 99.0 | 98.0 | 96.0 | **97.5** |
|
||||
|
||||
| Model | LIBERO Spatial | LIBERO Object | LIBERO Goal | LIBERO 10 | Average |
|
||||
| -------- | -------------- | ------------- | ----------- | --------- | -------- |
|
||||
| **π₀.₅** | 97.0 | 99.0 | 98.0 | 96.0 | **97.5** |
|
||||
These results are consistent with the [original results](https://github.com/Physical-Intelligence/openpi/tree/main/examples/libero#results) reported by Physical Intelligence:
|
||||
|
||||
These results are consistent with the original [results](https://github.com/Physical-Intelligence/openpi/tree/main/examples/libero#results) reported by Physical Intelligence:
|
||||
|
||||
| Model | LIBERO Spatial | LIBERO Object | LIBERO Goal | LIBERO 10 | Average |
|
||||
| -------- | -------------- | ------------- | ----------- | --------- | --------- |
|
||||
| **π₀.₅** | 98.8 | 98.2 | 98.0 | 92.4 | **96.85** |
|
||||
| Model | LIBERO Spatial | LIBERO Object | LIBERO Goal | LIBERO 10 | Average |
|
||||
| ------------------ | -------------- | ------------- | ----------- | --------- | --------- |
|
||||
| **Pi0.5 (OpenPI)** | 98.8 | 98.2 | 98.0 | 92.4 | **96.85** |
|
||||
|
||||
+97
-47
@@ -1,32 +1,111 @@
|
||||
# Meta-World
|
||||
|
||||
Meta-World is a well-designed, open-source simulation benchmark for multi-task and meta reinforcement learning in continuous-control robotic manipulation. It gives researchers a shared, realistic playground to test whether algorithms can _learn many different tasks_ and _generalize quickly to new ones_ — two central challenges for real-world robotics.
|
||||
Meta-World is an open-source simulation benchmark for **multi-task and meta reinforcement learning** in continuous-control robotic manipulation. It bundles 50 diverse manipulation tasks using everyday objects and a common tabletop Sawyer arm, providing a standardized playground to test whether algorithms can learn many different tasks and generalize quickly to new ones.
|
||||
|
||||
- 📄 [MetaWorld paper](https://arxiv.org/pdf/1910.10897)
|
||||
- 💻 [Original MetaWorld repo](https://github.com/Farama-Foundation/Metaworld)
|
||||
- Paper: [Meta-World: A Benchmark and Evaluation for Multi-Task and Meta Reinforcement Learning](https://arxiv.org/abs/1910.10897)
|
||||
- GitHub: [Farama-Foundation/Metaworld](https://github.com/Farama-Foundation/Metaworld)
|
||||
- Project website: [metaworld.farama.org](https://metaworld.farama.org)
|
||||
|
||||

|
||||
|
||||
## Why Meta-World matters
|
||||
## Available tasks
|
||||
|
||||
- **Diverse, realistic tasks.** Meta-World bundles a large suite of simulated manipulation tasks (50 in the MT50 suite) using everyday objects and a common tabletop Sawyer arm. This diversity exposes algorithms to a wide variety of dynamics, contacts and goal specifications while keeping a consistent control and observation structure.
|
||||
- **Focus on generalization and multi-task learning.** By evaluating across task distributions that share structure but differ in goals and objects, Meta-World reveals whether an agent truly learns transferable skills rather than overfitting to a narrow task.
|
||||
- **Standardized evaluation protocol.** It provides clear evaluation modes and difficulty splits, so different methods can be compared fairly across easy, medium, hard and very-hard regimes.
|
||||
- **Empirical insight.** Past evaluations on Meta-World show impressive progress on some fronts, but also highlight that current multi-task and meta-RL methods still struggle with large, diverse task sets. That gap points to important research directions.
|
||||
Meta-World provides 50 tasks organized into difficulty groups. In LeRobot, you can evaluate on individual tasks, difficulty groups, or the full MT50 suite:
|
||||
|
||||
## What it enables in LeRobot
|
||||
| Group | CLI name | Tasks | Description |
|
||||
| ---------- | -------------------- | ----- | ------------------------------------------------------ |
|
||||
| Easy | `easy` | 28 | Tasks with simple dynamics and single-step goals |
|
||||
| Medium | `medium` | 11 | Tasks requiring multi-step reasoning |
|
||||
| Hard | `hard` | 6 | Tasks with complex contacts and precise manipulation |
|
||||
| Very Hard | `very_hard` | 5 | The most challenging tasks in the suite |
|
||||
| MT50 (all) | Comma-separated list | 50 | All 50 tasks — the most challenging multi-task setting |
|
||||
|
||||
In LeRobot, you can evaluate any policy or vision-language-action (VLA) model on Meta-World tasks and get a clear success-rate measure. The integration is designed to be straightforward:
|
||||
You can also pass individual task names directly (e.g., `assembly-v3`, `dial-turn-v3`).
|
||||
|
||||
- We provide a LeRobot-ready dataset for Meta-World (MT50) on the HF Hub: `https://huggingface.co/datasets/lerobot/metaworld_mt50`.
|
||||
- This dataset is formatted for the MT50 evaluation that uses all 50 tasks (the most challenging multi-task setting).
|
||||
- MT50 gives the policy a one-hot task vector and uses fixed object/goal positions for consistency.
|
||||
We provide a LeRobot-ready dataset for Meta-World MT50 on the HF Hub: [lerobot/metaworld_mt50](https://huggingface.co/datasets/lerobot/metaworld_mt50). This dataset is formatted for the MT50 evaluation that uses all 50 tasks with fixed object/goal positions and one-hot task vectors for consistency.
|
||||
|
||||
- Task descriptions and the exact keys required for evaluation are available in the repo/dataset — use these to ensure your policy outputs the right success signals.
|
||||
## Installation
|
||||
|
||||
## Quick start, train a SmolVLA policy on Meta-World
|
||||
After following the LeRobot installation instructions:
|
||||
|
||||
Example command to train a SmolVLA policy on a subset of tasks:
|
||||
```bash
|
||||
pip install -e ".[metaworld]"
|
||||
```
|
||||
|
||||
<Tip warning={true}>
|
||||
If you encounter an `AssertionError: ['human', 'rgb_array', 'depth_array']` when running Meta-World environments, this is a mismatch between Meta-World and your Gymnasium version. Fix it with:
|
||||
|
||||
```bash
|
||||
pip install "gymnasium==1.1.0"
|
||||
```
|
||||
|
||||
</Tip>
|
||||
|
||||
## Evaluation
|
||||
|
||||
### Default evaluation (recommended)
|
||||
|
||||
Evaluate on the medium difficulty split (a good balance of coverage and compute):
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path="your-policy-id" \
|
||||
--env.type=metaworld \
|
||||
--env.task=medium \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=10
|
||||
```
|
||||
|
||||
### Single-task evaluation
|
||||
|
||||
Evaluate on a specific task:
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path="your-policy-id" \
|
||||
--env.type=metaworld \
|
||||
--env.task=assembly-v3 \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=10
|
||||
```
|
||||
|
||||
### Multi-task evaluation
|
||||
|
||||
Evaluate across multiple tasks or difficulty groups:
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path="your-policy-id" \
|
||||
--env.type=metaworld \
|
||||
--env.task=assembly-v3,dial-turn-v3,handle-press-side-v3 \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=10
|
||||
```
|
||||
|
||||
- `--env.task` accepts explicit task lists (comma-separated) or difficulty groups (e.g., `easy`, `medium`, `hard`, `very_hard`).
|
||||
- `--eval.batch_size` controls how many environments run in parallel.
|
||||
- `--eval.n_episodes` sets how many episodes to run per task.
|
||||
|
||||
### Policy inputs and outputs
|
||||
|
||||
**Observations:**
|
||||
|
||||
- `observation.image` — single camera view (`corner2`), 480x480 HWC uint8
|
||||
- `observation.state` — 4-dim proprioceptive state (end-effector position + gripper)
|
||||
|
||||
**Actions:**
|
||||
|
||||
- Continuous control in `Box(-1, 1, shape=(4,))` — 3D end-effector delta + 1D gripper
|
||||
|
||||
### Recommended evaluation episodes
|
||||
|
||||
For reproducible benchmarking, use **10 episodes per task**. For the full MT50 suite this gives 500 total episodes. If you care about generalization, run on the full MT50 — it is intentionally challenging and reveals strengths/weaknesses better than a few narrow tasks.
|
||||
|
||||
## Training
|
||||
|
||||
### Example training command
|
||||
|
||||
Train a SmolVLA policy on a subset of Meta-World tasks:
|
||||
|
||||
```bash
|
||||
lerobot-train \
|
||||
@@ -44,37 +123,8 @@ lerobot-train \
|
||||
--eval_freq=1000
|
||||
```
|
||||
|
||||
Notes:
|
||||
|
||||
- `--env.task` accepts explicit task lists (comma separated) or difficulty groups (e.g., `env.task="hard"`).
|
||||
- Adjust `batch_size`, `steps`, and `eval_freq` to match your compute budget.
|
||||
- **Gymnasium Assertion Error**: if you encounter an error like
|
||||
`AssertionError: ['human', 'rgb_array', 'depth_array']` when running MetaWorld environments, this comes from a mismatch between MetaWorld and your Gymnasium version.
|
||||
We recommend using:
|
||||
|
||||
```bash
|
||||
pip install "gymnasium==1.1.0"
|
||||
```
|
||||
|
||||
to ensure proper compatibility.
|
||||
|
||||
## Quick start — evaluate a trained policy
|
||||
|
||||
To evaluate a trained policy on the Meta-World medium difficulty split:
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path="your-policy-id" \
|
||||
--env.type=metaworld \
|
||||
--env.task=medium \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=2
|
||||
```
|
||||
|
||||
This will run episodes and return per-task success rates using the standard Meta-World evaluation keys.
|
||||
|
||||
## Practical tips
|
||||
|
||||
- If you care about generalization, run on the full MT50 suite — it’s intentionally challenging and reveals strengths/weaknesses better than a few narrow tasks.
|
||||
- Use the one-hot task conditioning for multi-task training (MT10 / MT50 conventions) so policies have explicit task context.
|
||||
- Use the one-hot task conditioning for multi-task training (MT10/MT50 conventions) so policies have explicit task context.
|
||||
- Inspect the dataset task descriptions and the `info["is_success"]` keys when writing post-processing or logging so your success metrics line up with the benchmark.
|
||||
- Adjust `batch_size`, `steps`, and `eval_freq` to match your compute budget.
|
||||
|
||||
@@ -0,0 +1,388 @@
|
||||
# Multitask DiT Policy
|
||||
|
||||
Multitask Diffusion Transformer (DiT) Policy is an evolution of the original Diffusion Policy architecture, which leverages a large DiT with text and vision conditioning for multitask robot learning. This implementation supports both diffusion and flow matching objectives for action generation, enabling robots to perform diverse manipulation tasks conditioned on language instructions.
|
||||
|
||||
## Model Overview
|
||||
|
||||
The model uses:
|
||||
|
||||
- **CLIP Vision Encoder**: Processes RGB images from multiple camera views
|
||||
- **CLIP Text Encoder**: Encodes language task instructions (frozen weights with learnable projection)
|
||||
- **Diffusion Transformer**: Predicts action sequences conditioned on observations and language
|
||||
- **Two Objectives**: Supports both diffusion (DDPM/DDIM) and flow matching for action generation
|
||||
|
||||
This model is exciting because you can achieve extremely high dexterity, competitive with multi-billion parameter
|
||||
VLAs, with only ~450M parameters and significantly less training.
|
||||
|
||||
## Installation Requirements
|
||||
|
||||
Multitask DiT Policy has additional dependencies. Install it with:
|
||||
|
||||
```bash
|
||||
pip install lerobot[multi_task_dit]
|
||||
```
|
||||
|
||||
This will install all necessary dependencies including the HuggingFace Transformers library for CLIP models.
|
||||
|
||||
## Usage
|
||||
|
||||
To use Multitask DiT in your LeRobot configuration, specify the policy type as:
|
||||
|
||||
```python
|
||||
policy.type=multi_task_dit
|
||||
```
|
||||
|
||||
## Training
|
||||
|
||||
### Basic Training Command
|
||||
|
||||
Here's a complete training command for training Multitask DiT on your dataset:
|
||||
|
||||
```bash
|
||||
lerobot-train \
|
||||
--dataset.repo_id=YOUR_DATASET \
|
||||
--output_dir=./outputs/multitask_dit_training \
|
||||
--batch_size=32 \
|
||||
--steps=5000 \
|
||||
--save_freq=500 \
|
||||
--log_freq=100 \
|
||||
--policy.type=multi_task_dit \
|
||||
--policy.device=cuda \
|
||||
--policy.repo_id="HF_USER/multitask-dit-your-robot" \
|
||||
--wandb.enable=true
|
||||
```
|
||||
|
||||
### Recommended Hyperparameters and Dataset Details (30Hz Control Frequency)
|
||||
|
||||
For reliable performance, start with these suggested default hyperparameters:
|
||||
|
||||
```bash
|
||||
lerobot-train \
|
||||
--dataset.repo_id=YOUR_DATASET \
|
||||
--output_dir=./outputs/mutitask_dit_training \
|
||||
--batch_size=320 \
|
||||
--steps=30000 \
|
||||
--policy.type=multi_task_dit \
|
||||
--policy.device=cuda \
|
||||
--policy.horizon=32 \
|
||||
--policy.n_action_steps=24 \
|
||||
--policy.objective=diffusion \
|
||||
--policy.noise_scheduler_type=DDPM \
|
||||
--policy.num_train_timesteps=100 \
|
||||
--policy.repo_id="HF_USER/multitask-dit-your-robot" \
|
||||
--wandb.enable=true
|
||||
```
|
||||
|
||||
**Key Parameters:**
|
||||
|
||||
- **Batch Size**: 192-320 - If you have access to a GPU that can support this, you will get the best training dynamics
|
||||
- **Horizon**: 32 - number of action steps to predict, ~1.0 sec at 30Hz
|
||||
- **n_action_steps**: 24 - ~0.8 seconds at 30Hz
|
||||
- **Objective**: `diffusion` - start with diffusion and experiment with flow matching if generation quality is poor
|
||||
- **Training Steps**: >30k steps recommended for a single task
|
||||
|
||||
### Training Configuration Parameters
|
||||
|
||||
#### Objective Selection
|
||||
|
||||
Choose between diffusion and flow matching:
|
||||
|
||||
```bash
|
||||
# Diffusion objective (default)
|
||||
--policy.objective=diffusion \
|
||||
--policy.noise_scheduler_type=DDPM \ # or "DDIM"
|
||||
--policy.num_train_timesteps=100 \
|
||||
--policy.num_inference_steps=10 \ # For faster inference
|
||||
--policy.beta_schedule=squaredcos_cap_v2 \ # Noise schedule type
|
||||
--policy.prediction_type=epsilon \ # "epsilon" (predict noise) or "sample" (predict clean)
|
||||
--policy.clip_sample=true \ # Clip samples during denoising
|
||||
--policy.clip_sample_range=1.0 # Clipping range [-x, x]
|
||||
|
||||
# Flow matching objective
|
||||
--policy.objective=flow_matching \
|
||||
--policy.timestep_sampling_strategy=beta \ # or "uniform" | the beta sampling strategy performance appears much better in practice
|
||||
--policy.num_integration_steps=100 \
|
||||
--policy.integration_method=euler \ # or "rk4"
|
||||
--policy.sigma_min=0.0 # Minimum noise in flow interpolation path
|
||||
```
|
||||
|
||||
#### Transformer Architecture
|
||||
|
||||
Adjust model capacity based on dataset size:
|
||||
|
||||
```bash
|
||||
# Small datasets (< 100 examples)
|
||||
--policy.num_layers=4 \
|
||||
--policy.hidden_dim=512 \
|
||||
--policy.num_heads=8 # should ideally be hidden_dim // 64
|
||||
|
||||
# Medium datasets (100-5k examples) - default
|
||||
--policy.num_layers=6 \
|
||||
--policy.hidden_dim=512 \
|
||||
--policy.num_heads=8 # should ideally be hidden_dim // 64
|
||||
|
||||
# Large datasets (> 5k examples)
|
||||
--policy.num_layers=8 \
|
||||
--policy.hidden_dim=512 \
|
||||
--policy.num_heads=8 # should ideally be hidden_dim // 64
|
||||
```
|
||||
|
||||
**Positional Encoding Options:**
|
||||
|
||||
The model supports two positional encoding methods for action sequences:
|
||||
|
||||
```bash
|
||||
# Rotary Position Embedding (RoPE) - default, recommended
|
||||
--policy.use_rope=true \
|
||||
--policy.rope_base=10000.0 # Base frequency for RoPE
|
||||
|
||||
# Absolute positional encoding
|
||||
--policy.use_positional_encoding=true # Disables RoPE when true
|
||||
```
|
||||
|
||||
**Other Transformer Parameters:**
|
||||
|
||||
```bash
|
||||
--policy.dropout=0.1 # Dropout rate for DiT blocks (0.0-1.0)
|
||||
--policy.timestep_embed_dim=256 # Timestep embedding dimension
|
||||
```
|
||||
|
||||
#### Vision Encoder Configuration
|
||||
|
||||
```bash
|
||||
# Use different CLIP model for more expressivity at the cost of inference time
|
||||
# experiment with larger or smaller models depending on the complexity of your tasks and size of dataset
|
||||
--policy.vision_encoder_name=openai/clip-vit-large-patch14
|
||||
|
||||
# Use separate vision encoder per camera
|
||||
# This may be useful when cameras have significantly different characteristics, but
|
||||
# be wary of increased VRAM footprint.
|
||||
--policy.use_separate_rgb_encoder_per_camera=true
|
||||
|
||||
# Image preprocessing
|
||||
--policy.image_resize_shape=[XXX,YYY] \ # you may need to resize your images for inference speed ups
|
||||
--policy.image_crop_shape=[224,224] \
|
||||
--policy.image_crop_is_random=true # Random during training, center at inference
|
||||
```
|
||||
|
||||
#### Text Encoder Configuration
|
||||
|
||||
```bash
|
||||
# Use different CLIP text encoder model
|
||||
# same as vision: experiment with larger or smaller models depending on the
|
||||
# complexity of your tasks and size of dataset
|
||||
--policy.text_encoder_name=openai/clip-vit-large-patch14
|
||||
```
|
||||
|
||||
#### Learning Rate Configuration
|
||||
|
||||
The vision encoder uses a separate learning rate multiplier, where 1/10th is suggested to be the ideal staritng point:
|
||||
|
||||
```bash
|
||||
--policy.optimizer_lr=2e-5 \
|
||||
--policy.vision_encoder_lr_multiplier=0.1 # Vision encoder LR = 0.1 * optimizer_lr
|
||||
```
|
||||
|
||||
### Training Tuning Guidelines
|
||||
|
||||
#### 1. Flow Matching with Beta Sampling
|
||||
|
||||
The original diffusion implementation here is based on the work described in [TRI's LBM paper](https://arxiv.org/abs/2507.05331)
|
||||
|
||||
Additionally, we have implemented a flow-matching objective, which is described at a high-level in [Boston Dynamics blog post](https://bostondynamics.com/blog/large-behavior-models-atlas-find-new-footing/).
|
||||
|
||||
Consider testing the flow-matching objective and evaluating performance differences for your task:
|
||||
|
||||
```bash
|
||||
--policy.objective=flow_matching \
|
||||
--policy.timestep_sampling_strategy=beta \
|
||||
--policy.timestep_sampling_alpha=1.5 \
|
||||
--policy.timestep_sampling_beta=1.0 \
|
||||
--policy.timestep_sampling_s=0.999
|
||||
```
|
||||
|
||||
This hasn't been shown to be a silver bullet across every user case, but it occasionally results in smoother and more consistent actions.
|
||||
|
||||
#### 2. Number of Transformer Layers
|
||||
|
||||
Match model capacity to your dataset size:
|
||||
|
||||
- **Small datasets** (< 100 examples): Reduce to 4 layers
|
||||
- **Large datasets** (> 5k examples): Increase to 8 layers
|
||||
|
||||
#### 3. `horizon` Tuning
|
||||
|
||||
The model can be sensitive to the horizon you choose. Start with around a 1 second horizon based on your control frequency:
|
||||
|
||||
- **30 Hz frequency**: `horizon=30`
|
||||
- **10 Hz frequency**: `horizon=10`
|
||||
|
||||
Then experiment with increasing from there. The horizon determines how far into the future the model predicts actions.
|
||||
|
||||
#### 4. `n_action_steps` Sensitivity
|
||||
|
||||
The model can also be very sensitive to `n_action_steps`. Start with it being around 0.8 seconds based on your control frequency and tune from there:
|
||||
|
||||
- **Lower values**: More reactive but potentially less stable for long-horizon tasks
|
||||
- **Higher values**: Better for long-horizon execution but open-loop failures are limited in their recovery
|
||||
|
||||
### Inference Tuning
|
||||
|
||||
For faster inference, use DDIM with fewer sampling steps:
|
||||
|
||||
```bash
|
||||
--policy.noise_scheduler_type=DDIM \
|
||||
--policy.num_inference_steps=10
|
||||
```
|
||||
|
||||
### Resuming Training
|
||||
|
||||
To resume training from a checkpoint:
|
||||
|
||||
```bash
|
||||
lerobot-train \
|
||||
--config_path=./outputs/mutitask_dit_training/checkpoints/last/pretrained_model/train_config.json \
|
||||
--resume=true
|
||||
```
|
||||
|
||||
The checkpoint directory should contain `model.safetensors` and `config.json` files (saved automatically during training). When resuming, the configuration is loaded from the checkpoint, so you don't need to specify other parameters.
|
||||
|
||||
## Common Failure Modes and Debugging
|
||||
|
||||
Training these models can be finicky. Here are common failure modes and debugging approaches:
|
||||
|
||||
### Idling / No Motion
|
||||
|
||||
The model may "collapse" during inference, resulting in static or no motion. This can occur when:
|
||||
|
||||
1. **Insufficient training data**: If you only have 20-50 examples, try to roughly double your dataset size. Once you have above 300 examples, if you're still seeing this, the task may be too complex.
|
||||
|
||||
2. **Multiple similar tasks**: When your dataset contains multiple similar tasks (e.g., picking up 2 different objects), the model may rely too heavily on language conditioning which might not be rich enough.
|
||||
|
||||
**Debugging tips:**
|
||||
|
||||
- Increase dataset size (double until you get to over 300 examples)
|
||||
- Train for longer, up to 100k steps, even when the loss flatlines
|
||||
- Check if the model is receiving proper language instructions or increase diversity of instruction
|
||||
|
||||
### Executing the Wrong Task
|
||||
|
||||
Sometimes the robot will completely ignore your instruction and perform some other task. This generally only happens if you have trained on multiple tasks.
|
||||
|
||||
**Potential causes:**
|
||||
|
||||
- Language instruction ambiguity
|
||||
- Insufficient task-specific training data
|
||||
- Model confusion between similar tasks in the multitask dataset
|
||||
|
||||
**Debugging tips:**
|
||||
|
||||
- Verify language instruction specificity, especially if descriptions are similar between multiple tasks
|
||||
- Check task distribution in your training dataset and add weighting to the failing/ignored task
|
||||
- Consider task-specific fine-tuning
|
||||
|
||||
### Training Instability
|
||||
|
||||
If training loss is unstable or diverging:
|
||||
|
||||
- Try adjusting learning rate between `1e-5` and `3e-4`
|
||||
- Increase batch size if possible
|
||||
- Check that your dataset normalization is correct
|
||||
- Verify image preprocessing is working correctly
|
||||
|
||||
## Performance Considerations
|
||||
|
||||
### GPU Requirements
|
||||
|
||||
- **Inference**: At least an RTX 5070 Ti (or equivalent GPU) is recommended for reasonable speed performance
|
||||
- **Training**: A GPU with enough VRAM to load batch sizes of >64 is ideal, which will vary depending on the number of image observations, etc
|
||||
|
||||
### Batch Size Recommendations
|
||||
|
||||
- **Minimum**: 64 (less than this may result in unstable training)
|
||||
- **Recommended**: 256-320 (best performance, requires larger GPU)
|
||||
|
||||
## Example: Training on Custom Dataset
|
||||
|
||||
Here's a complete example training on a custom dataset:
|
||||
|
||||
```bash
|
||||
lerobot-train \
|
||||
--dataset.repo_id=YOUR_DATASET \
|
||||
--output_dir=./outputs/mutitask_dit_training \
|
||||
--batch_size=320 \
|
||||
--steps=30000 \
|
||||
--save_freq=1000 \
|
||||
--log_freq=100 \
|
||||
--eval_freq=1000 \
|
||||
--policy.type=multi_task_dit \
|
||||
--policy.device=cuda \
|
||||
--policy.horizon=32 \
|
||||
--policy.n_action_steps=24 \
|
||||
--policy.objective=diffusion \
|
||||
--policy.noise_scheduler_type=DDPM \
|
||||
--policy.num_layers=6 \
|
||||
--policy.hidden_dim=512 \
|
||||
--policy.vision_encoder_name=openai/clip-vit-base-patch16 \
|
||||
--policy.image_resize_shape=[320,240] \
|
||||
--policy.image_crop_shape=[224,224] \
|
||||
--policy.repo_id="HF_USER/multitask-dit-your-robot" \
|
||||
--wandb.enable=true \
|
||||
--wandb.project=multitask_dit
|
||||
```
|
||||
|
||||
## Libero Results
|
||||
|
||||
```
|
||||
python -m lerobot.scripts.lerobot_train \
|
||||
--dataset.repo_id=HuggingFaceVLA/libero \
|
||||
--policy.type=multi_task_dit \
|
||||
--policy.push_to_hub=false \
|
||||
--output_dir="./outputs/multitask_dit_libero" \
|
||||
--job_name="multitask-dit-libero" \
|
||||
--wandb.enable=true \
|
||||
--wandb.project=multitask_dit_libero \
|
||||
--dataset.image_transforms.enable=true \
|
||||
--dataset.image_transforms.max_num_transforms=4 \
|
||||
--dataset.image_transforms.tfs='{"brightness":{"type":"ColorJitter","kwargs":{"brightness":[0.75,1.25]}},"contrast":{"type":"ColorJitter","kwargs":{"contrast":[0.6,1.4]}},"saturation":{"type":"ColorJitter","kwargs":{"saturation":[0.8,1.2]}},"hue":{"type":"ColorJitter","kwargs":{"hue":[-0.05,0.05]}},"sharpness":{"type":"SharpnessJitter","kwargs":{"sharpness":[0.6,1.4]}},"rotation":{"type":"RandomRotation","kwargs":{"degrees":[-5,5]}},"translation":{"type":"RandomAffine","kwargs":{"degrees":0,"translate":[0.1,0.1]}}}' \
|
||||
--dataset.video_backend=torchcodec \
|
||||
--policy.use_amp=true \
|
||||
--policy.horizon=48 \
|
||||
--policy.n_obs_steps=2 \
|
||||
--policy.use_rope=true \
|
||||
--policy.use_positional_encoding=false \
|
||||
--policy.hidden_dim=768 \
|
||||
--policy.num_layers=8 \
|
||||
--policy.num_heads=12 \
|
||||
--policy.dropout=0.1 \
|
||||
--policy.timestep_embed_dim=256 \
|
||||
--policy.objective=diffusion \
|
||||
--policy.optimizer_lr=3e-4 \
|
||||
--policy.optimizer_weight_decay=0 \
|
||||
--policy.scheduler_warmup_steps=0 \
|
||||
--policy.vision_encoder_name=openai/clip-vit-base-patch16 \
|
||||
--policy.image_resize_shape=[256,256] \
|
||||
--policy.image_crop_is_random=true \
|
||||
--policy.text_encoder_name=openai/clip-vit-base-patch16 \
|
||||
--policy.vision_encoder_lr_multiplier=0.1 \
|
||||
--policy.device=cuda \
|
||||
--num_workers=8 \
|
||||
--save_freq=4000 \
|
||||
--log_freq=100 \
|
||||
--steps=100000 \
|
||||
--batch_size=320
|
||||
```
|
||||
|
||||
Results:
|
||||
|
||||
| LIBERO Spatial | LIBERO Object | LIBERO Goal | LIBERO 10 | Average |
|
||||
| -------------- | ------------- | ----------- | --------- | ------- |
|
||||
| 87.0 | 98.2 | 93.8 | 83.2 | 90.6 |
|
||||
|
||||
## References
|
||||
|
||||
For more details on the technical implementation and architecture, see:
|
||||
|
||||
- [A Careful Examination of Large Behavior Models for Multitask Dexterous Manipulation](https://arxiv.org/abs/2507.05331)
|
||||
- [Large Behavior Models and Atlas Find New Footing](https://bostondynamics.com/blog/large-behavior-models-atlas-find-new-footing/)
|
||||
- [Dissecting and Open-Sourcing Multitask Diffusion Transformer Policy](https://brysonkjones.substack.com/p/dissecting-and-open-sourcing-multitask-diffusion-transformer-policy)
|
||||
@@ -91,6 +91,46 @@ lerobot-train \
|
||||
|
||||
**💡 Tip**: Setting `train_expert_only=true` freezes the VLM and trains only the action expert and projections, allowing finetuning with reduced memory usage.
|
||||
|
||||
## Relative Actions
|
||||
|
||||
By default, π₀ predicts absolute actions. You can enable **relative actions** so the model predicts offsets relative to the current robot state. This can improve training stability for certain setups.
|
||||
|
||||
To use relative actions, first recompute your dataset stats in relative space via the CLI:
|
||||
|
||||
```bash
|
||||
lerobot-edit-dataset \
|
||||
--repo_id your_dataset \
|
||||
--operation.type recompute_stats \
|
||||
--operation.relative_action true \
|
||||
--operation.chunk_size 50 \
|
||||
--operation.relative_exclude_joints "['gripper']" \
|
||||
--push_to_hub true
|
||||
```
|
||||
|
||||
Or equivalently in Python:
|
||||
|
||||
```python
|
||||
from lerobot.datasets.lerobot_dataset import LeRobotDataset
|
||||
from lerobot.datasets.dataset_tools import recompute_stats
|
||||
|
||||
dataset = LeRobotDataset("your_dataset")
|
||||
recompute_stats(dataset, relative_action=True, chunk_size=50, relative_exclude_joints=["gripper"])
|
||||
dataset.push_to_hub()
|
||||
```
|
||||
|
||||
The `chunk_size` should match your policy's `chunk_size` (default 50 for π₀). `relative_exclude_joints` lists joint names that should remain in absolute space (e.g. gripper commands). Use `--push_to_hub true` to upload the updated stats to the Hub.
|
||||
|
||||
Then train with relative actions enabled:
|
||||
|
||||
```bash
|
||||
lerobot-train \
|
||||
--dataset.repo_id=your_dataset \
|
||||
--policy.type=pi0 \
|
||||
--policy.use_relative_actions=true \
|
||||
--policy.relative_exclude_joints='["gripper"]' \
|
||||
...
|
||||
```
|
||||
|
||||
## License
|
||||
|
||||
This model follows the **Apache 2.0 License**, consistent with the original [OpenPI repository](https://github.com/Physical-Intelligence/openpi).
|
||||
|
||||
@@ -97,6 +97,46 @@ python src/lerobot/datasets/v30/augment_dataset_quantile_stats.py \
|
||||
|
||||
Or train pi05 with this normalization mapping: `--policy.normalization_mapping='{"ACTION": "MEAN_STD", "STATE": "MEAN_STD", "VISUAL": "IDENTITY"}'`
|
||||
|
||||
## Relative Actions
|
||||
|
||||
By default, π₀.₅ predicts absolute actions. You can enable **relative actions** so the model predicts offsets relative to the current robot state. This can improve training stability for certain setups.
|
||||
|
||||
To use relative actions, first recompute your dataset stats in relative space via the CLI:
|
||||
|
||||
```bash
|
||||
lerobot-edit-dataset \
|
||||
--repo_id your_dataset \
|
||||
--operation.type recompute_stats \
|
||||
--operation.relative_action true \
|
||||
--operation.chunk_size 50 \
|
||||
--operation.relative_exclude_joints "['gripper']" \
|
||||
--push_to_hub true
|
||||
```
|
||||
|
||||
Or equivalently in Python:
|
||||
|
||||
```python
|
||||
from lerobot.datasets.lerobot_dataset import LeRobotDataset
|
||||
from lerobot.datasets.dataset_tools import recompute_stats
|
||||
|
||||
dataset = LeRobotDataset("your_dataset")
|
||||
recompute_stats(dataset, relative_action=True, chunk_size=50, relative_exclude_joints=["gripper"])
|
||||
dataset.push_to_hub()
|
||||
```
|
||||
|
||||
The `chunk_size` should match your policy's `chunk_size` (default 50 for π₀.₅). `relative_exclude_joints` lists joint names that should remain in absolute space (e.g. gripper commands). Use `--push_to_hub true` to upload the updated stats to the Hub.
|
||||
|
||||
Then train with relative actions enabled:
|
||||
|
||||
```bash
|
||||
lerobot-train \
|
||||
--dataset.repo_id=your_dataset \
|
||||
--policy.type=pi05 \
|
||||
--policy.use_relative_actions=true \
|
||||
--policy.relative_exclude_joints='["gripper"]' \
|
||||
...
|
||||
```
|
||||
|
||||
## Performance Results
|
||||
|
||||
### Libero Benchmark Results
|
||||
|
||||
@@ -0,0 +1,37 @@
|
||||
# Multitask DiT Policy
|
||||
|
||||
## Citation
|
||||
|
||||
If you use this work, please cite the following works:
|
||||
|
||||
```bibtex
|
||||
@misc{jones2025multitaskditpolicy,
|
||||
author = {Bryson Jones},
|
||||
title = {Dissecting and Open-Sourcing Multitask Diffusion Transformer Policy},
|
||||
year = {2025},
|
||||
url = {https://brysonkjones.substack.com/p/dissecting-and-open-sourcing-multitask-diffusion-transformer-policy},
|
||||
note = {Blog post}
|
||||
}
|
||||
```
|
||||
|
||||
```bibtex
|
||||
@misc{trilbmteam2025carefulexaminationlargebehaviormodels,
|
||||
author = {TRI LBM Team},
|
||||
title = {A Careful Examination of Large Behavior Models for Multitask Dexterous Manipulation},
|
||||
year = {2025},
|
||||
eprint = {arXiv:2507.05331},
|
||||
archivePrefix = {arXiv},
|
||||
primaryClass = {cs.RO},
|
||||
url = {https://arxiv.org/abs/2507.05331}
|
||||
}
|
||||
```
|
||||
|
||||
```bibtex
|
||||
@misc{bostondynamics2025largebehaviormodelsatlas,
|
||||
author = {Boston Dynamics and TRI Research Team},
|
||||
title = {Large Behavior Models and Atlas Find New Footing},
|
||||
year = {2025},
|
||||
url = {https://bostondynamics.com/blog/large-behavior-models-atlas-find-new-footing/},
|
||||
note = {Blog post}
|
||||
}
|
||||
```
|
||||
@@ -0,0 +1,91 @@
|
||||
# π₀.₅ (pi05)
|
||||
|
||||
This repository contains the Hugging Face port of **π₀.₅**, adapted from [OpenPI](https://github.com/Physical-Intelligence/openpi) by the Physical Intelligence.
|
||||
It is designed as a **Vision-Language-Action model with open-world generalization**.
|
||||
|
||||
---
|
||||
|
||||
## Model Overview
|
||||
|
||||
| Feature | π₀ | π₀.₅ |
|
||||
| -------------------- | ------------------------------------------------------ | ----------------------------------------- |
|
||||
| Time Conditioning | Concatenates time with actions via `action_time_mlp_*` | Uses `time_mlp_*` for AdaRMS conditioning |
|
||||
| AdaRMS | Not used | Used in action expert |
|
||||
| Tokenizer Length | 48 tokens | 200 tokens |
|
||||
| Discrete State Input | False (Uses `state_proj` layer) | True |
|
||||
| Parameter Count | Higher (includes state embedding) | Lower (no state embedding) |
|
||||
|
||||
---
|
||||
|
||||
## Relative Actions
|
||||
|
||||
π₀.₅ supports training with **relative actions**, where the model learns relative offsets
|
||||
from the current robot state instead of absolute joint positions. This mirrors the
|
||||
relative-action transform in OpenPI (`DeltaActions`) and can improve performance.
|
||||
|
||||
### How it works
|
||||
|
||||
1. **During preprocessing**, absolute actions are converted to relative offsets:
|
||||
`relative = action - state` (for selected joints).
|
||||
2. The relative actions are normalized using statistics computed from the relative distribution.
|
||||
3. **During postprocessing**, predicted relative actions are converted back to absolute:
|
||||
`absolute = relative + state`.
|
||||
|
||||
Joints listed in `relative_exclude_joints` (e.g., gripper) are kept absolute.
|
||||
|
||||
### Configuration
|
||||
|
||||
| Parameter | Type | Default | Description |
|
||||
| ------------------------- | ----------- | ------------- | ---------------------------------------------------------------- |
|
||||
| `use_relative_actions` | `bool` | `False` | Enable relative-action training |
|
||||
| `relative_exclude_joints` | `list[str]` | `["gripper"]` | Joint names to keep absolute (matched by substring) |
|
||||
| `action_feature_names` | `list[str]` | `None` | Auto-populated from dataset metadata at runtime by `make_policy` |
|
||||
|
||||
### Training example
|
||||
|
||||
```bash
|
||||
python -m lerobot.scripts.lerobot_train \
|
||||
--policy.type=pi05 \
|
||||
--dataset.repo_id=your_org/your_dataset \
|
||||
--policy.use_relative_actions=true \
|
||||
--policy.relative_exclude_joints='["gripper"]'
|
||||
```
|
||||
|
||||
When `use_relative_actions=true`, the training script automatically:
|
||||
|
||||
- Computes relative action statistics from the dataset (sampled chunk-level relative actions)
|
||||
- Replaces the standard action stats with relative stats for normalization
|
||||
- Broadcasts these stats across all ranks in distributed training
|
||||
|
||||
---
|
||||
|
||||
## Citation
|
||||
|
||||
If you use this work, please cite both **OpenPI** and the π₀.₅ paper:
|
||||
|
||||
```bibtex
|
||||
@misc{openpi2024,
|
||||
author = {Physical Intelligence Lab},
|
||||
title = {OpenPI: PyTorch Implementation of π0 and π0.5 Policies},
|
||||
year = {2024},
|
||||
publisher = {GitHub},
|
||||
howpublished = {\url{https://github.com/Physical-Intelligence/openpi}},
|
||||
license = {Apache-2.0}
|
||||
}
|
||||
|
||||
@misc{intelligence2025pi05visionlanguageactionmodelopenworld,
|
||||
title = {π₀.₅: a Vision-Language-Action Model with Open-World Generalization},
|
||||
author = {Physical Intelligence and Kevin Black and Noah Brown and James Darpinian and Karan Dhabalia and Danny Driess and Adnan Esmail and Michael Equi and Chelsea Finn and Niccolo Fusai and Manuel Y. Galliker and Dibya Ghosh and Lachy Groom and Karol Hausman and Brian Ichter and Szymon Jakubczak and Tim Jones and Liyiming Ke and Devin LeBlanc and Sergey Levine and Adrian Li-Bell and Mohith Mothukuri and Suraj Nair and Karl Pertsch and Allen Z. Ren and Lucy Xiaoyang Shi and Laura Smith and Jost Tobias Springenberg and Kyle Stachowicz and James Tanner and Quan Vuong and Homer Walke and Anna Walling and Haohuan Wang and Lili Yu and Ury Zhilinsky},
|
||||
year = {2025},
|
||||
eprint = {2504.16054},
|
||||
archivePrefix= {arXiv},
|
||||
primaryClass = {cs.LG},
|
||||
url = {https://arxiv.org/abs/2504.16054},
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## License
|
||||
|
||||
This port follows the **Apache 2.0 License**, consistent with the original [OpenPI repository](https://github.com/Physical-Intelligence/openpi).
|
||||
@@ -0,0 +1,108 @@
|
||||
# π₀ (pi0)
|
||||
|
||||
This repository contains the Hugging Face port of **π₀**, adapted from [OpenPI](https://github.com/Physical-Intelligence/openpi) by the Physical Intelligence.
|
||||
It is designed as a **Vision-Language-Action model for general robot control**.
|
||||
|
||||
---
|
||||
|
||||
## Model Overview
|
||||
|
||||
| Feature | π₀ | π₀.₅ |
|
||||
| -------------------- | ------------------------------------------------------ | ----------------------------------------- |
|
||||
| Time Conditioning | Concatenates time with actions via `action_time_mlp_*` | Uses `time_mlp_*` for AdaRMS conditioning |
|
||||
| AdaRMS | Not used | Used in action expert |
|
||||
| Tokenizer Length | 48 tokens | 200 tokens |
|
||||
| Discrete State Input | False (Uses `state_proj` layer) | True |
|
||||
| Parameter Count | Higher (includes state embedding) | Lower (no state embedding) |
|
||||
|
||||
---
|
||||
|
||||
## Relative Actions
|
||||
|
||||
π₀ supports training with **relative actions**, where the model learns relative offsets
|
||||
from the current robot state instead of absolute joint positions. This mirrors the
|
||||
relative-action transform in OpenPI (`DeltaActions`) and can improve performance.
|
||||
|
||||
### How it works
|
||||
|
||||
1. **During preprocessing**, absolute actions are converted to relative offsets:
|
||||
`relative = action - state` (for selected joints).
|
||||
2. The relative actions are normalized using statistics computed from the relative distribution.
|
||||
3. **During postprocessing**, predicted relative actions are converted back to absolute:
|
||||
`absolute = relative + state`.
|
||||
|
||||
Joints listed in `relative_exclude_joints` (e.g., gripper) are kept absolute.
|
||||
|
||||
### Configuration
|
||||
|
||||
| Parameter | Type | Default | Description |
|
||||
| ------------------------- | ----------- | ------------- | ---------------------------------------------------------------- |
|
||||
| `use_relative_actions` | `bool` | `False` | Enable relative-action training |
|
||||
| `relative_exclude_joints` | `list[str]` | `["gripper"]` | Joint names to keep absolute (matched by substring) |
|
||||
| `action_feature_names` | `list[str]` | `None` | Auto-populated from dataset metadata at runtime by `make_policy` |
|
||||
|
||||
### Training example
|
||||
|
||||
```bash
|
||||
python -m lerobot.scripts.lerobot_train \
|
||||
--policy.type=pi0 \
|
||||
--dataset.repo_id=your_org/your_dataset \
|
||||
--policy.use_relative_actions=true \
|
||||
--policy.relative_exclude_joints='["gripper"]'
|
||||
```
|
||||
|
||||
When `use_relative_actions=true`, the training script automatically:
|
||||
|
||||
- Computes relative action statistics from the dataset (sampled chunk-level relative actions)
|
||||
- Replaces the standard action stats with relative stats for normalization
|
||||
- Broadcasts these stats across all ranks in distributed training
|
||||
|
||||
### Recomputing stats for an existing dataset
|
||||
|
||||
If you want to precompute relative action stats offline, use `recompute_stats` from
|
||||
`lerobot.datasets.dataset_tools`:
|
||||
|
||||
```python
|
||||
from lerobot.datasets.lerobot_dataset import LeRobotDataset
|
||||
from lerobot.datasets.dataset_tools import recompute_stats
|
||||
|
||||
dataset = LeRobotDataset("your_org/your_dataset")
|
||||
dataset = recompute_stats(
|
||||
dataset,
|
||||
relative_action=True,
|
||||
relative_exclude_joints=["gripper"],
|
||||
)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Citation
|
||||
|
||||
If you use this work, please cite both **OpenPI** and the π₀ paper:
|
||||
|
||||
```bibtex
|
||||
@misc{openpi2024,
|
||||
author = {Physical Intelligence Lab},
|
||||
title = {OpenPI: PyTorch Implementation of π0 and π0.5 Policies},
|
||||
year = {2024},
|
||||
publisher = {GitHub},
|
||||
howpublished = {\url{https://github.com/Physical-Intelligence/openpi}},
|
||||
license = {Apache-2.0}
|
||||
}
|
||||
|
||||
@misc{black2024pi0visionlanguageactionflowmodel,
|
||||
title = {π₀: A Vision-Language-Action Flow Model for General Robot Control},
|
||||
author = {Kevin Black and Noah Brown and Danny Driess and Adnan Esmail and Michael Equi and Chelsea Finn and Niccolo Fusai and Lachy Groom and Karol Hausman and Brian Ichter and Szymon Jakubczak and Tim Jones and Liyiming Ke and Sergey Levine and Adrian Li-Bell and Mohith Mothukuri and Suraj Nair and Karl Pertsch and Lucy Xiaoyang Shi and James Tanner and Quan Vuong and Anna Walling and Haohuan Wang and Ury Zhilinsky},
|
||||
year = {2024},
|
||||
eprint = {2410.24164},
|
||||
archivePrefix= {arXiv},
|
||||
primaryClass = {cs.LG},
|
||||
url = {https://arxiv.org/abs/2410.24164},
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## License
|
||||
|
||||
This port follows the **Apache 2.0 License**, consistent with the original [OpenPI repository](https://github.com/Physical-Intelligence/openpi).
|
||||
@@ -0,0 +1,38 @@
|
||||
# Real-Time Chunking (RTC)
|
||||
|
||||
This module contains the LeRobot implementation of **Real-Time Chunking (RTC)**, an inference-time technique for flow-matching based policies.
|
||||
|
||||
**Note**: RTC is not a policy itself, but rather an inference enhancement that works with flow-matching based policies including [π₀](../pi0/), [π₀.₅](../pi05/), and [SmolVLA](../smolvla/).
|
||||
|
||||
---
|
||||
|
||||
## Citation
|
||||
|
||||
If you use Real-Time Chunking in your work, please cite:
|
||||
|
||||
```bibtex
|
||||
@misc{openpi2024,
|
||||
author = {Physical Intelligence Lab},
|
||||
title = {OpenPI: PyTorch Implementation of π0 and π0.5 Policies},
|
||||
year = {2024},
|
||||
publisher = {GitHub},
|
||||
howpublished = {\url{https://github.com/Physical-Intelligence/openpi}},
|
||||
license = {Apache-2.0}
|
||||
}
|
||||
|
||||
@misc{black2025realtimeexecutionactionchunking,
|
||||
title={Real-Time Execution of Action Chunking Flow Policies},
|
||||
author={Kevin Black and Manuel Y. Galliker and Sergey Levine},
|
||||
year={2025},
|
||||
eprint={2506.07339},
|
||||
archivePrefix={arXiv},
|
||||
primaryClass={cs.RO},
|
||||
url={https://arxiv.org/abs/2506.07339},
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## License
|
||||
|
||||
This implementation follows the **Apache 2.0 License**, consistent with the LeRobot project.
|
||||
@@ -0,0 +1,14 @@
|
||||
## Paper
|
||||
|
||||
https://arxiv.org/abs/2509.25358
|
||||
|
||||
## Citation
|
||||
|
||||
```bibtex
|
||||
@article{chen2025sarm,
|
||||
title={SARM: Stage-Aware Reward Modeling for Long Horizon Robot Manipulation},
|
||||
author={Chen, Qianzhong and Yu, Justin and Schwager, Mac and Abbeel, Pieter and Shentu, Yide and Wu, Philipp},
|
||||
journal={arXiv preprint arXiv:2509.25358},
|
||||
year={2025}
|
||||
}
|
||||
```
|
||||
@@ -236,10 +236,10 @@ It is advisable to install one 3-pin cable in the motor after placing them befor
|
||||
|
||||
### Joint 1
|
||||
|
||||
- Install both motor horns. Secure the top horn with a M3x6mm screw. No screws are required for the bottom horn.
|
||||
- Place the first motor into the base.
|
||||
- Fasten the motor with 4 M2x6mm screws (smallest screws). Two from the top and two from the bottom.
|
||||
- Slide over the first motor holder and fasten it using two M2x6mm screws (one on each side).
|
||||
- Install both motor horns, securing the top horn with a M3x6mm screw.
|
||||
- Attach the shoulder part.
|
||||
- Tighten the shoulder part with 4 M3x6mm screws on top and 4 M3x6mm screws on the bottom
|
||||
- Add the shoulder motor holder.
|
||||
@@ -255,9 +255,9 @@ It is advisable to install one 3-pin cable in the motor after placing them befor
|
||||
|
||||
### Joint 2
|
||||
|
||||
- Install both motor horns. Secure the top horn with a M3x6mm screw. No screws are required for the bottom horn.
|
||||
- Slide the second motor in from the top.
|
||||
- Fasten the second motor with 4 M2x6mm screws.
|
||||
- Attach both motor horns to motor 2, again use the M3x6mm horn screw.
|
||||
- Attach the upper arm with 4 M3x6mm screws on each side.
|
||||
|
||||
<div class="video-container">
|
||||
@@ -271,8 +271,8 @@ It is advisable to install one 3-pin cable in the motor after placing them befor
|
||||
|
||||
### Joint 3
|
||||
|
||||
- Insert motor 3 and fasten using 4 M2x6mm screws
|
||||
- Attach both motor horns to motor 3 and secure one again with a M3x6mm horn screw.
|
||||
- Install both motor horns. Secure the top horn with a M3x6mm screw. No screws are required for the bottom horn.
|
||||
- Insert motor 3 and fasten using 4 M2x6mm screws.
|
||||
- Connect the forearm to motor 3 using 4 M3x6mm screws on each side.
|
||||
|
||||
<div class="video-container">
|
||||
@@ -286,9 +286,10 @@ It is advisable to install one 3-pin cable in the motor after placing them befor
|
||||
|
||||
### Joint 4
|
||||
|
||||
- Install both motor horns. Secure the top horn with a M3x6mm screw. No screws are required for the bottom horn.
|
||||
- Slide over motor holder 4.
|
||||
- Slide in motor 4.
|
||||
- Fasten motor 4 with 4 M2x6mm screws and attach its motor horns, use a M3x6mm horn screw.
|
||||
- Fasten motor 4 with 4 M2x6mm screws.
|
||||
|
||||
<div class="video-container">
|
||||
<video controls width="600">
|
||||
@@ -321,7 +322,7 @@ It is advisable to install one 3-pin cable in the motor after placing them befor
|
||||
|
||||
- Attach the gripper to motor 5, attach it to the motor horn on the wrist using 4 M3x6mm screws.
|
||||
- Insert the gripper motor and secure it with 2 M2x6mm screws on each side.
|
||||
- Attach the motor horns and again use a M3x6mm horn screw.
|
||||
- Install both motor horns on the gripper motor. Secure the top horn with a M3x6mm screw; no screws are required for the bottom horn.
|
||||
- Install the gripper claw and secure it with 4 M3x6mm screws on both sides.
|
||||
|
||||
<div class="video-container">
|
||||
|
||||
@@ -78,7 +78,7 @@ def replay(cfg: ReplayConfig):
|
||||
|
||||
robot = make_robot_from_config(cfg.robot)
|
||||
dataset = LeRobotDataset(cfg.dataset.repo_id, root=cfg.dataset.root, episodes=[cfg.dataset.episode])
|
||||
actions = dataset.hf_dataset.select_columns(ACTION)
|
||||
actions = dataset.select_columns(ACTION)
|
||||
robot.connect()
|
||||
|
||||
try:
|
||||
|
||||
@@ -0,0 +1,680 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
Create MP4 (or GIF) videos with sarm_progress overlay for specified episodes.
|
||||
|
||||
Downloads datasets from HuggingFace, seeks directly into the episode segment
|
||||
of the source video, draws a progress line on each frame, and writes the result.
|
||||
|
||||
Usage:
|
||||
python examples/dataset/create_progress_videos.py \
|
||||
--repo-id lerobot-data-collection/level2_final_quality3 \
|
||||
--episode 1100
|
||||
|
||||
python examples/dataset/create_progress_videos.py \
|
||||
--repo-id lerobot-data-collection/level2_final_quality3 \
|
||||
--episode 1100 \
|
||||
--camera-key observation.images.top \
|
||||
--output-dir ./my_videos \
|
||||
--gif
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
GRAPH_Y_TOP_FRAC = 0.01
|
||||
GRAPH_Y_BOT_FRAC = 0.99
|
||||
LINE_THICKNESS = 3
|
||||
SHADOW_THICKNESS = 6
|
||||
REF_ALPHA = 0.45
|
||||
FILL_ALPHA = 0.55
|
||||
SCORE_FONT_SCALE = 0.8
|
||||
TASK_FONT_SCALE = 0.55
|
||||
|
||||
|
||||
def download_episode_metadata(repo_id: str, episode: int) -> Path:
|
||||
"""Download only the metadata and sarm_progress files for a dataset.
|
||||
|
||||
Args:
|
||||
repo_id: HuggingFace dataset repository ID.
|
||||
episode: Episode index (used for logging only; all meta is fetched).
|
||||
|
||||
Returns:
|
||||
Local cache path for the downloaded snapshot.
|
||||
"""
|
||||
logging.info("[1/4] Downloading metadata for %s (episode %d) ...", repo_id, episode)
|
||||
local_path = Path(
|
||||
snapshot_download(
|
||||
repo_id=repo_id,
|
||||
repo_type="dataset",
|
||||
allow_patterns=["meta/**", "sarm_progress.parquet"],
|
||||
ignore_patterns=["*.mp4"],
|
||||
)
|
||||
)
|
||||
return local_path
|
||||
|
||||
|
||||
def load_episode_meta(local_path: Path, episode: int, camera_key: str | None) -> dict:
|
||||
"""Read info.json and episode parquet to resolve fps, video path, and timestamps.
|
||||
|
||||
Args:
|
||||
local_path: Local cache directory containing meta/.
|
||||
episode: Episode index to look up.
|
||||
camera_key: Camera observation key (e.g. "observation.images.base").
|
||||
If None, the first available video key is used.
|
||||
|
||||
Returns:
|
||||
Dict with keys: fps, camera, video_rel, chunk_index, file_index,
|
||||
from_ts, to_ts, task_name.
|
||||
"""
|
||||
info = json.loads((local_path / "meta" / "info.json").read_text())
|
||||
fps = info["fps"]
|
||||
features = info["features"]
|
||||
|
||||
video_keys = [k for k, v in features.items() if v.get("dtype") == "video"]
|
||||
if not video_keys:
|
||||
raise RuntimeError("No video keys found in dataset features")
|
||||
|
||||
if camera_key is not None:
|
||||
if camera_key not in video_keys:
|
||||
raise RuntimeError(f"camera_key='{camera_key}' not found. Available: {video_keys}")
|
||||
selected_camera = camera_key
|
||||
else:
|
||||
selected_camera = video_keys[0]
|
||||
logging.info(" fps=%d camera='%s' all_cams=%s", fps, selected_camera, video_keys)
|
||||
|
||||
episode_rows = []
|
||||
for parquet_file in sorted((local_path / "meta" / "episodes").glob("**/*.parquet")):
|
||||
episode_rows.append(pd.read_parquet(parquet_file))
|
||||
episode_df = pd.concat(episode_rows, ignore_index=True)
|
||||
row = episode_df[episode_df["episode_index"] == episode]
|
||||
if row.empty:
|
||||
raise RuntimeError(f"Episode {episode} not found in episode metadata")
|
||||
row = row.iloc[0]
|
||||
|
||||
chunk_col = f"videos/{selected_camera}/chunk_index"
|
||||
file_col = f"videos/{selected_camera}/file_index"
|
||||
ts_from_col = f"videos/{selected_camera}/from_timestamp"
|
||||
ts_to_col = f"videos/{selected_camera}/to_timestamp"
|
||||
|
||||
if chunk_col not in row.index:
|
||||
chunk_col = f"{selected_camera}/chunk_index"
|
||||
file_col = f"{selected_camera}/file_index"
|
||||
ts_from_col = f"{selected_camera}/from_timestamp"
|
||||
ts_to_col = f"{selected_camera}/to_timestamp"
|
||||
if chunk_col not in row.index:
|
||||
raise RuntimeError(
|
||||
f"Cannot find video metadata columns for {selected_camera}.\nAvailable: {list(row.index)}"
|
||||
)
|
||||
|
||||
chunk_index = int(row[chunk_col])
|
||||
file_index = int(row[file_col])
|
||||
from_timestamp = float(row[ts_from_col])
|
||||
to_timestamp = float(row[ts_to_col])
|
||||
|
||||
video_template = info.get(
|
||||
"video_path", "videos/{video_key}/chunk-{chunk_index:03d}/file-{file_index:03d}.mp4"
|
||||
)
|
||||
video_rel = video_template.format(
|
||||
video_key=selected_camera,
|
||||
chunk_index=chunk_index,
|
||||
file_index=file_index,
|
||||
)
|
||||
|
||||
task_name = _resolve_task_name(row, local_path)
|
||||
|
||||
return {
|
||||
"fps": fps,
|
||||
"camera": selected_camera,
|
||||
"video_rel": video_rel,
|
||||
"chunk_index": chunk_index,
|
||||
"file_index": file_index,
|
||||
"from_ts": from_timestamp,
|
||||
"to_ts": to_timestamp,
|
||||
"task_name": task_name,
|
||||
}
|
||||
|
||||
|
||||
def _resolve_task_name(row: pd.Series, local_path: Path) -> str:
|
||||
"""Best-effort extraction of the task name for an episode row.
|
||||
|
||||
Args:
|
||||
row: Single-episode row from the episodes parquet.
|
||||
local_path: Dataset cache root.
|
||||
|
||||
Returns:
|
||||
Task name string, or empty string if unavailable.
|
||||
"""
|
||||
try:
|
||||
if "tasks" in row.index and row["tasks"] is not None:
|
||||
tasks_val = row["tasks"]
|
||||
if isinstance(tasks_val, (list, tuple, np.ndarray)) and len(tasks_val) > 0:
|
||||
return str(tasks_val[0])
|
||||
return str(tasks_val).strip("[]'")
|
||||
|
||||
tasks_parquet = local_path / "meta" / "tasks.parquet"
|
||||
if tasks_parquet.exists():
|
||||
tasks_df = pd.read_parquet(tasks_parquet)
|
||||
task_idx = int(row.get("task_index", 0)) if "task_index" in row.index else 0
|
||||
match = tasks_df[tasks_df["task_index"] == task_idx]
|
||||
if not match.empty:
|
||||
return str(match.index[0])
|
||||
except Exception as exc:
|
||||
logging.warning("Could not load task name: %s", exc)
|
||||
return ""
|
||||
|
||||
|
||||
def download_video_file(repo_id: str, local_path: Path, video_rel: str) -> Path:
|
||||
"""Download the specific video file if not already cached.
|
||||
|
||||
Args:
|
||||
repo_id: HuggingFace dataset repository ID.
|
||||
local_path: Local cache directory.
|
||||
video_rel: Relative path to the video file within the dataset.
|
||||
|
||||
Returns:
|
||||
Absolute path to the downloaded video file.
|
||||
"""
|
||||
video_path = local_path / video_rel
|
||||
if video_path.exists():
|
||||
logging.info(" Video already cached: %s", video_path)
|
||||
return video_path
|
||||
logging.info("[2/4] Downloading video file %s ...", video_rel)
|
||||
snapshot_download(
|
||||
repo_id=repo_id,
|
||||
repo_type="dataset",
|
||||
local_dir=str(local_path),
|
||||
allow_patterns=[video_rel],
|
||||
)
|
||||
if not video_path.exists():
|
||||
raise RuntimeError(f"Video not found after download: {video_path}")
|
||||
return video_path
|
||||
|
||||
|
||||
def load_progress_data(local_path: Path, episode: int) -> np.ndarray | None:
|
||||
"""Load sarm_progress values for an episode.
|
||||
|
||||
Args:
|
||||
local_path: Dataset cache root.
|
||||
episode: Episode index.
|
||||
|
||||
Returns:
|
||||
Sorted (N, 2) array of (frame_index, progress), or None if unavailable.
|
||||
"""
|
||||
parquet_path = local_path / "sarm_progress.parquet"
|
||||
if not parquet_path.exists():
|
||||
logging.warning("sarm_progress.parquet not found")
|
||||
return None
|
||||
df = pd.read_parquet(parquet_path)
|
||||
logging.info(" sarm_progress.parquet columns: %s", list(df.columns))
|
||||
episode_df = df[df["episode_index"] == episode].copy()
|
||||
if episode_df.empty:
|
||||
logging.warning("No sarm_progress rows for episode %d", episode)
|
||||
return None
|
||||
episode_df = episode_df.sort_values("frame_index")
|
||||
|
||||
if "progress_dense" in episode_df.columns and episode_df["progress_dense"].notna().any():
|
||||
progress_column = "progress_dense"
|
||||
elif "progress_sparse" in episode_df.columns:
|
||||
progress_column = "progress_sparse"
|
||||
else:
|
||||
progress_columns = [c for c in episode_df.columns if "progress" in c.lower()]
|
||||
if not progress_columns:
|
||||
return None
|
||||
progress_column = progress_columns[0]
|
||||
|
||||
logging.info(" Using progress column: '%s'", progress_column)
|
||||
return episode_df[["frame_index", progress_column]].rename(columns={progress_column: "progress"}).values
|
||||
|
||||
|
||||
def _precompute_pixel_coords(
|
||||
progress_data: np.ndarray,
|
||||
num_frames: int,
|
||||
frame_width: int,
|
||||
frame_height: int,
|
||||
) -> np.ndarray:
|
||||
"""Map progress samples to pixel coordinates for overlay drawing.
|
||||
|
||||
Args:
|
||||
progress_data: (N, 2) array of (frame_index, progress).
|
||||
num_frames: Total number of video frames.
|
||||
frame_width: Video width in pixels.
|
||||
frame_height: Video height in pixels.
|
||||
|
||||
Returns:
|
||||
(N, 2) array of (x, y) pixel coordinates.
|
||||
"""
|
||||
frame_indices = progress_data[:, 0].astype(float)
|
||||
progress_values = np.clip(progress_data[:, 1].astype(float), 0.0, 1.0)
|
||||
|
||||
y_top = int(frame_height * GRAPH_Y_TOP_FRAC)
|
||||
y_bot = int(frame_height * GRAPH_Y_BOT_FRAC)
|
||||
graph_height = y_bot - y_top
|
||||
|
||||
x_coords = (frame_indices / (num_frames - 1) * (frame_width - 1)).astype(int)
|
||||
y_coords = (y_bot - progress_values * graph_height).astype(int)
|
||||
|
||||
return np.stack([x_coords, y_coords], axis=1)
|
||||
|
||||
|
||||
def _progress_color(normalized_position: float) -> tuple[int, int, int]:
|
||||
"""Interpolate BGR color from red to green based on position in [0, 1].
|
||||
|
||||
Args:
|
||||
normalized_position: Value in [0, 1] indicating how far along the episode.
|
||||
|
||||
Returns:
|
||||
BGR color tuple.
|
||||
"""
|
||||
red = int(255 * (1.0 - normalized_position))
|
||||
green = int(255 * normalized_position)
|
||||
return (0, green, red)
|
||||
|
||||
|
||||
def _prerender_fill_polygon(
|
||||
pixel_coords: np.ndarray,
|
||||
frame_width: int,
|
||||
frame_height: int,
|
||||
) -> np.ndarray:
|
||||
"""Pre-render the grey fill polygon under the progress curve as a BGRA image.
|
||||
|
||||
Args:
|
||||
pixel_coords: (N, 2) array of (x, y) pixel coordinates.
|
||||
frame_width: Video width in pixels.
|
||||
frame_height: Video height in pixels.
|
||||
|
||||
Returns:
|
||||
BGRA image array of shape (frame_height, frame_width, 4).
|
||||
"""
|
||||
y_bot = int(frame_height * GRAPH_Y_BOT_FRAC)
|
||||
fill_image = np.zeros((frame_height, frame_width, 4), dtype=np.uint8)
|
||||
polygon = np.concatenate(
|
||||
[
|
||||
pixel_coords,
|
||||
[[pixel_coords[-1][0], y_bot], [pixel_coords[0][0], y_bot]],
|
||||
],
|
||||
axis=0,
|
||||
).astype(np.int32)
|
||||
cv2.fillPoly(fill_image, [polygon], color=(128, 128, 128, int(255 * FILL_ALPHA)))
|
||||
return fill_image
|
||||
|
||||
|
||||
def _alpha_composite_region(base: np.ndarray, overlay_bgra: np.ndarray, x_limit: int) -> None:
|
||||
"""Blend BGRA overlay onto BGR base in-place, up to x_limit columns.
|
||||
|
||||
Args:
|
||||
base: BGR frame to draw on (modified in-place).
|
||||
overlay_bgra: BGRA overlay image.
|
||||
x_limit: Only blend columns [0, x_limit).
|
||||
"""
|
||||
if x_limit <= 0:
|
||||
return
|
||||
region_base = base[:, :x_limit]
|
||||
region_overlay = overlay_bgra[:, :x_limit]
|
||||
alpha = region_overlay[:, :, 3:4].astype(np.float32) / 255.0
|
||||
region_base[:] = np.clip(
|
||||
region_overlay[:, :, :3].astype(np.float32) * alpha + region_base.astype(np.float32) * (1.0 - alpha),
|
||||
0,
|
||||
255,
|
||||
).astype(np.uint8)
|
||||
|
||||
|
||||
def _draw_text_outlined(
|
||||
frame: np.ndarray,
|
||||
text: str,
|
||||
position: tuple[int, int],
|
||||
font_scale: float,
|
||||
thickness: int = 1,
|
||||
) -> None:
|
||||
"""Draw white text with a dark outline for readability on any background.
|
||||
|
||||
Args:
|
||||
frame: BGR image to draw on (modified in-place).
|
||||
text: String to render.
|
||||
position: (x, y) bottom-left corner of the text.
|
||||
font_scale: OpenCV font scale.
|
||||
thickness: Text stroke thickness.
|
||||
"""
|
||||
font = cv2.FONT_HERSHEY_SIMPLEX
|
||||
cv2.putText(frame, text, position, font, font_scale, (0, 0, 0), thickness + 2, cv2.LINE_AA)
|
||||
cv2.putText(frame, text, position, font, font_scale, (255, 255, 255), thickness, cv2.LINE_AA)
|
||||
|
||||
|
||||
def composite_progress_video(
|
||||
video_path: Path,
|
||||
from_timestamp: float,
|
||||
to_timestamp: float,
|
||||
progress_data: np.ndarray,
|
||||
output_path: Path,
|
||||
fps: float,
|
||||
task_name: str = "",
|
||||
) -> Path:
|
||||
"""Read episode frames by seeking into the source video, draw progress overlay, write output.
|
||||
|
||||
Uses cv2.CAP_PROP_POS_MSEC to seek directly into the source video,
|
||||
eliminating the need for an intermediate clip file.
|
||||
|
||||
Args:
|
||||
video_path: Path to the full source video file.
|
||||
from_timestamp: Start timestamp of the episode in seconds.
|
||||
to_timestamp: End timestamp of the episode in seconds.
|
||||
progress_data: (N, 2) array of (frame_index, progress).
|
||||
output_path: Path to write the output MP4.
|
||||
fps: Frames per second for the output video.
|
||||
task_name: Optional task name to display at the top of the video.
|
||||
|
||||
Returns:
|
||||
Path to the written output file (MP4).
|
||||
"""
|
||||
capture = cv2.VideoCapture(str(video_path))
|
||||
try:
|
||||
capture.set(cv2.CAP_PROP_POS_MSEC, from_timestamp * 1000)
|
||||
|
||||
frame_width = int(capture.get(cv2.CAP_PROP_FRAME_WIDTH))
|
||||
frame_height = int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
||||
duration_seconds = to_timestamp - from_timestamp
|
||||
num_frames = int(round(duration_seconds * fps))
|
||||
|
||||
logging.info(
|
||||
" Video: %dx%d, %d frames @ %.1f fps (%.2fs)",
|
||||
frame_width,
|
||||
frame_height,
|
||||
num_frames,
|
||||
fps,
|
||||
duration_seconds,
|
||||
)
|
||||
|
||||
pixel_coords = _precompute_pixel_coords(progress_data, num_frames, frame_width, frame_height)
|
||||
y_ref = int(frame_height * GRAPH_Y_TOP_FRAC)
|
||||
|
||||
fill_image = _prerender_fill_polygon(pixel_coords, frame_width, frame_height)
|
||||
|
||||
ref_line_image = np.zeros((frame_height, frame_width, 4), dtype=np.uint8)
|
||||
cv2.line(
|
||||
ref_line_image,
|
||||
(0, y_ref),
|
||||
(frame_width - 1, y_ref),
|
||||
(200, 200, 200, int(255 * REF_ALPHA)),
|
||||
1,
|
||||
cv2.LINE_AA,
|
||||
)
|
||||
|
||||
frame_indices = progress_data[:, 0].astype(int)
|
||||
progress_values = progress_data[:, 1].astype(float)
|
||||
|
||||
logging.info("[3/4] Compositing %d frames ...", num_frames)
|
||||
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
|
||||
writer = cv2.VideoWriter(str(output_path), fourcc, fps, (frame_width, frame_height))
|
||||
|
||||
for frame_idx in range(num_frames):
|
||||
ret, frame = capture.read()
|
||||
if not ret:
|
||||
break
|
||||
|
||||
drawn_count = int(np.searchsorted(frame_indices, frame_idx, side="right"))
|
||||
x_current = (
|
||||
int(pixel_coords[min(drawn_count, len(pixel_coords)) - 1][0]) + 1 if drawn_count > 0 else 0
|
||||
)
|
||||
|
||||
_alpha_composite_region(frame, ref_line_image, frame_width)
|
||||
_alpha_composite_region(frame, fill_image, x_current)
|
||||
|
||||
if drawn_count >= 2:
|
||||
time_position = (drawn_count - 1) / max(len(progress_values) - 1, 1)
|
||||
line_color = _progress_color(time_position)
|
||||
points = pixel_coords[:drawn_count].reshape(-1, 1, 2).astype(np.int32)
|
||||
cv2.polylines(
|
||||
frame,
|
||||
[points],
|
||||
isClosed=False,
|
||||
color=(255, 255, 255),
|
||||
thickness=SHADOW_THICKNESS,
|
||||
lineType=cv2.LINE_AA,
|
||||
)
|
||||
cv2.polylines(
|
||||
frame,
|
||||
[points],
|
||||
isClosed=False,
|
||||
color=line_color,
|
||||
thickness=LINE_THICKNESS,
|
||||
lineType=cv2.LINE_AA,
|
||||
)
|
||||
|
||||
if drawn_count > 0:
|
||||
score = float(progress_values[min(drawn_count, len(progress_values)) - 1])
|
||||
score_text = f"{score:.2f}"
|
||||
(text_width, _), _ = cv2.getTextSize(
|
||||
score_text, cv2.FONT_HERSHEY_SIMPLEX, SCORE_FONT_SCALE, 2
|
||||
)
|
||||
score_x = frame_width - text_width - 12
|
||||
score_y = frame_height - 12
|
||||
time_position = (drawn_count - 1) / max(len(progress_values) - 1, 1)
|
||||
score_color = _progress_color(time_position)
|
||||
cv2.putText(
|
||||
frame,
|
||||
score_text,
|
||||
(score_x, score_y),
|
||||
cv2.FONT_HERSHEY_SIMPLEX,
|
||||
SCORE_FONT_SCALE,
|
||||
(0, 0, 0),
|
||||
4,
|
||||
cv2.LINE_AA,
|
||||
)
|
||||
cv2.putText(
|
||||
frame,
|
||||
score_text,
|
||||
(score_x, score_y),
|
||||
cv2.FONT_HERSHEY_SIMPLEX,
|
||||
SCORE_FONT_SCALE,
|
||||
score_color,
|
||||
2,
|
||||
cv2.LINE_AA,
|
||||
)
|
||||
|
||||
if task_name:
|
||||
(text_width, _), _ = cv2.getTextSize(task_name, cv2.FONT_HERSHEY_SIMPLEX, TASK_FONT_SCALE, 1)
|
||||
task_x = max((frame_width - text_width) // 2, 4)
|
||||
_draw_text_outlined(frame, task_name, (task_x, 22), TASK_FONT_SCALE)
|
||||
|
||||
writer.write(frame)
|
||||
if frame_idx % 100 == 0:
|
||||
logging.info(" Frame %d/%d ...", frame_idx, num_frames)
|
||||
|
||||
writer.release()
|
||||
finally:
|
||||
capture.release()
|
||||
|
||||
logging.info(" MP4 written: %s", output_path)
|
||||
return output_path
|
||||
|
||||
|
||||
def convert_mp4_to_gif(mp4_path: Path) -> Path:
|
||||
"""Convert an MP4 to an optimized GIF using ffmpeg palette generation.
|
||||
|
||||
Args:
|
||||
mp4_path: Path to the source MP4 file.
|
||||
|
||||
Returns:
|
||||
Path to the generated GIF file.
|
||||
"""
|
||||
capture = cv2.VideoCapture(str(mp4_path))
|
||||
frame_width = int(capture.get(cv2.CAP_PROP_FRAME_WIDTH))
|
||||
capture.release()
|
||||
|
||||
gif_path = mp4_path.with_suffix(".gif")
|
||||
palette_path = mp4_path.parent / "_palette.png"
|
||||
|
||||
logging.info("[4/4] Converting to GIF ...")
|
||||
result_palette = subprocess.run( # nosec B607
|
||||
[
|
||||
"ffmpeg",
|
||||
"-y",
|
||||
"-i",
|
||||
str(mp4_path),
|
||||
"-vf",
|
||||
f"fps=10,scale={frame_width}:-1:flags=lanczos,palettegen=max_colors=128:stats_mode=diff",
|
||||
"-update",
|
||||
"1",
|
||||
str(palette_path),
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
if result_palette.returncode != 0:
|
||||
logging.warning("palettegen failed:\n%s", result_palette.stderr[-500:])
|
||||
|
||||
result_gif = subprocess.run( # nosec B607
|
||||
[
|
||||
"ffmpeg",
|
||||
"-y",
|
||||
"-i",
|
||||
str(mp4_path),
|
||||
"-i",
|
||||
str(palette_path),
|
||||
"-filter_complex",
|
||||
f"fps=10,scale={frame_width}:-1:flags=lanczos[v];[v][1:v]paletteuse=dither=bayer:bayer_scale=3",
|
||||
str(gif_path),
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
if result_gif.returncode != 0:
|
||||
logging.warning("GIF encode failed:\n%s", result_gif.stderr[-500:])
|
||||
|
||||
palette_path.unlink(missing_ok=True)
|
||||
logging.info(" GIF written: %s", gif_path)
|
||||
return gif_path
|
||||
|
||||
|
||||
def process_dataset(
|
||||
repo_id: str,
|
||||
episode: int,
|
||||
camera_key: str | None,
|
||||
output_dir: Path,
|
||||
create_gif: bool = False,
|
||||
) -> Path | None:
|
||||
"""Full pipeline: download, extract metadata, composite progress, write output.
|
||||
|
||||
Args:
|
||||
repo_id: HuggingFace dataset repository ID.
|
||||
episode: Episode index.
|
||||
camera_key: Camera key to use, or None for auto-selection.
|
||||
output_dir: Directory to write output files.
|
||||
create_gif: If True, also generate a GIF from the MP4.
|
||||
|
||||
Returns:
|
||||
Path to the final output file, or None on failure.
|
||||
"""
|
||||
safe_name = repo_id.replace("/", "_")
|
||||
logging.info("Processing: %s | episode %d", repo_id, episode)
|
||||
|
||||
local_path = download_episode_metadata(repo_id, episode)
|
||||
logging.info(" Local cache: %s", local_path)
|
||||
|
||||
episode_meta = load_episode_meta(local_path, episode, camera_key)
|
||||
logging.info(" Episode meta: %s", episode_meta)
|
||||
|
||||
video_path = download_video_file(repo_id, local_path, episode_meta["video_rel"])
|
||||
|
||||
progress_data = load_progress_data(local_path, episode)
|
||||
if progress_data is None:
|
||||
logging.error("Could not load sarm_progress data. Skipping overlay.")
|
||||
return None
|
||||
|
||||
logging.info(" Progress frames: %d", len(progress_data))
|
||||
|
||||
output_path = output_dir / f"{safe_name}_ep{episode}_progress.mp4"
|
||||
final_path = composite_progress_video(
|
||||
video_path=video_path,
|
||||
from_timestamp=episode_meta["from_ts"],
|
||||
to_timestamp=episode_meta["to_ts"],
|
||||
progress_data=progress_data,
|
||||
output_path=output_path,
|
||||
fps=episode_meta["fps"],
|
||||
task_name=episode_meta.get("task_name", ""),
|
||||
)
|
||||
|
||||
if create_gif:
|
||||
final_path = convert_mp4_to_gif(final_path)
|
||||
|
||||
logging.info("Done: %s", final_path)
|
||||
return final_path
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Create MP4/GIF videos with sarm_progress overlay for dataset episodes."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--repo-id",
|
||||
type=str,
|
||||
required=True,
|
||||
help="HuggingFace dataset repository ID (e.g. 'lerobot-data-collection/level2_final_quality3').",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--episode",
|
||||
type=int,
|
||||
required=True,
|
||||
help="Episode index to visualize.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--camera-key",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Camera observation key (e.g. 'observation.images.base'). Auto-selects first camera if omitted.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
type=Path,
|
||||
default=Path("progress_videos"),
|
||||
help="Directory to write output files (default: ./progress_videos).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--gif",
|
||||
action="store_true",
|
||||
help="Also generate a GIF from the MP4 output.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
|
||||
|
||||
args.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
result = process_dataset(
|
||||
repo_id=args.repo_id,
|
||||
episode=args.episode,
|
||||
camera_key=args.camera_key,
|
||||
output_dir=args.output_dir,
|
||||
create_gif=args.gif,
|
||||
)
|
||||
|
||||
if result:
|
||||
logging.info("Output: %s", result)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -88,9 +88,8 @@ def main():
|
||||
# The previous metadata class is contained in the 'meta' attribute of the dataset:
|
||||
print(dataset.meta)
|
||||
|
||||
# LeRobotDataset actually wraps an underlying Hugging Face dataset
|
||||
# (see https://huggingface.co/docs/datasets for more information).
|
||||
print(dataset.hf_dataset)
|
||||
# You can inspect the dataset using its repr:
|
||||
print(dataset)
|
||||
|
||||
# LeRobot datasets also subclasses PyTorch datasets so you can do everything you know and love from working
|
||||
# with the latter, like iterating through the dataset.
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,228 @@
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Shared utilities for Human-in-the-Loop data collection scripts."""
|
||||
|
||||
import logging
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
|
||||
from lerobot.processor import (
|
||||
IdentityProcessorStep,
|
||||
RobotAction,
|
||||
RobotObservation,
|
||||
RobotProcessorPipeline,
|
||||
)
|
||||
from lerobot.processor.converters import (
|
||||
observation_to_transition,
|
||||
robot_action_observation_to_transition,
|
||||
transition_to_observation,
|
||||
transition_to_robot_action,
|
||||
)
|
||||
from lerobot.robots import Robot
|
||||
from lerobot.teleoperators import Teleoperator
|
||||
from lerobot.utils.control_utils import is_headless
|
||||
from lerobot.utils.robot_utils import precise_sleep
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class HILDatasetConfig:
|
||||
repo_id: str
|
||||
single_task: str
|
||||
root: str | Path | None = None
|
||||
fps: int = 30
|
||||
episode_time_s: float = 120
|
||||
num_episodes: int = 50
|
||||
video: bool = True
|
||||
push_to_hub: bool = True
|
||||
private: bool = False
|
||||
tags: list[str] | None = None
|
||||
num_image_writer_processes: int = 0
|
||||
num_image_writer_threads_per_camera: int = 4
|
||||
video_encoding_batch_size: int = 1
|
||||
vcodec: str = "auto"
|
||||
streaming_encoding: bool = True
|
||||
encoder_queue_maxsize: int = 30
|
||||
encoder_threads: int | None = None
|
||||
rename_map: dict[str, str] = field(default_factory=dict)
|
||||
|
||||
|
||||
def teleop_has_motor_control(teleop: Teleoperator) -> bool:
|
||||
"""Check if teleoperator has motor control capabilities."""
|
||||
return all(hasattr(teleop, attr) for attr in ("enable_torque", "disable_torque", "write_goal_positions"))
|
||||
|
||||
|
||||
def teleop_disable_torque(teleop: Teleoperator) -> None:
|
||||
"""Disable teleop torque if supported."""
|
||||
if hasattr(teleop, "disable_torque"):
|
||||
teleop.disable_torque()
|
||||
|
||||
|
||||
def teleop_enable_torque(teleop: Teleoperator) -> None:
|
||||
"""Enable teleop torque if supported."""
|
||||
if hasattr(teleop, "enable_torque"):
|
||||
teleop.enable_torque()
|
||||
|
||||
|
||||
def teleop_smooth_move_to(teleop: Teleoperator, target_pos: dict, duration_s: float = 2.0, fps: int = 50):
|
||||
"""Smoothly move teleop to target position if motor control is available."""
|
||||
if not teleop_has_motor_control(teleop):
|
||||
logger.warning("Teleop does not support motor control - cannot mirror robot position")
|
||||
return
|
||||
|
||||
teleop_enable_torque(teleop)
|
||||
current = teleop.get_action()
|
||||
steps = max(int(duration_s * fps), 1)
|
||||
|
||||
for step in range(steps + 1):
|
||||
t = step / steps
|
||||
interp = {}
|
||||
for k in current:
|
||||
if k in target_pos:
|
||||
interp[k] = current[k] * (1 - t) + target_pos[k] * t
|
||||
else:
|
||||
interp[k] = current[k]
|
||||
teleop.write_goal_positions(interp)
|
||||
time.sleep(1 / fps)
|
||||
|
||||
|
||||
def init_keyboard_listener():
|
||||
"""Initialize keyboard listener with HIL controls."""
|
||||
events = {
|
||||
"exit_early": False,
|
||||
"rerecord_episode": False,
|
||||
"stop_recording": False,
|
||||
"policy_paused": False,
|
||||
"correction_active": False,
|
||||
"resume_policy": False,
|
||||
"in_reset": False,
|
||||
"start_next_episode": False,
|
||||
}
|
||||
|
||||
if is_headless():
|
||||
logger.warning("Headless environment - keyboard controls unavailable")
|
||||
return None, events
|
||||
|
||||
from pynput import keyboard
|
||||
|
||||
def on_press(key):
|
||||
try:
|
||||
if events["in_reset"]:
|
||||
if key in [keyboard.Key.space, keyboard.Key.right]:
|
||||
logger.info("[HIL] Starting next episode...")
|
||||
events["start_next_episode"] = True
|
||||
elif hasattr(key, "char") and key.char == "c":
|
||||
events["start_next_episode"] = True
|
||||
elif key == keyboard.Key.esc:
|
||||
logger.info("[HIL] ESC - Stop recording, pushing to hub...")
|
||||
events["stop_recording"] = True
|
||||
events["start_next_episode"] = True
|
||||
else:
|
||||
if key == keyboard.Key.space:
|
||||
if not events["policy_paused"] and not events["correction_active"]:
|
||||
logger.info("[HIL] PAUSED - Press 'c' to take control or 'p' to resume policy")
|
||||
events["policy_paused"] = True
|
||||
elif hasattr(key, "char") and key.char == "c":
|
||||
if events["policy_paused"] and not events["correction_active"]:
|
||||
logger.info("[HIL] Taking control...")
|
||||
events["start_next_episode"] = True
|
||||
elif hasattr(key, "char") and key.char == "p":
|
||||
if events["policy_paused"] or events["correction_active"]:
|
||||
logger.info("[HIL] Resuming policy...")
|
||||
events["resume_policy"] = True
|
||||
elif key == keyboard.Key.right:
|
||||
logger.info("[HIL] End episode")
|
||||
events["exit_early"] = True
|
||||
elif key == keyboard.Key.left:
|
||||
logger.info("[HIL] Re-record episode")
|
||||
events["rerecord_episode"] = True
|
||||
events["exit_early"] = True
|
||||
elif key == keyboard.Key.esc:
|
||||
logger.info("[HIL] ESC - Stop recording...")
|
||||
events["stop_recording"] = True
|
||||
events["exit_early"] = True
|
||||
except Exception as e:
|
||||
logger.info(f"Key error: {e}")
|
||||
|
||||
listener = keyboard.Listener(on_press=on_press)
|
||||
listener.start()
|
||||
return listener, events
|
||||
|
||||
|
||||
def make_identity_processors():
|
||||
"""Create identity processors for recording."""
|
||||
teleop_proc = RobotProcessorPipeline[tuple[RobotAction, RobotObservation], RobotAction](
|
||||
steps=[IdentityProcessorStep()],
|
||||
to_transition=robot_action_observation_to_transition,
|
||||
to_output=transition_to_robot_action,
|
||||
)
|
||||
obs_proc = RobotProcessorPipeline[RobotObservation, RobotObservation](
|
||||
steps=[IdentityProcessorStep()],
|
||||
to_transition=observation_to_transition,
|
||||
to_output=transition_to_observation,
|
||||
)
|
||||
return teleop_proc, obs_proc
|
||||
|
||||
|
||||
def reset_loop(robot: Robot, teleop: Teleoperator, events: dict, fps: int):
|
||||
"""Reset period where human repositions environment."""
|
||||
logger.info("[HIL] RESET")
|
||||
|
||||
events["in_reset"] = True
|
||||
events["start_next_episode"] = False
|
||||
|
||||
obs = robot.get_observation()
|
||||
robot_pos = {k: v for k, v in obs.items() if k.endswith(".pos") and k in robot.observation_features}
|
||||
teleop_smooth_move_to(teleop, robot_pos, duration_s=2.0, fps=50)
|
||||
|
||||
logger.info("Press any key to enable teleoperation")
|
||||
while not events["start_next_episode"] and not events["stop_recording"]:
|
||||
precise_sleep(0.05)
|
||||
|
||||
if events["stop_recording"]:
|
||||
return
|
||||
|
||||
events["start_next_episode"] = False
|
||||
teleop_disable_torque(teleop)
|
||||
logger.info("Teleop enabled - press any key to start episode")
|
||||
|
||||
while not events["start_next_episode"] and not events["stop_recording"]:
|
||||
loop_start = time.perf_counter()
|
||||
action = teleop.get_action()
|
||||
robot.send_action(action)
|
||||
precise_sleep(1 / fps - (time.perf_counter() - loop_start))
|
||||
|
||||
events["in_reset"] = False
|
||||
events["start_next_episode"] = False
|
||||
events["exit_early"] = False
|
||||
events["policy_paused"] = False
|
||||
events["correction_active"] = False
|
||||
events["resume_policy"] = False
|
||||
|
||||
|
||||
def print_controls(rtc: bool = False):
|
||||
"""Print control instructions."""
|
||||
mode = "Human-in-the-Loop Data Collection" + (" (RTC)" if rtc else "")
|
||||
logger.info(
|
||||
"%s\n Controls:\n"
|
||||
" SPACE - Pause policy\n"
|
||||
" c - Take control\n"
|
||||
" p - Resume policy after pause/correction\n"
|
||||
" → - End episode\n"
|
||||
" ESC - Stop and push to hub",
|
||||
mode,
|
||||
)
|
||||
@@ -35,9 +35,7 @@ def main():
|
||||
|
||||
# Fetch the dataset to replay
|
||||
dataset = LeRobotDataset("<hf_username>/<dataset_repo_id>", episodes=[EPISODE_IDX])
|
||||
# Filter dataset to only include frames from the specified episode since episodes are chunked in dataset V3.0
|
||||
episode_frames = dataset.hf_dataset.filter(lambda x: x["episode_index"] == EPISODE_IDX)
|
||||
actions = episode_frames.select_columns(ACTION)
|
||||
actions = dataset.select_columns(ACTION)
|
||||
|
||||
# Connect to the robot
|
||||
robot.connect()
|
||||
@@ -48,7 +46,7 @@ def main():
|
||||
|
||||
print("Starting replay loop...")
|
||||
log_say(f"Replaying episode {EPISODE_IDX}")
|
||||
for idx in range(len(episode_frames)):
|
||||
for idx in range(dataset.num_frames):
|
||||
t0 = time.perf_counter()
|
||||
|
||||
# Get recorded action from dataset
|
||||
|
||||
@@ -67,9 +67,7 @@ def main():
|
||||
|
||||
# Fetch the dataset to replay
|
||||
dataset = LeRobotDataset(HF_REPO_ID, episodes=[EPISODE_IDX])
|
||||
# Filter dataset to only include frames from the specified episode since episodes are chunked in dataset V3.0
|
||||
episode_frames = dataset.hf_dataset.filter(lambda x: x["episode_index"] == EPISODE_IDX)
|
||||
actions = episode_frames.select_columns(ACTION)
|
||||
actions = dataset.select_columns(ACTION)
|
||||
|
||||
# Connect to the robot
|
||||
robot.connect()
|
||||
@@ -80,7 +78,7 @@ def main():
|
||||
|
||||
print("Starting replay loop...")
|
||||
log_say(f"Replaying episode {EPISODE_IDX}")
|
||||
for idx in range(len(episode_frames)):
|
||||
for idx in range(dataset.num_frames):
|
||||
t0 = time.perf_counter()
|
||||
|
||||
# Get recorded action from dataset
|
||||
|
||||
@@ -63,6 +63,31 @@ Usage:
|
||||
--robot.cameras="{ gripper: {type: opencv, index_or_path: 0, width: 640, height: 480, fps: 30}, front: {type: opencv, index_or_path: 1, width: 640, height: 480, fps: 30}}" \
|
||||
--task="Move green small object into the purple platform" \
|
||||
--duration=120
|
||||
|
||||
# Run RTC with bi_openarm_follower (dual-arm OpenArms) and pi0.5 policy
|
||||
python examples/rtc/eval_with_real_robot.py \
|
||||
--policy.path=lerobot-data-collection/folding_final \
|
||||
--robot.type=bi_openarm_follower \
|
||||
--robot.cameras='{left_wrist: {type: opencv, index_or_path: "/dev/video4", width: 1280, height: 720, fps: 30}, base: {type: opencv, index_or_path: "/dev/video2", width: 640, height: 480, fps: 30}, right_wrist: {type: opencv, index_or_path: "/dev/video0", width: 1280, height: 720, fps: 30}}' \
|
||||
--robot.left_arm_config.port=can0 \
|
||||
--robot.left_arm_config.side=left \
|
||||
--robot.left_arm_config.can_interface=socketcan \
|
||||
--robot.left_arm_config.disable_torque_on_disconnect=true \
|
||||
--robot.left_arm_config.max_relative_target=8.0 \
|
||||
--robot.right_arm_config.port=can1 \
|
||||
--robot.right_arm_config.side=right \
|
||||
--robot.right_arm_config.can_interface=socketcan \
|
||||
--robot.right_arm_config.disable_torque_on_disconnect=true \
|
||||
--robot.right_arm_config.max_relative_target=8.0 \
|
||||
--task="Fold the T-shirt properly" \
|
||||
--fps=30 \
|
||||
--duration=2000 \
|
||||
--interpolation_multiplier=3 \
|
||||
--rtc.enabled=true \
|
||||
--rtc.execution_horizon=20 \
|
||||
--rtc.max_guidance_weight=5.0 \
|
||||
--rtc.prefix_attention_schedule=LINEAR \
|
||||
--device=cuda
|
||||
"""
|
||||
|
||||
import logging
|
||||
@@ -84,24 +109,30 @@ from lerobot.configs.policies import PreTrainedConfig
|
||||
from lerobot.configs.types import RTCAttentionSchedule
|
||||
from lerobot.datasets.feature_utils import build_dataset_frame, hw_to_dataset_features
|
||||
from lerobot.policies.factory import get_policy_class, make_pre_post_processors
|
||||
from lerobot.policies.rtc.action_queue import ActionQueue
|
||||
from lerobot.policies.rtc.configuration_rtc import RTCConfig
|
||||
from lerobot.policies.rtc.latency_tracker import LatencyTracker
|
||||
from lerobot.policies.rtc import ActionInterpolator, ActionQueue, LatencyTracker, RTCConfig
|
||||
from lerobot.processor import (
|
||||
NormalizerProcessorStep,
|
||||
RelativeActionsProcessorStep,
|
||||
TransitionKey,
|
||||
create_transition,
|
||||
)
|
||||
from lerobot.processor.factory import (
|
||||
make_default_robot_action_processor,
|
||||
make_default_robot_observation_processor,
|
||||
)
|
||||
from lerobot.processor.relative_action_processor import to_relative_actions
|
||||
from lerobot.rl.process import ProcessSignalHandler
|
||||
from lerobot.robots import ( # noqa: F401
|
||||
Robot,
|
||||
RobotConfig,
|
||||
bi_openarm_follower,
|
||||
bi_so_follower,
|
||||
koch_follower,
|
||||
so_follower,
|
||||
unitree_g1,
|
||||
)
|
||||
from lerobot.robots.utils import make_robot_from_config
|
||||
from lerobot.utils.constants import OBS_IMAGES
|
||||
from lerobot.utils.constants import OBS_IMAGES, OBS_STATE
|
||||
from lerobot.utils.hub import HubMixin
|
||||
from lerobot.utils.utils import init_logging
|
||||
|
||||
@@ -153,6 +184,7 @@ class RTCDemoConfig(HubMixin):
|
||||
# Demo parameters
|
||||
duration: float = 30.0 # Duration to run the demo (seconds)
|
||||
fps: float = 10.0 # Action execution frequency (Hz)
|
||||
interpolation_multiplier: int = 1 # Control rate multiplier (1=off, 2=2x, 3=3x)
|
||||
|
||||
# Compute device
|
||||
device: str | None = None # Device to run on (cuda, cpu, auto)
|
||||
@@ -212,6 +244,35 @@ def is_image_key(k: str) -> bool:
|
||||
return k.startswith(OBS_IMAGES)
|
||||
|
||||
|
||||
def _reanchor_relative_rtc_prefix(
|
||||
prev_actions_absolute: Tensor,
|
||||
current_state: Tensor,
|
||||
relative_step: RelativeActionsProcessorStep,
|
||||
normalizer_step: NormalizerProcessorStep | None,
|
||||
policy_device: torch.device | str,
|
||||
) -> Tensor:
|
||||
"""Convert absolute leftovers into model-space for relative-action RTC policies.
|
||||
|
||||
When a policy uses relative actions, the RTC prefix (leftover actions from
|
||||
the previous chunk) is stored in absolute space. Before feeding it back to
|
||||
the policy we need to re-express it relative to the *current* robot state
|
||||
and then re-normalize.
|
||||
"""
|
||||
state = current_state.detach().cpu()
|
||||
if state.dim() == 1:
|
||||
state = state.unsqueeze(0)
|
||||
|
||||
action_cpu = prev_actions_absolute.detach().cpu()
|
||||
mask = relative_step._build_mask(action_cpu.shape[-1])
|
||||
relative_actions = to_relative_actions(action_cpu, state, mask)
|
||||
|
||||
transition = create_transition(action=relative_actions)
|
||||
if normalizer_step is not None:
|
||||
transition = normalizer_step(transition)
|
||||
|
||||
return transition[TransitionKey.ACTION].to(policy_device)
|
||||
|
||||
|
||||
def get_actions(
|
||||
policy,
|
||||
robot: RobotWrapper,
|
||||
@@ -237,7 +298,15 @@ def get_actions(
|
||||
fps = cfg.fps
|
||||
time_per_chunk = 1.0 / fps
|
||||
|
||||
dataset_features = hw_to_dataset_features(robot.observation_features(), "observation")
|
||||
# Only keep .pos joints + camera streams if the policy was trained on positions,
|
||||
# not the full pos/vel/torque state the robot exposes.
|
||||
observation_features_hw = {
|
||||
key: value
|
||||
for key, value in robot.observation_features().items()
|
||||
if key.endswith(".pos") or isinstance(value, tuple)
|
||||
}
|
||||
|
||||
dataset_features = hw_to_dataset_features(observation_features_hw, "observation")
|
||||
policy_device = policy.config.device
|
||||
|
||||
# Load preprocessor and postprocessor from pretrained files
|
||||
@@ -255,6 +324,25 @@ def get_actions(
|
||||
|
||||
logger.info("[GET_ACTIONS] Preprocessor/postprocessor loaded successfully with embedded stats")
|
||||
|
||||
relative_step = next(
|
||||
(s for s in preprocessor.steps if isinstance(s, RelativeActionsProcessorStep) and s.enabled),
|
||||
None,
|
||||
)
|
||||
normalizer_step = next(
|
||||
(s for s in preprocessor.steps if isinstance(s, NormalizerProcessorStep)),
|
||||
None,
|
||||
)
|
||||
if relative_step is not None:
|
||||
if relative_step.action_names is None:
|
||||
cfg_names = getattr(cfg.policy, "action_feature_names", None)
|
||||
if cfg_names:
|
||||
relative_step.action_names = list(cfg_names)
|
||||
else:
|
||||
relative_step.action_names = [
|
||||
k for k in robot.robot.action_features if k.endswith(".pos")
|
||||
]
|
||||
logger.info("[GET_ACTIONS] Relative actions enabled: will re-anchor RTC prefix")
|
||||
|
||||
get_actions_threshold = cfg.action_queue_size_to_get_new_actions
|
||||
|
||||
if not cfg.rtc.enabled:
|
||||
@@ -297,6 +385,28 @@ def get_actions(
|
||||
|
||||
preproceseded_obs = preprocessor(obs_with_policy_features)
|
||||
|
||||
# Re-anchor leftover actions for relative-action policies.
|
||||
# We need the *postprocessed* (absolute) leftover, not the original
|
||||
# (normalized/relative) one that get_left_over() returns.
|
||||
if (
|
||||
prev_actions is not None
|
||||
and relative_step is not None
|
||||
and OBS_STATE in obs_with_policy_features
|
||||
):
|
||||
with action_queue.lock:
|
||||
if action_queue.queue is not None:
|
||||
prev_actions_abs = action_queue.queue[action_queue.last_index :].clone()
|
||||
else:
|
||||
prev_actions_abs = None
|
||||
if prev_actions_abs is not None and prev_actions_abs.numel() > 0:
|
||||
prev_actions = _reanchor_relative_rtc_prefix(
|
||||
prev_actions_absolute=prev_actions_abs,
|
||||
current_state=obs_with_policy_features[OBS_STATE],
|
||||
relative_step=relative_step,
|
||||
normalizer_step=normalizer_step,
|
||||
policy_device=policy_device,
|
||||
)
|
||||
|
||||
# Generate actions WITH RTC
|
||||
actions = policy.predict_action_chunk(
|
||||
preproceseded_obs,
|
||||
@@ -352,21 +462,26 @@ def actor_control(
|
||||
try:
|
||||
logger.info("[ACTOR] Starting actor thread")
|
||||
|
||||
action_keys = [k for k in robot.action_features() if k.endswith(".pos")]
|
||||
|
||||
action_count = 0
|
||||
action_interval = 1.0 / cfg.fps
|
||||
interpolator = ActionInterpolator(multiplier=cfg.interpolation_multiplier)
|
||||
action_interval = interpolator.get_control_interval(cfg.fps)
|
||||
|
||||
while not shutdown_event.is_set():
|
||||
start_time = time.perf_counter()
|
||||
|
||||
# Try to get an action from the queue with timeout
|
||||
action = action_queue.get()
|
||||
if interpolator.needs_new_action():
|
||||
new_action = action_queue.get()
|
||||
if new_action is not None:
|
||||
interpolator.add(new_action.cpu())
|
||||
|
||||
action = interpolator.get()
|
||||
if action is not None:
|
||||
action = action.cpu()
|
||||
action_dict = {key: action[i].item() for i, key in enumerate(robot.action_features())}
|
||||
action_dict = {key: action[i].item() for i, key in enumerate(action_keys)}
|
||||
action_processed = robot_action_processor((action_dict, None))
|
||||
robot.send_action(action_processed)
|
||||
|
||||
action_count += 1
|
||||
|
||||
dt_s = time.perf_counter() - start_time
|
||||
|
||||
@@ -68,9 +68,7 @@ def main():
|
||||
|
||||
# Fetch the dataset to replay
|
||||
dataset = LeRobotDataset(HF_REPO_ID, episodes=[EPISODE_IDX])
|
||||
# Filter dataset to only include frames from the specified episode since episodes are chunked in dataset V3.0
|
||||
episode_frames = dataset.hf_dataset.filter(lambda x: x["episode_index"] == EPISODE_IDX)
|
||||
actions = episode_frames.select_columns(ACTION)
|
||||
actions = dataset.select_columns(ACTION)
|
||||
|
||||
# Connect to the robot
|
||||
robot.connect()
|
||||
@@ -81,7 +79,7 @@ def main():
|
||||
|
||||
print("Starting replay loop...")
|
||||
log_say(f"Replaying episode {EPISODE_IDX}")
|
||||
for idx in range(len(episode_frames)):
|
||||
for idx in range(dataset.num_frames):
|
||||
t0 = time.perf_counter()
|
||||
|
||||
# Get recorded action from dataset
|
||||
|
||||
+5
-4
@@ -71,9 +71,9 @@ dependencies = [
|
||||
"cmake>=3.29.0.1,<4.2.0",
|
||||
"packaging>=24.2,<26.0",
|
||||
|
||||
"torch>=2.2.1,<2.11.0",
|
||||
"torchcodec>=0.2.1,<0.11.0; sys_platform != 'win32' and (sys_platform != 'linux' or (platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 'armv7l')) and (sys_platform != 'darwin' or platform_machine != 'x86_64')",
|
||||
"torchvision>=0.21.0,<0.26.0",
|
||||
"torch>=2.7,<2.11.0",
|
||||
"torchcodec>=0.3.0,<0.11.0; sys_platform != 'win32' and (sys_platform != 'linux' or (platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 'armv7l')) and (sys_platform != 'darwin' or platform_machine != 'x86_64')", # NOTE: Windows support starts at version 0.7 (needs torch==2.8), ffmpeg>=8 support starts at version 0.8.1 (needs torch==2.9), system-wide ffmpeg support starts at version 0.10 (needs torch==2.10).
|
||||
"torchvision>=0.22.0,<0.26.0",
|
||||
|
||||
"einops>=0.8.0,<0.9.0",
|
||||
"opencv-python-headless>=4.9.0,<4.14.0",
|
||||
@@ -99,7 +99,7 @@ dependencies = [
|
||||
# Common
|
||||
pygame-dep = ["pygame>=2.5.1,<2.7.0"]
|
||||
placo-dep = ["placo>=0.9.6,<0.9.17"]
|
||||
transformers-dep = ["transformers>=5.3.0,<6.0.0"]
|
||||
transformers-dep = ["transformers==5.3.0"] # TODO(Steven): https://github.com/huggingface/lerobot/pull/3249
|
||||
grpcio-dep = ["grpcio==1.73.1", "protobuf>=6.31.1,<6.32.0"]
|
||||
can-dep = ["python-can>=4.2.0,<5.0.0"]
|
||||
peft-dep = ["peft>=0.18.0,<1.0.0"]
|
||||
@@ -145,6 +145,7 @@ wallx = [
|
||||
]
|
||||
pi = ["lerobot[transformers-dep]", "lerobot[scipy-dep]"]
|
||||
smolvla = ["lerobot[transformers-dep]", "num2words>=0.5.14,<0.6.0", "accelerate>=1.7.0,<2.0.0", "safetensors>=0.4.3,<1.0.0"]
|
||||
multi_task_dit = ["lerobot[transformers-dep]"]
|
||||
groot = [
|
||||
"lerobot[transformers-dep]",
|
||||
"lerobot[peft]",
|
||||
|
||||
@@ -27,7 +27,8 @@ class DatasetConfig:
|
||||
# "dataset_index" into the returned item. The index mapping is made according to the order in which the
|
||||
# datasets are provided.
|
||||
repo_id: str
|
||||
# Root directory where the dataset will be stored (e.g. 'dataset/path'). If None, defaults to $HF_LEROBOT_HOME/repo_id.
|
||||
# Root directory for a concrete local dataset tree (e.g. 'dataset/path'). If None, local datasets are
|
||||
# looked up under $HF_LEROBOT_HOME/repo_id and Hub downloads use a revision-safe cache under $HF_LEROBOT_HOME/hub.
|
||||
root: str | None = None
|
||||
episodes: list[int] | None = None
|
||||
image_transforms: ImageTransformsConfig = field(default_factory=ImageTransformsConfig)
|
||||
|
||||
@@ -0,0 +1,33 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2026 The HuggingFace Inc. team.
|
||||
# All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from lerobot.datasets.dataset_metadata import LeRobotDatasetMetadata
|
||||
from lerobot.datasets.lerobot_dataset import LeRobotDataset
|
||||
from lerobot.datasets.multi_dataset import MultiLeRobotDataset
|
||||
from lerobot.datasets.sampler import EpisodeAwareSampler
|
||||
from lerobot.datasets.streaming_dataset import StreamingLeRobotDataset
|
||||
from lerobot.datasets.transforms import ImageTransforms, ImageTransformsConfig
|
||||
|
||||
__all__ = [
|
||||
"EpisodeAwareSampler",
|
||||
"ImageTransforms",
|
||||
"ImageTransformsConfig",
|
||||
"LeRobotDataset",
|
||||
"LeRobotDatasetMetadata",
|
||||
"MultiLeRobotDataset",
|
||||
"StreamingLeRobotDataset",
|
||||
]
|
||||
@@ -13,9 +13,14 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
import numpy as np
|
||||
|
||||
from lerobot.datasets.io_utils import load_image_as_numpy
|
||||
from lerobot.utils.constants import ACTION, OBS_STATE
|
||||
|
||||
DEFAULT_QUANTILES = [0.01, 0.10, 0.50, 0.90, 0.99]
|
||||
|
||||
@@ -624,3 +629,141 @@ def aggregate_stats(stats_list: list[dict[str, dict]]) -> dict[str, dict[str, np
|
||||
aggregated_stats[key] = aggregate_feature_stats(stats_with_key)
|
||||
|
||||
return aggregated_stats
|
||||
|
||||
|
||||
def _get_valid_chunk_starts(episode_indices: np.ndarray, chunk_size: int) -> np.ndarray:
|
||||
"""Return all start indices where a chunk of ``chunk_size`` stays within one episode."""
|
||||
total = len(episode_indices)
|
||||
if total < chunk_size:
|
||||
return np.array([], dtype=np.int64)
|
||||
max_start = total - chunk_size
|
||||
starts = np.arange(max_start + 1)
|
||||
valid = episode_indices[starts] == episode_indices[starts + chunk_size - 1]
|
||||
return starts[valid]
|
||||
|
||||
|
||||
def _compute_relative_chunk_batch(
|
||||
start_indices: np.ndarray,
|
||||
all_actions: np.ndarray,
|
||||
all_states: np.ndarray,
|
||||
chunk_size: int,
|
||||
relative_mask: np.ndarray,
|
||||
) -> np.ndarray:
|
||||
"""Vectorised relative-action computation for a batch of start indices.
|
||||
|
||||
Returns an ``(N * chunk_size, action_dim)`` float32 array.
|
||||
"""
|
||||
if len(start_indices) == 0:
|
||||
return np.empty((0, all_actions.shape[1]), dtype=np.float32)
|
||||
offsets = np.arange(chunk_size)
|
||||
frame_idx = start_indices[:, None] + offsets[None, :]
|
||||
chunks = all_actions[frame_idx].copy()
|
||||
states = all_states[start_indices]
|
||||
mask_dim = len(relative_mask)
|
||||
chunks[:, :, :mask_dim] -= states[:, None, :mask_dim] * relative_mask[None, None, :]
|
||||
return chunks.reshape(-1, all_actions.shape[1])
|
||||
|
||||
|
||||
def compute_relative_action_stats(
|
||||
hf_dataset,
|
||||
features: dict,
|
||||
chunk_size: int,
|
||||
exclude_joints: list[str] | None = None,
|
||||
num_workers: int = 0,
|
||||
) -> dict[str, np.ndarray]:
|
||||
"""Compute normalization statistics for relative actions over the full dataset.
|
||||
|
||||
Iterates *all* valid action chunks (within single episodes), converts them to
|
||||
relative actions (action − current_state), and computes per-dimension
|
||||
statistics suitable for normalization.
|
||||
|
||||
Args:
|
||||
hf_dataset: The underlying HuggingFace dataset with "action",
|
||||
"observation.state", and "episode_index" columns.
|
||||
features: Dataset feature metadata (must contain "action" with "shape"
|
||||
and optionally "names").
|
||||
chunk_size: Number of consecutive frames per action chunk.
|
||||
exclude_joints: Joint names whose dimensions should remain absolute
|
||||
(not converted to relative actions).
|
||||
num_workers: Number of parallel threads for computation. Values ≤1
|
||||
mean single-threaded. Numpy releases the GIL so threads give
|
||||
real parallelism here.
|
||||
|
||||
Returns:
|
||||
Statistics dict with keys "mean", "std", "min", "max", "q01", …, "q99".
|
||||
|
||||
Raises:
|
||||
ValueError: If the dataset has fewer frames than ``chunk_size``.
|
||||
RuntimeError: If no valid (single-episode) chunks are found.
|
||||
"""
|
||||
from lerobot.processor.relative_action_processor import RelativeActionsProcessorStep
|
||||
|
||||
if exclude_joints is None:
|
||||
exclude_joints = []
|
||||
|
||||
action_dim = features[ACTION]["shape"][0]
|
||||
action_names = features.get(ACTION, {}).get("names")
|
||||
mask_step = RelativeActionsProcessorStep(
|
||||
enabled=True,
|
||||
exclude_joints=exclude_joints,
|
||||
action_names=action_names,
|
||||
)
|
||||
relative_mask = np.array(mask_step._build_mask(action_dim), dtype=np.float32)
|
||||
|
||||
logging.info("Loading action/state data for relative action stats...")
|
||||
all_actions = np.array(hf_dataset[ACTION], dtype=np.float32)
|
||||
all_states = np.array(hf_dataset[OBS_STATE], dtype=np.float32)
|
||||
episode_indices = np.array(hf_dataset["episode_index"])
|
||||
|
||||
valid_starts = _get_valid_chunk_starts(episode_indices, chunk_size)
|
||||
if len(valid_starts) == 0:
|
||||
raise RuntimeError(
|
||||
f"No valid chunks found (total_frames={len(episode_indices)}, chunk_size={chunk_size})"
|
||||
)
|
||||
|
||||
effective_workers = max(num_workers, 1)
|
||||
logging.info(
|
||||
f"Computing relative action stats from {len(valid_starts)} chunks "
|
||||
f"(chunk_size={chunk_size}, workers={effective_workers})"
|
||||
)
|
||||
|
||||
batch_size = 50_000
|
||||
batches = [valid_starts[i : i + batch_size] for i in range(0, len(valid_starts), batch_size)]
|
||||
|
||||
running_stats = RunningQuantileStats()
|
||||
|
||||
if num_workers > 1:
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
with ThreadPoolExecutor(max_workers=num_workers) as pool:
|
||||
futures = [
|
||||
pool.submit(
|
||||
_compute_relative_chunk_batch,
|
||||
batch,
|
||||
all_actions,
|
||||
all_states,
|
||||
chunk_size,
|
||||
relative_mask,
|
||||
)
|
||||
for batch in batches
|
||||
]
|
||||
for future in as_completed(futures):
|
||||
running_stats.update(future.result())
|
||||
else:
|
||||
for batch in batches:
|
||||
running_stats.update(
|
||||
_compute_relative_chunk_batch(batch, all_actions, all_states, chunk_size, relative_mask)
|
||||
)
|
||||
|
||||
stats = running_stats.get_statistics()
|
||||
|
||||
excluded_dims = int(len(relative_mask) - relative_mask.sum())
|
||||
total_frames = len(valid_starts) * chunk_size
|
||||
logging.info(
|
||||
f"Relative action stats ({len(valid_starts)} chunks, {total_frames} frames): "
|
||||
f"relative_dims={int(relative_mask.sum())}/{len(relative_mask)} (excluded={excluded_dims}), "
|
||||
f"mean={np.abs(stats['mean']).mean():.4f}, std={stats['std'].mean():.4f}, "
|
||||
f"q01={stats['q01'].mean():.4f}, q99={stats['q99'].mean():.4f}"
|
||||
)
|
||||
|
||||
return stats
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import contextlib
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
@@ -43,16 +44,24 @@ from lerobot.datasets.utils import (
|
||||
check_version_compatibility,
|
||||
flatten_dict,
|
||||
get_safe_version,
|
||||
has_legacy_hub_download_metadata,
|
||||
is_valid_version,
|
||||
update_chunk_file_indices,
|
||||
)
|
||||
from lerobot.datasets.video_utils import get_video_info
|
||||
from lerobot.utils.constants import HF_LEROBOT_HOME
|
||||
from lerobot.utils.constants import HF_LEROBOT_HOME, HF_LEROBOT_HUB_CACHE
|
||||
|
||||
CODEBASE_VERSION = "v3.0"
|
||||
|
||||
|
||||
class LeRobotDatasetMetadata:
|
||||
"""Metadata container for a LeRobot dataset.
|
||||
|
||||
Manages the ``info.json``, ``stats.json``, ``tasks.parquet``, and
|
||||
``episodes/`` parquet files that describe a dataset's structure, content,
|
||||
and statistics.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
repo_id: str,
|
||||
@@ -61,33 +70,57 @@ class LeRobotDatasetMetadata:
|
||||
force_cache_sync: bool = False,
|
||||
metadata_buffer_size: int = 10,
|
||||
):
|
||||
"""Load or download metadata for an existing LeRobot dataset.
|
||||
|
||||
Attempts to load metadata from local disk. If files are missing or
|
||||
``force_cache_sync`` is ``True``, downloads the ``meta/`` directory from
|
||||
the Hub.
|
||||
|
||||
Args:
|
||||
repo_id: Repository identifier (e.g. ``'lerobot/aloha_sim'``).
|
||||
root: Local directory for the dataset. When provided, Hub downloads
|
||||
are materialized directly into this directory. When omitted,
|
||||
existing local datasets are still looked up under
|
||||
``$HF_LEROBOT_HOME/{repo_id}``, but Hub downloads use a
|
||||
revision-safe snapshot cache under
|
||||
``$HF_LEROBOT_HOME/hub``.
|
||||
revision: Git revision (branch, tag, or commit hash). Defaults to
|
||||
the current codebase version.
|
||||
force_cache_sync: If ``True``, re-download metadata from the Hub
|
||||
even when local files exist.
|
||||
metadata_buffer_size: Number of episode metadata records to buffer
|
||||
in memory before flushing to parquet.
|
||||
"""
|
||||
self.repo_id = repo_id
|
||||
self.revision = revision if revision else CODEBASE_VERSION
|
||||
self.root = Path(root) if root is not None else HF_LEROBOT_HOME / repo_id
|
||||
self.writer = None
|
||||
self._requested_root = Path(root) if root is not None else None
|
||||
self.root = self._requested_root if self._requested_root is not None else HF_LEROBOT_HOME / repo_id
|
||||
self._pq_writer = None
|
||||
self.latest_episode = None
|
||||
self.metadata_buffer: list[dict] = []
|
||||
self.metadata_buffer_size = metadata_buffer_size
|
||||
self._metadata_buffer: list[dict] = []
|
||||
self._metadata_buffer_size = metadata_buffer_size
|
||||
self._finalized = False
|
||||
|
||||
try:
|
||||
if force_cache_sync:
|
||||
if force_cache_sync or (
|
||||
self._requested_root is None and has_legacy_hub_download_metadata(self.root)
|
||||
):
|
||||
raise FileNotFoundError
|
||||
self.load_metadata()
|
||||
self._load_metadata()
|
||||
except (FileNotFoundError, NotADirectoryError):
|
||||
if is_valid_version(self.revision):
|
||||
self.revision = get_safe_version(self.repo_id, self.revision)
|
||||
|
||||
(self.root / "meta").mkdir(exist_ok=True, parents=True)
|
||||
self.pull_from_repo(allow_patterns="meta/")
|
||||
self.load_metadata()
|
||||
self._pull_from_repo(allow_patterns="meta/")
|
||||
self._load_metadata()
|
||||
|
||||
def _flush_metadata_buffer(self) -> None:
|
||||
"""Write all buffered episode metadata to parquet file."""
|
||||
if not hasattr(self, "metadata_buffer") or len(self.metadata_buffer) == 0:
|
||||
if not hasattr(self, "_metadata_buffer") or len(self._metadata_buffer) == 0:
|
||||
return
|
||||
|
||||
combined_dict = {}
|
||||
for episode_dict in self.metadata_buffer:
|
||||
for episode_dict in self._metadata_buffer:
|
||||
for key, value in episode_dict.items():
|
||||
if key not in combined_dict:
|
||||
combined_dict[key] = []
|
||||
@@ -96,40 +129,50 @@ class LeRobotDatasetMetadata:
|
||||
val = value[0] if isinstance(value, list) else value
|
||||
combined_dict[key].append(val.tolist() if isinstance(val, np.ndarray) else val)
|
||||
|
||||
first_ep = self.metadata_buffer[0]
|
||||
first_ep = self._metadata_buffer[0]
|
||||
chunk_idx = first_ep["meta/episodes/chunk_index"][0]
|
||||
file_idx = first_ep["meta/episodes/file_index"][0]
|
||||
|
||||
table = pa.Table.from_pydict(combined_dict)
|
||||
|
||||
if not self.writer:
|
||||
if not self._pq_writer:
|
||||
path = Path(self.root / DEFAULT_EPISODES_PATH.format(chunk_index=chunk_idx, file_index=file_idx))
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
self.writer = pq.ParquetWriter(
|
||||
self._pq_writer = pq.ParquetWriter(
|
||||
path, schema=table.schema, compression="snappy", use_dictionary=True
|
||||
)
|
||||
|
||||
self.writer.write_table(table)
|
||||
self._pq_writer.write_table(table)
|
||||
|
||||
self.latest_episode = self.metadata_buffer[-1]
|
||||
self.metadata_buffer.clear()
|
||||
self.latest_episode = self._metadata_buffer[-1]
|
||||
self._metadata_buffer.clear()
|
||||
|
||||
def _close_writer(self) -> None:
|
||||
"""Close and cleanup the parquet writer if it exists."""
|
||||
self._flush_metadata_buffer()
|
||||
|
||||
writer = getattr(self, "writer", None)
|
||||
writer = getattr(self, "_pq_writer", None)
|
||||
if writer is not None:
|
||||
writer.close()
|
||||
self.writer = None
|
||||
self._pq_writer = None
|
||||
|
||||
def finalize(self) -> None:
|
||||
"""Flush metadata buffer and close the parquet writer.
|
||||
|
||||
Idempotent — safe to call multiple times.
|
||||
"""
|
||||
if getattr(self, "_finalized", False):
|
||||
return
|
||||
self._close_writer()
|
||||
self._finalized = True
|
||||
|
||||
def __del__(self):
|
||||
"""
|
||||
Trust the user to call .finalize() but as an added safety check call the parquet writer to stop when calling the destructor
|
||||
"""
|
||||
self._close_writer()
|
||||
"""Safety net: flush and close parquet writer on garbage collection."""
|
||||
# During interpreter shutdown, referenced objects may already be collected.
|
||||
with contextlib.suppress(Exception):
|
||||
self.finalize()
|
||||
|
||||
def load_metadata(self):
|
||||
def _load_metadata(self):
|
||||
self.info = load_info(self.root)
|
||||
check_version_compatibility(self.repo_id, self._version, CODEBASE_VERSION)
|
||||
self.tasks = load_tasks(self.root)
|
||||
@@ -137,22 +180,38 @@ class LeRobotDatasetMetadata:
|
||||
self.episodes = load_episodes(self.root)
|
||||
self.stats = load_stats(self.root)
|
||||
|
||||
def pull_from_repo(
|
||||
def _pull_from_repo(
|
||||
self,
|
||||
allow_patterns: list[str] | str | None = None,
|
||||
ignore_patterns: list[str] | str | None = None,
|
||||
) -> None:
|
||||
if self._requested_root is None:
|
||||
self.root = Path(
|
||||
snapshot_download(
|
||||
self.repo_id,
|
||||
repo_type="dataset",
|
||||
revision=self.revision,
|
||||
cache_dir=HF_LEROBOT_HUB_CACHE,
|
||||
allow_patterns=allow_patterns,
|
||||
ignore_patterns=ignore_patterns,
|
||||
)
|
||||
)
|
||||
return
|
||||
|
||||
self._requested_root.mkdir(exist_ok=True, parents=True)
|
||||
snapshot_download(
|
||||
self.repo_id,
|
||||
repo_type="dataset",
|
||||
revision=self.revision,
|
||||
local_dir=self.root,
|
||||
local_dir=self._requested_root,
|
||||
allow_patterns=allow_patterns,
|
||||
ignore_patterns=ignore_patterns,
|
||||
)
|
||||
self.root = self._requested_root
|
||||
|
||||
@property
|
||||
def url_root(self) -> str:
|
||||
"""Hugging Face Hub URL root for this dataset."""
|
||||
return f"hf://datasets/{self.repo_id}"
|
||||
|
||||
@property
|
||||
@@ -161,6 +220,17 @@ class LeRobotDatasetMetadata:
|
||||
return packaging.version.parse(self.info["codebase_version"])
|
||||
|
||||
def get_data_file_path(self, ep_index: int) -> Path:
|
||||
"""Return the relative parquet file path for the given episode index.
|
||||
|
||||
Args:
|
||||
ep_index: Zero-based episode index.
|
||||
|
||||
Returns:
|
||||
Path to the parquet file containing this episode's data.
|
||||
|
||||
Raises:
|
||||
IndexError: If ``ep_index`` is out of range.
|
||||
"""
|
||||
if self.episodes is None:
|
||||
self.episodes = load_episodes(self.root)
|
||||
if ep_index >= len(self.episodes):
|
||||
@@ -174,6 +244,19 @@ class LeRobotDatasetMetadata:
|
||||
return Path(fpath)
|
||||
|
||||
def get_video_file_path(self, ep_index: int, vid_key: str) -> Path:
|
||||
"""Return the relative video file path for the given episode and video key.
|
||||
|
||||
Args:
|
||||
ep_index: Zero-based episode index.
|
||||
vid_key: Feature key identifying the video stream
|
||||
(e.g. ``'observation.images.laptop'``).
|
||||
|
||||
Returns:
|
||||
Path to the video file containing this episode's frames.
|
||||
|
||||
Raises:
|
||||
IndexError: If ``ep_index`` is out of range.
|
||||
"""
|
||||
if self.episodes is None:
|
||||
self.episodes = load_episodes(self.root)
|
||||
if ep_index >= len(self.episodes):
|
||||
@@ -277,6 +360,17 @@ class LeRobotDatasetMetadata:
|
||||
return None
|
||||
|
||||
def save_episode_tasks(self, tasks: list[str]):
|
||||
"""Register tasks for the current episode and persist to disk.
|
||||
|
||||
New tasks that do not already exist in the dataset are assigned
|
||||
sequential task indices and appended to the tasks parquet file.
|
||||
|
||||
Args:
|
||||
tasks: List of unique task descriptions in natural language.
|
||||
|
||||
Raises:
|
||||
ValueError: If ``tasks`` contains duplicates.
|
||||
"""
|
||||
if len(set(tasks)) != len(tasks):
|
||||
raise ValueError(f"Tasks are not unique: {tasks}")
|
||||
|
||||
@@ -336,8 +430,8 @@ class LeRobotDatasetMetadata:
|
||||
|
||||
latest_path = (
|
||||
self.root / DEFAULT_EPISODES_PATH.format(chunk_index=chunk_idx, file_index=file_idx)
|
||||
if self.writer is None
|
||||
else self.writer.where
|
||||
if self._pq_writer is None
|
||||
else self._pq_writer.where
|
||||
)
|
||||
|
||||
if Path(latest_path).exists():
|
||||
@@ -359,10 +453,10 @@ class LeRobotDatasetMetadata:
|
||||
episode_dict["dataset_to_index"] = [self.latest_episode["dataset_to_index"][0] + num_frames]
|
||||
|
||||
# Add to buffer
|
||||
self.metadata_buffer.append(episode_dict)
|
||||
self._metadata_buffer.append(episode_dict)
|
||||
self.latest_episode = episode_dict
|
||||
|
||||
if len(self.metadata_buffer) >= self.metadata_buffer_size:
|
||||
if len(self._metadata_buffer) >= self._metadata_buffer_size:
|
||||
self._flush_metadata_buffer()
|
||||
|
||||
def save_episode(
|
||||
@@ -373,6 +467,20 @@ class LeRobotDatasetMetadata:
|
||||
episode_stats: dict[str, dict],
|
||||
episode_metadata: dict,
|
||||
) -> None:
|
||||
"""Persist episode metadata, update dataset info, and aggregate stats.
|
||||
|
||||
Writes the episode's metadata to the buffered parquet writer, increments
|
||||
the total episode/frame counters in ``info.json``, and merges the
|
||||
episode's statistics into the running dataset statistics.
|
||||
|
||||
Args:
|
||||
episode_index: Zero-based index of the episode being saved.
|
||||
episode_length: Number of frames in this episode.
|
||||
episode_tasks: List of task descriptions for this episode.
|
||||
episode_stats: Per-feature statistics for this episode.
|
||||
episode_metadata: Additional metadata (chunk/file indices, frame
|
||||
ranges, video timestamps, etc.).
|
||||
"""
|
||||
episode_dict = {
|
||||
"episode_index": episode_index,
|
||||
"tasks": episode_tasks,
|
||||
@@ -479,10 +587,36 @@ class LeRobotDatasetMetadata:
|
||||
data_files_size_in_mb: int | None = None,
|
||||
video_files_size_in_mb: int | None = None,
|
||||
) -> "LeRobotDatasetMetadata":
|
||||
"""Creates metadata for a LeRobotDataset."""
|
||||
"""Create metadata for a new LeRobot dataset from scratch.
|
||||
|
||||
Initializes the ``info.json`` file on disk with the provided feature
|
||||
schema and dataset settings. No episode data is written yet.
|
||||
|
||||
Args:
|
||||
repo_id: Repository identifier (e.g. ``'user/my_dataset'``).
|
||||
fps: Frames per second used during data collection.
|
||||
features: Feature specification dict mapping feature names to their
|
||||
type/shape metadata.
|
||||
robot_type: Optional robot type string stored in metadata.
|
||||
root: Local directory for the dataset. Defaults to
|
||||
``$HF_LEROBOT_HOME/{repo_id}``. Must not already exist.
|
||||
use_videos: If ``True``, visual modalities are encoded as MP4 videos.
|
||||
metadata_buffer_size: Number of episode metadata records to buffer
|
||||
before flushing to parquet.
|
||||
chunks_size: Max number of files per chunk directory. ``None`` uses
|
||||
the default.
|
||||
data_files_size_in_mb: Max parquet file size in MB. ``None`` uses the
|
||||
default.
|
||||
video_files_size_in_mb: Max video file size in MB. ``None`` uses the
|
||||
default.
|
||||
|
||||
Returns:
|
||||
A new :class:`LeRobotDatasetMetadata` instance.
|
||||
"""
|
||||
obj = cls.__new__(cls)
|
||||
obj.repo_id = repo_id
|
||||
obj.root = Path(root) if root is not None else HF_LEROBOT_HOME / repo_id
|
||||
obj._requested_root = Path(root) if root is not None else None
|
||||
obj.root = obj._requested_root if obj._requested_root is not None else HF_LEROBOT_HOME / repo_id
|
||||
|
||||
obj.root.mkdir(parents=True, exist_ok=False)
|
||||
|
||||
@@ -510,8 +644,9 @@ class LeRobotDatasetMetadata:
|
||||
)
|
||||
write_json(obj.info, obj.root / INFO_PATH)
|
||||
obj.revision = None
|
||||
obj.writer = None
|
||||
obj._pq_writer = None
|
||||
obj.latest_episode = None
|
||||
obj.metadata_buffer = []
|
||||
obj.metadata_buffer_size = metadata_buffer_size
|
||||
obj._metadata_buffer = []
|
||||
obj._metadata_buffer_size = metadata_buffer_size
|
||||
obj._finalized = False
|
||||
return obj
|
||||
|
||||
@@ -0,0 +1,288 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Private reader component for LeRobotDataset. Handles random-access reading (HF dataset, delta indices, video decoding)."""
|
||||
|
||||
from collections.abc import Callable
|
||||
from pathlib import Path
|
||||
|
||||
import datasets
|
||||
import torch
|
||||
|
||||
from lerobot.datasets.dataset_metadata import LeRobotDatasetMetadata
|
||||
from lerobot.datasets.feature_utils import (
|
||||
check_delta_timestamps,
|
||||
get_delta_indices,
|
||||
get_hf_features_from_features,
|
||||
)
|
||||
from lerobot.datasets.io_utils import (
|
||||
hf_transform_to_torch,
|
||||
load_nested_dataset,
|
||||
)
|
||||
from lerobot.datasets.video_utils import decode_video_frames
|
||||
|
||||
|
||||
class DatasetReader:
|
||||
"""Encapsulates read-side state and methods for LeRobotDataset.
|
||||
|
||||
Owns: hf_dataset, _absolute_to_relative_idx, delta_indices.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
meta: LeRobotDatasetMetadata,
|
||||
root: Path,
|
||||
episodes: list[int] | None,
|
||||
tolerance_s: float,
|
||||
video_backend: str,
|
||||
delta_timestamps: dict[str, list[float]] | None,
|
||||
image_transforms: Callable | None,
|
||||
):
|
||||
"""Initialize the reader with metadata, filtering, and transform config.
|
||||
|
||||
The HF dataset is not loaded here — call :meth:`try_load` or
|
||||
:meth:`load_and_activate` afterward.
|
||||
|
||||
Args:
|
||||
meta: Dataset metadata instance.
|
||||
root: Local dataset root directory.
|
||||
episodes: Optional list of episode indices to select. ``None``
|
||||
means all episodes.
|
||||
tolerance_s: Timestamp synchronization tolerance in seconds.
|
||||
video_backend: Video decoding backend identifier.
|
||||
delta_timestamps: Optional dict mapping feature keys to lists of
|
||||
relative timestamp offsets for temporal context windows.
|
||||
image_transforms: Optional torchvision v2 transform applied to
|
||||
visual features.
|
||||
"""
|
||||
self._meta = meta
|
||||
self.root = root
|
||||
self.episodes = episodes
|
||||
self._tolerance_s = tolerance_s
|
||||
self._video_backend = video_backend
|
||||
self._image_transforms = image_transforms
|
||||
|
||||
self.hf_dataset: datasets.Dataset | None = None
|
||||
self._absolute_to_relative_idx: dict[int, int] | None = None
|
||||
|
||||
# Setup delta_indices (doesn't depend on hf_dataset)
|
||||
self.delta_indices = None
|
||||
if delta_timestamps is not None:
|
||||
check_delta_timestamps(delta_timestamps, meta.fps, tolerance_s)
|
||||
self.delta_indices = get_delta_indices(delta_timestamps, meta.fps)
|
||||
|
||||
def try_load(self) -> bool:
|
||||
"""Attempt to load from local cache. Returns True if data is sufficient."""
|
||||
try:
|
||||
self.hf_dataset = self._load_hf_dataset()
|
||||
except (FileNotFoundError, NotADirectoryError):
|
||||
self.hf_dataset = None
|
||||
return False
|
||||
if not self._check_cached_episodes_sufficient():
|
||||
self.hf_dataset = None
|
||||
return False
|
||||
self._build_index_mapping()
|
||||
return True
|
||||
|
||||
def load_and_activate(self) -> None:
|
||||
"""Load HF dataset from disk and build index mapping. Call after data is on disk."""
|
||||
self.hf_dataset = self._load_hf_dataset()
|
||||
self._build_index_mapping()
|
||||
|
||||
def _build_index_mapping(self) -> None:
|
||||
"""Build absolute-to-relative index mapping from loaded hf_dataset."""
|
||||
self._absolute_to_relative_idx = None
|
||||
if self.episodes is not None and self.hf_dataset is not None:
|
||||
self._absolute_to_relative_idx = {
|
||||
abs_idx.item() if isinstance(abs_idx, torch.Tensor) else abs_idx: rel_idx
|
||||
for rel_idx, abs_idx in enumerate(self.hf_dataset["index"])
|
||||
}
|
||||
|
||||
@property
|
||||
def num_frames(self) -> int:
|
||||
"""Number of frames in selected episodes."""
|
||||
if self.episodes is not None and self.hf_dataset is not None:
|
||||
return len(self.hf_dataset)
|
||||
return self._meta.total_frames
|
||||
|
||||
@property
|
||||
def num_episodes(self) -> int:
|
||||
"""Number of episodes selected."""
|
||||
return len(self.episodes) if self.episodes is not None else self._meta.total_episodes
|
||||
|
||||
def _load_hf_dataset(self) -> datasets.Dataset:
|
||||
"""hf_dataset contains all the observations, states, actions, rewards, etc."""
|
||||
features = get_hf_features_from_features(self._meta.features)
|
||||
hf_dataset = load_nested_dataset(self.root / "data", features=features, episodes=self.episodes)
|
||||
hf_dataset.set_transform(hf_transform_to_torch)
|
||||
return hf_dataset
|
||||
|
||||
def _check_cached_episodes_sufficient(self) -> bool:
|
||||
"""Check if the cached dataset contains all requested episodes and their video files."""
|
||||
if self.hf_dataset is None or len(self.hf_dataset) == 0:
|
||||
return False
|
||||
|
||||
available_episodes = {
|
||||
ep_idx.item() if isinstance(ep_idx, torch.Tensor) else ep_idx
|
||||
for ep_idx in self.hf_dataset.unique("episode_index")
|
||||
}
|
||||
|
||||
if self.episodes is None:
|
||||
requested_episodes = set(range(self._meta.total_episodes))
|
||||
else:
|
||||
requested_episodes = set(self.episodes)
|
||||
|
||||
if not requested_episodes.issubset(available_episodes):
|
||||
return False
|
||||
|
||||
if len(self._meta.video_keys) > 0:
|
||||
for ep_idx in requested_episodes:
|
||||
for vid_key in self._meta.video_keys:
|
||||
video_path = self.root / self._meta.get_video_file_path(ep_idx, vid_key)
|
||||
if not video_path.exists():
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def get_episodes_file_paths(self) -> list[Path]:
|
||||
"""Return deduplicated file paths (data + video) for selected episodes.
|
||||
|
||||
Used to build the ``allow_patterns`` list for ``snapshot_download``.
|
||||
"""
|
||||
episodes = self.episodes if self.episodes is not None else list(range(self._meta.total_episodes))
|
||||
fpaths = [str(self._meta.get_data_file_path(ep_idx)) for ep_idx in episodes]
|
||||
if len(self._meta.video_keys) > 0:
|
||||
video_files = [
|
||||
str(self._meta.get_video_file_path(ep_idx, vid_key))
|
||||
for vid_key in self._meta.video_keys
|
||||
for ep_idx in episodes
|
||||
]
|
||||
fpaths += video_files
|
||||
# episodes are stored in the same files, so we return unique paths only
|
||||
fpaths = list(set(fpaths))
|
||||
return fpaths
|
||||
|
||||
def _get_query_indices(
|
||||
self, abs_idx: int, ep_idx: int
|
||||
) -> tuple[dict[str, list[int]], dict[str, torch.Tensor]]:
|
||||
"""Compute query indices for delta timestamps."""
|
||||
ep = self._meta.episodes[ep_idx]
|
||||
ep_start = ep["dataset_from_index"]
|
||||
ep_end = ep["dataset_to_index"]
|
||||
query_indices = {
|
||||
key: [max(ep_start, min(ep_end - 1, abs_idx + delta)) for delta in delta_idx]
|
||||
for key, delta_idx in self.delta_indices.items()
|
||||
}
|
||||
padding = {
|
||||
f"{key}_is_pad": torch.BoolTensor(
|
||||
[(abs_idx + delta < ep_start) | (abs_idx + delta >= ep_end) for delta in delta_idx]
|
||||
)
|
||||
for key, delta_idx in self.delta_indices.items()
|
||||
}
|
||||
return query_indices, padding
|
||||
|
||||
def _get_query_timestamps(
|
||||
self,
|
||||
current_ts: float,
|
||||
query_indices: dict[str, list[int]] | None = None,
|
||||
) -> dict[str, list[float]]:
|
||||
query_timestamps = {}
|
||||
for key in self._meta.video_keys:
|
||||
if query_indices is not None and key in query_indices:
|
||||
if self._absolute_to_relative_idx is not None:
|
||||
relative_indices = [self._absolute_to_relative_idx[idx] for idx in query_indices[key]]
|
||||
timestamps = self.hf_dataset[relative_indices]["timestamp"]
|
||||
else:
|
||||
timestamps = self.hf_dataset[query_indices[key]]["timestamp"]
|
||||
query_timestamps[key] = torch.stack(timestamps).tolist()
|
||||
else:
|
||||
query_timestamps[key] = [current_ts]
|
||||
|
||||
return query_timestamps
|
||||
|
||||
def _query_hf_dataset(self, query_indices: dict[str, list[int]]) -> dict:
|
||||
"""Query dataset for indices across keys, skipping video keys."""
|
||||
result: dict = {}
|
||||
for key, q_idx in query_indices.items():
|
||||
if key in self._meta.video_keys:
|
||||
continue
|
||||
relative_indices = (
|
||||
q_idx
|
||||
if self._absolute_to_relative_idx is None
|
||||
else [self._absolute_to_relative_idx[idx] for idx in q_idx]
|
||||
)
|
||||
try:
|
||||
result[key] = torch.stack(self.hf_dataset[key][relative_indices])
|
||||
except (KeyError, TypeError, IndexError):
|
||||
result[key] = torch.stack(self.hf_dataset[relative_indices][key])
|
||||
return result
|
||||
|
||||
def _query_videos(self, query_timestamps: dict[str, list[float]], ep_idx: int) -> dict[str, torch.Tensor]:
|
||||
"""Note: When using data workers (e.g. DataLoader with num_workers>0), do not call this function
|
||||
in the main process (e.g. by using a second Dataloader with num_workers=0). It will result in a
|
||||
Segmentation Fault.
|
||||
"""
|
||||
ep = self._meta.episodes[ep_idx]
|
||||
item = {}
|
||||
for vid_key, query_ts in query_timestamps.items():
|
||||
from_timestamp = ep[f"videos/{vid_key}/from_timestamp"]
|
||||
shifted_query_ts = [from_timestamp + ts for ts in query_ts]
|
||||
|
||||
video_path = self.root / self._meta.get_video_file_path(ep_idx, vid_key)
|
||||
frames = decode_video_frames(video_path, shifted_query_ts, self._tolerance_s, self._video_backend)
|
||||
item[vid_key] = frames.squeeze(0)
|
||||
|
||||
return item
|
||||
|
||||
def get_item(self, idx) -> dict:
|
||||
"""Core __getitem__ logic. Assumes hf_dataset is loaded.
|
||||
|
||||
``idx`` is a *relative* index into the (possibly episode-filtered)
|
||||
HF dataset, **not** the absolute frame index stored in the ``index``
|
||||
column. The absolute index is retrieved from the row itself.
|
||||
"""
|
||||
item = self.hf_dataset[idx]
|
||||
ep_idx = item["episode_index"].item()
|
||||
abs_idx = item["index"].item()
|
||||
|
||||
query_indices = None
|
||||
if self.delta_indices is not None:
|
||||
query_indices, padding = self._get_query_indices(abs_idx, ep_idx)
|
||||
query_result = self._query_hf_dataset(query_indices)
|
||||
item = {**item, **padding}
|
||||
for key, val in query_result.items():
|
||||
item[key] = val
|
||||
|
||||
if len(self._meta.video_keys) > 0:
|
||||
current_ts = item["timestamp"].item()
|
||||
query_timestamps = self._get_query_timestamps(current_ts, query_indices)
|
||||
video_frames = self._query_videos(query_timestamps, ep_idx)
|
||||
item = {**video_frames, **item}
|
||||
|
||||
if self._image_transforms is not None:
|
||||
image_keys = self._meta.camera_keys
|
||||
for cam in image_keys:
|
||||
item[cam] = self._image_transforms(item[cam])
|
||||
|
||||
# Add task as a string
|
||||
task_idx = item["task_index"].item()
|
||||
item["task"] = self._meta.tasks.iloc[task_idx].name
|
||||
|
||||
# add subtask information if available
|
||||
if "subtask_index" in self._meta.features and self._meta.subtasks is not None:
|
||||
subtask_idx = item["subtask_index"].item()
|
||||
item["subtask"] = self._meta.subtasks.iloc[subtask_idx].name
|
||||
|
||||
return item
|
||||
@@ -37,7 +37,11 @@ import torch
|
||||
from tqdm import tqdm
|
||||
|
||||
from lerobot.datasets.aggregate import aggregate_datasets
|
||||
from lerobot.datasets.compute_stats import aggregate_stats
|
||||
from lerobot.datasets.compute_stats import (
|
||||
aggregate_stats,
|
||||
compute_episode_stats,
|
||||
compute_relative_action_stats,
|
||||
)
|
||||
from lerobot.datasets.dataset_metadata import LeRobotDatasetMetadata
|
||||
from lerobot.datasets.io_utils import (
|
||||
get_parquet_file_size_in_mb,
|
||||
@@ -56,7 +60,7 @@ from lerobot.datasets.utils import (
|
||||
update_chunk_file_indices,
|
||||
)
|
||||
from lerobot.datasets.video_utils import encode_video_frames, get_video_info
|
||||
from lerobot.utils.constants import HF_LEROBOT_HOME, OBS_IMAGE
|
||||
from lerobot.utils.constants import ACTION, HF_LEROBOT_HOME, OBS_IMAGE, OBS_STATE
|
||||
|
||||
|
||||
def _load_episode_with_stats(src_dataset: LeRobotDataset, episode_idx: int) -> dict:
|
||||
@@ -891,7 +895,7 @@ def _copy_and_reindex_episodes_metadata(
|
||||
|
||||
total_frames += src_episode["length"]
|
||||
|
||||
dst_meta._close_writer()
|
||||
dst_meta.finalize()
|
||||
|
||||
dst_meta.info.update(
|
||||
{
|
||||
@@ -1533,6 +1537,114 @@ def modify_tasks(
|
||||
return dataset
|
||||
|
||||
|
||||
def recompute_stats(
|
||||
dataset: LeRobotDataset,
|
||||
skip_image_video: bool = True,
|
||||
relative_action: bool = False,
|
||||
relative_exclude_joints: list[str] | None = None,
|
||||
chunk_size: int = 50,
|
||||
num_workers: int = 0,
|
||||
) -> LeRobotDataset:
|
||||
"""Recompute stats.json from scratch by iterating all episodes.
|
||||
|
||||
Args:
|
||||
dataset: The LeRobotDataset to recompute stats for.
|
||||
skip_image_video: If True (default), only recompute stats for numeric features
|
||||
(action, state, etc.) and keep existing image/video stats unchanged.
|
||||
relative_action: If True, compute action stats in relative space by
|
||||
iterating all valid action chunks and subtracting the current state.
|
||||
This matches the normalization distribution the model sees during
|
||||
training with ``use_relative_actions=True``.
|
||||
relative_exclude_joints: Joint names to exclude from relative conversion when
|
||||
relative_action=True. These dims keep absolute stats.
|
||||
chunk_size: Action chunk size used for relative stats computation. Should match
|
||||
``policy.chunk_size``. Only used when ``relative_action=True``.
|
||||
num_workers: Number of parallel threads for relative action stats computation.
|
||||
Values ≤1 mean single-threaded. Only used when ``relative_action=True``.
|
||||
|
||||
Returns:
|
||||
The same dataset with updated stats.
|
||||
"""
|
||||
features = dataset.meta.features
|
||||
meta_keys = {"index", "episode_index", "task_index", "frame_index", "timestamp"}
|
||||
numeric_features = {
|
||||
k: v
|
||||
for k, v in features.items()
|
||||
if v["dtype"] not in ["image", "video", "string"] and k not in meta_keys
|
||||
}
|
||||
|
||||
if skip_image_video:
|
||||
features_to_compute = numeric_features
|
||||
else:
|
||||
features_to_compute = {
|
||||
k: v for k, v in features.items() if v["dtype"] != "string" and k not in meta_keys
|
||||
}
|
||||
|
||||
# When relative_action is enabled, compute action stats via chunk-based sampling
|
||||
# (matching what the model sees during training) and skip action in the
|
||||
# per-episode pass below.
|
||||
relative_action_stats = None
|
||||
if relative_action and ACTION in features and OBS_STATE in features:
|
||||
if relative_exclude_joints is None:
|
||||
relative_exclude_joints = ["gripper"]
|
||||
relative_action_stats = compute_relative_action_stats(
|
||||
hf_dataset=dataset.hf_dataset,
|
||||
features=features,
|
||||
chunk_size=chunk_size,
|
||||
exclude_joints=relative_exclude_joints,
|
||||
num_workers=num_workers,
|
||||
)
|
||||
features_to_compute.pop(ACTION, None)
|
||||
|
||||
logging.info(f"Recomputing stats for features: {list(features_to_compute.keys())}")
|
||||
|
||||
data_dir = dataset.root / DATA_DIR
|
||||
parquet_files = sorted(data_dir.glob("*/*.parquet"))
|
||||
if not parquet_files:
|
||||
raise ValueError(f"No parquet files found in {data_dir}")
|
||||
|
||||
all_episode_stats = []
|
||||
numeric_keys = [k for k, v in features_to_compute.items() if v["dtype"] not in ["image", "video"]]
|
||||
|
||||
for parquet_path in tqdm(parquet_files, desc="Computing stats from data files"):
|
||||
df = pd.read_parquet(parquet_path)
|
||||
|
||||
for ep_idx in sorted(df["episode_index"].unique()):
|
||||
ep_df = df[df["episode_index"] == ep_idx]
|
||||
episode_data = {}
|
||||
for key in numeric_keys:
|
||||
if key in ep_df.columns:
|
||||
values = ep_df[key].values
|
||||
if hasattr(values[0], "__len__"):
|
||||
episode_data[key] = np.stack(values)
|
||||
else:
|
||||
episode_data[key] = np.array(values)
|
||||
|
||||
ep_stats = compute_episode_stats(episode_data, features_to_compute)
|
||||
all_episode_stats.append(ep_stats)
|
||||
|
||||
if features_to_compute and not all_episode_stats:
|
||||
logging.warning("No episode stats computed")
|
||||
return dataset
|
||||
|
||||
new_stats = aggregate_stats(all_episode_stats) if all_episode_stats else {}
|
||||
|
||||
if relative_action_stats is not None:
|
||||
new_stats[ACTION] = relative_action_stats
|
||||
|
||||
# Merge: keep existing stats for features we didn't recompute
|
||||
if dataset.meta.stats:
|
||||
for key, value in dataset.meta.stats.items():
|
||||
if key not in new_stats:
|
||||
new_stats[key] = value
|
||||
|
||||
write_stats(new_stats, dataset.root)
|
||||
dataset.meta.stats = new_stats
|
||||
|
||||
logging.info("Stats recomputed successfully")
|
||||
return dataset
|
||||
|
||||
|
||||
def convert_image_to_video_dataset(
|
||||
dataset: LeRobotDataset,
|
||||
output_dir: Path | None = None,
|
||||
|
||||
@@ -0,0 +1,634 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Private writer component for LeRobotDataset. Handles sequential recording (episode buffer, ParquetWriter, image writer, video encoding)."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import concurrent.futures
|
||||
import contextlib
|
||||
import logging
|
||||
import shutil
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import datasets
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import PIL.Image
|
||||
import pyarrow.parquet as pq
|
||||
import torch
|
||||
|
||||
from lerobot.datasets.compute_stats import compute_episode_stats
|
||||
from lerobot.datasets.dataset_metadata import LeRobotDatasetMetadata
|
||||
from lerobot.datasets.feature_utils import (
|
||||
get_hf_features_from_features,
|
||||
validate_episode_buffer,
|
||||
validate_frame,
|
||||
)
|
||||
from lerobot.datasets.image_writer import AsyncImageWriter, write_image
|
||||
from lerobot.datasets.io_utils import (
|
||||
embed_images,
|
||||
get_file_size_in_mb,
|
||||
load_episodes,
|
||||
write_info,
|
||||
)
|
||||
from lerobot.datasets.utils import (
|
||||
DEFAULT_EPISODES_PATH,
|
||||
DEFAULT_IMAGE_PATH,
|
||||
update_chunk_file_indices,
|
||||
)
|
||||
from lerobot.datasets.video_utils import (
|
||||
StreamingVideoEncoder,
|
||||
concatenate_video_files,
|
||||
encode_video_frames,
|
||||
get_video_duration_in_s,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _encode_video_worker(
|
||||
video_key: str,
|
||||
episode_index: int,
|
||||
root: Path,
|
||||
fps: int,
|
||||
vcodec: str = "libsvtav1",
|
||||
encoder_threads: int | None = None,
|
||||
) -> Path:
|
||||
temp_path = Path(tempfile.mkdtemp(dir=root)) / f"{video_key}_{episode_index:03d}.mp4"
|
||||
fpath = DEFAULT_IMAGE_PATH.format(image_key=video_key, episode_index=episode_index, frame_index=0)
|
||||
img_dir = (root / fpath).parent
|
||||
encode_video_frames(
|
||||
img_dir, temp_path, fps, vcodec=vcodec, overwrite=True, encoder_threads=encoder_threads
|
||||
)
|
||||
shutil.rmtree(img_dir)
|
||||
return temp_path
|
||||
|
||||
|
||||
class DatasetWriter:
|
||||
"""Encapsulates write-side state and methods for LeRobotDataset.
|
||||
|
||||
Owns: episode_buffer, image_writer, _pq_writer (ParquetWriter), _latest_episode,
|
||||
_current_file_start_frame, _streaming_encoder, _episodes_since_last_encoding, _recorded_frames.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
meta: LeRobotDatasetMetadata,
|
||||
root: Path,
|
||||
vcodec: str,
|
||||
encoder_threads: int | None,
|
||||
batch_encoding_size: int,
|
||||
streaming_encoder: StreamingVideoEncoder | None = None,
|
||||
initial_frames: int = 0,
|
||||
):
|
||||
"""Initialize the writer with metadata, codec, and encoding config.
|
||||
|
||||
Args:
|
||||
meta: Dataset metadata instance (used for feature schema, chunk
|
||||
settings, and episode persistence).
|
||||
root: Local dataset root directory.
|
||||
vcodec: Video codec for encoding (e.g. ``'libsvtav1'``, ``'h264'``).
|
||||
encoder_threads: Threads per encoder instance. ``None`` for auto.
|
||||
batch_encoding_size: Number of episodes to accumulate before
|
||||
batch-encoding videos.
|
||||
streaming_encoder: Optional pre-built :class:`StreamingVideoEncoder`
|
||||
for real-time encoding. ``None`` disables streaming mode.
|
||||
initial_frames: Starting frame count (non-zero when resuming).
|
||||
"""
|
||||
self._meta = meta
|
||||
self._root = root
|
||||
self._vcodec = vcodec
|
||||
self._encoder_threads = encoder_threads
|
||||
self._batch_encoding_size = batch_encoding_size
|
||||
self._streaming_encoder = streaming_encoder
|
||||
|
||||
# Writer state
|
||||
self.image_writer: AsyncImageWriter | None = None
|
||||
self.episode_buffer: dict = self._create_episode_buffer()
|
||||
self._pq_writer: pq.ParquetWriter | None = None
|
||||
self._latest_episode: dict | None = None
|
||||
self._current_file_start_frame: int | None = None
|
||||
self._episodes_since_last_encoding: int = 0
|
||||
self._recorded_frames: int = initial_frames
|
||||
self._finalized = False
|
||||
|
||||
def _create_episode_buffer(self, episode_index: int | None = None) -> dict:
|
||||
current_ep_idx = self._meta.total_episodes if episode_index is None else episode_index
|
||||
ep_buffer = {}
|
||||
ep_buffer["size"] = 0
|
||||
ep_buffer["task"] = []
|
||||
for key in self._meta.features:
|
||||
ep_buffer[key] = current_ep_idx if key == "episode_index" else []
|
||||
return ep_buffer
|
||||
|
||||
def _get_image_file_path(self, episode_index: int, image_key: str, frame_index: int) -> Path:
|
||||
fpath = DEFAULT_IMAGE_PATH.format(
|
||||
image_key=image_key, episode_index=episode_index, frame_index=frame_index
|
||||
)
|
||||
return self._root / fpath
|
||||
|
||||
def _get_image_file_dir(self, episode_index: int, image_key: str) -> Path:
|
||||
return self._get_image_file_path(episode_index, image_key, frame_index=0).parent
|
||||
|
||||
def _save_image(
|
||||
self, image: torch.Tensor | np.ndarray | PIL.Image.Image, fpath: Path, compress_level: int = 1
|
||||
) -> None:
|
||||
if self.image_writer is None:
|
||||
if isinstance(image, torch.Tensor):
|
||||
image = image.cpu().numpy()
|
||||
write_image(image, fpath, compress_level=compress_level)
|
||||
else:
|
||||
self.image_writer.save_image(image=image, fpath=fpath, compress_level=compress_level)
|
||||
|
||||
def add_frame(self, frame: dict) -> None:
|
||||
"""
|
||||
Add a single frame to the current episode buffer.
|
||||
|
||||
Apart from images written to a temporary directory, nothing is written to disk
|
||||
until ``save_episode()`` is called.
|
||||
|
||||
The caller must provide all user-defined features plus ``"task"``, and must
|
||||
not provide ``"timestamp"`` or ``"frame_index"``; those are computed
|
||||
automatically.
|
||||
"""
|
||||
# Convert torch to numpy if needed
|
||||
for name in frame:
|
||||
if isinstance(frame[name], torch.Tensor):
|
||||
frame[name] = frame[name].numpy()
|
||||
|
||||
validate_frame(frame, self._meta.features)
|
||||
|
||||
if self.episode_buffer is None:
|
||||
self.episode_buffer = self._create_episode_buffer()
|
||||
|
||||
# Automatically add frame_index and timestamp to episode buffer
|
||||
frame_index = self.episode_buffer["size"]
|
||||
timestamp = frame_index / self._meta.fps
|
||||
self.episode_buffer["frame_index"].append(frame_index)
|
||||
self.episode_buffer["timestamp"].append(timestamp)
|
||||
self.episode_buffer["task"].append(frame.pop("task"))
|
||||
|
||||
# Start streaming encoder on first frame of episode
|
||||
if frame_index == 0 and self._streaming_encoder is not None:
|
||||
self._streaming_encoder.start_episode(
|
||||
video_keys=list(self._meta.video_keys),
|
||||
temp_dir=self._root,
|
||||
)
|
||||
|
||||
# Add frame features to episode_buffer
|
||||
for key in frame:
|
||||
if key not in self._meta.features:
|
||||
raise ValueError(
|
||||
f"An element of the frame is not in the features. '{key}' not in '{self._meta.features.keys()}'."
|
||||
)
|
||||
|
||||
if self._meta.features[key]["dtype"] == "video" and self._streaming_encoder is not None:
|
||||
self._streaming_encoder.feed_frame(key, frame[key])
|
||||
self.episode_buffer[key].append(None)
|
||||
elif self._meta.features[key]["dtype"] in ["image", "video"]:
|
||||
img_path = self._get_image_file_path(
|
||||
episode_index=self.episode_buffer["episode_index"], image_key=key, frame_index=frame_index
|
||||
)
|
||||
if frame_index == 0:
|
||||
img_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
compress_level = 1 if self._meta.features[key]["dtype"] == "video" else 6
|
||||
self._save_image(frame[key], img_path, compress_level)
|
||||
self.episode_buffer[key].append(str(img_path))
|
||||
else:
|
||||
self.episode_buffer[key].append(frame[key])
|
||||
|
||||
self.episode_buffer["size"] += 1
|
||||
|
||||
def save_episode(
|
||||
self,
|
||||
episode_data: dict | None = None,
|
||||
parallel_encoding: bool = True,
|
||||
) -> None:
|
||||
"""Save the current episode in self.episode_buffer to disk."""
|
||||
episode_buffer = episode_data if episode_data is not None else self.episode_buffer
|
||||
|
||||
validate_episode_buffer(episode_buffer, self._meta.total_episodes, self._meta.features)
|
||||
|
||||
# size and task are special cases that won't be added to hf_dataset
|
||||
episode_length = episode_buffer.pop("size")
|
||||
tasks = episode_buffer.pop("task")
|
||||
episode_tasks = list(set(tasks))
|
||||
episode_index = episode_buffer["episode_index"]
|
||||
|
||||
episode_buffer["index"] = np.arange(self._meta.total_frames, self._meta.total_frames + episode_length)
|
||||
episode_buffer["episode_index"] = np.full((episode_length,), episode_index)
|
||||
|
||||
# Update tasks and task indices with new tasks if any
|
||||
self._meta.save_episode_tasks(episode_tasks)
|
||||
|
||||
# Given tasks in natural language, find their corresponding task indices
|
||||
episode_buffer["task_index"] = np.array([self._meta.get_task_index(task) for task in tasks])
|
||||
|
||||
for key, ft in self._meta.features.items():
|
||||
if key in ["index", "episode_index", "task_index"] or ft["dtype"] in ["image", "video"]:
|
||||
continue
|
||||
episode_buffer[key] = np.stack(episode_buffer[key])
|
||||
|
||||
# Wait for image writer to end, so that episode stats over images can be computed
|
||||
self._wait_image_writer()
|
||||
|
||||
has_video_keys = len(self._meta.video_keys) > 0
|
||||
use_streaming = self._streaming_encoder is not None and has_video_keys
|
||||
use_batched_encoding = self._batch_encoding_size > 1
|
||||
|
||||
if use_streaming:
|
||||
non_video_buffer = {
|
||||
k: v
|
||||
for k, v in episode_buffer.items()
|
||||
if self._meta.features.get(k, {}).get("dtype") not in ("video",)
|
||||
}
|
||||
non_video_features = {k: v for k, v in self._meta.features.items() if v["dtype"] != "video"}
|
||||
ep_stats = compute_episode_stats(non_video_buffer, non_video_features)
|
||||
else:
|
||||
ep_stats = compute_episode_stats(episode_buffer, self._meta.features)
|
||||
|
||||
ep_metadata = self._save_episode_data(episode_buffer)
|
||||
|
||||
if use_streaming:
|
||||
streaming_results = self._streaming_encoder.finish_episode()
|
||||
for video_key in self._meta.video_keys:
|
||||
temp_path, video_stats = streaming_results[video_key]
|
||||
if video_stats is not None:
|
||||
ep_stats[video_key] = {
|
||||
k: v if k == "count" else np.squeeze(v.reshape(1, -1, 1, 1) / 255.0, axis=0)
|
||||
for k, v in video_stats.items()
|
||||
}
|
||||
ep_metadata.update(self._save_episode_video(video_key, episode_index, temp_path=temp_path))
|
||||
elif has_video_keys and not use_batched_encoding:
|
||||
num_cameras = len(self._meta.video_keys)
|
||||
if parallel_encoding and num_cameras > 1:
|
||||
with concurrent.futures.ProcessPoolExecutor(max_workers=num_cameras) as executor:
|
||||
future_to_key = {
|
||||
executor.submit(
|
||||
_encode_video_worker,
|
||||
video_key,
|
||||
episode_index,
|
||||
self._root,
|
||||
self._meta.fps,
|
||||
self._vcodec,
|
||||
self._encoder_threads,
|
||||
): video_key
|
||||
for video_key in self._meta.video_keys
|
||||
}
|
||||
|
||||
results = {}
|
||||
for future in concurrent.futures.as_completed(future_to_key):
|
||||
video_key = future_to_key[future]
|
||||
try:
|
||||
temp_path = future.result()
|
||||
results[video_key] = temp_path
|
||||
except Exception as exc:
|
||||
logger.error(f"Video encoding failed for {video_key}: {exc}")
|
||||
raise exc
|
||||
|
||||
for video_key in self._meta.video_keys:
|
||||
temp_path = results[video_key]
|
||||
ep_metadata.update(
|
||||
self._save_episode_video(video_key, episode_index, temp_path=temp_path)
|
||||
)
|
||||
else:
|
||||
for video_key in self._meta.video_keys:
|
||||
ep_metadata.update(self._save_episode_video(video_key, episode_index))
|
||||
|
||||
# `meta.save_episode` need to be executed after encoding the videos
|
||||
self._meta.save_episode(episode_index, episode_length, episode_tasks, ep_stats, ep_metadata)
|
||||
|
||||
if has_video_keys and use_batched_encoding:
|
||||
self._episodes_since_last_encoding += 1
|
||||
if self._episodes_since_last_encoding == self._batch_encoding_size:
|
||||
start_ep = self._meta.total_episodes - self._batch_encoding_size
|
||||
end_ep = self._meta.total_episodes
|
||||
self._batch_save_episode_video(start_ep, end_ep)
|
||||
self._episodes_since_last_encoding = 0
|
||||
|
||||
if episode_data is None:
|
||||
self.clear_episode_buffer(delete_images=len(self._meta.image_keys) > 0)
|
||||
|
||||
def _batch_save_episode_video(self, start_episode: int, end_episode: int | None = None) -> None:
|
||||
"""Batch save videos for multiple episodes."""
|
||||
if end_episode is None:
|
||||
end_episode = self._meta.total_episodes
|
||||
|
||||
logger.info(
|
||||
f"Batch encoding {self._batch_encoding_size} videos for episodes {start_episode} to {end_episode - 1}"
|
||||
)
|
||||
|
||||
chunk_idx = self._meta.episodes[start_episode]["data/chunk_index"]
|
||||
file_idx = self._meta.episodes[start_episode]["data/file_index"]
|
||||
episode_df_path = self._root / DEFAULT_EPISODES_PATH.format(
|
||||
chunk_index=chunk_idx, file_index=file_idx
|
||||
)
|
||||
episode_df = pd.read_parquet(episode_df_path)
|
||||
|
||||
for ep_idx in range(start_episode, end_episode):
|
||||
logger.info(f"Encoding videos for episode {ep_idx}")
|
||||
|
||||
if (
|
||||
self._meta.episodes[ep_idx]["data/chunk_index"] != chunk_idx
|
||||
or self._meta.episodes[ep_idx]["data/file_index"] != file_idx
|
||||
):
|
||||
episode_df.to_parquet(episode_df_path)
|
||||
self._meta.episodes = load_episodes(self._root)
|
||||
|
||||
chunk_idx = self._meta.episodes[ep_idx]["data/chunk_index"]
|
||||
file_idx = self._meta.episodes[ep_idx]["data/file_index"]
|
||||
episode_df_path = self._root / DEFAULT_EPISODES_PATH.format(
|
||||
chunk_index=chunk_idx, file_index=file_idx
|
||||
)
|
||||
episode_df = pd.read_parquet(episode_df_path)
|
||||
|
||||
video_ep_metadata = {}
|
||||
for video_key in self._meta.video_keys:
|
||||
video_ep_metadata.update(self._save_episode_video(video_key, ep_idx))
|
||||
video_ep_metadata.pop("episode_index")
|
||||
video_ep_df = pd.DataFrame(video_ep_metadata, index=[ep_idx]).convert_dtypes(
|
||||
dtype_backend="pyarrow"
|
||||
)
|
||||
|
||||
episode_df = episode_df.combine_first(video_ep_df)
|
||||
episode_df.to_parquet(episode_df_path)
|
||||
self._meta.episodes = load_episodes(self._root)
|
||||
|
||||
def _save_episode_data(self, episode_buffer: dict) -> dict:
|
||||
"""Save episode data to a parquet file."""
|
||||
# Use metadata features as the authoritative schema
|
||||
hf_features = get_hf_features_from_features(self._meta.features)
|
||||
ep_dict = {key: episode_buffer[key] for key in hf_features}
|
||||
ep_dataset = datasets.Dataset.from_dict(ep_dict, features=hf_features, split="train")
|
||||
ep_dataset = embed_images(ep_dataset)
|
||||
ep_num_frames = len(ep_dataset)
|
||||
|
||||
if self._latest_episode is None:
|
||||
chunk_idx, file_idx = 0, 0
|
||||
global_frame_index = 0
|
||||
self._current_file_start_frame = 0
|
||||
if self._meta.episodes is not None and len(self._meta.episodes) > 0:
|
||||
latest_ep = self._meta.episodes[-1]
|
||||
global_frame_index = latest_ep["dataset_to_index"]
|
||||
chunk_idx = latest_ep["data/chunk_index"]
|
||||
file_idx = latest_ep["data/file_index"]
|
||||
|
||||
chunk_idx, file_idx = update_chunk_file_indices(chunk_idx, file_idx, self._meta.chunks_size)
|
||||
self._current_file_start_frame = global_frame_index
|
||||
else:
|
||||
latest_ep = self._latest_episode
|
||||
chunk_idx = latest_ep["data/chunk_index"]
|
||||
file_idx = latest_ep["data/file_index"]
|
||||
global_frame_index = latest_ep["index"][-1] + 1
|
||||
|
||||
latest_path = self._root / self._meta.data_path.format(chunk_index=chunk_idx, file_index=file_idx)
|
||||
latest_size_in_mb = get_file_size_in_mb(latest_path)
|
||||
|
||||
frames_in_current_file = global_frame_index - self._current_file_start_frame
|
||||
av_size_per_frame = (
|
||||
latest_size_in_mb / frames_in_current_file if frames_in_current_file > 0 else 0
|
||||
)
|
||||
|
||||
if latest_size_in_mb + av_size_per_frame * ep_num_frames >= self._meta.data_files_size_in_mb:
|
||||
chunk_idx, file_idx = update_chunk_file_indices(chunk_idx, file_idx, self._meta.chunks_size)
|
||||
self.close_writer()
|
||||
self._current_file_start_frame = global_frame_index
|
||||
|
||||
ep_dict["data/chunk_index"] = chunk_idx
|
||||
ep_dict["data/file_index"] = file_idx
|
||||
|
||||
path = self._root / self._meta.data_path.format(chunk_index=chunk_idx, file_index=file_idx)
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
table = ep_dataset.with_format("arrow")[:]
|
||||
if not self._pq_writer:
|
||||
self._pq_writer = pq.ParquetWriter(
|
||||
path, schema=table.schema, compression="snappy", use_dictionary=True
|
||||
)
|
||||
self._pq_writer.write_table(table)
|
||||
|
||||
metadata = {
|
||||
"data/chunk_index": chunk_idx,
|
||||
"data/file_index": file_idx,
|
||||
"dataset_from_index": global_frame_index,
|
||||
"dataset_to_index": global_frame_index + ep_num_frames,
|
||||
}
|
||||
|
||||
self._latest_episode = {**ep_dict, **metadata}
|
||||
self._recorded_frames += ep_num_frames
|
||||
|
||||
return metadata
|
||||
|
||||
def _save_episode_video(
|
||||
self,
|
||||
video_key: str,
|
||||
episode_index: int,
|
||||
temp_path: Path | None = None,
|
||||
) -> dict:
|
||||
if temp_path is None:
|
||||
ep_path = self._encode_temporary_episode_video(video_key, episode_index)
|
||||
else:
|
||||
ep_path = temp_path
|
||||
|
||||
ep_size_in_mb = get_file_size_in_mb(ep_path)
|
||||
ep_duration_in_s = get_video_duration_in_s(ep_path)
|
||||
|
||||
if (
|
||||
episode_index == 0
|
||||
or self._meta.latest_episode is None
|
||||
or f"videos/{video_key}/chunk_index" not in self._meta.latest_episode
|
||||
):
|
||||
chunk_idx, file_idx = 0, 0
|
||||
if self._meta.episodes is not None and len(self._meta.episodes) > 0:
|
||||
old_chunk_idx = self._meta.episodes[-1][f"videos/{video_key}/chunk_index"]
|
||||
old_file_idx = self._meta.episodes[-1][f"videos/{video_key}/file_index"]
|
||||
chunk_idx, file_idx = update_chunk_file_indices(
|
||||
old_chunk_idx, old_file_idx, self._meta.chunks_size
|
||||
)
|
||||
latest_duration_in_s = 0.0
|
||||
new_path = self._root / self._meta.video_path.format(
|
||||
video_key=video_key, chunk_index=chunk_idx, file_index=file_idx
|
||||
)
|
||||
new_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
shutil.move(str(ep_path), str(new_path))
|
||||
else:
|
||||
latest_ep = self._meta.latest_episode
|
||||
chunk_idx = latest_ep[f"videos/{video_key}/chunk_index"][0]
|
||||
file_idx = latest_ep[f"videos/{video_key}/file_index"][0]
|
||||
|
||||
latest_path = self._root / self._meta.video_path.format(
|
||||
video_key=video_key, chunk_index=chunk_idx, file_index=file_idx
|
||||
)
|
||||
latest_size_in_mb = get_file_size_in_mb(latest_path)
|
||||
latest_duration_in_s = latest_ep[f"videos/{video_key}/to_timestamp"][0]
|
||||
|
||||
if latest_size_in_mb + ep_size_in_mb >= self._meta.video_files_size_in_mb:
|
||||
chunk_idx, file_idx = update_chunk_file_indices(chunk_idx, file_idx, self._meta.chunks_size)
|
||||
new_path = self._root / self._meta.video_path.format(
|
||||
video_key=video_key, chunk_index=chunk_idx, file_index=file_idx
|
||||
)
|
||||
new_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
shutil.move(str(ep_path), str(new_path))
|
||||
latest_duration_in_s = 0.0
|
||||
else:
|
||||
concatenate_video_files(
|
||||
[latest_path, ep_path],
|
||||
latest_path,
|
||||
)
|
||||
|
||||
# Remove temporary directory
|
||||
shutil.rmtree(str(ep_path.parent))
|
||||
|
||||
# Update video info (only needed when first episode is encoded)
|
||||
if episode_index == 0:
|
||||
self._meta.update_video_info(video_key)
|
||||
write_info(self._meta.info, self._meta.root)
|
||||
|
||||
metadata = {
|
||||
"episode_index": episode_index,
|
||||
f"videos/{video_key}/chunk_index": chunk_idx,
|
||||
f"videos/{video_key}/file_index": file_idx,
|
||||
f"videos/{video_key}/from_timestamp": latest_duration_in_s,
|
||||
f"videos/{video_key}/to_timestamp": latest_duration_in_s + ep_duration_in_s,
|
||||
}
|
||||
return metadata
|
||||
|
||||
def clear_episode_buffer(self, delete_images: bool = True) -> None:
|
||||
"""Discard the current episode buffer and optionally delete temp images.
|
||||
|
||||
Args:
|
||||
delete_images: If ``True``, remove temporary image directories
|
||||
written for the current episode.
|
||||
"""
|
||||
# Cancel streaming encoder if active
|
||||
if self._streaming_encoder is not None:
|
||||
self._streaming_encoder.cancel_episode()
|
||||
|
||||
if delete_images:
|
||||
if self.image_writer is not None:
|
||||
self._wait_image_writer()
|
||||
episode_index = self.episode_buffer["episode_index"]
|
||||
# episode_index is `int` when freshly created, but becomes `np.ndarray` after
|
||||
# save_episode() mutates the buffer. Handle both types here.
|
||||
if isinstance(episode_index, np.ndarray):
|
||||
episode_index = episode_index.item() if episode_index.size == 1 else episode_index[0]
|
||||
for cam_key in self._meta.image_keys:
|
||||
img_dir = self._get_image_file_dir(episode_index, cam_key)
|
||||
if img_dir.is_dir():
|
||||
shutil.rmtree(img_dir)
|
||||
|
||||
self.episode_buffer = self._create_episode_buffer()
|
||||
|
||||
def start_image_writer(self, num_processes: int = 0, num_threads: int = 4) -> None:
|
||||
"""Start an :class:`AsyncImageWriter` for background image persistence.
|
||||
|
||||
Args:
|
||||
num_processes: Number of subprocesses. ``0`` means threads only.
|
||||
num_threads: Number of threads per process.
|
||||
"""
|
||||
if isinstance(self.image_writer, AsyncImageWriter):
|
||||
logger.warning(
|
||||
"You are starting a new AsyncImageWriter that is replacing an already existing one in the dataset."
|
||||
)
|
||||
|
||||
self.image_writer = AsyncImageWriter(
|
||||
num_processes=num_processes,
|
||||
num_threads=num_threads,
|
||||
)
|
||||
|
||||
def stop_image_writer(self) -> None:
|
||||
"""Stop the image writer (needed before pickling the dataset for DataLoader)."""
|
||||
if self.image_writer is not None:
|
||||
self.image_writer.stop()
|
||||
self.image_writer = None
|
||||
|
||||
def _wait_image_writer(self) -> None:
|
||||
"""Wait for asynchronous image writer to finish."""
|
||||
if self.image_writer is not None:
|
||||
self.image_writer.wait_until_done()
|
||||
|
||||
def _encode_temporary_episode_video(self, video_key: str, episode_index: int) -> Path:
|
||||
"""Use ffmpeg to convert frames stored as png into mp4 videos."""
|
||||
return _encode_video_worker(
|
||||
video_key, episode_index, self._root, self._meta.fps, self._vcodec, self._encoder_threads
|
||||
)
|
||||
|
||||
def close_writer(self) -> None:
|
||||
"""Close and cleanup the parquet writer if it exists."""
|
||||
if self._pq_writer is not None:
|
||||
self._pq_writer.close()
|
||||
self._pq_writer = None
|
||||
|
||||
def flush_pending_videos(self) -> None:
|
||||
"""Flush any pending video encoding (streaming or batch).
|
||||
|
||||
For streaming encoding: closes the encoder.
|
||||
For batch encoding: encodes any remaining episodes that haven't been batch-encoded yet.
|
||||
"""
|
||||
if self._streaming_encoder is not None:
|
||||
self._streaming_encoder.close()
|
||||
elif self._episodes_since_last_encoding > 0:
|
||||
start_ep = self._meta.total_episodes - self._episodes_since_last_encoding
|
||||
end_ep = self._meta.total_episodes
|
||||
logger.info(
|
||||
f"Encoding remaining {self._episodes_since_last_encoding} episodes, "
|
||||
f"from episode {start_ep} to {end_ep - 1}"
|
||||
)
|
||||
self._batch_save_episode_video(start_ep, end_ep)
|
||||
|
||||
def cancel_pending_videos(self) -> None:
|
||||
"""Cancel any in-progress streaming encoding without flushing."""
|
||||
if self._streaming_encoder is not None:
|
||||
self._streaming_encoder.cancel_episode()
|
||||
|
||||
def cleanup_interrupted_episode(self, episode_index: int) -> None:
|
||||
"""Remove temporary image directories for an interrupted episode."""
|
||||
for key in self._meta.video_keys:
|
||||
img_dir = self._get_image_file_path(
|
||||
episode_index=episode_index, image_key=key, frame_index=0
|
||||
).parent
|
||||
if img_dir.exists():
|
||||
logger.debug(
|
||||
f"Cleaning up interrupted episode images for episode {episode_index}, camera {key}"
|
||||
)
|
||||
shutil.rmtree(img_dir)
|
||||
|
||||
def finalize(self) -> None:
|
||||
"""Flush all pending work and release all resources.
|
||||
|
||||
Idempotent — safe to call multiple times.
|
||||
"""
|
||||
if getattr(self, "_finalized", False):
|
||||
return
|
||||
# 1. Wait for async image writes to complete, then stop
|
||||
if self.image_writer is not None:
|
||||
self.image_writer.wait_until_done()
|
||||
self.image_writer.stop()
|
||||
self.image_writer = None
|
||||
# 2. Flush pending video encoding (streaming or batch)
|
||||
self.flush_pending_videos()
|
||||
# 3. Close own parquet writer
|
||||
self.close_writer()
|
||||
# 4. Finalize metadata (idempotent)
|
||||
self._meta.finalize()
|
||||
self._finalized = True
|
||||
|
||||
def __del__(self):
|
||||
"""Safety net: release resources on garbage collection."""
|
||||
# During interpreter shutdown, referenced objects may already be collected.
|
||||
with contextlib.suppress(Exception):
|
||||
self.finalize()
|
||||
@@ -365,6 +365,10 @@ def get_delta_indices(delta_timestamps: dict[str, list[float]], fps: int) -> dic
|
||||
|
||||
|
||||
def validate_frame(frame: dict, features: dict) -> None:
|
||||
# DEFAULT_FEATURES (timestamp, frame_index, episode_index, index, task_index) are
|
||||
# auto-populated by the recording pipeline (add_frame / save_episode) and must not
|
||||
# be supplied by the caller. Excluding them here means any frame dict that contains
|
||||
# these keys will be rejected as extra features.
|
||||
expected_features = set(features) - set(DEFAULT_FEATURES)
|
||||
actual_features = set(frame)
|
||||
|
||||
|
||||
@@ -32,10 +32,10 @@ def safe_stop_image_writer(func):
|
||||
return func(*args, **kwargs)
|
||||
except Exception as e:
|
||||
dataset = kwargs.get("dataset")
|
||||
image_writer = getattr(dataset, "image_writer", None) if dataset else None
|
||||
if image_writer is not None:
|
||||
writer = getattr(dataset, "writer", None) if dataset else None
|
||||
if writer is not None and writer.image_writer is not None:
|
||||
logger.warning("Waiting for image writer to terminate...")
|
||||
image_writer.stop()
|
||||
writer.image_writer.stop()
|
||||
raise e
|
||||
|
||||
return wrapper
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -22,6 +22,7 @@ import torch
|
||||
import torch.utils
|
||||
|
||||
from lerobot.datasets.compute_stats import aggregate_stats
|
||||
from lerobot.datasets.feature_utils import get_hf_features_from_features
|
||||
from lerobot.datasets.lerobot_dataset import LeRobotDataset
|
||||
from lerobot.datasets.video_utils import VideoFrame
|
||||
from lerobot.utils.constants import HF_LEROBOT_HOME
|
||||
@@ -88,12 +89,24 @@ class MultiLeRobotDataset(torch.utils.data.Dataset):
|
||||
)
|
||||
self.disabled_features.update(extra_keys)
|
||||
|
||||
self.image_transforms = image_transforms
|
||||
self.delta_timestamps = delta_timestamps
|
||||
# TODO(rcadene, aliberts): We should not perform this aggregation for datasets
|
||||
# with multiple robots of different ranges. Instead we should have one normalization
|
||||
# per robot.
|
||||
self.stats = aggregate_stats([dataset.meta.stats for dataset in self._datasets])
|
||||
self.set_image_transforms(image_transforms)
|
||||
|
||||
def set_image_transforms(self, image_transforms: Callable | None) -> None:
|
||||
"""Replace the transform for this dataset and its children."""
|
||||
if image_transforms is not None and not callable(image_transforms):
|
||||
raise TypeError("image_transforms must be callable or None.")
|
||||
self.image_transforms = image_transforms
|
||||
for dataset in getattr(self, "_datasets", []):
|
||||
dataset.set_image_transforms(self.image_transforms)
|
||||
|
||||
def clear_image_transforms(self) -> None:
|
||||
"""Remove the transform from this dataset and its children."""
|
||||
self.set_image_transforms(None)
|
||||
|
||||
@property
|
||||
def repo_id_to_index(self):
|
||||
@@ -125,7 +138,13 @@ class MultiLeRobotDataset(torch.utils.data.Dataset):
|
||||
def features(self) -> datasets.Features:
|
||||
features = {}
|
||||
for dataset in self._datasets:
|
||||
features.update({k: v for k, v in dataset.hf_features.items() if k not in self.disabled_features})
|
||||
features.update(
|
||||
{
|
||||
k: v
|
||||
for k, v in get_hf_features_from_features(dataset.features).items()
|
||||
if k not in self.disabled_features
|
||||
}
|
||||
)
|
||||
return features
|
||||
|
||||
@property
|
||||
|
||||
@@ -255,7 +255,9 @@ class StreamingLeRobotDataset(torch.utils.data.IterableDataset):
|
||||
|
||||
Args:
|
||||
repo_id (str): This is the repo id that will be used to fetch the dataset.
|
||||
root (Path | None, optional): Local directory to use for downloading/writing files.
|
||||
root (Path | None, optional): Local directory to use for local datasets. When omitted, Hub
|
||||
metadata is resolved through a revision-safe snapshot cache under
|
||||
``$HF_LEROBOT_HOME/hub``.
|
||||
episodes (list[int] | None, optional): If specified, this will only load episodes specified by
|
||||
their episode_index in this list.
|
||||
image_transforms (Callable | None, optional): Transform to apply to image data.
|
||||
@@ -271,7 +273,8 @@ class StreamingLeRobotDataset(torch.utils.data.IterableDataset):
|
||||
"""
|
||||
super().__init__()
|
||||
self.repo_id = repo_id
|
||||
self.root = Path(root) if root else HF_LEROBOT_HOME / repo_id
|
||||
self._requested_root = Path(root) if root else None
|
||||
self.root = self._requested_root if self._requested_root is not None else HF_LEROBOT_HOME / repo_id
|
||||
self.streaming_from_local = root is not None
|
||||
|
||||
self.image_transforms = image_transforms
|
||||
@@ -288,12 +291,15 @@ class StreamingLeRobotDataset(torch.utils.data.IterableDataset):
|
||||
# We cache the video decoders to avoid re-initializing them at each frame (avoiding a ~10x slowdown)
|
||||
self.video_decoder_cache = None
|
||||
|
||||
self.root.mkdir(exist_ok=True, parents=True)
|
||||
if self._requested_root is not None:
|
||||
self.root.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
# Load metadata
|
||||
self.meta = LeRobotDatasetMetadata(
|
||||
self.repo_id, self.root, self.revision, force_cache_sync=force_cache_sync
|
||||
self.repo_id, self._requested_root, self.revision, force_cache_sync=force_cache_sync
|
||||
)
|
||||
self.root = self.meta.root
|
||||
self.revision = self.meta.revision
|
||||
# Check version
|
||||
check_version_compatibility(self.repo_id, self.meta._version, CODEBASE_VERSION)
|
||||
|
||||
|
||||
@@ -18,6 +18,7 @@ import importlib.resources
|
||||
import json
|
||||
import logging
|
||||
from collections.abc import Iterator
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import datasets
|
||||
@@ -101,6 +102,18 @@ DEFAULT_FEATURES = {
|
||||
}
|
||||
|
||||
|
||||
def has_legacy_hub_download_metadata(root: Path) -> bool:
|
||||
"""Return ``True`` when *root* looks like a legacy Hub ``local_dir`` mirror.
|
||||
|
||||
``snapshot_download(local_dir=...)`` stores lightweight metadata under
|
||||
``<local_dir>/.cache/huggingface/download/``. The presence of this
|
||||
directory is a reliable indicator that the dataset was downloaded with
|
||||
the old non-revision-safe ``local_dir`` mode and should be re-fetched
|
||||
through the snapshot cache instead.
|
||||
"""
|
||||
return (root / ".cache" / "huggingface" / "download").exists()
|
||||
|
||||
|
||||
def update_chunk_file_indices(chunk_idx: int, file_idx: int, chunks_size: int) -> tuple[int, int]:
|
||||
if file_idx == chunks_size - 1:
|
||||
file_idx = 0
|
||||
|
||||
@@ -741,6 +741,7 @@ class StreamingVideoEncoder:
|
||||
self._video_paths: dict[str, Path] = {}
|
||||
self._dropped_frames: dict[str, int] = {}
|
||||
self._episode_active = False
|
||||
self._closed = False
|
||||
|
||||
def start_episode(self, video_keys: list[str], temp_dir: Path) -> None:
|
||||
"""Start encoder threads for a new episode.
|
||||
@@ -895,8 +896,11 @@ class StreamingVideoEncoder:
|
||||
|
||||
def close(self) -> None:
|
||||
"""Close the encoder, canceling any in-progress episode."""
|
||||
if self._closed:
|
||||
return
|
||||
if self._episode_active:
|
||||
self.cancel_episode()
|
||||
self._closed = True
|
||||
|
||||
def _cleanup(self) -> None:
|
||||
"""Clean up queues and thread tracking dicts."""
|
||||
@@ -1063,43 +1067,19 @@ class VideoEncodingManager:
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
streaming_encoder = getattr(self.dataset, "_streaming_encoder", None)
|
||||
writer = self.dataset.writer
|
||||
if writer is not None:
|
||||
if exc_type is not None and writer._streaming_encoder is not None:
|
||||
writer.cancel_pending_videos()
|
||||
|
||||
if streaming_encoder is not None:
|
||||
# Handle streaming encoder cleanup
|
||||
if exc_type is not None:
|
||||
streaming_encoder.cancel_episode()
|
||||
streaming_encoder.close()
|
||||
elif self.dataset.episodes_since_last_encoding > 0:
|
||||
# Handle any remaining episodes that haven't been batch encoded
|
||||
if exc_type is not None:
|
||||
logger.info("Exception occurred. Encoding remaining episodes before exit...")
|
||||
else:
|
||||
logger.info("Recording stopped. Encoding remaining episodes...")
|
||||
# finalize() handles flush_pending_videos + parquet + metadata
|
||||
self.dataset.finalize()
|
||||
|
||||
start_ep = self.dataset.num_episodes - self.dataset.episodes_since_last_encoding
|
||||
end_ep = self.dataset.num_episodes
|
||||
logger.info(
|
||||
f"Encoding remaining {self.dataset.episodes_since_last_encoding} episodes, "
|
||||
f"from episode {start_ep} to {end_ep - 1}"
|
||||
)
|
||||
self.dataset._batch_save_episode_video(start_ep, end_ep)
|
||||
|
||||
# Finalize the dataset to properly close all writers
|
||||
self.dataset.finalize()
|
||||
|
||||
# Clean up episode images if recording was interrupted (only for non-streaming mode)
|
||||
if exc_type is not None and streaming_encoder is None:
|
||||
interrupted_episode_index = self.dataset.num_episodes
|
||||
for key in self.dataset.meta.video_keys:
|
||||
img_dir = self.dataset._get_image_file_path(
|
||||
episode_index=interrupted_episode_index, image_key=key, frame_index=0
|
||||
).parent
|
||||
if img_dir.exists():
|
||||
logger.debug(
|
||||
f"Cleaning up interrupted episode images for episode {interrupted_episode_index}, camera {key}"
|
||||
)
|
||||
shutil.rmtree(img_dir)
|
||||
# Clean up episode images if recording was interrupted (only for non-streaming mode)
|
||||
if exc_type is not None and writer._streaming_encoder is None:
|
||||
writer.cleanup_interrupted_episode(self.dataset.num_episodes)
|
||||
else:
|
||||
self.dataset.finalize()
|
||||
|
||||
# Clean up any remaining images directory if it's empty
|
||||
img_dir = self.dataset.root / "images"
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
from .act.configuration_act import ACTConfig as ACTConfig
|
||||
from .diffusion.configuration_diffusion import DiffusionConfig as DiffusionConfig
|
||||
from .groot.configuration_groot import GrootConfig as GrootConfig
|
||||
from .multi_task_dit.configuration_multi_task_dit import MultiTaskDiTConfig as MultiTaskDiTConfig
|
||||
from .pi0.configuration_pi0 import PI0Config as PI0Config
|
||||
from .pi0_fast.configuration_pi0_fast import PI0FastConfig as PI0FastConfig
|
||||
from .pi05.configuration_pi05 import PI05Config as PI05Config
|
||||
@@ -28,6 +29,7 @@ from .xvla.configuration_xvla import XVLAConfig as XVLAConfig
|
||||
__all__ = [
|
||||
"ACTConfig",
|
||||
"DiffusionConfig",
|
||||
"MultiTaskDiTConfig",
|
||||
"PI0Config",
|
||||
"PI05Config",
|
||||
"PI0FastConfig",
|
||||
|
||||
@@ -31,6 +31,7 @@ from lerobot.envs.utils import env_to_policy_features
|
||||
from lerobot.policies.act.configuration_act import ACTConfig
|
||||
from lerobot.policies.diffusion.configuration_diffusion import DiffusionConfig
|
||||
from lerobot.policies.groot.configuration_groot import GrootConfig
|
||||
from lerobot.policies.multi_task_dit.configuration_multi_task_dit import MultiTaskDiTConfig
|
||||
from lerobot.policies.pi0.configuration_pi0 import PI0Config
|
||||
from lerobot.policies.pi05.configuration_pi05 import PI05Config
|
||||
from lerobot.policies.pretrained import PreTrainedPolicy
|
||||
@@ -58,6 +59,29 @@ from lerobot.utils.constants import (
|
||||
)
|
||||
|
||||
|
||||
def _reconnect_relative_absolute_steps(
|
||||
preprocessor: PolicyProcessorPipeline, postprocessor: PolicyProcessorPipeline
|
||||
) -> None:
|
||||
"""Wire AbsoluteActionsProcessorStep.relative_step to the RelativeActionsProcessorStep after deserialization.
|
||||
|
||||
After a policy is loaded from disk, the preprocessor and postprocessor are reconstructed
|
||||
independently from their configs. AbsoluteActionsProcessorStep needs a live reference to
|
||||
the RelativeActionsProcessorStep so it can read the cached state at inference time.
|
||||
That reference is not serializable, so we re-establish it here after loading.
|
||||
"""
|
||||
from lerobot.processor.relative_action_processor import (
|
||||
AbsoluteActionsProcessorStep,
|
||||
RelativeActionsProcessorStep,
|
||||
)
|
||||
|
||||
relative_step = next((s for s in preprocessor.steps if isinstance(s, RelativeActionsProcessorStep)), None)
|
||||
if relative_step is None:
|
||||
return
|
||||
for step in postprocessor.steps:
|
||||
if isinstance(step, AbsoluteActionsProcessorStep) and step.relative_step is None:
|
||||
step.relative_step = relative_step
|
||||
|
||||
|
||||
def get_policy_class(name: str) -> type[PreTrainedPolicy]:
|
||||
"""
|
||||
Retrieves a policy class by its registered name.
|
||||
@@ -67,8 +91,7 @@ def get_policy_class(name: str) -> type[PreTrainedPolicy]:
|
||||
|
||||
Args:
|
||||
name: The name of the policy. Supported names are "tdmpc", "diffusion", "act",
|
||||
"vqbet", "pi0", "pi05", "sac", "reward_classifier", "smolvla", "wall_x".
|
||||
|
||||
"multi_task_dit", "vqbet", "pi0", "pi05", "sac", "reward_classifier", "smolvla", "wall_x".
|
||||
Returns:
|
||||
The policy class corresponding to the given name.
|
||||
|
||||
@@ -87,6 +110,10 @@ def get_policy_class(name: str) -> type[PreTrainedPolicy]:
|
||||
from lerobot.policies.act.modeling_act import ACTPolicy
|
||||
|
||||
return ACTPolicy
|
||||
elif name == "multi_task_dit":
|
||||
from lerobot.policies.multi_task_dit.modeling_multi_task_dit import MultiTaskDiTPolicy
|
||||
|
||||
return MultiTaskDiTPolicy
|
||||
elif name == "vqbet":
|
||||
from lerobot.policies.vqbet.modeling_vqbet import VQBeTPolicy
|
||||
|
||||
@@ -147,8 +174,8 @@ def make_policy_config(policy_type: str, **kwargs) -> PreTrainedConfig:
|
||||
|
||||
Args:
|
||||
policy_type: The type of the policy. Supported types include "tdmpc",
|
||||
"diffusion", "act", "vqbet", "pi0", "pi05", "sac", "smolvla",
|
||||
"reward_classifier", "wall_x".
|
||||
"multi_task_dit", "diffusion", "act", "vqbet", "pi0", "pi05", "sac",
|
||||
"smolvla", "reward_classifier", "wall_x".
|
||||
**kwargs: Keyword arguments to be passed to the configuration class constructor.
|
||||
|
||||
Returns:
|
||||
@@ -163,6 +190,8 @@ def make_policy_config(policy_type: str, **kwargs) -> PreTrainedConfig:
|
||||
return DiffusionConfig(**kwargs)
|
||||
elif policy_type == "act":
|
||||
return ACTConfig(**kwargs)
|
||||
elif policy_type == "multi_task_dit":
|
||||
return MultiTaskDiTConfig(**kwargs)
|
||||
elif policy_type == "vqbet":
|
||||
return VQBeTConfig(**kwargs)
|
||||
elif policy_type == "pi0":
|
||||
@@ -263,26 +292,26 @@ def make_pre_post_processors(
|
||||
kwargs["preprocessor_overrides"] = preprocessor_overrides
|
||||
kwargs["postprocessor_overrides"] = postprocessor_overrides
|
||||
|
||||
return (
|
||||
PolicyProcessorPipeline.from_pretrained(
|
||||
pretrained_model_name_or_path=pretrained_path,
|
||||
config_filename=kwargs.get(
|
||||
"preprocessor_config_filename", f"{POLICY_PREPROCESSOR_DEFAULT_NAME}.json"
|
||||
),
|
||||
overrides=kwargs.get("preprocessor_overrides", {}),
|
||||
to_transition=batch_to_transition,
|
||||
to_output=transition_to_batch,
|
||||
),
|
||||
PolicyProcessorPipeline.from_pretrained(
|
||||
pretrained_model_name_or_path=pretrained_path,
|
||||
config_filename=kwargs.get(
|
||||
"postprocessor_config_filename", f"{POLICY_POSTPROCESSOR_DEFAULT_NAME}.json"
|
||||
),
|
||||
overrides=kwargs.get("postprocessor_overrides", {}),
|
||||
to_transition=policy_action_to_transition,
|
||||
to_output=transition_to_policy_action,
|
||||
preprocessor = PolicyProcessorPipeline.from_pretrained(
|
||||
pretrained_model_name_or_path=pretrained_path,
|
||||
config_filename=kwargs.get(
|
||||
"preprocessor_config_filename", f"{POLICY_PREPROCESSOR_DEFAULT_NAME}.json"
|
||||
),
|
||||
overrides=kwargs.get("preprocessor_overrides", {}),
|
||||
to_transition=batch_to_transition,
|
||||
to_output=transition_to_batch,
|
||||
)
|
||||
postprocessor = PolicyProcessorPipeline.from_pretrained(
|
||||
pretrained_model_name_or_path=pretrained_path,
|
||||
config_filename=kwargs.get(
|
||||
"postprocessor_config_filename", f"{POLICY_POSTPROCESSOR_DEFAULT_NAME}.json"
|
||||
),
|
||||
overrides=kwargs.get("postprocessor_overrides", {}),
|
||||
to_transition=policy_action_to_transition,
|
||||
to_output=transition_to_policy_action,
|
||||
)
|
||||
_reconnect_relative_absolute_steps(preprocessor, postprocessor)
|
||||
return preprocessor, postprocessor
|
||||
|
||||
# Create a new processor based on policy type
|
||||
if isinstance(policy_cfg, TDMPCConfig):
|
||||
@@ -309,6 +338,16 @@ def make_pre_post_processors(
|
||||
dataset_stats=kwargs.get("dataset_stats"),
|
||||
)
|
||||
|
||||
elif isinstance(policy_cfg, MultiTaskDiTConfig):
|
||||
from lerobot.policies.multi_task_dit.processor_multi_task_dit import (
|
||||
make_multi_task_dit_pre_post_processors,
|
||||
)
|
||||
|
||||
processors = make_multi_task_dit_pre_post_processors(
|
||||
config=policy_cfg,
|
||||
dataset_stats=kwargs.get("dataset_stats"),
|
||||
)
|
||||
|
||||
elif isinstance(policy_cfg, VQBeTConfig):
|
||||
from lerobot.policies.vqbet.processor_vqbet import make_vqbet_pre_post_processors
|
||||
|
||||
@@ -470,6 +509,13 @@ def make_policy(
|
||||
cfg.output_features = {key: ft for key, ft in features.items() if ft.type is FeatureType.ACTION}
|
||||
if not cfg.input_features:
|
||||
cfg.input_features = {key: ft for key, ft in features.items() if key not in cfg.output_features}
|
||||
|
||||
# Store action feature names for relative_exclude_joints support
|
||||
if ds_meta is not None and hasattr(cfg, "action_feature_names"):
|
||||
action_names = ds_meta.features.get(ACTION, {}).get("names")
|
||||
if action_names is not None:
|
||||
cfg.action_feature_names = list(action_names)
|
||||
|
||||
kwargs["config"] = cfg
|
||||
|
||||
# Pass dataset_stats to the policy if available (needed for some policies like SARM)
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
../../../../docs/source/policy_multi_task_dit_README.md
|
||||
@@ -0,0 +1,21 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2025 Bryson Jones and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from .configuration_multi_task_dit import MultiTaskDiTConfig
|
||||
from .modeling_multi_task_dit import MultiTaskDiTPolicy
|
||||
from .processor_multi_task_dit import make_multi_task_dit_pre_post_processors
|
||||
|
||||
__all__ = ["MultiTaskDiTConfig", "MultiTaskDiTPolicy", "make_multi_task_dit_pre_post_processors"]
|
||||
@@ -0,0 +1,256 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2025 Bryson Jones and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from lerobot.configs.policies import PreTrainedConfig
|
||||
from lerobot.configs.types import NormalizationMode
|
||||
from lerobot.optim.optimizers import AdamConfig
|
||||
from lerobot.optim.schedulers import DiffuserSchedulerConfig
|
||||
|
||||
|
||||
@PreTrainedConfig.register_subclass("multi_task_dit")
|
||||
@dataclass
|
||||
class MultiTaskDiTConfig(PreTrainedConfig):
|
||||
"""Configuration for the Multi-Task Diffusion Transformer (DiT) policy.
|
||||
|
||||
A transformer-based policy that supports both diffusion and flow matching objectives
|
||||
for multi-task robot learning with text and vision conditioning.
|
||||
"""
|
||||
|
||||
n_obs_steps: int = 2 # Number of observation steps for temporal context
|
||||
horizon: int = 32 # Number of action steps to predict
|
||||
n_action_steps: int = 24 # Actions executed per policy call (~0.8s at 30Hz)
|
||||
|
||||
# Objective Selection
|
||||
objective: str = "diffusion" # "diffusion" or "flow_matching"
|
||||
|
||||
# --- Diffusion-specific (used when objective="diffusion") ---
|
||||
noise_scheduler_type: str = "DDPM" # "DDPM" or "DDIM"
|
||||
num_train_timesteps: int = 100 # Number of diffusion timesteps
|
||||
beta_schedule: str = "squaredcos_cap_v2" # Noise schedule type
|
||||
beta_start: float = 0.0001 # Starting noise level
|
||||
beta_end: float = 0.02 # Ending noise level
|
||||
prediction_type: str = "epsilon" # "epsilon" (predict noise) or "sample" (predict clean)
|
||||
clip_sample: bool = True # Clip samples during denoising
|
||||
clip_sample_range: float = 1.0 # Clipping range [-x, x]
|
||||
num_inference_steps: int | None = None # Denoising steps at inference (defaults to num_train_timesteps)
|
||||
|
||||
# --- Flow Matching-specific (used when objective="flow_matching") ---
|
||||
sigma_min: float = 0.0 # Minimum noise in flow interpolation path
|
||||
num_integration_steps: int = 100 # ODE integration steps at inference
|
||||
integration_method: str = "euler" # ODE solver: "euler" or "rk4"
|
||||
timestep_sampling_strategy: str = "beta" # "uniform" or "beta"
|
||||
|
||||
timestep_sampling_s: float = 0.999 # (beta only) Max timestep threshold
|
||||
timestep_sampling_alpha: float = 1.5 # (beta only) Beta distribution alpha
|
||||
timestep_sampling_beta: float = 1.0 # (beta only) Beta distribution beta
|
||||
|
||||
# Transformer Architecture
|
||||
hidden_dim: int = 512 # Transformer hidden dimension
|
||||
num_layers: int = 6 # Number of transformer layers
|
||||
num_heads: int = 8 # Number of attention heads
|
||||
dropout: float = 0.1 # Dropout rate
|
||||
use_positional_encoding: bool = False # Use absolute positional encoding
|
||||
timestep_embed_dim: int = 256 # Timestep embedding dimension
|
||||
use_rope: bool = True # Use Rotary Position Embedding
|
||||
rope_base: float = 10000.0 # RoPE base frequency
|
||||
|
||||
# Vision Encoder (CLIP)
|
||||
vision_encoder_name: str = "openai/clip-vit-base-patch16" # HuggingFace CLIP model
|
||||
use_separate_rgb_encoder_per_camera: bool = False # Separate encoder per camera view
|
||||
vision_encoder_lr_multiplier: float = 0.1 # LR multiplier for vision encoder
|
||||
image_resize_shape: tuple[int, int] | None = None # Resize images before crop
|
||||
image_crop_shape: tuple[int, int] | None = (224, 224) # Crop shape (CLIP default)
|
||||
image_crop_is_random: bool = True # Random crop during training, center at inference
|
||||
|
||||
# Text Encoder (CLIP)
|
||||
text_encoder_name: str = "openai/clip-vit-base-patch16" # HuggingFace CLIP model
|
||||
tokenizer_max_length: int = 77 # Max length for tokenized text (CLIP default is 77)
|
||||
tokenizer_padding: str = "max_length" # Padding strategy: "max_length" or "longest"
|
||||
tokenizer_padding_side: str = "right" # Padding side: "left" or "right"
|
||||
tokenizer_truncation: bool = True # Whether to truncate sequences longer than max_length
|
||||
|
||||
# Normalization
|
||||
normalization_mapping: dict[str, NormalizationMode] = field(
|
||||
default_factory=lambda: {
|
||||
"VISUAL": NormalizationMode.MEAN_STD,
|
||||
"STATE": NormalizationMode.MIN_MAX,
|
||||
"ACTION": NormalizationMode.MIN_MAX,
|
||||
}
|
||||
)
|
||||
|
||||
# Training/Optimizer
|
||||
optimizer_lr: float = 2e-5
|
||||
optimizer_betas: tuple = (0.95, 0.999)
|
||||
optimizer_eps: float = 1e-8
|
||||
optimizer_weight_decay: float = 0.0
|
||||
scheduler_name: str = "cosine"
|
||||
scheduler_warmup_steps: int = 0
|
||||
do_mask_loss_for_padding: bool = False
|
||||
|
||||
# Auto-calculated
|
||||
drop_n_last_frames: int | None = None
|
||||
|
||||
def __post_init__(self):
|
||||
super().__post_init__()
|
||||
|
||||
if self.drop_n_last_frames is None:
|
||||
self.drop_n_last_frames = self.horizon - self.n_action_steps - self.n_obs_steps + 1
|
||||
|
||||
self._validate()
|
||||
|
||||
def _validate(self):
|
||||
"""Validate configuration parameters."""
|
||||
# Objective validation
|
||||
if self.objective not in ["diffusion", "flow_matching"]:
|
||||
raise ValueError(f"objective must be 'diffusion' or 'flow_matching', got '{self.objective}'")
|
||||
|
||||
# Transformer validation
|
||||
if self.hidden_dim <= 0:
|
||||
raise ValueError("hidden_dim must be positive")
|
||||
if self.num_layers <= 0:
|
||||
raise ValueError("num_layers must be positive")
|
||||
if self.num_heads <= 0:
|
||||
raise ValueError("num_heads must be positive")
|
||||
if self.hidden_dim % self.num_heads != 0:
|
||||
raise ValueError("hidden_dim must be divisible by num_heads")
|
||||
if not (0.0 <= self.dropout <= 1.0):
|
||||
raise ValueError("dropout must be between 0.0 and 1.0")
|
||||
|
||||
# Vision encoder validation
|
||||
if "clip" not in self.vision_encoder_name.lower():
|
||||
raise ValueError(
|
||||
f"vision_encoder_name must be a CLIP model (contain 'clip'), got '{self.vision_encoder_name}'"
|
||||
)
|
||||
if (
|
||||
self.image_resize_shape
|
||||
and self.image_crop_shape
|
||||
and (
|
||||
self.image_crop_shape[0] > self.image_resize_shape[0]
|
||||
or self.image_crop_shape[1] > self.image_resize_shape[1]
|
||||
)
|
||||
):
|
||||
logging.warning(
|
||||
"image_crop_shape %s must be <= image_resize_shape %s; disabling cropping.",
|
||||
self.image_crop_shape,
|
||||
self.image_resize_shape,
|
||||
)
|
||||
self.image_crop_shape = None
|
||||
|
||||
# Text encoder validation
|
||||
if "clip" not in self.text_encoder_name.lower():
|
||||
raise ValueError(
|
||||
f"text_encoder_name must be a CLIP model (contain 'clip'), got '{self.text_encoder_name}'"
|
||||
)
|
||||
|
||||
# Objective-specific validation
|
||||
if self.objective == "diffusion":
|
||||
if self.noise_scheduler_type not in ["DDPM", "DDIM"]:
|
||||
raise ValueError(
|
||||
f"noise_scheduler_type must be 'DDPM' or 'DDIM', got {self.noise_scheduler_type}"
|
||||
)
|
||||
if self.prediction_type not in ["epsilon", "sample"]:
|
||||
raise ValueError(f"prediction_type must be 'epsilon' or 'sample', got {self.prediction_type}")
|
||||
if self.num_train_timesteps <= 0:
|
||||
raise ValueError(f"num_train_timesteps must be positive, got {self.num_train_timesteps}")
|
||||
if not (0.0 <= self.beta_start <= self.beta_end <= 1.0):
|
||||
raise ValueError(f"Invalid beta values: {self.beta_start}, {self.beta_end}")
|
||||
|
||||
elif self.objective == "flow_matching":
|
||||
if not (0.0 <= self.sigma_min <= 1.0):
|
||||
raise ValueError(f"sigma_min must be in [0, 1], got {self.sigma_min}")
|
||||
if self.num_integration_steps <= 0:
|
||||
raise ValueError(f"num_integration_steps must be positive, got {self.num_integration_steps}")
|
||||
if self.integration_method not in ["euler", "rk4"]:
|
||||
raise ValueError(
|
||||
f"integration_method must be 'euler' or 'rk4', got {self.integration_method}"
|
||||
)
|
||||
if self.timestep_sampling_strategy not in ["uniform", "beta"]:
|
||||
raise ValueError("timestep_sampling_strategy must be 'uniform' or 'beta'")
|
||||
if self.timestep_sampling_strategy == "beta":
|
||||
if not (0.0 < self.timestep_sampling_s <= 1.0):
|
||||
raise ValueError(f"timestep_sampling_s must be in (0, 1], got {self.timestep_sampling_s}")
|
||||
if self.timestep_sampling_alpha <= 0:
|
||||
raise ValueError("timestep_sampling_alpha must be positive")
|
||||
if self.timestep_sampling_beta <= 0:
|
||||
raise ValueError("timestep_sampling_beta must be positive")
|
||||
|
||||
def get_optimizer_preset(self) -> AdamConfig:
|
||||
return AdamConfig(
|
||||
lr=self.optimizer_lr,
|
||||
betas=self.optimizer_betas,
|
||||
eps=self.optimizer_eps,
|
||||
weight_decay=self.optimizer_weight_decay,
|
||||
)
|
||||
|
||||
def get_scheduler_preset(self) -> DiffuserSchedulerConfig:
|
||||
return DiffuserSchedulerConfig(
|
||||
name=self.scheduler_name,
|
||||
num_warmup_steps=self.scheduler_warmup_steps,
|
||||
)
|
||||
|
||||
def validate_features(self) -> None:
|
||||
"""Validate that required input features are present and properly configured."""
|
||||
# If the configured crop doesn't fit, disable cropping instead of erroring.
|
||||
# Note: if image_resize_shape is set, cropping is applied *after* resizing.
|
||||
if self.image_crop_shape is not None:
|
||||
for key, image_ft in self.image_features.items():
|
||||
# image_ft.shape is (C, H, W)
|
||||
effective_h, effective_w = (
|
||||
self.image_resize_shape
|
||||
if self.image_resize_shape is not None
|
||||
else (image_ft.shape[1], image_ft.shape[2])
|
||||
)
|
||||
if self.image_crop_shape[0] > effective_h or self.image_crop_shape[1] > effective_w:
|
||||
logging.warning(
|
||||
"image_crop_shape %s doesn't fit within effective image shape (%s, %s) for '%s'; disabling cropping.",
|
||||
self.image_crop_shape,
|
||||
effective_h,
|
||||
effective_w,
|
||||
key,
|
||||
)
|
||||
self.image_crop_shape = None
|
||||
break
|
||||
|
||||
if len(self.image_features) > 0:
|
||||
first_key, first_ft = next(iter(self.image_features.items()))
|
||||
for key, image_ft in self.image_features.items():
|
||||
if image_ft.shape != first_ft.shape:
|
||||
raise ValueError(
|
||||
f"Image '{key}' shape {image_ft.shape} != '{first_key}' shape {first_ft.shape}"
|
||||
)
|
||||
|
||||
@property
|
||||
def is_diffusion(self) -> bool:
|
||||
return self.objective == "diffusion"
|
||||
|
||||
@property
|
||||
def is_flow_matching(self) -> bool:
|
||||
return self.objective == "flow_matching"
|
||||
|
||||
@property
|
||||
def observation_delta_indices(self) -> list:
|
||||
return list(range(1 - self.n_obs_steps, 1))
|
||||
|
||||
@property
|
||||
def action_delta_indices(self) -> list:
|
||||
return list(range(1 - self.n_obs_steps, 1 - self.n_obs_steps + self.horizon))
|
||||
|
||||
@property
|
||||
def reward_delta_indices(self) -> None:
|
||||
return None
|
||||
@@ -0,0 +1,803 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2025 Bryson Jones and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Multi-Task Diffusion Transformer (DiT) Policy
|
||||
|
||||
Transformer-based diffusion policy for multi-task robot learning with text and vision conditioning.
|
||||
Supports both diffusion and flow matching objectives for action generation.
|
||||
|
||||
References:
|
||||
- https://arxiv.org/abs/2507.05331
|
||||
- https://bostondynamics.com/blog/large-behavior-models-atlas-find-new-footing/
|
||||
- https://brysonkjones.substack.com/p/dissecting-and-open-sourcing-multitask-diffusion-transformer-policy
|
||||
"""
|
||||
|
||||
import math
|
||||
from collections import deque
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import einops
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F # noqa: N812
|
||||
import torchvision
|
||||
from diffusers.schedulers.scheduling_ddim import DDIMScheduler
|
||||
from diffusers.schedulers.scheduling_ddpm import DDPMScheduler
|
||||
from torch import Tensor
|
||||
|
||||
from lerobot.policies.multi_task_dit.configuration_multi_task_dit import MultiTaskDiTConfig
|
||||
from lerobot.utils.import_utils import _transformers_available
|
||||
|
||||
# Conditional import for type checking and lazy loading
|
||||
if TYPE_CHECKING or _transformers_available:
|
||||
from transformers import CLIPTextModel, CLIPVisionModel
|
||||
else:
|
||||
CLIPTextModel = None
|
||||
CLIPVisionModel = None
|
||||
from lerobot.policies.pretrained import PreTrainedPolicy
|
||||
from lerobot.policies.utils import populate_queues
|
||||
from lerobot.utils.constants import (
|
||||
ACTION,
|
||||
OBS_IMAGES,
|
||||
OBS_LANGUAGE_ATTENTION_MASK,
|
||||
OBS_LANGUAGE_TOKENS,
|
||||
OBS_STATE,
|
||||
)
|
||||
|
||||
# -- Policy --
|
||||
|
||||
|
||||
class MultiTaskDiTPolicy(PreTrainedPolicy):
|
||||
config_class = MultiTaskDiTConfig
|
||||
name = "multi_task_dit"
|
||||
|
||||
def __init__(self, config: MultiTaskDiTConfig, **kwargs):
|
||||
super().__init__(config)
|
||||
config.validate_features()
|
||||
self.config = config
|
||||
|
||||
self._queues = None
|
||||
|
||||
self.observation_encoder = ObservationEncoder(config)
|
||||
conditioning_dim = self.observation_encoder.conditioning_dim
|
||||
self.noise_predictor = DiffusionTransformer(config, conditioning_dim=conditioning_dim)
|
||||
|
||||
action_dim = config.action_feature.shape[0]
|
||||
horizon = config.horizon
|
||||
|
||||
if config.is_diffusion:
|
||||
self.objective = DiffusionObjective(
|
||||
config,
|
||||
action_dim=action_dim,
|
||||
horizon=horizon,
|
||||
do_mask_loss_for_padding=config.do_mask_loss_for_padding,
|
||||
)
|
||||
elif config.is_flow_matching:
|
||||
self.objective = FlowMatchingObjective(
|
||||
config,
|
||||
action_dim=action_dim,
|
||||
horizon=horizon,
|
||||
do_mask_loss_for_padding=config.do_mask_loss_for_padding,
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unsupported objective: {config.objective}")
|
||||
|
||||
self.reset()
|
||||
|
||||
def get_optim_params(self) -> list:
|
||||
"""Returns parameter groups with different learning rates for vision vs non-vision parameters"""
|
||||
non_vision_params = []
|
||||
vision_encoder_params = []
|
||||
|
||||
for name, param in self.named_parameters():
|
||||
if not param.requires_grad:
|
||||
continue
|
||||
|
||||
if "observation_encoder.vision_encoder" in name:
|
||||
vision_encoder_params.append(param)
|
||||
else:
|
||||
non_vision_params.append(param)
|
||||
|
||||
return [
|
||||
{"params": non_vision_params},
|
||||
{
|
||||
"params": vision_encoder_params,
|
||||
"lr": self.config.optimizer_lr * self.config.vision_encoder_lr_multiplier,
|
||||
},
|
||||
]
|
||||
|
||||
def _generate_actions(self, batch: dict[str, Tensor]) -> Tensor:
|
||||
batch_size, n_obs_steps = batch[OBS_STATE].shape[:2]
|
||||
assert n_obs_steps == self.config.n_obs_steps
|
||||
|
||||
conditioning_vec = self.observation_encoder.encode(batch)
|
||||
actions = self.objective.conditional_sample(self.noise_predictor, batch_size, conditioning_vec)
|
||||
|
||||
start = n_obs_steps - 1
|
||||
end = start + self.config.n_action_steps
|
||||
actions = actions[:, start:end]
|
||||
return actions
|
||||
|
||||
def reset(self):
|
||||
"""Clear observation and action queues. Should be called on `env.reset()`"""
|
||||
self._queues = {
|
||||
OBS_STATE: deque(maxlen=self.config.n_obs_steps),
|
||||
ACTION: deque(maxlen=self.config.n_action_steps),
|
||||
}
|
||||
|
||||
if self.config.image_features:
|
||||
self._queues[OBS_IMAGES] = deque(maxlen=self.config.n_obs_steps)
|
||||
|
||||
@torch.no_grad()
|
||||
def predict_action_chunk(self, batch: dict[str, Tensor]) -> Tensor:
|
||||
"""Predict a chunk of actions given environment observations"""
|
||||
self.eval()
|
||||
|
||||
for k in batch:
|
||||
if k in self._queues:
|
||||
batch[k] = torch.stack(list(self._queues[k]), dim=1)
|
||||
|
||||
actions = self._generate_actions(batch)
|
||||
return actions
|
||||
|
||||
def _prepare_batch(self, batch: dict[str, Tensor]) -> dict[str, Tensor]:
|
||||
"""Prepare batch by stacking image features if needed."""
|
||||
if self.config.image_features:
|
||||
batch = dict(batch) # shallow copy to avoid modifying original
|
||||
batch[OBS_IMAGES] = torch.stack([batch[key] for key in self.config.image_features], dim=-4)
|
||||
|
||||
return batch
|
||||
|
||||
@torch.no_grad()
|
||||
def select_action(self, batch: dict[str, Tensor]) -> Tensor:
|
||||
"""Select a single action given environment observations"""
|
||||
if ACTION in batch:
|
||||
batch = dict(batch) # shallow copy to avoid modifying original
|
||||
batch.pop(ACTION)
|
||||
|
||||
batch = self._prepare_batch(batch)
|
||||
|
||||
self._queues = populate_queues(self._queues, batch)
|
||||
|
||||
if len(self._queues[ACTION]) == 0:
|
||||
actions = self.predict_action_chunk(batch)
|
||||
self._queues[ACTION].extend(actions.transpose(0, 1))
|
||||
|
||||
action = self._queues[ACTION].popleft()
|
||||
return action
|
||||
|
||||
def forward(self, batch: dict[str, Tensor]) -> tuple[Tensor, dict | None]:
|
||||
"""Run the batch through the model and compute the loss for training"""
|
||||
batch = self._prepare_batch(batch)
|
||||
|
||||
conditioning_vec = self.observation_encoder.encode(batch)
|
||||
loss = self.objective.compute_loss(self.noise_predictor, batch, conditioning_vec)
|
||||
|
||||
return loss, None
|
||||
|
||||
|
||||
# -- Observation Encoders --
|
||||
|
||||
|
||||
class CLIPVisionEncoder(nn.Module):
|
||||
"""CLIP vision encoder using the CLS token for global image representation."""
|
||||
|
||||
def __init__(self, model_name: str):
|
||||
super().__init__()
|
||||
self.model_name = model_name
|
||||
self.model = CLIPVisionModel.from_pretrained(self.model_name)
|
||||
self.num_non_spatial_tokens = 1
|
||||
self.embed_dim = self.model.config.hidden_size
|
||||
|
||||
def forward(self, x: Tensor) -> Tensor:
|
||||
"""Encode RGB image to CLS token."""
|
||||
outputs = self.model(pixel_values=x, output_hidden_states=False)
|
||||
cls_token = outputs.last_hidden_state[:, 0]
|
||||
b, embed_dim = cls_token.shape
|
||||
return cls_token.reshape(b, embed_dim, 1, 1)
|
||||
|
||||
def get_output_shape(self) -> tuple:
|
||||
return (self.embed_dim, 1, 1)
|
||||
|
||||
|
||||
class CLIPTextEncoder(nn.Module):
|
||||
"""CLIP text encoder with frozen weights and a learnable projection layer.
|
||||
|
||||
Accepts pre-tokenized inputs (input_ids and attention_mask) from the processor pipeline. See the processor
|
||||
pipeline to see how the tokenization is handled.
|
||||
"""
|
||||
|
||||
def __init__(self, model_name: str = "openai/clip-vit-base-patch16", projection_dim: int = 512):
|
||||
super().__init__()
|
||||
self.model_name = model_name
|
||||
self.projection_dim = projection_dim
|
||||
self.text_encoder = CLIPTextModel.from_pretrained(model_name)
|
||||
|
||||
for param in self.text_encoder.parameters():
|
||||
param.requires_grad = False
|
||||
|
||||
self.text_embed_dim = self.text_encoder.config.hidden_size
|
||||
self.projection = nn.Linear(self.text_embed_dim, projection_dim)
|
||||
|
||||
def forward(self, input_ids: Tensor, attention_mask: Tensor) -> Tensor:
|
||||
"""Encode pre-tokenized text to feature vectors."""
|
||||
# Ensure inputs are on the same device as the model
|
||||
device = next(self.parameters()).device
|
||||
input_ids = input_ids.to(device)
|
||||
attention_mask = attention_mask.to(device)
|
||||
|
||||
with torch.no_grad():
|
||||
outputs = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)
|
||||
clip_features = outputs.pooler_output
|
||||
|
||||
return self.projection(clip_features)
|
||||
|
||||
|
||||
class ObservationEncoder(nn.Module):
|
||||
"""Handles all observation processing for the conditioning vector."""
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
self._setup_preprocessing(config)
|
||||
|
||||
if config.image_features:
|
||||
self.num_cameras = len(config.image_features)
|
||||
self.camera_names = list(config.image_features.keys())
|
||||
|
||||
if config.use_separate_rgb_encoder_per_camera:
|
||||
self.vision_encoders = nn.ModuleList(
|
||||
[CLIPVisionEncoder(model_name=config.vision_encoder_name) for _ in self.camera_names]
|
||||
)
|
||||
self.vision_encoder = None
|
||||
else:
|
||||
self.vision_encoder = CLIPVisionEncoder(model_name=config.vision_encoder_name)
|
||||
self.vision_encoders = None
|
||||
else:
|
||||
self.vision_encoder = None
|
||||
self.vision_encoders = None
|
||||
self.camera_names = []
|
||||
self.num_cameras = 0
|
||||
|
||||
if hasattr(config, "robot_state_feature") and config.robot_state_feature:
|
||||
self.robot_state_dim = config.robot_state_feature.shape[0]
|
||||
else:
|
||||
self.robot_state_dim = 0
|
||||
|
||||
self.text_dim = config.hidden_dim
|
||||
self.text_encoder = CLIPTextEncoder(model_name=config.text_encoder_name, projection_dim=self.text_dim)
|
||||
|
||||
self._setup_vector_output()
|
||||
|
||||
def _apply_preprocessing(self, images: Tensor) -> Tensor:
|
||||
if self.do_resize:
|
||||
images = self.resize(images)
|
||||
if self.do_crop:
|
||||
images = self.maybe_random_crop(images) if self.training else self.center_crop(images)
|
||||
return images
|
||||
|
||||
def _setup_preprocessing(self, config):
|
||||
if config.image_resize_shape is not None:
|
||||
self.do_resize = True
|
||||
self.resize = torchvision.transforms.Resize(
|
||||
size=config.image_resize_shape,
|
||||
interpolation=torchvision.transforms.InterpolationMode.BILINEAR,
|
||||
antialias=True,
|
||||
)
|
||||
else:
|
||||
self.do_resize = False
|
||||
|
||||
if config.image_crop_shape is not None:
|
||||
self.do_crop = True
|
||||
self.center_crop = torchvision.transforms.CenterCrop(config.image_crop_shape)
|
||||
if config.image_crop_is_random:
|
||||
self.maybe_random_crop = torchvision.transforms.RandomCrop(config.image_crop_shape)
|
||||
else:
|
||||
self.maybe_random_crop = self.center_crop
|
||||
else:
|
||||
self.do_crop = False
|
||||
|
||||
def _setup_vector_output(self):
|
||||
total_dim = 0
|
||||
|
||||
if self.vision_encoder is not None or self.vision_encoders is not None:
|
||||
encoder_to_check = self.vision_encoder or next(iter(self.vision_encoders))
|
||||
feature_map_shape = encoder_to_check.get_output_shape()
|
||||
c, h, w = feature_map_shape
|
||||
spatial_feature_dim = c * h * w
|
||||
total_dim += spatial_feature_dim * self.num_cameras
|
||||
|
||||
total_dim += self.robot_state_dim
|
||||
total_dim += self.text_dim
|
||||
|
||||
self.conditioning_dim = total_dim * self.config.n_obs_steps
|
||||
|
||||
def encode(self, batch: dict) -> Tensor:
|
||||
"""Encode observations to vector format."""
|
||||
batch_size, n_obs_steps = batch[OBS_STATE].shape[:2]
|
||||
conditioning_feats = []
|
||||
|
||||
conditioning_feats.append(batch[OBS_STATE])
|
||||
|
||||
if self.vision_encoder is not None or self.vision_encoders is not None:
|
||||
images = batch[OBS_IMAGES]
|
||||
|
||||
if len(images.shape) == 5:
|
||||
images = images.unsqueeze(1)
|
||||
|
||||
if self.config.use_separate_rgb_encoder_per_camera:
|
||||
camera_features = []
|
||||
for cam_idx in range(self.num_cameras):
|
||||
cam_images = images[:, :, cam_idx]
|
||||
cam_images_flat = einops.rearrange(cam_images, "b s c h w -> (b s) c h w")
|
||||
cam_images_flat = self._apply_preprocessing(cam_images_flat)
|
||||
cam_features = self.vision_encoders[cam_idx](cam_images_flat)
|
||||
cam_visual_features = cam_features.flatten(start_dim=1)
|
||||
cam_features_reshaped = einops.rearrange(
|
||||
cam_visual_features, "(b s) f -> b s f", b=batch_size, s=n_obs_steps
|
||||
)
|
||||
camera_features.append(cam_features_reshaped)
|
||||
img_features = torch.cat(camera_features, dim=-1)
|
||||
conditioning_feats.append(img_features)
|
||||
else:
|
||||
images_flat = einops.rearrange(images, "b s n c h w -> (b s n) c h w")
|
||||
images_flat = self._apply_preprocessing(images_flat)
|
||||
visual_features = self.vision_encoder(images_flat).flatten(start_dim=1)
|
||||
img_features = einops.rearrange(
|
||||
visual_features, "(b s n) f -> b s (n f)", b=batch_size, s=n_obs_steps, n=self.num_cameras
|
||||
)
|
||||
conditioning_feats.append(img_features)
|
||||
|
||||
if self.text_encoder is not None and OBS_LANGUAGE_TOKENS in batch:
|
||||
input_ids = batch[OBS_LANGUAGE_TOKENS] # [batch_size, seq_length]
|
||||
attention_mask = batch[OBS_LANGUAGE_ATTENTION_MASK] # [batch_size, seq_length]
|
||||
|
||||
text_features = self.text_encoder(input_ids, attention_mask)
|
||||
|
||||
text_features = text_features.unsqueeze(1).expand(-1, n_obs_steps, -1)
|
||||
conditioning_feats.append(text_features)
|
||||
|
||||
combined_features = torch.cat(conditioning_feats, dim=-1)
|
||||
return combined_features.flatten(start_dim=1)
|
||||
|
||||
|
||||
# -- Transformer Components --
|
||||
|
||||
|
||||
def modulate(x: Tensor, shift: Tensor, scale: Tensor) -> Tensor:
|
||||
"""Modulate input with shift and scale for AdaLN-Zero."""
|
||||
return x * (1 + scale) + shift
|
||||
|
||||
|
||||
class SinusoidalPosEmb(nn.Module):
|
||||
"""Sinusoidal positional embeddings for timesteps."""
|
||||
|
||||
def __init__(self, dim: int):
|
||||
super().__init__()
|
||||
self.dim = dim
|
||||
|
||||
def forward(self, x: Tensor) -> Tensor:
|
||||
device = x.device
|
||||
half_dim = self.dim // 2
|
||||
emb = math.log(10000) / (half_dim - 1)
|
||||
emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
|
||||
emb = x[:, None] * emb[None, :]
|
||||
emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
|
||||
return emb
|
||||
|
||||
|
||||
class RotaryPositionalEmbedding(nn.Module):
|
||||
"""Rotary Position Embedding (RoPE) for transformers."""
|
||||
|
||||
def __init__(self, head_dim: int, max_seq_len: int = 512, base: float = 10000.0):
|
||||
super().__init__()
|
||||
assert head_dim % 2 == 0, "head_dim must be even for RoPE"
|
||||
|
||||
self.head_dim = head_dim
|
||||
self.max_seq_len = max_seq_len
|
||||
self.base = base
|
||||
|
||||
inv_freq = 1.0 / (base ** (torch.arange(0, head_dim, 2).float() / head_dim))
|
||||
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
||||
self._precompute_cache(max_seq_len)
|
||||
|
||||
def _precompute_cache(self, seq_len: int):
|
||||
t = torch.arange(seq_len, dtype=self.inv_freq.dtype)
|
||||
freqs = torch.outer(t, self.inv_freq)
|
||||
emb = torch.cat((freqs, freqs), dim=-1)
|
||||
self.register_buffer("_cos_cached", emb.cos()[None, None, :, :], persistent=False)
|
||||
self.register_buffer("_sin_cached", emb.sin()[None, None, :, :], persistent=False)
|
||||
|
||||
def _rotate_half(self, x: Tensor) -> Tensor:
|
||||
x1 = x[..., : x.shape[-1] // 2]
|
||||
x2 = x[..., x.shape[-1] // 2 :]
|
||||
return torch.cat((-x2, x1), dim=-1)
|
||||
|
||||
def forward(self, q: Tensor, k: Tensor) -> tuple[Tensor, Tensor]:
|
||||
seq_len = q.shape[2]
|
||||
if seq_len > self.max_seq_len:
|
||||
raise ValueError(f"Sequence length {seq_len} exceeds max_seq_len {self.max_seq_len}.")
|
||||
|
||||
cos = self._cos_cached[:, :, :seq_len, :].to(q.dtype)
|
||||
sin = self._sin_cached[:, :, :seq_len, :].to(q.dtype)
|
||||
|
||||
q_rotated = (q * cos) + (self._rotate_half(q) * sin)
|
||||
k_rotated = (k * cos) + (self._rotate_half(k) * sin)
|
||||
return q_rotated, k_rotated
|
||||
|
||||
|
||||
class RoPEAttention(nn.Module):
|
||||
"""Multi-head self-attention with Rotary Position Embedding (RoPE)."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
hidden_size: int,
|
||||
num_heads: int,
|
||||
dropout: float = 0.0,
|
||||
max_seq_len: int = 512,
|
||||
rope_base: float = 10000.0,
|
||||
):
|
||||
super().__init__()
|
||||
assert hidden_size % num_heads == 0, "hidden_size must be divisible by num_heads"
|
||||
|
||||
self.hidden_size = hidden_size
|
||||
self.num_heads = num_heads
|
||||
self.head_dim = hidden_size // num_heads
|
||||
self.scale = self.head_dim**-0.5
|
||||
|
||||
self.qkv_proj = nn.Linear(hidden_size, 3 * hidden_size, bias=True)
|
||||
self.out_proj = nn.Linear(hidden_size, hidden_size, bias=True)
|
||||
self.dropout = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
|
||||
self.rope = RotaryPositionalEmbedding(head_dim=self.head_dim, max_seq_len=max_seq_len, base=rope_base)
|
||||
|
||||
def forward(self, x: Tensor) -> Tensor:
|
||||
B, T, _ = x.shape # noqa: N806
|
||||
|
||||
qkv = self.qkv_proj(x)
|
||||
qkv = qkv.reshape(B, T, 3, self.num_heads, self.head_dim)
|
||||
qkv = qkv.permute(2, 0, 3, 1, 4)
|
||||
q, k, v = qkv[0], qkv[1], qkv[2]
|
||||
|
||||
q, k = self.rope(q, k)
|
||||
|
||||
attn_out = torch.nn.functional.scaled_dot_product_attention(
|
||||
q,
|
||||
k,
|
||||
v,
|
||||
dropout_p=self.dropout.p if isinstance(self.dropout, nn.Dropout) and self.training else 0.0,
|
||||
)
|
||||
|
||||
attn_out = attn_out.transpose(1, 2).reshape(B, T, self.hidden_size)
|
||||
return self.out_proj(attn_out)
|
||||
|
||||
|
||||
class TransformerBlock(nn.Module):
|
||||
"""DiT-style transformer block with AdaLN-Zero."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
hidden_size: int = 128,
|
||||
num_heads: int = 4,
|
||||
num_features: int = 128,
|
||||
dropout: float = 0.0,
|
||||
use_rope: bool = False,
|
||||
max_seq_len: int = 512,
|
||||
rope_base: float = 10000.0,
|
||||
):
|
||||
super().__init__()
|
||||
self.use_rope = use_rope
|
||||
|
||||
if use_rope:
|
||||
self.attn = RoPEAttention(
|
||||
hidden_size=hidden_size,
|
||||
num_heads=num_heads,
|
||||
dropout=dropout,
|
||||
max_seq_len=max_seq_len,
|
||||
rope_base=rope_base,
|
||||
)
|
||||
else:
|
||||
self.multihead_attn = nn.MultiheadAttention(
|
||||
hidden_size, num_heads=num_heads, batch_first=True, dropout=dropout
|
||||
)
|
||||
|
||||
self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
|
||||
self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
|
||||
|
||||
self.mlp = nn.Sequential(
|
||||
nn.Linear(hidden_size, hidden_size * 4),
|
||||
nn.GELU(approximate="tanh"),
|
||||
nn.Linear(hidden_size * 4, hidden_size),
|
||||
)
|
||||
|
||||
self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(num_features, 6 * hidden_size, bias=True))
|
||||
|
||||
def forward(self, x: Tensor, features: Tensor) -> Tensor:
|
||||
shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(
|
||||
features
|
||||
).chunk(6, dim=1)
|
||||
|
||||
attn_input = modulate(self.norm1(x), shift_msa.unsqueeze(1), scale_msa.unsqueeze(1))
|
||||
|
||||
if self.use_rope:
|
||||
attn_out = self.attn(attn_input)
|
||||
else:
|
||||
attn_out, _ = self.multihead_attn(attn_input, attn_input, attn_input)
|
||||
|
||||
x = x + gate_msa.unsqueeze(1) * attn_out
|
||||
|
||||
mlp_input = modulate(self.norm2(x), shift_mlp.unsqueeze(1), scale_mlp.unsqueeze(1))
|
||||
mlp_out = self.mlp(mlp_input)
|
||||
x = x + gate_mlp.unsqueeze(1) * mlp_out
|
||||
|
||||
return x
|
||||
|
||||
|
||||
class DiffusionTransformer(nn.Module):
|
||||
"""Transformer-based diffusion noise prediction model."""
|
||||
|
||||
def __init__(self, config, conditioning_dim: int):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
self.conditioning_dim = conditioning_dim
|
||||
|
||||
self.action_dim = config.action_feature.shape[0]
|
||||
self.horizon = config.horizon
|
||||
self.hidden_size = config.hidden_dim
|
||||
self.num_layers = config.num_layers
|
||||
self.num_heads = config.num_heads
|
||||
self.dropout = config.dropout
|
||||
self.use_rope = config.use_rope
|
||||
|
||||
self.timestep_embed_dim = config.timestep_embed_dim
|
||||
self.time_mlp = nn.Sequential(
|
||||
SinusoidalPosEmb(self.timestep_embed_dim),
|
||||
nn.Linear(self.timestep_embed_dim, 2 * self.timestep_embed_dim),
|
||||
nn.GELU(),
|
||||
nn.Linear(2 * self.timestep_embed_dim, self.timestep_embed_dim),
|
||||
nn.GELU(),
|
||||
)
|
||||
|
||||
self.cond_dim = self.timestep_embed_dim + conditioning_dim
|
||||
self.input_proj = nn.Linear(self.action_dim, self.hidden_size)
|
||||
|
||||
if config.use_positional_encoding:
|
||||
self.pos_embedding = nn.Parameter(
|
||||
torch.empty(1, self.horizon, self.hidden_size).normal_(std=0.02)
|
||||
)
|
||||
else:
|
||||
self.pos_embedding = None
|
||||
|
||||
self.transformer_blocks = nn.ModuleList(
|
||||
[
|
||||
TransformerBlock(
|
||||
hidden_size=self.hidden_size,
|
||||
num_heads=self.num_heads,
|
||||
num_features=self.cond_dim,
|
||||
dropout=self.dropout,
|
||||
use_rope=self.use_rope,
|
||||
max_seq_len=self.horizon,
|
||||
rope_base=config.rope_base,
|
||||
)
|
||||
for _ in range(self.num_layers)
|
||||
]
|
||||
)
|
||||
|
||||
self.output_proj = nn.Linear(self.hidden_size, self.action_dim)
|
||||
self._initialize_weights()
|
||||
|
||||
def _initialize_weights(self):
|
||||
for block in self.transformer_blocks:
|
||||
nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
|
||||
nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
|
||||
|
||||
def forward(self, x: Tensor, timestep: Tensor, conditioning_vec: Tensor) -> Tensor:
|
||||
_, seq_len, _ = x.shape
|
||||
|
||||
timestep_features = self.time_mlp(timestep)
|
||||
cond_features = torch.cat([timestep_features, conditioning_vec], dim=-1)
|
||||
|
||||
hidden_seq = self.input_proj(x)
|
||||
|
||||
if self.pos_embedding is not None:
|
||||
hidden_seq = hidden_seq + self.pos_embedding[:, :seq_len, :]
|
||||
|
||||
for block in self.transformer_blocks:
|
||||
hidden_seq = block(hidden_seq, cond_features)
|
||||
|
||||
return self.output_proj(hidden_seq)
|
||||
|
||||
|
||||
# -- Objectives --
|
||||
|
||||
|
||||
class DiffusionObjective(nn.Module):
|
||||
"""Standard diffusion (DDPM/DDIM) objective implementation."""
|
||||
|
||||
def __init__(self, config, action_dim: int, horizon: int, do_mask_loss_for_padding: bool = False):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
self.action_dim = action_dim
|
||||
self.horizon = horizon
|
||||
self.do_mask_loss_for_padding = do_mask_loss_for_padding
|
||||
|
||||
scheduler_kwargs = {
|
||||
"num_train_timesteps": config.num_train_timesteps,
|
||||
"beta_start": config.beta_start,
|
||||
"beta_end": config.beta_end,
|
||||
"beta_schedule": config.beta_schedule,
|
||||
"clip_sample": config.clip_sample,
|
||||
"clip_sample_range": config.clip_sample_range,
|
||||
"prediction_type": config.prediction_type,
|
||||
}
|
||||
|
||||
if config.noise_scheduler_type == "DDPM":
|
||||
self.noise_scheduler: DDPMScheduler | DDIMScheduler = DDPMScheduler(**scheduler_kwargs)
|
||||
elif config.noise_scheduler_type == "DDIM":
|
||||
self.noise_scheduler = DDIMScheduler(**scheduler_kwargs)
|
||||
else:
|
||||
raise ValueError(f"Unsupported noise scheduler type {config.noise_scheduler_type}")
|
||||
|
||||
self.num_inference_steps = (
|
||||
config.num_inference_steps
|
||||
if config.num_inference_steps is not None
|
||||
else self.noise_scheduler.config.num_train_timesteps
|
||||
)
|
||||
|
||||
def compute_loss(self, model: nn.Module, batch: dict[str, Tensor], conditioning_vec: Tensor) -> Tensor:
|
||||
clean_actions = batch[ACTION]
|
||||
noise = torch.randn_like(clean_actions)
|
||||
timesteps = torch.randint(
|
||||
low=0,
|
||||
high=self.noise_scheduler.config.num_train_timesteps,
|
||||
size=(clean_actions.shape[0],),
|
||||
device=clean_actions.device,
|
||||
).long()
|
||||
noisy_actions = self.noise_scheduler.add_noise(clean_actions, noise, timesteps)
|
||||
|
||||
prediction_type = self.noise_scheduler.config.prediction_type
|
||||
if prediction_type == "epsilon":
|
||||
target = noise
|
||||
elif prediction_type == "sample":
|
||||
target = clean_actions
|
||||
else:
|
||||
raise ValueError(f"Unsupported prediction type: {prediction_type}")
|
||||
|
||||
predicted = model(noisy_actions, timesteps, conditioning_vec=conditioning_vec)
|
||||
loss = F.mse_loss(predicted, target, reduction="none")
|
||||
|
||||
if self.do_mask_loss_for_padding and "action_is_pad" in batch:
|
||||
valid_actions = ~batch["action_is_pad"]
|
||||
loss = loss * valid_actions.unsqueeze(-1)
|
||||
|
||||
return loss.mean()
|
||||
|
||||
def conditional_sample(self, model: nn.Module, batch_size: int, conditioning_vec: Tensor) -> Tensor:
|
||||
device = next(model.parameters()).device
|
||||
dtype = next(model.parameters()).dtype
|
||||
|
||||
sample = torch.randn(
|
||||
size=(batch_size, self.horizon, self.action_dim),
|
||||
dtype=dtype,
|
||||
device=device,
|
||||
)
|
||||
|
||||
self.noise_scheduler.set_timesteps(self.num_inference_steps)
|
||||
for t in self.noise_scheduler.timesteps:
|
||||
model_output = model(
|
||||
sample,
|
||||
torch.full(sample.shape[:1], t, dtype=torch.long, device=sample.device),
|
||||
conditioning_vec=conditioning_vec,
|
||||
)
|
||||
sample = self.noise_scheduler.step(model_output, t, sample).prev_sample
|
||||
|
||||
return sample
|
||||
|
||||
|
||||
class FlowMatchingObjective(nn.Module):
|
||||
"""Flow matching objective: trains a model to predict velocity fields."""
|
||||
|
||||
def __init__(self, config, action_dim: int, horizon: int, do_mask_loss_for_padding: bool = False):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
self.action_dim = action_dim
|
||||
self.horizon = horizon
|
||||
self.do_mask_loss_for_padding = do_mask_loss_for_padding
|
||||
|
||||
def _sample_timesteps(self, batch_size: int, device: torch.device) -> Tensor:
|
||||
if self.config.timestep_sampling_strategy == "uniform":
|
||||
return torch.rand(batch_size, device=device)
|
||||
elif self.config.timestep_sampling_strategy == "beta":
|
||||
beta_dist = torch.distributions.Beta(
|
||||
self.config.timestep_sampling_alpha, self.config.timestep_sampling_beta
|
||||
)
|
||||
u = beta_dist.sample((batch_size,)).to(device)
|
||||
return self.config.timestep_sampling_s * (1.0 - u)
|
||||
else:
|
||||
raise ValueError(f"Unknown timestep strategy: {self.config.timestep_sampling_strategy}")
|
||||
|
||||
def compute_loss(self, model: nn.Module, batch: dict[str, Tensor], conditioning_vec: Tensor) -> Tensor:
|
||||
data = batch[ACTION]
|
||||
batch_size = data.shape[0]
|
||||
device = data.device
|
||||
|
||||
noise = torch.randn_like(data)
|
||||
t = self._sample_timesteps(batch_size, device)
|
||||
t_expanded = t.view(-1, 1, 1)
|
||||
x_t = t_expanded * data + (1 - (1 - self.config.sigma_min) * t_expanded) * noise
|
||||
|
||||
target_velocity = data - (1 - self.config.sigma_min) * noise
|
||||
predicted_velocity = model(x_t, t, conditioning_vec=conditioning_vec)
|
||||
loss = F.mse_loss(predicted_velocity, target_velocity, reduction="none")
|
||||
|
||||
if self.do_mask_loss_for_padding and "action_is_pad" in batch:
|
||||
valid_mask = ~batch["action_is_pad"]
|
||||
loss = loss * valid_mask.unsqueeze(-1)
|
||||
|
||||
return loss.mean()
|
||||
|
||||
def conditional_sample(self, model: nn.Module, batch_size: int, conditioning_vec: Tensor) -> Tensor:
|
||||
device = next(model.parameters()).device
|
||||
dtype = next(model.parameters()).dtype
|
||||
|
||||
x = torch.randn((batch_size, self.horizon, self.action_dim), dtype=dtype, device=device)
|
||||
|
||||
num_steps = self.config.num_integration_steps
|
||||
time_grid = torch.linspace(0, 1, num_steps + 1, device=device)
|
||||
|
||||
if self.config.integration_method == "euler":
|
||||
x = self._euler_integrate(model, x, time_grid, conditioning_vec)
|
||||
elif self.config.integration_method == "rk4":
|
||||
x = self._rk4_integrate(model, x, time_grid, conditioning_vec)
|
||||
else:
|
||||
raise ValueError(f"Unknown integration method: {self.config.integration_method}")
|
||||
|
||||
return x
|
||||
|
||||
def _euler_integrate(
|
||||
self, model: nn.Module, x_init: Tensor, time_grid: Tensor, conditioning_vec: Tensor
|
||||
) -> Tensor:
|
||||
x = x_init
|
||||
for i in range(len(time_grid) - 1):
|
||||
t_scalar = time_grid[i].item()
|
||||
dt = (time_grid[i + 1] - time_grid[i]).item()
|
||||
t_batch = torch.full((x.shape[0],), t_scalar, dtype=x.dtype, device=x.device)
|
||||
with torch.no_grad():
|
||||
velocity = model(x, t_batch, conditioning_vec=conditioning_vec)
|
||||
x = x + dt * velocity
|
||||
return x
|
||||
|
||||
def _rk4_integrate(
|
||||
self, model: nn.Module, x_init: Tensor, time_grid: Tensor, conditioning_vec: Tensor
|
||||
) -> Tensor:
|
||||
x = x_init
|
||||
|
||||
def dynamics(x_val: Tensor, t_scalar: float) -> Tensor:
|
||||
t_batch = torch.full((x_val.shape[0],), t_scalar, dtype=x_val.dtype, device=x_val.device)
|
||||
with torch.no_grad():
|
||||
return model(x_val, t_batch, conditioning_vec=conditioning_vec)
|
||||
|
||||
for i in range(len(time_grid) - 1):
|
||||
t = time_grid[i].item()
|
||||
dt = (time_grid[i + 1] - time_grid[i]).item()
|
||||
|
||||
k1 = dynamics(x, t)
|
||||
k2 = dynamics(x + dt * k1 / 2, t + dt / 2)
|
||||
k3 = dynamics(x + dt * k2 / 2, t + dt / 2)
|
||||
k4 = dynamics(x + dt * k3, t + dt)
|
||||
|
||||
x = x + dt / 6 * (k1 + 2 * k2 + 2 * k3 + k4)
|
||||
|
||||
return x
|
||||
@@ -0,0 +1,105 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2025 Bryson Jones and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
|
||||
from lerobot.policies.multi_task_dit.configuration_multi_task_dit import MultiTaskDiTConfig
|
||||
from lerobot.processor import (
|
||||
AddBatchDimensionProcessorStep,
|
||||
DeviceProcessorStep,
|
||||
NormalizerProcessorStep,
|
||||
PolicyAction,
|
||||
PolicyProcessorPipeline,
|
||||
RenameObservationsProcessorStep,
|
||||
TokenizerProcessorStep,
|
||||
UnnormalizerProcessorStep,
|
||||
)
|
||||
from lerobot.processor.converters import policy_action_to_transition, transition_to_policy_action
|
||||
from lerobot.utils.constants import POLICY_POSTPROCESSOR_DEFAULT_NAME, POLICY_PREPROCESSOR_DEFAULT_NAME
|
||||
|
||||
|
||||
def make_multi_task_dit_pre_post_processors(
|
||||
config: MultiTaskDiTConfig,
|
||||
dataset_stats: dict[str, dict[str, torch.Tensor]] | None = None,
|
||||
) -> tuple[
|
||||
PolicyProcessorPipeline[dict[str, Any], dict[str, Any]],
|
||||
PolicyProcessorPipeline[PolicyAction, PolicyAction],
|
||||
]:
|
||||
"""
|
||||
Constructs pre-processor and post-processor pipelines for a Multi-Task DiT policy.
|
||||
|
||||
The pre-processing pipeline prepares the input data for the model by:
|
||||
1. Renaming features.
|
||||
2. Adding a batch dimension.
|
||||
3. Tokenizing the language task description (if present).
|
||||
4. Moving the data to the specified device.
|
||||
5. Normalizing the input and output features based on dataset statistics.
|
||||
|
||||
The post-processing pipeline handles the model's output by:
|
||||
1. Unnormalizing the output features to their original scale.
|
||||
2. Moving the data to the CPU.
|
||||
|
||||
Args:
|
||||
config: The configuration object for the Multi-Task DiT policy,
|
||||
containing feature definitions, normalization mappings, and device information.
|
||||
dataset_stats: A dictionary of statistics used for normalization.
|
||||
Defaults to None.
|
||||
|
||||
Returns:
|
||||
A tuple containing the configured pre-processor and post-processor pipelines.
|
||||
"""
|
||||
|
||||
input_steps = [
|
||||
RenameObservationsProcessorStep(rename_map={}),
|
||||
AddBatchDimensionProcessorStep(),
|
||||
TokenizerProcessorStep(
|
||||
tokenizer_name=config.text_encoder_name,
|
||||
padding=config.tokenizer_padding,
|
||||
padding_side=config.tokenizer_padding_side,
|
||||
max_length=config.tokenizer_max_length,
|
||||
truncation=config.tokenizer_truncation,
|
||||
),
|
||||
DeviceProcessorStep(device=config.device),
|
||||
NormalizerProcessorStep(
|
||||
features={**config.input_features, **config.output_features},
|
||||
norm_map=config.normalization_mapping,
|
||||
stats=dataset_stats,
|
||||
device=config.device,
|
||||
),
|
||||
]
|
||||
output_steps = [
|
||||
UnnormalizerProcessorStep(
|
||||
features=config.output_features,
|
||||
norm_map=config.normalization_mapping,
|
||||
stats=dataset_stats,
|
||||
),
|
||||
DeviceProcessorStep(device="cpu"),
|
||||
]
|
||||
|
||||
return (
|
||||
PolicyProcessorPipeline[dict[str, Any], dict[str, Any]](
|
||||
steps=input_steps,
|
||||
name=POLICY_PREPROCESSOR_DEFAULT_NAME,
|
||||
),
|
||||
PolicyProcessorPipeline[PolicyAction, PolicyAction](
|
||||
steps=output_steps,
|
||||
name=POLICY_POSTPROCESSOR_DEFAULT_NAME,
|
||||
to_transition=policy_action_to_transition,
|
||||
to_output=transition_to_policy_action,
|
||||
),
|
||||
)
|
||||
@@ -1,49 +0,0 @@
|
||||
# π₀ (pi0)
|
||||
|
||||
This repository contains the Hugging Face port of **π₀**, adapted from [OpenPI](https://github.com/Physical-Intelligence/openpi) by the Physical Intelligence.
|
||||
It is designed as a **Vision-Language-Action model for general robot control**.
|
||||
|
||||
---
|
||||
|
||||
## Model Overview
|
||||
|
||||
| Feature | π₀ | π₀.₅ |
|
||||
| -------------------- | ------------------------------------------------------ | ----------------------------------------- |
|
||||
| Time Conditioning | Concatenates time with actions via `action_time_mlp_*` | Uses `time_mlp_*` for AdaRMS conditioning |
|
||||
| AdaRMS | Not used | Used in action expert |
|
||||
| Tokenizer Length | 48 tokens | 200 tokens |
|
||||
| Discrete State Input | False (Uses `state_proj` layer) | True |
|
||||
| Parameter Count | Higher (includes state embedding) | Lower (no state embedding) |
|
||||
|
||||
---
|
||||
|
||||
## Citation
|
||||
|
||||
If you use this work, please cite both **OpenPI** and the π₀ paper:
|
||||
|
||||
```bibtex
|
||||
@misc{openpi2024,
|
||||
author = {Physical Intelligence Lab},
|
||||
title = {OpenPI: PyTorch Implementation of π0 and π0.5 Policies},
|
||||
year = {2024},
|
||||
publisher = {GitHub},
|
||||
howpublished = {\url{https://github.com/Physical-Intelligence/openpi}},
|
||||
license = {Apache-2.0}
|
||||
}
|
||||
|
||||
@misc{black2024pi0visionlanguageactionflowmodel,
|
||||
title = {π₀: A Vision-Language-Action Flow Model for General Robot Control},
|
||||
author = {Kevin Black and Noah Brown and Danny Driess and Adnan Esmail and Michael Equi and Chelsea Finn and Niccolo Fusai and Lachy Groom and Karol Hausman and Brian Ichter and Szymon Jakubczak and Tim Jones and Liyiming Ke and Sergey Levine and Adrian Li-Bell and Mohith Mothukuri and Suraj Nair and Karl Pertsch and Lucy Xiaoyang Shi and James Tanner and Quan Vuong and Anna Walling and Haohuan Wang and Ury Zhilinsky},
|
||||
year = {2024},
|
||||
eprint = {2410.24164},
|
||||
archivePrefix= {arXiv},
|
||||
primaryClass = {cs.LG},
|
||||
url = {https://arxiv.org/abs/2410.24164},
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## License
|
||||
|
||||
This port follows the **Apache 2.0 License**, consistent with the original [OpenPI repository](https://github.com/Physical-Intelligence/openpi).
|
||||
+1
@@ -0,0 +1 @@
|
||||
../../../../docs/source/policy_pi0_README.md
|
||||
@@ -50,6 +50,13 @@ class PI0Config(PreTrainedConfig):
|
||||
min_period: float = 4e-3
|
||||
max_period: float = 4.0
|
||||
|
||||
# Relative actions: converts absolute actions to relative (relative to state).
|
||||
use_relative_actions: bool = False
|
||||
# Joint names to exclude from relative (kept absolute). Empty list = all dims relative.
|
||||
relative_exclude_joints: list[str] = field(default_factory=lambda: ["gripper"])
|
||||
# Populated at runtime from dataset metadata by make_policy.
|
||||
action_feature_names: list[str] | None = None
|
||||
|
||||
# Real-Time Chunking (RTC) configuration
|
||||
rtc_config: RTCConfig | None = None
|
||||
|
||||
|
||||
@@ -21,6 +21,7 @@ import torch
|
||||
from lerobot.configs.types import PipelineFeatureType, PolicyFeature
|
||||
from lerobot.policies.pi0.configuration_pi0 import PI0Config
|
||||
from lerobot.processor import (
|
||||
AbsoluteActionsProcessorStep,
|
||||
AddBatchDimensionProcessorStep,
|
||||
ComplementaryDataProcessorStep,
|
||||
DeviceProcessorStep,
|
||||
@@ -29,6 +30,7 @@ from lerobot.processor import (
|
||||
PolicyProcessorPipeline,
|
||||
ProcessorStep,
|
||||
ProcessorStepRegistry,
|
||||
RelativeActionsProcessorStep,
|
||||
RenameObservationsProcessorStep,
|
||||
TokenizerProcessorStep,
|
||||
UnnormalizerProcessorStep,
|
||||
@@ -126,7 +128,13 @@ def make_pi0_pre_post_processors(
|
||||
A tuple containing the configured pre-processor and post-processor pipelines.
|
||||
"""
|
||||
|
||||
# Add remaining processors
|
||||
relative_step = RelativeActionsProcessorStep(
|
||||
enabled=config.use_relative_actions,
|
||||
exclude_joints=getattr(config, "relative_exclude_joints", []),
|
||||
action_names=getattr(config, "action_feature_names", None),
|
||||
)
|
||||
|
||||
# OpenPI order: raw → relative → normalize → model → unnormalize → absolute
|
||||
input_steps: list[ProcessorStep] = [
|
||||
RenameObservationsProcessorStep(rename_map={}), # To mimic the same processor as pretrained one
|
||||
AddBatchDimensionProcessorStep(),
|
||||
@@ -138,6 +146,7 @@ def make_pi0_pre_post_processors(
|
||||
padding="max_length",
|
||||
),
|
||||
DeviceProcessorStep(device=config.device),
|
||||
relative_step,
|
||||
NormalizerProcessorStep(
|
||||
features={**config.input_features, **config.output_features},
|
||||
norm_map=config.normalization_mapping,
|
||||
@@ -149,6 +158,7 @@ def make_pi0_pre_post_processors(
|
||||
UnnormalizerProcessorStep(
|
||||
features=config.output_features, norm_map=config.normalization_mapping, stats=dataset_stats
|
||||
),
|
||||
AbsoluteActionsProcessorStep(enabled=config.use_relative_actions, relative_step=relative_step),
|
||||
DeviceProcessorStep(device="cpu"),
|
||||
]
|
||||
|
||||
|
||||
@@ -1,49 +0,0 @@
|
||||
# π₀.₅ (pi05)
|
||||
|
||||
This repository contains the Hugging Face port of **π₀.₅**, adapted from [OpenPI](https://github.com/Physical-Intelligence/openpi) by the Physical Intelligence.
|
||||
It is designed as a **Vision-Language-Action model with open-world generalization**.
|
||||
|
||||
---
|
||||
|
||||
## Model Overview
|
||||
|
||||
| Feature | π₀ | π₀.₅ |
|
||||
| -------------------- | ------------------------------------------------------ | ----------------------------------------- |
|
||||
| Time Conditioning | Concatenates time with actions via `action_time_mlp_*` | Uses `time_mlp_*` for AdaRMS conditioning |
|
||||
| AdaRMS | Not used | Used in action expert |
|
||||
| Tokenizer Length | 48 tokens | 200 tokens |
|
||||
| Discrete State Input | False (Uses `state_proj` layer) | True |
|
||||
| Parameter Count | Higher (includes state embedding) | Lower (no state embedding) |
|
||||
|
||||
---
|
||||
|
||||
## Citation
|
||||
|
||||
If you use this work, please cite both **OpenPI** and the π₀.₅ paper:
|
||||
|
||||
```bibtex
|
||||
@misc{openpi2024,
|
||||
author = {Physical Intelligence Lab},
|
||||
title = {OpenPI: PyTorch Implementation of π0 and π0.5 Policies},
|
||||
year = {2024},
|
||||
publisher = {GitHub},
|
||||
howpublished = {\url{https://github.com/Physical-Intelligence/openpi}},
|
||||
license = {Apache-2.0}
|
||||
}
|
||||
|
||||
@misc{intelligence2025pi05visionlanguageactionmodelopenworld,
|
||||
title = {π₀.₅: a Vision-Language-Action Model with Open-World Generalization},
|
||||
author = {Physical Intelligence and Kevin Black and Noah Brown and James Darpinian and Karan Dhabalia and Danny Driess and Adnan Esmail and Michael Equi and Chelsea Finn and Niccolo Fusai and Manuel Y. Galliker and Dibya Ghosh and Lachy Groom and Karol Hausman and Brian Ichter and Szymon Jakubczak and Tim Jones and Liyiming Ke and Devin LeBlanc and Sergey Levine and Adrian Li-Bell and Mohith Mothukuri and Suraj Nair and Karl Pertsch and Allen Z. Ren and Lucy Xiaoyang Shi and Laura Smith and Jost Tobias Springenberg and Kyle Stachowicz and James Tanner and Quan Vuong and Homer Walke and Anna Walling and Haohuan Wang and Lili Yu and Ury Zhilinsky},
|
||||
year = {2025},
|
||||
eprint = {2504.16054},
|
||||
archivePrefix= {arXiv},
|
||||
primaryClass = {cs.LG},
|
||||
url = {https://arxiv.org/abs/2504.16054},
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## License
|
||||
|
||||
This port follows the **Apache 2.0 License**, consistent with the original [OpenPI repository](https://github.com/Physical-Intelligence/openpi).
|
||||
+1
@@ -0,0 +1 @@
|
||||
../../../../docs/source/policy_pi05_README.md
|
||||
@@ -50,6 +50,13 @@ class PI05Config(PreTrainedConfig):
|
||||
min_period: float = 4e-3
|
||||
max_period: float = 4.0
|
||||
|
||||
# Relative actions: converts absolute actions to relative (relative to state).
|
||||
use_relative_actions: bool = False
|
||||
# Joint names to exclude from relative (kept absolute). Empty list = all dims relative.
|
||||
relative_exclude_joints: list[str] = field(default_factory=lambda: ["gripper"])
|
||||
# Populated at runtime from dataset metadata by make_policy.
|
||||
action_feature_names: list[str] | None = None
|
||||
|
||||
# Real-Time Chunking (RTC) configuration
|
||||
rtc_config: RTCConfig | None = None
|
||||
|
||||
|
||||
@@ -24,6 +24,7 @@ import torch
|
||||
from lerobot.configs.types import PipelineFeatureType, PolicyFeature
|
||||
from lerobot.policies.pi05.configuration_pi05 import PI05Config
|
||||
from lerobot.processor import (
|
||||
AbsoluteActionsProcessorStep,
|
||||
AddBatchDimensionProcessorStep,
|
||||
DeviceProcessorStep,
|
||||
NormalizerProcessorStep,
|
||||
@@ -31,6 +32,7 @@ from lerobot.processor import (
|
||||
PolicyProcessorPipeline,
|
||||
ProcessorStep,
|
||||
ProcessorStepRegistry,
|
||||
RelativeActionsProcessorStep,
|
||||
RenameObservationsProcessorStep,
|
||||
TokenizerProcessorStep,
|
||||
UnnormalizerProcessorStep,
|
||||
@@ -125,10 +127,17 @@ def make_pi05_pre_post_processors(
|
||||
A tuple containing the configured pre-processor and post-processor pipelines.
|
||||
"""
|
||||
|
||||
# Add remaining processors
|
||||
relative_step = RelativeActionsProcessorStep(
|
||||
enabled=config.use_relative_actions,
|
||||
exclude_joints=getattr(config, "relative_exclude_joints", []),
|
||||
action_names=getattr(config, "action_feature_names", None),
|
||||
)
|
||||
|
||||
# OpenPI order: raw → relative → normalize → model → unnormalize → absolute
|
||||
input_steps: list[ProcessorStep] = [
|
||||
RenameObservationsProcessorStep(rename_map={}), # To mimic the same processor as pretrained one
|
||||
AddBatchDimensionProcessorStep(),
|
||||
relative_step,
|
||||
# NOTE: NormalizerProcessorStep MUST come before Pi05PrepareStateTokenizerProcessorStep
|
||||
# because the tokenizer step expects normalized state in [-1, 1] range for discretization
|
||||
NormalizerProcessorStep(
|
||||
@@ -150,6 +159,7 @@ def make_pi05_pre_post_processors(
|
||||
UnnormalizerProcessorStep(
|
||||
features=config.output_features, norm_map=config.normalization_mapping, stats=dataset_stats
|
||||
),
|
||||
AbsoluteActionsProcessorStep(enabled=config.use_relative_actions, relative_step=relative_step),
|
||||
DeviceProcessorStep(device="cpu"),
|
||||
]
|
||||
|
||||
|
||||
@@ -41,6 +41,13 @@ class PI0FastConfig(PreTrainedConfig):
|
||||
max_action_dim: int = 32
|
||||
max_action_tokens: int = 256
|
||||
|
||||
# Relative actions: converts absolute actions to relative (relative to state).
|
||||
use_relative_actions: bool = False
|
||||
# Joint names to exclude from relative (kept absolute). Empty list = all dims relative.
|
||||
relative_exclude_joints: list[str] = field(default_factory=lambda: ["gripper"])
|
||||
# Populated at runtime from dataset metadata by make_policy.
|
||||
action_feature_names: list[str] | None = None
|
||||
|
||||
# Real-Time Chunking (RTC) configuration
|
||||
rtc_config: RTCConfig | None = None
|
||||
|
||||
|
||||
@@ -24,6 +24,7 @@ import torch
|
||||
from lerobot.configs.types import PipelineFeatureType, PolicyFeature
|
||||
from lerobot.policies.pi0_fast.configuration_pi0_fast import PI0FastConfig
|
||||
from lerobot.processor import (
|
||||
AbsoluteActionsProcessorStep,
|
||||
ActionTokenizerProcessorStep,
|
||||
AddBatchDimensionProcessorStep,
|
||||
DeviceProcessorStep,
|
||||
@@ -32,6 +33,7 @@ from lerobot.processor import (
|
||||
PolicyProcessorPipeline,
|
||||
ProcessorStep,
|
||||
ProcessorStepRegistry,
|
||||
RelativeActionsProcessorStep,
|
||||
RenameObservationsProcessorStep,
|
||||
TokenizerProcessorStep,
|
||||
UnnormalizerProcessorStep,
|
||||
@@ -125,12 +127,24 @@ def make_pi0_fast_pre_post_processors(
|
||||
Returns:
|
||||
A tuple containing the configured pre-processor and post-processor pipelines.
|
||||
"""
|
||||
# Add remaining processors
|
||||
relative_step = RelativeActionsProcessorStep(
|
||||
enabled=config.use_relative_actions,
|
||||
exclude_joints=getattr(config, "relative_exclude_joints", []),
|
||||
action_names=getattr(config, "action_feature_names", None),
|
||||
)
|
||||
|
||||
# Pi0Fast order: relative → normalize → tokenize → model → unnormalize → absolute
|
||||
# This matches pi0/pi0.5: RelativeActionsProcessorStep runs first on raw absolute actions,
|
||||
# caching the raw state. NormalizerProcessorStep then normalizes the raw relative actions,
|
||||
# so the normalizer (and action tokenizer) sees delta values — relative stats are required.
|
||||
# NOTE: RelativeActionsProcessorStep only modifies the action in the transition; it reads
|
||||
# state from the observation but does not change it. NormalizerProcessorStep still runs
|
||||
# before Pi0FastPrepareStateAndLanguageTokenizerProcessorStep, so the state tokenizer
|
||||
# continues to receive normalized state in [-1, 1] as expected.
|
||||
input_steps: list[ProcessorStep] = [
|
||||
RenameObservationsProcessorStep(rename_map={}), # To mimic the same processor as pretrained one
|
||||
AddBatchDimensionProcessorStep(),
|
||||
# NOTE: NormalizerProcessorStep MUST come before Pi0FastPrepareStateAndLanguageTokenizerProcessorStep
|
||||
# because the tokenizer step expects normalized state in [-1, 1] range for discretization
|
||||
relative_step,
|
||||
NormalizerProcessorStep(
|
||||
features={**config.input_features, **config.output_features},
|
||||
norm_map=config.normalization_mapping,
|
||||
@@ -156,6 +170,7 @@ def make_pi0_fast_pre_post_processors(
|
||||
UnnormalizerProcessorStep(
|
||||
features=config.output_features, norm_map=config.normalization_mapping, stats=dataset_stats
|
||||
),
|
||||
AbsoluteActionsProcessorStep(enabled=config.use_relative_actions, relative_step=relative_step),
|
||||
DeviceProcessorStep(device="cpu"),
|
||||
]
|
||||
|
||||
|
||||
@@ -1,38 +0,0 @@
|
||||
# Real-Time Chunking (RTC)
|
||||
|
||||
This module contains the LeRobot implementation of **Real-Time Chunking (RTC)**, an inference-time technique for flow-matching based policies.
|
||||
|
||||
**Note**: RTC is not a policy itself, but rather an inference enhancement that works with flow-matching based policies including [π₀](../pi0/), [π₀.₅](../pi05/), and [SmolVLA](../smolvla/).
|
||||
|
||||
---
|
||||
|
||||
## Citation
|
||||
|
||||
If you use Real-Time Chunking in your work, please cite:
|
||||
|
||||
```bibtex
|
||||
@misc{openpi2024,
|
||||
author = {Physical Intelligence Lab},
|
||||
title = {OpenPI: PyTorch Implementation of π0 and π0.5 Policies},
|
||||
year = {2024},
|
||||
publisher = {GitHub},
|
||||
howpublished = {\url{https://github.com/Physical-Intelligence/openpi}},
|
||||
license = {Apache-2.0}
|
||||
}
|
||||
|
||||
@misc{black2025realtimeexecutionactionchunking,
|
||||
title={Real-Time Execution of Action Chunking Flow Policies},
|
||||
author={Kevin Black and Manuel Y. Galliker and Sergey Levine},
|
||||
year={2025},
|
||||
eprint={2506.07339},
|
||||
archivePrefix={arXiv},
|
||||
primaryClass={cs.RO},
|
||||
url={https://arxiv.org/abs/2506.07339},
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## License
|
||||
|
||||
This implementation follows the **Apache 2.0 License**, consistent with the LeRobot project.
|
||||
+1
@@ -0,0 +1 @@
|
||||
../../../../docs/source/policy_rtc_README.md
|
||||
@@ -0,0 +1,29 @@
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Real-Time Chunking (RTC) utilities for action-chunking policies."""
|
||||
|
||||
from lerobot.policies.rtc.action_interpolator import ActionInterpolator
|
||||
from lerobot.policies.rtc.action_queue import ActionQueue
|
||||
from lerobot.policies.rtc.configuration_rtc import RTCConfig
|
||||
from lerobot.policies.rtc.latency_tracker import LatencyTracker
|
||||
from lerobot.policies.rtc.modeling_rtc import RTCProcessor
|
||||
|
||||
__all__ = [
|
||||
"ActionInterpolator",
|
||||
"ActionQueue",
|
||||
"LatencyTracker",
|
||||
"RTCConfig",
|
||||
"RTCProcessor",
|
||||
]
|
||||
@@ -0,0 +1,116 @@
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Action interpolation for smoother robot control.
|
||||
|
||||
Provides configurable Nx control rate by interpolating between consecutive actions.
|
||||
Useful with RTC and action-chunking policies to reduce jerkiness.
|
||||
"""
|
||||
|
||||
from torch import Tensor
|
||||
|
||||
|
||||
class ActionInterpolator:
|
||||
"""Interpolates between consecutive actions for smoother control.
|
||||
|
||||
When enabled with multiplier N, produces N actions per policy action
|
||||
by linearly interpolating between the previous and current action.
|
||||
|
||||
Example with multiplier=3:
|
||||
prev_action -> [1/3 interpolated, 2/3 interpolated, current_action]
|
||||
|
||||
This effectively multiplies the control rate for smoother motion.
|
||||
|
||||
Usage:
|
||||
interpolator = ActionInterpolator(multiplier=2) # 2x control rate
|
||||
|
||||
# In control loop:
|
||||
if interpolator.needs_new_action():
|
||||
new_action = queue.get()
|
||||
if new_action:
|
||||
interpolator.add(new_action.cpu())
|
||||
|
||||
action = interpolator.get()
|
||||
if action:
|
||||
robot.send_action(action)
|
||||
"""
|
||||
|
||||
def __init__(self, multiplier: int = 1):
|
||||
"""Initialize the interpolator.
|
||||
|
||||
Args:
|
||||
multiplier: Control rate multiplier (1 = no interpolation, 2 = 2x, 3 = 3x, etc.)
|
||||
"""
|
||||
if multiplier < 1:
|
||||
raise ValueError(f"multiplier must be >= 1, got {multiplier}")
|
||||
self.multiplier = multiplier
|
||||
self._prev: Tensor | None = None
|
||||
self._buffer: list[Tensor] = []
|
||||
self._idx = 0
|
||||
|
||||
@property
|
||||
def enabled(self) -> bool:
|
||||
"""Whether interpolation is active (multiplier > 1)."""
|
||||
return self.multiplier > 1
|
||||
|
||||
def reset(self):
|
||||
"""Reset interpolation state (call between episodes)."""
|
||||
self._prev = None
|
||||
self._buffer = []
|
||||
self._idx = 0
|
||||
|
||||
def needs_new_action(self) -> bool:
|
||||
"""Check if a new action is needed from the queue."""
|
||||
return self._idx >= len(self._buffer)
|
||||
|
||||
def add(self, action: Tensor) -> None:
|
||||
"""Add a new action and compute interpolated sequence.
|
||||
|
||||
Args:
|
||||
action: New action tensor from policy/queue (already on CPU).
|
||||
"""
|
||||
if self.multiplier > 1 and self._prev is not None:
|
||||
self._buffer = []
|
||||
for i in range(1, self.multiplier + 1):
|
||||
t = i / self.multiplier
|
||||
interp = self._prev + t * (action - self._prev)
|
||||
self._buffer.append(interp)
|
||||
else:
|
||||
# First step: no previous action yet, so run at base FPS without interpolation.
|
||||
self._buffer = [action.clone()]
|
||||
self._prev = action.clone()
|
||||
self._idx = 0
|
||||
|
||||
def get(self) -> Tensor | None:
|
||||
"""Get the next interpolated action.
|
||||
|
||||
Returns:
|
||||
Next action tensor, or None if buffer is exhausted.
|
||||
"""
|
||||
if self._idx >= len(self._buffer):
|
||||
return None
|
||||
action = self._buffer[self._idx]
|
||||
self._idx += 1
|
||||
return action
|
||||
|
||||
def get_control_interval(self, fps: float) -> float:
|
||||
"""Get the control interval based on interpolation multiplier.
|
||||
|
||||
Args:
|
||||
fps: Base frames per second.
|
||||
|
||||
Returns:
|
||||
Control interval in seconds (divided by multiplier).
|
||||
"""
|
||||
return 1.0 / (fps * self.multiplier)
|
||||
@@ -79,6 +79,13 @@ class ActionQueue:
|
||||
self.last_index += 1
|
||||
return action.clone()
|
||||
|
||||
def clear(self) -> None:
|
||||
"""Clear queued actions and reset consumption index."""
|
||||
with self.lock:
|
||||
self.queue = None
|
||||
self.original_queue = None
|
||||
self.last_index = 0
|
||||
|
||||
def qsize(self) -> int:
|
||||
"""Get the number of remaining actions in the queue.
|
||||
|
||||
@@ -123,14 +130,26 @@ class ActionQueue:
|
||||
with self.lock:
|
||||
if self.original_queue is None:
|
||||
return None
|
||||
return self.original_queue[self.last_index :]
|
||||
return self.original_queue[self.last_index :].clone()
|
||||
|
||||
def get_processed_left_over(self) -> Tensor | None:
|
||||
"""Get leftover processed actions (the actions currently executed by the robot).
|
||||
|
||||
Returns:
|
||||
Tensor | None: Remaining processed actions (remaining_steps, action_dim),
|
||||
or None if no processed queue exists.
|
||||
"""
|
||||
with self.lock:
|
||||
if self.queue is None:
|
||||
return None
|
||||
return self.queue[self.last_index :].clone()
|
||||
|
||||
def merge(
|
||||
self,
|
||||
original_actions: Tensor,
|
||||
processed_actions: Tensor,
|
||||
real_delay: int,
|
||||
action_index_before_inference: int | None = 0,
|
||||
action_index_before_inference: int | None = None,
|
||||
):
|
||||
"""Merge new actions into the queue.
|
||||
|
||||
@@ -145,10 +164,10 @@ class ActionQueue:
|
||||
action_index_before_inference: Index before inference started, for validation.
|
||||
"""
|
||||
with self.lock:
|
||||
self._check_delays(real_delay, action_index_before_inference)
|
||||
delay = self._check_and_resolve_delays(real_delay, action_index_before_inference)
|
||||
|
||||
if self.cfg.enabled:
|
||||
self._replace_actions_queue(original_actions, processed_actions, real_delay)
|
||||
self._replace_actions_queue(original_actions, processed_actions, delay)
|
||||
return
|
||||
|
||||
self._append_actions_queue(original_actions, processed_actions)
|
||||
@@ -164,12 +183,13 @@ class ActionQueue:
|
||||
processed_actions: Post-processed actions for robot.
|
||||
real_delay: Number of time steps to skip due to inference delay.
|
||||
"""
|
||||
self.original_queue = original_actions[real_delay:].clone()
|
||||
self.queue = processed_actions[real_delay:].clone()
|
||||
clamped_delay = max(0, min(real_delay, len(original_actions), len(processed_actions)))
|
||||
self.original_queue = original_actions[clamped_delay:].clone()
|
||||
self.queue = processed_actions[clamped_delay:].clone()
|
||||
|
||||
logger.debug(f"original_actions shape: {self.original_queue.shape}")
|
||||
logger.debug(f"processed_actions shape: {self.queue.shape}")
|
||||
logger.debug(f"real_delay: {real_delay}")
|
||||
logger.debug(f"real_delay: {real_delay}, clamped_delay: {clamped_delay}")
|
||||
|
||||
self.last_index = 0
|
||||
|
||||
@@ -196,7 +216,9 @@ class ActionQueue:
|
||||
|
||||
self.last_index = 0
|
||||
|
||||
def _check_delays(self, real_delay: int, action_index_before_inference: int | None = None):
|
||||
def _check_and_resolve_delays(
|
||||
self, real_delay: int, action_index_before_inference: int | None = None
|
||||
) -> int:
|
||||
"""Validate that computed delays match expectations.
|
||||
|
||||
Compares the delay computed from inference latency with the actual
|
||||
@@ -205,15 +227,20 @@ class ActionQueue:
|
||||
Args:
|
||||
real_delay: Delay computed from inference latency.
|
||||
action_index_before_inference: Action index when inference started.
|
||||
"""
|
||||
if action_index_before_inference is None:
|
||||
return
|
||||
|
||||
indexes_diff = self.last_index - action_index_before_inference
|
||||
if indexes_diff != real_delay:
|
||||
# Let's check that action index difference (real delay calculated based on action queue)
|
||||
# is the same as delay calculated based on inference latency
|
||||
logger.warning(
|
||||
f"[ACTION_QUEUE] Indexes diff is not equal to real delay. "
|
||||
f"Indexes diff: {indexes_diff}, real delay: {real_delay}"
|
||||
)
|
||||
Returns:
|
||||
int: Delay to use.
|
||||
"""
|
||||
effective_delay = max(0, real_delay)
|
||||
|
||||
if action_index_before_inference is not None:
|
||||
indexes_diff = max(0, self.last_index - action_index_before_inference)
|
||||
if indexes_diff != real_delay:
|
||||
logger.warning(
|
||||
"Indexes diff is not equal to real delay. indexes_diff=%d, real_delay=%d",
|
||||
indexes_diff,
|
||||
real_delay,
|
||||
)
|
||||
return real_delay
|
||||
|
||||
return effective_delay
|
||||
|
||||
@@ -1,14 +0,0 @@
|
||||
## Paper
|
||||
|
||||
https://arxiv.org/abs/2509.25358
|
||||
|
||||
## Citation
|
||||
|
||||
```bibtex
|
||||
@article{chen2025sarm,
|
||||
title={SARM: Stage-Aware Reward Modeling for Long Horizon Robot Manipulation},
|
||||
author={Chen, Qianzhong and Yu, Justin and Schwager, Mac and Abbeel, Pieter and Shentu, Yide and Wu, Philipp},
|
||||
journal={arXiv preprint arXiv:2509.25358},
|
||||
year={2025}
|
||||
}
|
||||
```
|
||||
+1
@@ -0,0 +1 @@
|
||||
../../../../docs/source/policy_sarm_README.md
|
||||
@@ -55,7 +55,7 @@ class SmolVLAConfig(PreTrainedConfig):
|
||||
# the space used by the pi internal runtime which was used to train the base model.
|
||||
adapt_to_pi_aloha: bool = False
|
||||
|
||||
# Converts joint dimensions to deltas with respect to the current state before passing to the model.
|
||||
# Converts joint dimensions to relative values with respect to the current state before passing to the model.
|
||||
# Gripper dimensions will remain in absolute values.
|
||||
use_delta_joint_actions_aloha: bool = False
|
||||
|
||||
|
||||
@@ -75,6 +75,12 @@ from .policy_robot_bridge import (
|
||||
PolicyActionToRobotActionProcessorStep,
|
||||
RobotActionToPolicyActionProcessorStep,
|
||||
)
|
||||
from .relative_action_processor import (
|
||||
AbsoluteActionsProcessorStep,
|
||||
RelativeActionsProcessorStep,
|
||||
to_absolute_actions,
|
||||
to_relative_actions,
|
||||
)
|
||||
from .rename_processor import RenameObservationsProcessorStep
|
||||
from .tokenizer_processor import ActionTokenizerProcessorStep, TokenizerProcessorStep
|
||||
|
||||
@@ -100,6 +106,8 @@ __all__ = [
|
||||
"make_default_teleop_action_processor",
|
||||
"make_default_robot_action_processor",
|
||||
"make_default_robot_observation_processor",
|
||||
"AbsoluteActionsProcessorStep",
|
||||
"RelativeActionsProcessorStep",
|
||||
"MapDeltaActionToRobotActionStep",
|
||||
"MapTensorToDeltaActionDictStep",
|
||||
"NormalizerProcessorStep",
|
||||
@@ -129,6 +137,8 @@ __all__ = [
|
||||
"transition_to_batch",
|
||||
"TransitionKey",
|
||||
"TruncatedProcessorStep",
|
||||
"to_absolute_actions",
|
||||
"to_relative_actions",
|
||||
"UnnormalizerProcessorStep",
|
||||
"VanillaObservationProcessorStep",
|
||||
]
|
||||
|
||||
@@ -0,0 +1,208 @@
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from collections.abc import Sequence
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
from torch import Tensor
|
||||
|
||||
from lerobot.configs.types import PipelineFeatureType, PolicyFeature
|
||||
from lerobot.types import EnvTransition, TransitionKey
|
||||
from lerobot.utils.constants import OBS_STATE
|
||||
|
||||
from .delta_action_processor import MapDeltaActionToRobotActionStep, MapTensorToDeltaActionDictStep
|
||||
from .pipeline import ProcessorStep, ProcessorStepRegistry
|
||||
|
||||
# Re-export for backward compatibility
|
||||
__all__ = [
|
||||
"MapDeltaActionToRobotActionStep",
|
||||
"MapTensorToDeltaActionDictStep",
|
||||
"RelativeActionsProcessorStep",
|
||||
"AbsoluteActionsProcessorStep",
|
||||
"to_relative_actions",
|
||||
"to_absolute_actions",
|
||||
]
|
||||
|
||||
|
||||
def to_relative_actions(actions: Tensor, state: Tensor, mask: Sequence[bool]) -> Tensor:
|
||||
"""Convert absolute actions to relative: relative = action - state (for masked dims).
|
||||
|
||||
Args:
|
||||
actions: (B, T, action_dim) or (B, action_dim).
|
||||
state: (B, state_dim). Broadcast across time dimension.
|
||||
mask: Which dims to convert. Can be shorter than action_dim.
|
||||
"""
|
||||
mask_t = torch.tensor(mask, dtype=actions.dtype, device=actions.device)
|
||||
dims = mask_t.shape[0]
|
||||
# Align state to the same device/dtype as actions. _last_state is cached before
|
||||
# DeviceProcessorStep moves the transition, so it can be on CPU while actions are on CUDA.
|
||||
if state.device != actions.device or state.dtype != actions.dtype:
|
||||
state = state.to(device=actions.device, dtype=actions.dtype)
|
||||
state_offset = state[..., :dims] * mask_t
|
||||
if actions.ndim == 3:
|
||||
state_offset = state_offset.unsqueeze(-2)
|
||||
actions = actions.clone()
|
||||
actions[..., :dims] -= state_offset
|
||||
return actions
|
||||
|
||||
|
||||
def to_absolute_actions(actions: Tensor, state: Tensor, mask: Sequence[bool]) -> Tensor:
|
||||
"""Convert relative actions back to absolute: absolute = relative + state (for masked dims).
|
||||
|
||||
Args:
|
||||
actions: (B, T, action_dim) or (B, action_dim).
|
||||
state: (B, state_dim). Broadcast across time dimension.
|
||||
mask: Which dims to convert. Can be shorter than action_dim.
|
||||
"""
|
||||
mask_t = torch.tensor(mask, dtype=actions.dtype, device=actions.device)
|
||||
dims = mask_t.shape[0]
|
||||
# Align state to the same device/dtype as actions. _last_state is cached before
|
||||
# DeviceProcessorStep moves the transition, so it can be on CPU while actions are on CUDA.
|
||||
if state.device != actions.device or state.dtype != actions.dtype:
|
||||
state = state.to(device=actions.device, dtype=actions.dtype)
|
||||
state_offset = state[..., :dims] * mask_t
|
||||
if actions.ndim == 3:
|
||||
state_offset = state_offset.unsqueeze(-2)
|
||||
actions = actions.clone()
|
||||
actions[..., :dims] += state_offset
|
||||
return actions
|
||||
|
||||
|
||||
@ProcessorStepRegistry.register("delta_actions_processor")
|
||||
@dataclass
|
||||
class RelativeActionsProcessorStep(ProcessorStep):
|
||||
"""Converts absolute actions to relative actions (action -= state) for masked dimensions.
|
||||
|
||||
Mirrors OpenPI's DeltaActions transform. Applied during preprocessing so the model
|
||||
trains on relative offsets instead of absolute positions.
|
||||
Caches the last seen state so a paired AbsoluteActionsProcessorStep can reverse
|
||||
the conversion during postprocessing.
|
||||
|
||||
Attributes:
|
||||
enabled: Whether to apply the relative conversion.
|
||||
exclude_joints: Joint names to keep absolute (not converted to relative).
|
||||
action_names: Action dimension names from dataset metadata, used to build
|
||||
the mask from exclude_joints. If None, all dims are converted.
|
||||
"""
|
||||
|
||||
enabled: bool = False
|
||||
exclude_joints: list[str] = field(default_factory=list)
|
||||
action_names: list[str] | None = None
|
||||
_last_state: torch.Tensor | None = field(default=None, init=False, repr=False)
|
||||
|
||||
def _build_mask(self, action_dim: int) -> list[bool]:
|
||||
if not self.exclude_joints or self.action_names is None:
|
||||
return [True] * action_dim
|
||||
|
||||
exclude_tokens = [str(name).lower() for name in self.exclude_joints if name]
|
||||
if not exclude_tokens:
|
||||
return [True] * action_dim
|
||||
|
||||
mask = []
|
||||
for name in self.action_names[:action_dim]:
|
||||
action_name = str(name).lower()
|
||||
is_excluded = any(token == action_name or token in action_name for token in exclude_tokens)
|
||||
mask.append(not is_excluded)
|
||||
|
||||
if len(mask) < action_dim:
|
||||
mask.extend([True] * (action_dim - len(mask)))
|
||||
|
||||
return mask
|
||||
|
||||
def __call__(self, transition: EnvTransition) -> EnvTransition:
|
||||
observation = transition.get(TransitionKey.OBSERVATION, {})
|
||||
state = observation.get(OBS_STATE) if observation else None
|
||||
|
||||
# Always cache state for the paired AbsoluteActionsProcessorStep
|
||||
if state is not None:
|
||||
self._last_state = state
|
||||
|
||||
if not self.enabled:
|
||||
return transition
|
||||
|
||||
new_transition = transition.copy()
|
||||
action = new_transition.get(TransitionKey.ACTION)
|
||||
if action is None or state is None:
|
||||
return new_transition
|
||||
|
||||
mask = self._build_mask(action.shape[-1])
|
||||
new_transition[TransitionKey.ACTION] = to_relative_actions(action, state, mask)
|
||||
return new_transition
|
||||
|
||||
def get_config(self) -> dict[str, Any]:
|
||||
return {
|
||||
"enabled": self.enabled,
|
||||
"exclude_joints": self.exclude_joints,
|
||||
"action_names": self.action_names,
|
||||
}
|
||||
|
||||
def transform_features(
|
||||
self, features: dict[PipelineFeatureType, dict[str, PolicyFeature]]
|
||||
) -> dict[PipelineFeatureType, dict[str, PolicyFeature]]:
|
||||
return features
|
||||
|
||||
|
||||
@ProcessorStepRegistry.register("absolute_actions_processor")
|
||||
@dataclass
|
||||
class AbsoluteActionsProcessorStep(ProcessorStep):
|
||||
"""Converts relative actions back to absolute actions (action += state) for all dimensions.
|
||||
|
||||
Mirrors OpenPI's AbsoluteActions transform. Applied during postprocessing so
|
||||
predicted relative offsets are converted back to absolute positions for execution.
|
||||
Reads the cached state from its paired RelativeActionsProcessorStep.
|
||||
|
||||
Attributes:
|
||||
enabled: Whether to apply the absolute conversion.
|
||||
relative_step: Reference to the paired RelativeActionsProcessorStep that caches state.
|
||||
"""
|
||||
|
||||
enabled: bool = False
|
||||
relative_step: RelativeActionsProcessorStep | None = field(default=None, repr=False)
|
||||
|
||||
def __call__(self, transition: EnvTransition) -> EnvTransition:
|
||||
if not self.enabled:
|
||||
return transition
|
||||
|
||||
if self.relative_step is None:
|
||||
raise RuntimeError(
|
||||
"AbsoluteActionsProcessorStep requires a paired RelativeActionsProcessorStep "
|
||||
"but relative_step is None. Ensure relative_step is set when constructing the postprocessor."
|
||||
)
|
||||
|
||||
if self.relative_step._last_state is None:
|
||||
raise RuntimeError(
|
||||
"AbsoluteActionsProcessorStep requires state from RelativeActionsProcessorStep "
|
||||
"but no state has been cached. Ensure the preprocessor runs before the postprocessor."
|
||||
)
|
||||
|
||||
new_transition = transition.copy()
|
||||
action = new_transition.get(TransitionKey.ACTION)
|
||||
if action is None:
|
||||
return new_transition
|
||||
|
||||
mask = self.relative_step._build_mask(action.shape[-1])
|
||||
new_transition[TransitionKey.ACTION] = to_absolute_actions(
|
||||
action, self.relative_step._last_state, mask
|
||||
)
|
||||
return new_transition
|
||||
|
||||
def get_config(self) -> dict[str, Any]:
|
||||
return {"enabled": self.enabled}
|
||||
|
||||
def transform_features(
|
||||
self, features: dict[PipelineFeatureType, dict[str, PolicyFeature]]
|
||||
) -> dict[PipelineFeatureType, dict[str, PolicyFeature]]:
|
||||
return features
|
||||
@@ -563,7 +563,7 @@ class ReplayBuffer:
|
||||
)
|
||||
|
||||
# Start writing images if needed
|
||||
lerobot_dataset.start_image_writer(num_processes=0, num_threads=3)
|
||||
lerobot_dataset.writer.start_image_writer(num_processes=0, num_threads=3)
|
||||
|
||||
# Convert transitions into episodes and frames
|
||||
|
||||
@@ -603,10 +603,10 @@ class ReplayBuffer:
|
||||
lerobot_dataset.save_episode()
|
||||
|
||||
# Save any remaining frames in the buffer
|
||||
if lerobot_dataset.episode_buffer["size"] > 0:
|
||||
if lerobot_dataset.has_pending_frames():
|
||||
lerobot_dataset.save_episode()
|
||||
|
||||
lerobot_dataset.stop_image_writer()
|
||||
lerobot_dataset.writer.stop_image_writer()
|
||||
lerobot_dataset.finalize()
|
||||
|
||||
return lerobot_dataset
|
||||
|
||||
@@ -752,8 +752,7 @@ def replay_trajectory(
|
||||
episodes=[cfg.dataset.replay_episode],
|
||||
download_videos=False,
|
||||
)
|
||||
episode_frames = dataset.hf_dataset.filter(lambda x: x["episode_index"] == cfg.dataset.replay_episode)
|
||||
actions = episode_frames.select_columns(ACTION)
|
||||
actions = dataset.select_columns(ACTION)
|
||||
|
||||
_, info = env.reset()
|
||||
|
||||
|
||||
@@ -39,13 +39,23 @@ class BiOpenArmFollower(Robot):
|
||||
super().__init__(config)
|
||||
self.config = config
|
||||
|
||||
# Top-level cameras are distributed evenly: each arm's OpenArmFollower
|
||||
# will only open the cameras assigned to it. Per-arm cameras are used
|
||||
# as fallback when top-level cameras are empty.
|
||||
if config.cameras:
|
||||
left_cameras = config.cameras
|
||||
right_cameras = {}
|
||||
else:
|
||||
left_cameras = config.left_arm_config.cameras
|
||||
right_cameras = config.right_arm_config.cameras
|
||||
|
||||
left_arm_config = OpenArmFollowerConfig(
|
||||
id=f"{config.id}_left" if config.id else None,
|
||||
calibration_dir=config.calibration_dir,
|
||||
port=config.left_arm_config.port,
|
||||
disable_torque_on_disconnect=config.left_arm_config.disable_torque_on_disconnect,
|
||||
max_relative_target=config.left_arm_config.max_relative_target,
|
||||
cameras=config.left_arm_config.cameras,
|
||||
cameras=left_cameras,
|
||||
side=config.left_arm_config.side,
|
||||
can_interface=config.left_arm_config.can_interface,
|
||||
use_can_fd=config.left_arm_config.use_can_fd,
|
||||
@@ -63,7 +73,7 @@ class BiOpenArmFollower(Robot):
|
||||
port=config.right_arm_config.port,
|
||||
disable_torque_on_disconnect=config.right_arm_config.disable_torque_on_disconnect,
|
||||
max_relative_target=config.right_arm_config.max_relative_target,
|
||||
cameras=config.right_arm_config.cameras,
|
||||
cameras=right_cameras,
|
||||
side=config.right_arm_config.side,
|
||||
can_interface=config.right_arm_config.can_interface,
|
||||
use_can_fd=config.right_arm_config.use_can_fd,
|
||||
@@ -86,20 +96,19 @@ class BiOpenArmFollower(Robot):
|
||||
left_arm_motors_ft = self.left_arm._motors_ft
|
||||
right_arm_motors_ft = self.right_arm._motors_ft
|
||||
|
||||
# Right first, then left — matches the teleoperator (OpenArmMini) ordering
|
||||
# and the dataset feature names recorded during data collection.
|
||||
return {
|
||||
**{f"left_{k}": v for k, v in left_arm_motors_ft.items()},
|
||||
**{f"right_{k}": v for k, v in right_arm_motors_ft.items()},
|
||||
**{f"left_{k}": v for k, v in left_arm_motors_ft.items()},
|
||||
}
|
||||
|
||||
@property
|
||||
def _cameras_ft(self) -> dict[str, tuple]:
|
||||
left_arm_cameras_ft = self.left_arm._cameras_ft
|
||||
right_arm_cameras_ft = self.right_arm._cameras_ft
|
||||
|
||||
return {
|
||||
**{f"left_{k}": v for k, v in left_arm_cameras_ft.items()},
|
||||
**{f"right_{k}": v for k, v in right_arm_cameras_ft.items()},
|
||||
}
|
||||
# Cameras already have unique user-chosen names (e.g. "left_wrist", "base",
|
||||
# "right_wrist"), so we merge them directly — unlike motors which need the
|
||||
# left_/right_ prefix to disambiguate identical per-arm joint names.
|
||||
return {**self.left_arm._cameras_ft, **self.right_arm._cameras_ft}
|
||||
|
||||
@cached_property
|
||||
def observation_features(self) -> dict[str, type | tuple]:
|
||||
@@ -139,13 +148,19 @@ class BiOpenArmFollower(Robot):
|
||||
def get_observation(self) -> RobotObservation:
|
||||
obs_dict = {}
|
||||
|
||||
# Add "left_" prefix
|
||||
left_obs = self.left_arm.get_observation()
|
||||
obs_dict.update({f"left_{key}": value for key, value in left_obs.items()})
|
||||
# Camera keys that should NOT get the arm prefix (they already have unique names)
|
||||
left_cam_keys = set(self.left_arm.cameras.keys())
|
||||
right_cam_keys = set(self.right_arm.cameras.keys())
|
||||
|
||||
# Add "right_" prefix
|
||||
# Right first, then left — matches the teleoperator (OpenArmMini) ordering
|
||||
# and the dataset feature names recorded during data collection.
|
||||
right_obs = self.right_arm.get_observation()
|
||||
obs_dict.update({f"right_{key}": value for key, value in right_obs.items()})
|
||||
for key, value in right_obs.items():
|
||||
obs_dict[key if key in right_cam_keys else f"right_{key}"] = value
|
||||
|
||||
left_obs = self.left_arm.get_observation()
|
||||
for key, value in left_obs.items():
|
||||
obs_dict[key if key in left_cam_keys else f"left_{key}"] = value
|
||||
|
||||
return obs_dict
|
||||
|
||||
@@ -172,7 +187,7 @@ class BiOpenArmFollower(Robot):
|
||||
prefixed_sent_action_left = {f"left_{key}": value for key, value in sent_action_left.items()}
|
||||
prefixed_sent_action_right = {f"right_{key}": value for key, value in sent_action_right.items()}
|
||||
|
||||
return {**prefixed_sent_action_left, **prefixed_sent_action_right}
|
||||
return {**prefixed_sent_action_right, **prefixed_sent_action_left}
|
||||
|
||||
@check_if_not_connected
|
||||
def disconnect(self):
|
||||
|
||||
@@ -14,17 +14,23 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from dataclasses import dataclass
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from lerobot.cameras import CameraConfig
|
||||
from lerobot.robots.openarm_follower import OpenArmFollowerConfigBase
|
||||
|
||||
from ..config import RobotConfig
|
||||
|
||||
|
||||
@RobotConfig.register_subclass("bi_openarm_follower")
|
||||
@dataclass
|
||||
@dataclass(kw_only=True)
|
||||
class BiOpenArmFollowerConfig(RobotConfig):
|
||||
"""Configuration class for Bi OpenArm Follower robots."""
|
||||
|
||||
id: str | None = "bi_openarm_follower"
|
||||
|
||||
left_arm_config: OpenArmFollowerConfigBase
|
||||
right_arm_config: OpenArmFollowerConfigBase
|
||||
|
||||
# Top-level cameras shared across both arms.
|
||||
cameras: dict[str, CameraConfig] = field(default_factory=dict)
|
||||
|
||||
@@ -18,7 +18,7 @@
|
||||
Edit LeRobot datasets using various transformation tools.
|
||||
|
||||
This script allows you to delete episodes, split datasets, merge datasets,
|
||||
remove features, modify tasks, and convert image datasets to video format.
|
||||
remove features, modify tasks, recompute stats, and convert image datasets to video format.
|
||||
When new_repo_id is specified, creates a new dataset.
|
||||
|
||||
Path semantics (v2): --root and --new_root are exact dataset folders containing
|
||||
@@ -148,6 +148,21 @@ Show dataset information without feature details:
|
||||
--operation.type info \
|
||||
--operation.show_features false
|
||||
|
||||
Recompute dataset statistics:
|
||||
lerobot-edit-dataset \
|
||||
--repo_id lerobot/pusht \
|
||||
--operation.type recompute_stats
|
||||
|
||||
Recompute stats for relative actions and push to hub:
|
||||
lerobot-edit-dataset \
|
||||
--repo_id lerobot/pusht \
|
||||
--operation.type recompute_stats \
|
||||
--operation.relative_action true \
|
||||
--operation.chunk_size 50 \
|
||||
--operation.relative_exclude_joints "['gripper']" \
|
||||
--operation.num_workers 4 \
|
||||
--push_to_hub true
|
||||
|
||||
Using JSON config file:
|
||||
lerobot-edit-dataset \
|
||||
--config_path path/to/edit_config.json
|
||||
@@ -168,6 +183,7 @@ from lerobot.datasets.dataset_tools import (
|
||||
delete_episodes,
|
||||
merge_datasets,
|
||||
modify_tasks,
|
||||
recompute_stats,
|
||||
remove_feature,
|
||||
split_dataset,
|
||||
)
|
||||
@@ -230,6 +246,16 @@ class ConvertImageToVideoConfig(OperationConfig):
|
||||
max_frames_per_batch: int | None = None
|
||||
|
||||
|
||||
@OperationConfig.register_subclass("recompute_stats")
|
||||
@dataclass
|
||||
class RecomputeStatsConfig(OperationConfig):
|
||||
skip_image_video: bool = True
|
||||
relative_action: bool = False
|
||||
relative_exclude_joints: list[str] | None = None
|
||||
chunk_size: int = 50
|
||||
num_workers: int = 0
|
||||
|
||||
|
||||
@OperationConfig.register_subclass("info")
|
||||
@dataclass
|
||||
class InfoConfig(OperationConfig):
|
||||
@@ -525,6 +551,35 @@ def handle_convert_image_to_video(cfg: EditDatasetConfig) -> None:
|
||||
logging.info("Dataset saved locally (not pushed to hub)")
|
||||
|
||||
|
||||
def handle_recompute_stats(cfg: EditDatasetConfig) -> None:
|
||||
if not isinstance(cfg.operation, RecomputeStatsConfig):
|
||||
raise ValueError("Operation config must be RecomputeStatsConfig")
|
||||
|
||||
dataset = LeRobotDataset(cfg.repo_id, root=cfg.root)
|
||||
|
||||
logging.info(f"Recomputing stats for {cfg.repo_id}")
|
||||
if cfg.operation.relative_action:
|
||||
logging.info(
|
||||
f"Relative action stats enabled (chunk_size={cfg.operation.chunk_size}, "
|
||||
f"exclude_joints={cfg.operation.relative_exclude_joints})"
|
||||
)
|
||||
|
||||
recompute_stats(
|
||||
dataset,
|
||||
skip_image_video=cfg.operation.skip_image_video,
|
||||
relative_action=cfg.operation.relative_action,
|
||||
relative_exclude_joints=cfg.operation.relative_exclude_joints,
|
||||
chunk_size=cfg.operation.chunk_size,
|
||||
num_workers=cfg.operation.num_workers,
|
||||
)
|
||||
|
||||
logging.info(f"Stats written to {dataset.root}")
|
||||
|
||||
if cfg.push_to_hub:
|
||||
logging.info(f"Pushing to hub as {dataset.meta.repo_id}...")
|
||||
dataset.push_to_hub()
|
||||
|
||||
|
||||
def _get_dataset_size(repo_path):
|
||||
import os
|
||||
|
||||
@@ -596,6 +651,8 @@ def edit_dataset(cfg: EditDatasetConfig) -> None:
|
||||
handle_modify_tasks(cfg)
|
||||
elif operation_type == "convert_image_to_video":
|
||||
handle_convert_image_to_video(cfg)
|
||||
elif operation_type == "recompute_stats":
|
||||
handle_recompute_stats(cfg)
|
||||
elif operation_type == "info":
|
||||
handle_info(cfg)
|
||||
else:
|
||||
|
||||
@@ -65,6 +65,7 @@ def get_sys_info() -> dict[str, str]:
|
||||
"Platform": platform.platform(),
|
||||
"Python version": platform.python_version(),
|
||||
"Huggingface Hub version": get_package_version("huggingface_hub"),
|
||||
"Transformers version": get_package_version("transformers"),
|
||||
"Datasets version": get_package_version("datasets"),
|
||||
"Numpy version": get_package_version("numpy"),
|
||||
"FFmpeg version": get_ffmpeg_version(),
|
||||
|
||||
@@ -74,6 +74,8 @@ from pathlib import Path
|
||||
from pprint import pformat
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
|
||||
from lerobot.cameras import ( # noqa: F401
|
||||
CameraConfig, # noqa: F401
|
||||
)
|
||||
@@ -90,6 +92,7 @@ from lerobot.datasets.pipeline_features import aggregate_pipeline_dataset_featur
|
||||
from lerobot.datasets.video_utils import VideoEncodingManager
|
||||
from lerobot.policies.factory import make_policy, make_pre_post_processors
|
||||
from lerobot.policies.pretrained import PreTrainedPolicy
|
||||
from lerobot.policies.rtc import ActionInterpolator
|
||||
from lerobot.policies.utils import make_robot_action
|
||||
from lerobot.processor import (
|
||||
PolicyAction,
|
||||
@@ -226,6 +229,9 @@ class RecordConfig:
|
||||
play_sounds: bool = True
|
||||
# Resume recording on an existing dataset.
|
||||
resume: bool = False
|
||||
# Action interpolation multiplier for smoother policy control (1=off, 2=2x, 3=3x)
|
||||
# Only applies when using a policy (not teleop)
|
||||
interpolation_multiplier: int = 1
|
||||
|
||||
def __post_init__(self):
|
||||
# HACK: We parse again the cli args here to get the pretrained path if there was one.
|
||||
@@ -298,6 +304,7 @@ def record_loop(
|
||||
control_time_s: int | None = None,
|
||||
single_task: str | None = None,
|
||||
display_data: bool = False,
|
||||
interpolator: ActionInterpolator | None = None,
|
||||
display_compressed_images: bool = False,
|
||||
):
|
||||
if dataset is not None and dataset.fps != fps:
|
||||
@@ -334,6 +341,16 @@ def record_loop(
|
||||
preprocessor.reset()
|
||||
postprocessor.reset()
|
||||
|
||||
# Reset interpolator if provided
|
||||
if interpolator is not None:
|
||||
interpolator.reset()
|
||||
|
||||
# Calculate control interval based on interpolation
|
||||
use_interpolation = interpolator is not None and interpolator.enabled and policy is not None
|
||||
control_interval = interpolator.get_control_interval(fps) if interpolator else 1 / fps
|
||||
# Pre-compute action key order outside the hot loop — it won't change mid-episode.
|
||||
action_keys = sorted(robot.action_features) if use_interpolation else []
|
||||
|
||||
no_action_count = 0
|
||||
timestamp = 0
|
||||
start_episode_t = time.perf_counter()
|
||||
@@ -353,28 +370,68 @@ def record_loop(
|
||||
if policy is not None or dataset is not None:
|
||||
observation_frame = build_dataset_frame(dataset.features, obs_processed, prefix=OBS_STR)
|
||||
|
||||
# Track whether this iteration should be recorded to the dataset.
|
||||
# Interpolated-only iterations send actions to the robot but don't record frames,
|
||||
# keeping the dataset at the original fps while the robot moves at the higher rate.
|
||||
is_record_frame = True
|
||||
|
||||
# Get action from either policy or teleop
|
||||
if policy is not None and preprocessor is not None and postprocessor is not None:
|
||||
action_values = predict_action(
|
||||
observation=observation_frame,
|
||||
policy=policy,
|
||||
device=get_safe_torch_device(policy.config.device),
|
||||
preprocessor=preprocessor,
|
||||
postprocessor=postprocessor,
|
||||
use_amp=policy.config.use_amp,
|
||||
task=single_task,
|
||||
robot_type=robot.robot_type,
|
||||
)
|
||||
# With interpolation: only call policy when interpolator needs new action
|
||||
if use_interpolation:
|
||||
ran_inference = False
|
||||
|
||||
act_processed_policy: RobotAction = make_robot_action(action_values, dataset.features)
|
||||
if interpolator.needs_new_action():
|
||||
action_values = predict_action(
|
||||
observation=observation_frame,
|
||||
policy=policy,
|
||||
device=get_safe_torch_device(policy.config.device),
|
||||
preprocessor=preprocessor,
|
||||
postprocessor=postprocessor,
|
||||
use_amp=policy.config.use_amp,
|
||||
task=single_task,
|
||||
robot_type=robot.robot_type,
|
||||
)
|
||||
act_processed_policy = make_robot_action(action_values, dataset.features)
|
||||
robot_action_to_send = robot_action_processor((act_processed_policy, obs))
|
||||
|
||||
action_tensor = torch.tensor([robot_action_to_send[k] for k in action_keys])
|
||||
interpolator.add(action_tensor)
|
||||
ran_inference = True
|
||||
|
||||
interp_action = interpolator.get()
|
||||
if interp_action is not None:
|
||||
robot_action_to_send = {k: interp_action[i].item() for i, k in enumerate(action_keys)}
|
||||
action_values = robot_action_to_send
|
||||
else:
|
||||
continue
|
||||
|
||||
is_record_frame = ran_inference
|
||||
else:
|
||||
action_values = predict_action(
|
||||
observation=observation_frame,
|
||||
policy=policy,
|
||||
device=get_safe_torch_device(policy.config.device),
|
||||
preprocessor=preprocessor,
|
||||
postprocessor=postprocessor,
|
||||
use_amp=policy.config.use_amp,
|
||||
task=single_task,
|
||||
robot_type=robot.robot_type,
|
||||
)
|
||||
act_processed_policy: RobotAction = make_robot_action(action_values, dataset.features)
|
||||
# Applies a pipeline to the action, default is IdentityProcessor
|
||||
robot_action_to_send = robot_action_processor((act_processed_policy, obs))
|
||||
action_values = robot_action_to_send
|
||||
|
||||
elif policy is None and isinstance(teleop, Teleoperator):
|
||||
act = teleop.get_action()
|
||||
if robot.name == "unitree_g1":
|
||||
teleop.send_feedback(obs)
|
||||
act = teleop.get_action()
|
||||
|
||||
# Applies a pipeline to the raw teleop action, default is IdentityProcessor
|
||||
act_processed_teleop = teleop_action_processor((act, obs))
|
||||
action_values = act_processed_teleop
|
||||
robot_action_to_send = robot_action_processor((act_processed_teleop, obs))
|
||||
|
||||
elif policy is None and isinstance(teleop, list):
|
||||
arm_action = teleop_arm.get_action()
|
||||
@@ -383,6 +440,8 @@ def record_loop(
|
||||
base_action = robot._from_keyboard_to_base_action(keyboard_action)
|
||||
act = {**arm_action, **base_action} if len(base_action) > 0 else arm_action
|
||||
act_processed_teleop = teleop_action_processor((act, obs))
|
||||
action_values = act_processed_teleop
|
||||
robot_action_to_send = robot_action_processor((act_processed_teleop, obs))
|
||||
else:
|
||||
no_action_count += 1
|
||||
if no_action_count == 1 or no_action_count % 10 == 0:
|
||||
@@ -393,22 +452,14 @@ def record_loop(
|
||||
)
|
||||
continue
|
||||
|
||||
# Applies a pipeline to the action, default is IdentityProcessor
|
||||
if policy is not None and act_processed_policy is not None:
|
||||
action_values = act_processed_policy
|
||||
robot_action_to_send = robot_action_processor((act_processed_policy, obs))
|
||||
else:
|
||||
action_values = act_processed_teleop
|
||||
robot_action_to_send = robot_action_processor((act_processed_teleop, obs))
|
||||
|
||||
# Send action to robot
|
||||
# Action can eventually be clipped using `max_relative_target`,
|
||||
# so action actually sent is saved in the dataset. action = postprocessor.process(action)
|
||||
# TODO(steven, pepijn, adil): we should use a pipeline step to clip the action, so the sent action is the action that we input to the robot.
|
||||
_sent_action = robot.send_action(robot_action_to_send)
|
||||
|
||||
# Write to dataset
|
||||
if dataset is not None:
|
||||
# Write to dataset (only on real policy frames, not interpolated-only iterations)
|
||||
if dataset is not None and is_record_frame:
|
||||
action_frame = build_dataset_frame(dataset.features, action_values, prefix=ACTION)
|
||||
frame = {**observation_frame, **action_frame, "task": single_task}
|
||||
dataset.add_frame(frame)
|
||||
@@ -420,7 +471,7 @@ def record_loop(
|
||||
|
||||
dt_s = time.perf_counter() - start_loop_t
|
||||
|
||||
sleep_time_s: float = 1 / fps - dt_s
|
||||
sleep_time_s: float = control_interval - dt_s
|
||||
if sleep_time_s < 0:
|
||||
logging.warning(
|
||||
f"Record loop is running slower ({1 / dt_s:.1f} Hz) than the target FPS ({fps} Hz). Dataset frames might be dropped and robot control might be unstable. Common causes are: 1) Camera FPS not keeping up 2) Policy inference taking too long 3) CPU starvation"
|
||||
@@ -468,7 +519,8 @@ def record(cfg: RecordConfig) -> LeRobotDataset:
|
||||
|
||||
try:
|
||||
if cfg.resume:
|
||||
dataset = LeRobotDataset(
|
||||
num_cameras = len(robot.cameras) if hasattr(robot, "cameras") else 0
|
||||
dataset = LeRobotDataset.resume(
|
||||
cfg.dataset.repo_id,
|
||||
root=cfg.dataset.root,
|
||||
batch_encoding_size=cfg.dataset.video_encoding_batch_size,
|
||||
@@ -476,13 +528,11 @@ def record(cfg: RecordConfig) -> LeRobotDataset:
|
||||
streaming_encoding=cfg.dataset.streaming_encoding,
|
||||
encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize,
|
||||
encoder_threads=cfg.dataset.encoder_threads,
|
||||
image_writer_processes=cfg.dataset.num_image_writer_processes if num_cameras > 0 else 0,
|
||||
image_writer_threads=cfg.dataset.num_image_writer_threads_per_camera * num_cameras
|
||||
if num_cameras > 0
|
||||
else 0,
|
||||
)
|
||||
|
||||
if hasattr(robot, "cameras") and len(robot.cameras) > 0:
|
||||
dataset.start_image_writer(
|
||||
num_processes=cfg.dataset.num_image_writer_processes,
|
||||
num_threads=cfg.dataset.num_image_writer_threads_per_camera * len(robot.cameras),
|
||||
)
|
||||
sanity_check_dataset_robot_compatibility(dataset, robot, cfg.dataset.fps, dataset_features)
|
||||
else:
|
||||
# Create empty dataset or load existing saved episodes
|
||||
@@ -507,6 +557,7 @@ def record(cfg: RecordConfig) -> LeRobotDataset:
|
||||
policy = None if cfg.policy is None else make_policy(cfg.policy, ds_meta=dataset.meta)
|
||||
preprocessor = None
|
||||
postprocessor = None
|
||||
interpolator = None
|
||||
if cfg.policy is not None:
|
||||
preprocessor, postprocessor = make_pre_post_processors(
|
||||
policy_cfg=cfg.policy,
|
||||
@@ -517,6 +568,10 @@ def record(cfg: RecordConfig) -> LeRobotDataset:
|
||||
"rename_observations_processor": {"rename_map": cfg.dataset.rename_map},
|
||||
},
|
||||
)
|
||||
# Create interpolator for smoother policy control
|
||||
if cfg.interpolation_multiplier > 1:
|
||||
interpolator = ActionInterpolator(multiplier=cfg.interpolation_multiplier)
|
||||
logging.info(f"Action interpolation enabled: {cfg.interpolation_multiplier}x control rate")
|
||||
|
||||
robot.connect()
|
||||
if teleop is not None:
|
||||
@@ -548,6 +603,7 @@ def record(cfg: RecordConfig) -> LeRobotDataset:
|
||||
control_time_s=cfg.dataset.episode_time_s,
|
||||
single_task=cfg.dataset.single_task,
|
||||
display_data=cfg.display_data,
|
||||
interpolator=interpolator,
|
||||
display_compressed_images=display_compressed_images,
|
||||
)
|
||||
|
||||
|
||||
@@ -104,15 +104,13 @@ def replay(cfg: ReplayConfig):
|
||||
robot = make_robot_from_config(cfg.robot)
|
||||
dataset = LeRobotDataset(cfg.dataset.repo_id, root=cfg.dataset.root, episodes=[cfg.dataset.episode])
|
||||
|
||||
# Filter dataset to only include frames from the specified episode since episodes are chunked in dataset V3.0
|
||||
episode_frames = dataset.hf_dataset.filter(lambda x: x["episode_index"] == cfg.dataset.episode)
|
||||
actions = episode_frames.select_columns(ACTION)
|
||||
actions = dataset.select_columns(ACTION)
|
||||
|
||||
robot.connect()
|
||||
|
||||
try:
|
||||
log_say("Replaying episode", cfg.play_sounds, blocking=True)
|
||||
for idx in range(len(episode_frames)):
|
||||
for idx in range(dataset.num_frames):
|
||||
start_episode_t = time.perf_counter()
|
||||
|
||||
action_array = actions[idx][ACTION]
|
||||
|
||||
@@ -252,10 +252,22 @@ def train(cfg: TrainPipelineConfig, accelerator: Accelerator | None = None):
|
||||
# Wait for all processes to finish policy creation before continuing
|
||||
accelerator.wait_for_everyone()
|
||||
|
||||
processor_pretrained_path = cfg.policy.pretrained_path
|
||||
if (
|
||||
getattr(cfg.policy, "use_relative_actions", False)
|
||||
and processor_pretrained_path is not None
|
||||
and not cfg.resume
|
||||
):
|
||||
logging.warning(
|
||||
"use_relative_actions=true with pretrained processors can skip relative transforms if "
|
||||
"the checkpoint processors do not define them. Building processors from current policy config."
|
||||
)
|
||||
processor_pretrained_path = None
|
||||
|
||||
# Create processors - only provide dataset_stats if not resuming from saved processors
|
||||
processor_kwargs = {}
|
||||
postprocessor_kwargs = {}
|
||||
if (cfg.policy.pretrained_path and not cfg.resume) or not cfg.policy.pretrained_path:
|
||||
if (processor_pretrained_path and not cfg.resume) or not processor_pretrained_path:
|
||||
# Only provide dataset_stats when not resuming from saved processor state
|
||||
processor_kwargs["dataset_stats"] = dataset.meta.stats
|
||||
|
||||
@@ -263,7 +275,7 @@ def train(cfg: TrainPipelineConfig, accelerator: Accelerator | None = None):
|
||||
if cfg.policy.type == "sarm":
|
||||
processor_kwargs["dataset_meta"] = dataset.meta
|
||||
|
||||
if cfg.policy.pretrained_path is not None:
|
||||
if processor_pretrained_path is not None:
|
||||
processor_kwargs["preprocessor_overrides"] = {
|
||||
"device_processor": {"device": device.type},
|
||||
"normalizer_processor": {
|
||||
@@ -285,7 +297,7 @@ def train(cfg: TrainPipelineConfig, accelerator: Accelerator | None = None):
|
||||
|
||||
preprocessor, postprocessor = make_pre_post_processors(
|
||||
policy_cfg=cfg.policy,
|
||||
pretrained_path=cfg.policy.pretrained_path,
|
||||
pretrained_path=processor_pretrained_path,
|
||||
**processor_kwargs,
|
||||
**postprocessor_kwargs,
|
||||
)
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
|
||||
This script:
|
||||
1. Loads action chunks from LeRobotDataset (with episode sampling)
|
||||
2. Optionally applies delta transforms (relative vs absolute actions)
|
||||
2. Optionally applies relative transforms (relative vs absolute actions)
|
||||
3. Extracts specified action dimensions for encoding
|
||||
4. Applies normalization (MEAN_STD, MIN_MAX, QUANTILES, or other modes)
|
||||
5. Trains FAST tokenizer (BPE on DCT coefficients) on the action chunks
|
||||
@@ -32,8 +32,8 @@ lerobot-train-tokenizer \
|
||||
--max_episodes=100 \
|
||||
--sample_fraction=0.1 \
|
||||
--encoded_dims="0:6" \
|
||||
--delta_dims="0,1,2,3,4,5" \
|
||||
--use_delta_transform=true \
|
||||
--relative_dims="0,1,2,3,4,5" \
|
||||
--use_relative_transform=true \
|
||||
--state_key="observation.state" \
|
||||
--normalization_mode="QUANTILES" \
|
||||
--vocab_size=1024 \
|
||||
@@ -82,10 +82,10 @@ class TokenizerTrainingConfig:
|
||||
sample_fraction: float = 0.1
|
||||
# Comma-separated dimension ranges to encode (e.g., "0:6,7:23")
|
||||
encoded_dims: str = "0:6,7:23"
|
||||
# Comma-separated dimension indices for delta transform (e.g., "0,1,2,3,4,5")
|
||||
delta_dims: str | None = None
|
||||
# Whether to apply delta transform (relative actions vs absolute actions)
|
||||
use_delta_transform: bool = False
|
||||
# Comma-separated dimension indices for relative transform (e.g., "0,1,2,3,4,5")
|
||||
relative_dims: str | None = None
|
||||
# Whether to apply relative transform (relative actions vs absolute actions)
|
||||
use_relative_transform: bool = False
|
||||
# Dataset key for state observations (default: "observation.state")
|
||||
state_key: str = OBS_STATE
|
||||
# Normalization mode (MEAN_STD, MIN_MAX, QUANTILES, QUANTILE10, IDENTITY)
|
||||
@@ -104,25 +104,27 @@ class TokenizerTrainingConfig:
|
||||
hub_private: bool = False
|
||||
|
||||
|
||||
def apply_delta_transform(state: np.ndarray, actions: np.ndarray, delta_dims: list[int] | None) -> np.ndarray:
|
||||
"""Apply delta transform to specified dimensions.
|
||||
def apply_relative_transform(
|
||||
state: np.ndarray, actions: np.ndarray, relative_dims: list[int] | None
|
||||
) -> np.ndarray:
|
||||
"""Apply relative transform to specified dimensions.
|
||||
|
||||
Args:
|
||||
state: Current state [D]
|
||||
actions: Future actions [D]
|
||||
delta_dims: List of dimension indices to apply delta transform to
|
||||
relative_dims: List of dimension indices to apply relative transform to
|
||||
|
||||
Returns:
|
||||
Transformed actions [D]
|
||||
"""
|
||||
if delta_dims is None or len(delta_dims) == 0:
|
||||
if relative_dims is None or len(relative_dims) == 0:
|
||||
return actions
|
||||
|
||||
delta_actions = actions.copy()
|
||||
for dim in delta_dims:
|
||||
delta_actions[dim] = actions[dim] - state[dim]
|
||||
relative_actions = actions.copy()
|
||||
for dim in relative_dims:
|
||||
relative_actions[dim] = actions[dim] - state[dim]
|
||||
|
||||
return delta_actions
|
||||
return relative_actions
|
||||
|
||||
|
||||
def apply_normalization(
|
||||
@@ -185,7 +187,7 @@ def apply_normalization(
|
||||
|
||||
def process_episode(args):
|
||||
"""Process single episode and return action chunks."""
|
||||
dataset, ep_idx, action_horizon, delta_dims, sample_fraction, state_key, use_delta_transform = args
|
||||
dataset, ep_idx, action_horizon, relative_dims, sample_fraction, state_key, use_relative_transform = args
|
||||
|
||||
try:
|
||||
# get episode info
|
||||
@@ -204,15 +206,15 @@ def process_episode(args):
|
||||
|
||||
for abs_idx in range(from_idx, to_idx):
|
||||
# map absolute index to relative index if needed
|
||||
if dataset._absolute_to_relative_idx is not None:
|
||||
if abs_idx not in dataset._absolute_to_relative_idx:
|
||||
if dataset.reader._absolute_to_relative_idx is not None:
|
||||
if abs_idx not in dataset.reader._absolute_to_relative_idx:
|
||||
# this episode's frames aren't in the filtered dataset
|
||||
return None
|
||||
rel_idx = dataset._absolute_to_relative_idx[abs_idx]
|
||||
rel_idx = dataset.reader._absolute_to_relative_idx[abs_idx]
|
||||
else:
|
||||
rel_idx = abs_idx
|
||||
|
||||
frame = dataset.hf_dataset[rel_idx]
|
||||
frame = dataset.get_raw_item(rel_idx)
|
||||
|
||||
# get state (could be from observation.state or other state key)
|
||||
if state_key in frame:
|
||||
@@ -222,7 +224,7 @@ def process_episode(args):
|
||||
else np.array(frame[state_key])
|
||||
)
|
||||
else:
|
||||
# if no state key, use zeros (no delta transform)
|
||||
# if no state key, use zeros (no relative transform)
|
||||
state = np.zeros_like(
|
||||
frame[ACTION].numpy() if torch.is_tensor(frame[ACTION]) else np.array(frame[ACTION])
|
||||
)
|
||||
@@ -243,18 +245,18 @@ def process_episode(args):
|
||||
current_state = states[i] # First state in chunk
|
||||
future_absolute_actions = actions[i : i + action_horizon]
|
||||
|
||||
if use_delta_transform:
|
||||
if use_relative_transform:
|
||||
# relative actions
|
||||
delta_chunk = np.zeros_like(future_absolute_actions)
|
||||
relative_chunk = np.zeros_like(future_absolute_actions)
|
||||
for t in range(action_horizon):
|
||||
delta_chunk[t] = apply_delta_transform(
|
||||
relative_chunk[t] = apply_relative_transform(
|
||||
current_state,
|
||||
future_absolute_actions[t],
|
||||
delta_dims,
|
||||
relative_dims,
|
||||
)
|
||||
action_chunks.append(delta_chunk)
|
||||
action_chunks.append(relative_chunk)
|
||||
else:
|
||||
# absolute actions (no delta)
|
||||
# absolute actions (no relative transform)
|
||||
action_chunks.append(future_absolute_actions)
|
||||
|
||||
if len(action_chunks) == 0:
|
||||
@@ -407,17 +409,20 @@ def train_tokenizer(cfg: TokenizerTrainingConfig):
|
||||
total_encoded_dims = sum(end - start for start, end in encoded_dim_ranges)
|
||||
print(f"Encoding {total_encoded_dims} dimensions: {cfg.encoded_dims}")
|
||||
|
||||
# parse delta dimensions
|
||||
delta_dim_list = None
|
||||
if cfg.delta_dims is not None and cfg.delta_dims.strip():
|
||||
delta_dim_list = [int(d.strip()) for d in cfg.delta_dims.split(",")]
|
||||
print(f"Delta dimensions: {delta_dim_list}")
|
||||
# parse relative dimensions
|
||||
relative_dim_list = None
|
||||
if cfg.relative_dims is not None and cfg.relative_dims.strip():
|
||||
relative_dim_list = [int(d.strip()) for d in cfg.relative_dims.split(",")]
|
||||
print(f"Relative dimensions: {relative_dim_list}")
|
||||
else:
|
||||
print("No delta dimensions specified")
|
||||
print("No relative dimensions specified")
|
||||
|
||||
print(f"Use delta transform: {cfg.use_delta_transform}")
|
||||
if cfg.use_delta_transform and (delta_dim_list is None or len(delta_dim_list) == 0):
|
||||
print("Warning: use_delta_transform=True but no delta_dims specified. No delta will be applied.")
|
||||
print(f"Use relative transform: {cfg.use_relative_transform}")
|
||||
if cfg.use_relative_transform and (relative_dim_list is None or len(relative_dim_list) == 0):
|
||||
print(
|
||||
"Warning: use_relative_transform=True but no relative_dims specified. "
|
||||
"No relative transform will be applied."
|
||||
)
|
||||
|
||||
print(f"Action horizon: {cfg.action_horizon}")
|
||||
print(f"State key: {cfg.state_key}")
|
||||
@@ -440,10 +445,10 @@ def train_tokenizer(cfg: TokenizerTrainingConfig):
|
||||
dataset,
|
||||
ep_idx,
|
||||
cfg.action_horizon,
|
||||
delta_dim_list,
|
||||
relative_dim_list,
|
||||
cfg.sample_fraction,
|
||||
cfg.state_key,
|
||||
cfg.use_delta_transform,
|
||||
cfg.use_relative_transform,
|
||||
)
|
||||
)
|
||||
if chunks is not None:
|
||||
@@ -544,9 +549,9 @@ def train_tokenizer(cfg: TokenizerTrainingConfig):
|
||||
"encoded_dims": cfg.encoded_dims,
|
||||
"encoded_dim_ranges": encoded_dim_ranges,
|
||||
"total_encoded_dims": total_encoded_dims,
|
||||
"delta_dims": cfg.delta_dims,
|
||||
"delta_dim_list": delta_dim_list,
|
||||
"use_delta_transform": cfg.use_delta_transform,
|
||||
"relative_dims": cfg.relative_dims,
|
||||
"relative_dim_list": relative_dim_list,
|
||||
"use_relative_transform": cfg.use_relative_transform,
|
||||
"state_key": cfg.state_key,
|
||||
"normalization_mode": norm_mode.value,
|
||||
"action_horizon": cfg.action_horizon,
|
||||
|
||||
@@ -32,9 +32,15 @@ from .config_openarm_mini import OpenArmMiniConfig
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Motors whose direction is inverted during readout
|
||||
RIGHT_MOTORS_TO_FLIP = ["joint_1", "joint_2", "joint_3", "joint_4", "joint_5"]
|
||||
RIGHT_MOTORS_TO_FLIP = ["joint_1", "joint_2", "joint_3", "joint_4", "joint_5", "joint_7"]
|
||||
LEFT_MOTORS_TO_FLIP = ["joint_1", "joint_3", "joint_4", "joint_5", "joint_6", "joint_7"]
|
||||
|
||||
# Leader joint 6 maps to follower joint 7 and vice versa
|
||||
JOINT_REMAP = {"joint_6": "joint_7", "joint_7": "joint_6"}
|
||||
JOINT_REMAP_REVERSE = {"joint_7": "joint_6", "joint_6": "joint_7"}
|
||||
|
||||
GRIPPER_TELEOP_TO_DEGREES = -0.65
|
||||
|
||||
|
||||
class OpenArmMini(Teleoperator):
|
||||
"""
|
||||
@@ -95,6 +101,8 @@ class OpenArmMini(Teleoperator):
|
||||
|
||||
@property
|
||||
def action_features(self) -> dict[str, type]:
|
||||
# Right first, then left — matches the robot (BiOpenArmFollower) ordering
|
||||
# and the dataset feature names recorded during data collection.
|
||||
features: dict[str, type] = {}
|
||||
for motor in self.bus_right.motors:
|
||||
features[f"right_{motor}.pos"] = float
|
||||
@@ -276,16 +284,70 @@ class OpenArmMini(Teleoperator):
|
||||
right_positions = self.bus_right.sync_read("Present_Position")
|
||||
left_positions = self.bus_left.sync_read("Present_Position")
|
||||
|
||||
# Right first, then left — matches the robot (BiOpenArmFollower) ordering
|
||||
# and the dataset feature names recorded during data collection.
|
||||
# Joint 6↔7 remap: leader joint_6 → follower joint_7 and vice versa.
|
||||
action: dict[str, Any] = {}
|
||||
for motor, val in right_positions.items():
|
||||
action[f"right_{motor}.pos"] = -val if motor in RIGHT_MOTORS_TO_FLIP else val
|
||||
target = JOINT_REMAP.get(motor, motor)
|
||||
if motor == "gripper":
|
||||
# Convert gripper from teleop 0-100 to openarms degrees: 0→0°, 100→-65°
|
||||
action[f"right_{target}.pos"] = val * GRIPPER_TELEOP_TO_DEGREES
|
||||
else:
|
||||
action[f"right_{target}.pos"] = -val if motor in RIGHT_MOTORS_TO_FLIP else val
|
||||
for motor, val in left_positions.items():
|
||||
action[f"left_{motor}.pos"] = -val if motor in LEFT_MOTORS_TO_FLIP else val
|
||||
target = JOINT_REMAP.get(motor, motor)
|
||||
if motor == "gripper":
|
||||
action[f"left_{target}.pos"] = val * GRIPPER_TELEOP_TO_DEGREES
|
||||
else:
|
||||
action[f"left_{target}.pos"] = -val if motor in LEFT_MOTORS_TO_FLIP else val
|
||||
|
||||
dt_ms = (time.perf_counter() - start) * 1e3
|
||||
logger.debug(f"{self} read action: {dt_ms:.1f}ms")
|
||||
return action
|
||||
|
||||
def enable_torque(self) -> None:
|
||||
"""Enable torque on both arms for position control."""
|
||||
self.bus_right.enable_torque()
|
||||
self.bus_left.enable_torque()
|
||||
|
||||
def disable_torque(self) -> None:
|
||||
"""Disable torque on both arms for free movement."""
|
||||
self.bus_right.disable_torque()
|
||||
self.bus_left.disable_torque()
|
||||
|
||||
def write_goal_positions(self, positions: dict[str, float]) -> None:
|
||||
"""Write goal positions to motors (inverse of get_action flip/gripper/remap logic)."""
|
||||
right_goals: dict[str, float] = {}
|
||||
left_goals: dict[str, float] = {}
|
||||
|
||||
for key, val in positions.items():
|
||||
if not key.endswith(".pos"):
|
||||
continue
|
||||
motor_name = key.removesuffix(".pos")
|
||||
if motor_name.startswith("right_"):
|
||||
base = motor_name.removeprefix("right_")
|
||||
# Reverse remap: follower joint_7 → leader joint_6 and vice versa
|
||||
target = JOINT_REMAP_REVERSE.get(base, base)
|
||||
if base == "gripper":
|
||||
# Convert robot degrees to teleop 0-100: 0°→0, -65°→100
|
||||
right_goals[target] = val / GRIPPER_TELEOP_TO_DEGREES
|
||||
else:
|
||||
# Un-flip using the ORIGINAL motor name (target = leader motor)
|
||||
right_goals[target] = -val if target in RIGHT_MOTORS_TO_FLIP else val
|
||||
elif motor_name.startswith("left_"):
|
||||
base = motor_name.removeprefix("left_")
|
||||
target = JOINT_REMAP_REVERSE.get(base, base)
|
||||
if base == "gripper":
|
||||
left_goals[target] = val / GRIPPER_TELEOP_TO_DEGREES
|
||||
else:
|
||||
left_goals[target] = -val if target in LEFT_MOTORS_TO_FLIP else val
|
||||
|
||||
if right_goals:
|
||||
self.bus_right.sync_write("Goal_Position", right_goals)
|
||||
if left_goals:
|
||||
self.bus_left.sync_write("Goal_Position", left_goals)
|
||||
|
||||
def send_feedback(self, feedback: dict[str, float]) -> None:
|
||||
raise NotImplementedError("Feedback is not yet implemented for OpenArm Mini.")
|
||||
|
||||
|
||||
@@ -65,6 +65,10 @@ if "LEROBOT_HOME" in os.environ:
|
||||
# cache dir
|
||||
default_cache_path = Path(HF_HOME) / "lerobot"
|
||||
HF_LEROBOT_HOME = Path(os.getenv("HF_LEROBOT_HOME", default_cache_path)).expanduser()
|
||||
# LeRobot's own revision-safe Hub cache (NOT the system-wide ~/.cache/huggingface/hub/).
|
||||
# Used as the ``cache_dir`` argument to ``snapshot_download`` so that different
|
||||
# dataset revisions are stored in isolated snapshot directories.
|
||||
HF_LEROBOT_HUB_CACHE = HF_LEROBOT_HOME / "hub"
|
||||
|
||||
# calibration dir
|
||||
default_calibration_path = HF_LEROBOT_HOME / "calibration"
|
||||
|
||||
@@ -95,6 +95,8 @@ def init_logging(
|
||||
file_handler.setLevel(file_level.upper())
|
||||
logger.addHandler(file_handler)
|
||||
|
||||
logging.getLogger("httpx").setLevel(logging.WARNING)
|
||||
|
||||
|
||||
def format_big_number(num, precision=0):
|
||||
suffixes = ["", "K", "M", "B", "T", "Q"]
|
||||
|
||||
@@ -80,7 +80,7 @@ def get_policy_stats(ds_repo_id: str, policy_name: str, policy_kwargs: dict):
|
||||
# HACK: We reload a batch with no delta_indices as `select_action` won't expect a timestamps dimension
|
||||
# We simulate having an environment using a dataset by setting delta_indices to None and dropping tensors
|
||||
# indicating padding (those ending with "_is_pad")
|
||||
dataset.delta_indices = None
|
||||
dataset.reader.delta_indices = None
|
||||
batch = next(iter(dataloader))
|
||||
obs = {}
|
||||
for k in batch:
|
||||
|
||||
@@ -0,0 +1,385 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Contract tests for LeRobotDatasetMetadata."""
|
||||
|
||||
import json
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from lerobot.datasets.dataset_metadata import LeRobotDatasetMetadata
|
||||
from lerobot.datasets.utils import INFO_PATH
|
||||
from tests.fixtures.constants import DEFAULT_FPS, DUMMY_ROBOT_TYPE
|
||||
|
||||
# ── helpers ──────────────────────────────────────────────────────────
|
||||
|
||||
SIMPLE_FEATURES = {
|
||||
"state": {"dtype": "float32", "shape": (6,), "names": None},
|
||||
"action": {"dtype": "float32", "shape": (6,), "names": None},
|
||||
}
|
||||
|
||||
VIDEO_FEATURES = {
|
||||
**SIMPLE_FEATURES,
|
||||
"observation.images.laptop": {
|
||||
"dtype": "video",
|
||||
"shape": (64, 96, 3),
|
||||
"names": ["height", "width", "channels"],
|
||||
"info": None,
|
||||
},
|
||||
}
|
||||
|
||||
IMAGE_FEATURES = {
|
||||
**SIMPLE_FEATURES,
|
||||
"observation.images.laptop": {
|
||||
"dtype": "image",
|
||||
"shape": (64, 96, 3),
|
||||
"names": ["height", "width", "channels"],
|
||||
"info": None,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _make_dummy_stats(features: dict) -> dict:
|
||||
"""Create minimal episode stats matching the given features."""
|
||||
stats = {}
|
||||
for key, ft in features.items():
|
||||
if ft["dtype"] in ("image", "video"):
|
||||
stats[key] = {
|
||||
"max": np.ones((3, 1, 1), dtype=np.float32),
|
||||
"mean": np.full((3, 1, 1), 0.5, dtype=np.float32),
|
||||
"min": np.zeros((3, 1, 1), dtype=np.float32),
|
||||
"std": np.full((3, 1, 1), 0.25, dtype=np.float32),
|
||||
"count": np.array([5]),
|
||||
}
|
||||
elif ft["dtype"] in ("float32", "float64", "int64"):
|
||||
stats[key] = {
|
||||
"max": np.ones(ft["shape"], dtype=np.float32),
|
||||
"mean": np.full(ft["shape"], 0.5, dtype=np.float32),
|
||||
"min": np.zeros(ft["shape"], dtype=np.float32),
|
||||
"std": np.full(ft["shape"], 0.25, dtype=np.float32),
|
||||
"count": np.array([5]),
|
||||
}
|
||||
return stats
|
||||
|
||||
|
||||
# ── Construction contracts ───────────────────────────────────────────
|
||||
|
||||
|
||||
def test_create_produces_valid_info_on_disk(tmp_path):
|
||||
"""create() writes info.json and the returned object reflects the provided settings."""
|
||||
root = tmp_path / "new_ds"
|
||||
meta = LeRobotDatasetMetadata.create(
|
||||
repo_id="test/meta",
|
||||
fps=DEFAULT_FPS,
|
||||
features=SIMPLE_FEATURES,
|
||||
robot_type=DUMMY_ROBOT_TYPE,
|
||||
root=root,
|
||||
use_videos=False,
|
||||
)
|
||||
|
||||
# info.json was written to disk
|
||||
assert (root / INFO_PATH).exists()
|
||||
with open(root / INFO_PATH) as f:
|
||||
info_on_disk = json.load(f)
|
||||
|
||||
assert meta.fps == DEFAULT_FPS
|
||||
assert meta.robot_type == DUMMY_ROBOT_TYPE
|
||||
assert "state" in meta.features
|
||||
assert "action" in meta.features
|
||||
assert info_on_disk["fps"] == DEFAULT_FPS
|
||||
|
||||
|
||||
def test_create_starts_with_zero_counts(tmp_path):
|
||||
"""A freshly created metadata has zero episode/frame/task counts."""
|
||||
root = tmp_path / "empty_ds"
|
||||
meta = LeRobotDatasetMetadata.create(
|
||||
repo_id="test/empty", fps=DEFAULT_FPS, features=SIMPLE_FEATURES, root=root, use_videos=False
|
||||
)
|
||||
|
||||
assert meta.total_episodes == 0
|
||||
assert meta.total_frames == 0
|
||||
assert meta.total_tasks == 0
|
||||
assert meta.tasks is None
|
||||
assert meta.episodes is None
|
||||
assert meta.stats is None
|
||||
|
||||
|
||||
def test_create_with_videos_sets_video_path(tmp_path):
|
||||
"""When features include video-dtype keys, create() produces a non-None video_path."""
|
||||
root = tmp_path / "video_ds"
|
||||
meta = LeRobotDatasetMetadata.create(
|
||||
repo_id="test/video", fps=DEFAULT_FPS, features=VIDEO_FEATURES, root=root, use_videos=True
|
||||
)
|
||||
|
||||
assert meta.video_path is not None
|
||||
assert len(meta.video_keys) == 1
|
||||
assert "observation.images.laptop" in meta.video_keys
|
||||
|
||||
|
||||
def test_create_without_videos_has_no_video_path(tmp_path):
|
||||
"""When use_videos=False and no video features, video_path is None."""
|
||||
root = tmp_path / "no_video"
|
||||
meta = LeRobotDatasetMetadata.create(
|
||||
repo_id="test/novid", fps=DEFAULT_FPS, features=SIMPLE_FEATURES, root=root, use_videos=False
|
||||
)
|
||||
|
||||
assert meta.video_path is None
|
||||
assert meta.video_keys == []
|
||||
|
||||
|
||||
def test_create_raises_on_existing_directory(tmp_path):
|
||||
"""create() raises if root directory already exists."""
|
||||
root = tmp_path / "existing"
|
||||
root.mkdir()
|
||||
|
||||
with pytest.raises(FileExistsError):
|
||||
LeRobotDatasetMetadata.create(
|
||||
repo_id="test/exists", fps=DEFAULT_FPS, features=SIMPLE_FEATURES, root=root, use_videos=False
|
||||
)
|
||||
|
||||
|
||||
def test_init_loads_existing_metadata(tmp_path, lerobot_dataset_metadata_factory, info_factory):
|
||||
"""When metadata files exist on disk, __init__ loads them correctly."""
|
||||
root = tmp_path / "load_test"
|
||||
info = info_factory(total_episodes=3, total_frames=150, total_tasks=1, use_videos=False)
|
||||
meta = lerobot_dataset_metadata_factory(root=root, info=info)
|
||||
|
||||
assert meta.total_episodes == 3
|
||||
assert meta.total_frames == 150
|
||||
assert meta.fps == info["fps"]
|
||||
|
||||
|
||||
# ── Property accessors ───────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_property_accessors_reflect_info(tmp_path):
|
||||
"""Properties return values consistent with the info dict."""
|
||||
root = tmp_path / "props_ds"
|
||||
meta = LeRobotDatasetMetadata.create(
|
||||
repo_id="test/props",
|
||||
fps=DEFAULT_FPS,
|
||||
features=IMAGE_FEATURES,
|
||||
robot_type=DUMMY_ROBOT_TYPE,
|
||||
root=root,
|
||||
use_videos=False,
|
||||
)
|
||||
|
||||
assert meta.fps == DEFAULT_FPS
|
||||
assert meta.robot_type == DUMMY_ROBOT_TYPE
|
||||
# shapes should be tuples
|
||||
for _key, shape in meta.shapes.items():
|
||||
assert isinstance(shape, tuple)
|
||||
# image_keys should contain the image feature
|
||||
assert "observation.images.laptop" in meta.image_keys
|
||||
# camera_keys is a superset of image_keys and video_keys
|
||||
assert set(meta.image_keys + meta.video_keys) == set(meta.camera_keys)
|
||||
|
||||
|
||||
def test_data_path_is_formattable(tmp_path):
|
||||
"""data_path contains format placeholders that can be .format()-ed."""
|
||||
root = tmp_path / "fmt_ds"
|
||||
meta = LeRobotDatasetMetadata.create(
|
||||
repo_id="test/fmt", fps=DEFAULT_FPS, features=SIMPLE_FEATURES, root=root, use_videos=False
|
||||
)
|
||||
|
||||
formatted = meta.data_path.format(chunk_index=0, file_index=0)
|
||||
assert "chunk" in formatted.lower() or "0" in formatted
|
||||
|
||||
|
||||
# ── Task management ──────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_save_episode_tasks_creates_tasks_dataframe(tmp_path):
|
||||
"""On a fresh metadata, save_episode_tasks() creates the tasks DataFrame."""
|
||||
root = tmp_path / "task_ds"
|
||||
meta = LeRobotDatasetMetadata.create(
|
||||
repo_id="test/task", fps=DEFAULT_FPS, features=SIMPLE_FEATURES, root=root, use_videos=False
|
||||
)
|
||||
assert meta.tasks is None
|
||||
|
||||
meta.save_episode_tasks(["Pick up the cube"])
|
||||
|
||||
assert meta.tasks is not None
|
||||
assert len(meta.tasks) == 1
|
||||
assert "Pick up the cube" in meta.tasks.index
|
||||
|
||||
|
||||
def test_save_episode_tasks_is_additive(tmp_path):
|
||||
"""New tasks are added; existing tasks keep their original index."""
|
||||
root = tmp_path / "additive_ds"
|
||||
meta = LeRobotDatasetMetadata.create(
|
||||
repo_id="test/add", fps=DEFAULT_FPS, features=SIMPLE_FEATURES, root=root, use_videos=False
|
||||
)
|
||||
|
||||
meta.save_episode_tasks(["Task A"])
|
||||
idx_a = meta.get_task_index("Task A")
|
||||
|
||||
meta.save_episode_tasks(["Task A", "Task B"])
|
||||
assert meta.get_task_index("Task A") == idx_a # unchanged
|
||||
assert meta.get_task_index("Task B") is not None
|
||||
assert len(meta.tasks) == 2
|
||||
|
||||
|
||||
def test_get_task_index_returns_none_for_unknown(tmp_path):
|
||||
"""get_task_index() returns None for an unknown task."""
|
||||
root = tmp_path / "unknown_ds"
|
||||
meta = LeRobotDatasetMetadata.create(
|
||||
repo_id="test/unknown", fps=DEFAULT_FPS, features=SIMPLE_FEATURES, root=root, use_videos=False
|
||||
)
|
||||
meta.save_episode_tasks(["Known task"])
|
||||
|
||||
assert meta.get_task_index("Known task") == 0
|
||||
assert meta.get_task_index("Unknown task") is None
|
||||
|
||||
|
||||
def test_save_episode_tasks_rejects_duplicates(tmp_path):
|
||||
"""save_episode_tasks() raises ValueError on duplicate task strings."""
|
||||
root = tmp_path / "dup_ds"
|
||||
meta = LeRobotDatasetMetadata.create(
|
||||
repo_id="test/dup", fps=DEFAULT_FPS, features=SIMPLE_FEATURES, root=root, use_videos=False
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
meta.save_episode_tasks(["Same task", "Same task"])
|
||||
|
||||
|
||||
# ── Episode saving ───────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_save_episode_increments_counters(tmp_path):
|
||||
"""After save_episode(), total_episodes and total_frames increase."""
|
||||
root = tmp_path / "ep_ds"
|
||||
meta = LeRobotDatasetMetadata.create(
|
||||
repo_id="test/ep", fps=DEFAULT_FPS, features=SIMPLE_FEATURES, root=root, use_videos=False
|
||||
)
|
||||
meta.save_episode_tasks(["Task 1"])
|
||||
stats = _make_dummy_stats(meta.features)
|
||||
|
||||
meta.save_episode(
|
||||
episode_index=0,
|
||||
episode_length=10,
|
||||
episode_tasks=["Task 1"],
|
||||
episode_stats=stats,
|
||||
episode_metadata={},
|
||||
)
|
||||
|
||||
assert meta.total_episodes == 1
|
||||
assert meta.total_frames == 10
|
||||
|
||||
|
||||
def test_save_episode_updates_stats(tmp_path):
|
||||
"""After save_episode(), .stats is non-None and has feature keys."""
|
||||
root = tmp_path / "stats_ds"
|
||||
meta = LeRobotDatasetMetadata.create(
|
||||
repo_id="test/stats", fps=DEFAULT_FPS, features=SIMPLE_FEATURES, root=root, use_videos=False
|
||||
)
|
||||
meta.save_episode_tasks(["Task 1"])
|
||||
stats = _make_dummy_stats(meta.features)
|
||||
|
||||
meta.save_episode(
|
||||
episode_index=0,
|
||||
episode_length=5,
|
||||
episode_tasks=["Task 1"],
|
||||
episode_stats=stats,
|
||||
episode_metadata={},
|
||||
)
|
||||
|
||||
assert meta.stats is not None
|
||||
# Stats should contain at least the user-defined feature keys
|
||||
for key in SIMPLE_FEATURES:
|
||||
assert key in meta.stats
|
||||
|
||||
|
||||
# ── Chunk settings ───────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_update_chunk_settings_persists(tmp_path):
|
||||
"""update_chunk_settings() changes values and writes info.json."""
|
||||
root = tmp_path / "chunk_ds"
|
||||
meta = LeRobotDatasetMetadata.create(
|
||||
repo_id="test/chunk", fps=DEFAULT_FPS, features=SIMPLE_FEATURES, root=root, use_videos=False
|
||||
)
|
||||
original = meta.get_chunk_settings()
|
||||
|
||||
meta.update_chunk_settings(chunks_size=500)
|
||||
assert meta.chunks_size == 500
|
||||
assert meta.chunks_size != original["chunks_size"] or original["chunks_size"] == 500
|
||||
|
||||
# Verify persisted
|
||||
with open(root / INFO_PATH) as f:
|
||||
info_on_disk = json.load(f)
|
||||
assert info_on_disk["chunks_size"] == 500
|
||||
|
||||
|
||||
def test_update_chunk_settings_rejects_non_positive(tmp_path):
|
||||
"""update_chunk_settings() raises ValueError for <= 0 values."""
|
||||
root = tmp_path / "bad_chunk"
|
||||
meta = LeRobotDatasetMetadata.create(
|
||||
repo_id="test/bad", fps=DEFAULT_FPS, features=SIMPLE_FEATURES, root=root, use_videos=False
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
meta.update_chunk_settings(chunks_size=0)
|
||||
with pytest.raises(ValueError):
|
||||
meta.update_chunk_settings(data_files_size_in_mb=-1)
|
||||
|
||||
|
||||
# ── Finalization ─────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_finalize_is_idempotent(tmp_path):
|
||||
"""Calling finalize() multiple times does not raise."""
|
||||
root = tmp_path / "fin_ds"
|
||||
meta = LeRobotDatasetMetadata.create(
|
||||
repo_id="test/fin", fps=DEFAULT_FPS, features=SIMPLE_FEATURES, root=root, use_videos=False
|
||||
)
|
||||
|
||||
meta.finalize()
|
||||
meta.finalize() # second call should not raise
|
||||
|
||||
|
||||
def test_finalize_flushes_buffered_metadata(tmp_path):
|
||||
"""Episodes saved before finalize() are written to parquet."""
|
||||
root = tmp_path / "flush_ds"
|
||||
meta = LeRobotDatasetMetadata.create(
|
||||
repo_id="test/flush",
|
||||
fps=DEFAULT_FPS,
|
||||
features=SIMPLE_FEATURES,
|
||||
root=root,
|
||||
use_videos=False,
|
||||
metadata_buffer_size=100, # large buffer so nothing auto-flushes
|
||||
)
|
||||
meta.save_episode_tasks(["Task 1"])
|
||||
stats = _make_dummy_stats(meta.features)
|
||||
|
||||
# Save a few episodes (won't auto-flush since buffer_size=100)
|
||||
for i in range(3):
|
||||
meta.save_episode(
|
||||
episode_index=i,
|
||||
episode_length=5,
|
||||
episode_tasks=["Task 1"],
|
||||
episode_stats=stats,
|
||||
episode_metadata={},
|
||||
)
|
||||
|
||||
# Before finalize, the parquet might not exist yet
|
||||
meta.finalize()
|
||||
|
||||
# After finalize, episodes parquet should exist
|
||||
episodes_dir = root / "meta" / "episodes"
|
||||
assert episodes_dir.exists()
|
||||
parquet_files = list(episodes_dir.rglob("*.parquet"))
|
||||
assert len(parquet_files) > 0
|
||||
@@ -0,0 +1,168 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Contract tests for DatasetReader."""
|
||||
|
||||
from lerobot.datasets.dataset_reader import DatasetReader
|
||||
from lerobot.datasets.video_utils import get_safe_default_codec
|
||||
|
||||
# ── Loading ──────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_try_load_returns_true_when_data_exists(tmp_path, lerobot_dataset_factory):
|
||||
"""Given a fully written dataset, try_load() returns True."""
|
||||
dataset = lerobot_dataset_factory(
|
||||
root=tmp_path / "ds", total_episodes=2, total_frames=20, use_videos=False
|
||||
)
|
||||
reader = DatasetReader(
|
||||
meta=dataset.meta,
|
||||
root=dataset.root,
|
||||
episodes=None,
|
||||
tolerance_s=1e-4,
|
||||
video_backend=get_safe_default_codec(),
|
||||
delta_timestamps=None,
|
||||
image_transforms=None,
|
||||
)
|
||||
assert reader.try_load() is True
|
||||
assert reader.hf_dataset is not None
|
||||
|
||||
|
||||
def test_try_load_returns_false_when_no_data(tmp_path):
|
||||
"""When only metadata exists (no data/ parquets), try_load() returns False."""
|
||||
from lerobot.datasets.dataset_metadata import LeRobotDatasetMetadata
|
||||
|
||||
root = tmp_path / "meta_only"
|
||||
features = {"state": {"dtype": "float32", "shape": (2,), "names": None}}
|
||||
meta = LeRobotDatasetMetadata.create(
|
||||
repo_id="test/meta_only", fps=30, features=features, root=root, use_videos=False
|
||||
)
|
||||
|
||||
reader = DatasetReader(
|
||||
meta=meta,
|
||||
root=meta.root,
|
||||
episodes=None,
|
||||
tolerance_s=1e-4,
|
||||
video_backend=get_safe_default_codec(),
|
||||
delta_timestamps=None,
|
||||
image_transforms=None,
|
||||
)
|
||||
assert reader.try_load() is False
|
||||
assert reader.hf_dataset is None
|
||||
|
||||
|
||||
# ── Counts ───────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_num_frames_without_filter(tmp_path, lerobot_dataset_factory):
|
||||
"""With episodes=None, num_frames equals total_frames."""
|
||||
dataset = lerobot_dataset_factory(
|
||||
root=tmp_path / "ds", total_episodes=3, total_frames=60, use_videos=False
|
||||
)
|
||||
assert dataset.reader.num_frames == dataset.meta.total_frames
|
||||
|
||||
|
||||
def test_num_episodes_without_filter(tmp_path, lerobot_dataset_factory):
|
||||
"""With episodes=None, num_episodes equals total_episodes."""
|
||||
dataset = lerobot_dataset_factory(
|
||||
root=tmp_path / "ds", total_episodes=3, total_frames=60, use_videos=False
|
||||
)
|
||||
assert dataset.reader.num_episodes == dataset.meta.total_episodes
|
||||
|
||||
|
||||
def test_num_frames_with_episode_filter(tmp_path, lerobot_dataset_factory):
|
||||
"""When filtering to a subset, only those episodes' frames are counted."""
|
||||
dataset = lerobot_dataset_factory(
|
||||
root=tmp_path / "ds", total_episodes=5, total_frames=100, episodes=[0, 2], use_videos=False
|
||||
)
|
||||
# Filtered frames should be less than total
|
||||
assert dataset.reader.num_frames <= dataset.meta.total_frames
|
||||
assert dataset.reader.num_episodes == 2
|
||||
|
||||
|
||||
# ── get_item ─────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_get_item_returns_expected_keys(tmp_path, lerobot_dataset_factory):
|
||||
"""get_item(0) returns a dict with expected keys."""
|
||||
dataset = lerobot_dataset_factory(
|
||||
root=tmp_path / "ds", total_episodes=1, total_frames=10, use_videos=False
|
||||
)
|
||||
item = dataset.reader.get_item(0)
|
||||
|
||||
# Standard keys that must always be present
|
||||
for key in ["index", "episode_index", "frame_index", "timestamp", "task_index", "task"]:
|
||||
assert key in item, f"Missing key: {key}"
|
||||
|
||||
|
||||
def test_get_item_values_are_correct(tmp_path, lerobot_dataset_factory):
|
||||
"""get_item() returns correct index and episode_index."""
|
||||
dataset = lerobot_dataset_factory(
|
||||
root=tmp_path / "ds", total_episodes=2, total_frames=20, use_videos=False
|
||||
)
|
||||
item_0 = dataset.reader.get_item(0)
|
||||
|
||||
assert item_0["index"].item() == 0
|
||||
assert item_0["episode_index"].item() == 0
|
||||
|
||||
|
||||
# ── Transforms ───────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_image_transforms_are_applied(tmp_path, lerobot_dataset_factory):
|
||||
"""When image_transforms is provided, get_item() applies it to camera keys."""
|
||||
transform_called = {"count": 0}
|
||||
|
||||
def sentinel_transform(img):
|
||||
transform_called["count"] += 1
|
||||
return img
|
||||
|
||||
dataset = lerobot_dataset_factory(
|
||||
root=tmp_path / "ds",
|
||||
total_episodes=1,
|
||||
total_frames=5,
|
||||
use_videos=False,
|
||||
image_transforms=sentinel_transform,
|
||||
)
|
||||
item = dataset[0] # noqa: F841
|
||||
|
||||
# Should have been called once per camera key per frame
|
||||
num_cameras = len(dataset.meta.camera_keys)
|
||||
if num_cameras > 0:
|
||||
assert transform_called["count"] >= 1
|
||||
|
||||
|
||||
# ── File paths ───────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_get_episodes_file_paths_returns_data_paths(tmp_path, lerobot_dataset_factory):
|
||||
"""get_episodes_file_paths() returns paths including data/ paths."""
|
||||
dataset = lerobot_dataset_factory(
|
||||
root=tmp_path / "ds", total_episodes=2, total_frames=20, use_videos=False
|
||||
)
|
||||
paths = dataset.reader.get_episodes_file_paths()
|
||||
|
||||
assert len(paths) > 0
|
||||
assert any("data/" in str(p) for p in paths)
|
||||
|
||||
|
||||
def test_get_episodes_file_paths_includes_video_paths(tmp_path, lerobot_dataset_factory):
|
||||
"""When dataset has video keys, file paths include video/ paths."""
|
||||
dataset = lerobot_dataset_factory(
|
||||
root=tmp_path / "ds", total_episodes=2, total_frames=20, use_videos=True
|
||||
)
|
||||
|
||||
if len(dataset.meta.video_keys) > 0:
|
||||
paths = dataset.reader.get_episodes_file_paths()
|
||||
assert any("video" in str(p).lower() for p in paths)
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user