mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-12 15:19:43 +00:00
Compare commits
24 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 8e21268c29 | |||
| 4130d4a4a5 | |||
| 47bb840a55 | |||
| 9519ff5e09 | |||
| 32a27cae8a | |||
| 8cee56e2d6 | |||
| a76874f35e | |||
| 35bb2c7459 | |||
| 051f6c6803 | |||
| 04ae0312a2 | |||
| cc634de9e7 | |||
| 3eda5712d3 | |||
| 783ec6e232 | |||
| 4e3175ff15 | |||
| edd7fc52a8 | |||
| 0f0f8b8961 | |||
| 79db54dc34 | |||
| 6ae07878f7 | |||
| 10d05e03bc | |||
| f2c29d78cf | |||
| 8bc47e4318 | |||
| 49f32b9796 | |||
| f55782f9f7 | |||
| 05a2604d6e |
@@ -83,13 +83,10 @@ jobs:
|
||||
cache-binary: false
|
||||
|
||||
- name: Login to Docker Hub
|
||||
if: ${{ env.DOCKERHUB_USERNAME != '' }}
|
||||
uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
|
||||
env:
|
||||
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
|
||||
|
||||
# Build the benchmark-specific image. The Dockerfile separates dep-install
|
||||
# from source-copy, so code-only changes skip the slow uv-sync layer
|
||||
@@ -118,7 +115,7 @@ jobs:
|
||||
bash -c "
|
||||
hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
|
||||
lerobot-eval \
|
||||
--policy.path=lerobot/smolvla_libero \
|
||||
--policy.path=pepijn223/smolvla_libero \
|
||||
--env.type=libero \
|
||||
--env.task=libero_spatial \
|
||||
--eval.batch_size=1 \
|
||||
@@ -147,7 +144,7 @@ jobs:
|
||||
--artifacts-dir /tmp/libero-artifacts \
|
||||
--env libero \
|
||||
--task libero_spatial \
|
||||
--policy lerobot/smolvla_libero
|
||||
--policy pepijn223/smolvla_libero
|
||||
|
||||
- name: Upload Libero rollout video
|
||||
if: always()
|
||||
@@ -241,13 +238,10 @@ jobs:
|
||||
cache-binary: false
|
||||
|
||||
- name: Login to Docker Hub
|
||||
if: ${{ env.DOCKERHUB_USERNAME != '' }}
|
||||
uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
|
||||
env:
|
||||
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
|
||||
|
||||
- name: Build MetaWorld benchmark image
|
||||
uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
|
||||
@@ -270,7 +264,7 @@ jobs:
|
||||
bash -c "
|
||||
hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
|
||||
lerobot-eval \
|
||||
--policy.path=lerobot/smolvla_metaworld \
|
||||
--policy.path=pepijn223/smolvla_metaworld \
|
||||
--env.type=metaworld \
|
||||
--env.task=metaworld-push-v3 \
|
||||
--eval.batch_size=1 \
|
||||
@@ -299,7 +293,7 @@ jobs:
|
||||
--artifacts-dir /tmp/metaworld-artifacts \
|
||||
--env metaworld \
|
||||
--task metaworld-push-v3 \
|
||||
--policy lerobot/smolvla_metaworld
|
||||
--policy pepijn223/smolvla_metaworld
|
||||
|
||||
- name: Upload MetaWorld rollout video
|
||||
if: always()
|
||||
@@ -316,636 +310,3 @@ jobs:
|
||||
name: metaworld-metrics
|
||||
path: /tmp/metaworld-artifacts/metrics.json
|
||||
if-no-files-found: warn
|
||||
|
||||
# ── ROBOTWIN 2.0 ──────────────────────────────────────────────────────────
|
||||
# Isolated image: full RoboTwin 2.0 stack — SAPIEN, mplib, CuRobo,
|
||||
# pytorch3d, + simulation assets (~4 GB).
|
||||
# Build takes ~20 min on first run; subsequent runs hit the layer cache.
|
||||
# Requires an NVIDIA GPU runner with CUDA 12.1 drivers.
|
||||
robotwin-integration-test:
|
||||
name: RoboTwin 2.0 — build image + 1-episode eval
|
||||
runs-on:
|
||||
group: aws-g6-4xlarge-plus
|
||||
env:
|
||||
HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
|
||||
ROBOTWIN_POLICY: lerobot/smolvla_robotwin
|
||||
ROBOTWIN_TASKS: beat_block_hammer,click_bell,handover_block,stack_blocks_two,click_alarmclock,open_microwave,adjust_bottle,lift_pot,stamp_seal,turn_switch
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
persist-credentials: false
|
||||
lfs: true
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
cache-binary: false
|
||||
|
||||
- name: Login to Docker Hub
|
||||
if: ${{ env.DOCKERHUB_USERNAME != '' }}
|
||||
uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
|
||||
env:
|
||||
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
|
||||
|
||||
# Build the full-install image: SAPIEN, mplib, CuRobo, pytorch3d +
|
||||
# simulation assets (~4 GB). Layer cache lives in the runner's local
|
||||
# Docker daemon — reused across re-runs on the same machine.
|
||||
- name: Build RoboTwin 2.0 benchmark image
|
||||
uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
context: .
|
||||
file: docker/Dockerfile.benchmark.robotwin
|
||||
push: false
|
||||
load: true
|
||||
tags: lerobot-benchmark-robotwin:ci
|
||||
cache-from: type=local,src=/tmp/.buildx-cache-robotwin
|
||||
cache-to: type=local,dest=/tmp/.buildx-cache-robotwin,mode=max
|
||||
|
||||
- name: Run RoboTwin 2.0 smoke eval (10 tasks, 1 episode each)
|
||||
if: env.HF_USER_TOKEN != ''
|
||||
run: |
|
||||
# Named container (no --rm) so we can docker cp artifacts out.
|
||||
docker run --name robotwin-eval --gpus all \
|
||||
--shm-size=4g \
|
||||
-e HF_HOME=/tmp/hf \
|
||||
-e HF_USER_TOKEN="${HF_USER_TOKEN}" \
|
||||
-e ROBOTWIN_POLICY="${ROBOTWIN_POLICY}" \
|
||||
-e ROBOTWIN_TASKS="${ROBOTWIN_TASKS}" \
|
||||
lerobot-benchmark-robotwin:ci \
|
||||
bash -c "
|
||||
hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
|
||||
cd /opt/robotwin && lerobot-eval \
|
||||
--policy.path=\"\$ROBOTWIN_POLICY\" \
|
||||
--env.type=robotwin \
|
||||
--env.task=\"\$ROBOTWIN_TASKS\" \
|
||||
--env.max_parallel_tasks=5 \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=1 \
|
||||
--eval.use_async_envs=false \
|
||||
--policy.device=cuda \
|
||||
'--rename_map={\"observation.images.head_camera\": \"observation.images.camera1\", \"observation.images.left_camera\": \"observation.images.camera2\", \"observation.images.right_camera\": \"observation.images.camera3\"}' \
|
||||
--output_dir=/tmp/eval-artifacts
|
||||
python /lerobot/scripts/ci/extract_task_descriptions.py \
|
||||
--env robotwin \
|
||||
--task \"\$ROBOTWIN_TASKS\" \
|
||||
--output /tmp/eval-artifacts/task_descriptions.json
|
||||
"
|
||||
|
||||
- name: Copy RoboTwin artifacts from container
|
||||
if: always()
|
||||
run: |
|
||||
mkdir -p /tmp/robotwin-artifacts
|
||||
docker cp robotwin-eval:/tmp/eval-artifacts/. /tmp/robotwin-artifacts/ 2>/dev/null || true
|
||||
docker rm -f robotwin-eval || true
|
||||
|
||||
- name: Parse RoboTwin eval metrics
|
||||
if: always()
|
||||
run: |
|
||||
python3 scripts/ci/parse_eval_metrics.py \
|
||||
--artifacts-dir /tmp/robotwin-artifacts \
|
||||
--env robotwin \
|
||||
--task "${ROBOTWIN_TASKS}" \
|
||||
--policy "${ROBOTWIN_POLICY}"
|
||||
|
||||
- name: Upload RoboTwin rollout video
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: robotwin-rollout-video
|
||||
path: /tmp/robotwin-artifacts/videos/
|
||||
if-no-files-found: warn
|
||||
|
||||
- name: Upload RoboTwin eval metrics
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: robotwin-metrics
|
||||
path: /tmp/robotwin-artifacts/metrics.json
|
||||
if-no-files-found: warn
|
||||
|
||||
# ── ROBOCASA365 ──────────────────────────────────────────────────────────
|
||||
# Isolated image: robocasa + robosuite installed manually as editable
|
||||
# clones (no `lerobot[robocasa]` extra — robocasa's setup.py pins
|
||||
# `lerobot==0.3.3`, which would shadow this repo's lerobot).
|
||||
robocasa-integration-test:
|
||||
name: RoboCasa365 — build image + 1-episode eval
|
||||
runs-on:
|
||||
group: aws-g6-4xlarge-plus
|
||||
env:
|
||||
HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
persist-credentials: false
|
||||
lfs: true
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
cache-binary: false
|
||||
|
||||
- name: Login to Docker Hub
|
||||
if: ${{ env.DOCKERHUB_USERNAME != '' }}
|
||||
uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
|
||||
env:
|
||||
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
|
||||
|
||||
- name: Build RoboCasa365 benchmark image
|
||||
uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
context: .
|
||||
file: docker/Dockerfile.benchmark.robocasa
|
||||
push: false
|
||||
load: true
|
||||
tags: lerobot-benchmark-robocasa:ci
|
||||
|
||||
- name: Run RoboCasa365 smoke eval (10 atomic tasks, 1 episode each)
|
||||
if: env.HF_USER_TOKEN != ''
|
||||
run: |
|
||||
docker run --name robocasa-eval --gpus all \
|
||||
--shm-size=4g \
|
||||
-e HF_HOME=/tmp/hf \
|
||||
-e HF_USER_TOKEN="${HF_USER_TOKEN}" \
|
||||
-e HF_HUB_DOWNLOAD_TIMEOUT=300 \
|
||||
-e MUJOCO_GL=egl \
|
||||
lerobot-benchmark-robocasa:ci \
|
||||
bash -c "
|
||||
hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
|
||||
lerobot-eval \
|
||||
--policy.path=lerobot/smolvla_robocasa \
|
||||
--env.type=robocasa \
|
||||
--env.task=CloseFridge,OpenCabinet,OpenDrawer,TurnOnMicrowave,TurnOffStove,CloseToasterOvenDoor,SlideDishwasherRack,TurnOnSinkFaucet,NavigateKitchen,TurnOnElectricKettle \
|
||||
--env.max_parallel_tasks=5 \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=1 \
|
||||
--eval.use_async_envs=false \
|
||||
--policy.device=cuda \
|
||||
'--rename_map={\"observation.images.robot0_agentview_left\": \"observation.images.camera1\", \"observation.images.robot0_eye_in_hand\": \"observation.images.camera2\", \"observation.images.robot0_agentview_right\": \"observation.images.camera3\"}' \
|
||||
--output_dir=/tmp/eval-artifacts
|
||||
python scripts/ci/extract_task_descriptions.py \
|
||||
--env robocasa \
|
||||
--task CloseFridge,OpenCabinet,OpenDrawer,TurnOnMicrowave,TurnOffStove,CloseToasterOvenDoor,SlideDishwasherRack,TurnOnSinkFaucet,NavigateKitchen,TurnOnElectricKettle \
|
||||
--output /tmp/eval-artifacts/task_descriptions.json
|
||||
"
|
||||
|
||||
- name: Copy RoboCasa365 artifacts from container
|
||||
if: always()
|
||||
run: |
|
||||
mkdir -p /tmp/robocasa-artifacts
|
||||
docker cp robocasa-eval:/tmp/eval-artifacts/. /tmp/robocasa-artifacts/ 2>/dev/null || true
|
||||
docker rm -f robocasa-eval || true
|
||||
|
||||
- name: Parse RoboCasa365 eval metrics
|
||||
if: always()
|
||||
run: |
|
||||
python3 scripts/ci/parse_eval_metrics.py \
|
||||
--artifacts-dir /tmp/robocasa-artifacts \
|
||||
--env robocasa \
|
||||
--task atomic_smoke_10 \
|
||||
--policy lerobot/smolvla_robocasa
|
||||
|
||||
- name: Upload RoboCasa365 rollout video
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
name: robocasa-rollout-video
|
||||
path: /tmp/robocasa-artifacts/videos/
|
||||
if-no-files-found: warn
|
||||
|
||||
- name: Upload RoboCasa365 eval metrics
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
name: robocasa-metrics
|
||||
path: /tmp/robocasa-artifacts/metrics.json
|
||||
if-no-files-found: warn
|
||||
|
||||
# ── ROBOCEREBRA ───────────────────────────────────────────────────────────
|
||||
# Reuses the LIBERO simulator (libero_10 suite) with RoboCerebra camera
|
||||
# defaults (image/wrist_image). The image is layered on
|
||||
# huggingface/lerobot-gpu, which already ships [libero] as part of [all].
|
||||
robocerebra-integration-test:
|
||||
name: RoboCerebra — build image + 1-episode eval
|
||||
runs-on:
|
||||
group: aws-g6-4xlarge-plus
|
||||
env:
|
||||
HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
persist-credentials: false
|
||||
lfs: true
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
cache-binary: false
|
||||
|
||||
- name: Login to Docker Hub
|
||||
if: ${{ env.DOCKERHUB_USERNAME != '' }}
|
||||
uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
|
||||
env:
|
||||
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
|
||||
|
||||
- name: Build RoboCerebra benchmark image
|
||||
uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
context: .
|
||||
file: docker/Dockerfile.benchmark.robocerebra
|
||||
push: false
|
||||
load: true
|
||||
tags: lerobot-benchmark-robocerebra:ci
|
||||
cache-from: type=local,src=/tmp/.buildx-cache-robocerebra
|
||||
cache-to: type=local,dest=/tmp/.buildx-cache-robocerebra,mode=max
|
||||
|
||||
- name: Run RoboCerebra smoke eval (1 episode)
|
||||
if: env.HF_USER_TOKEN != ''
|
||||
run: |
|
||||
docker run --name robocerebra-eval --gpus all \
|
||||
--shm-size=4g \
|
||||
-e HF_HOME=/tmp/hf \
|
||||
-e HF_USER_TOKEN="${HF_USER_TOKEN}" \
|
||||
-e HF_HUB_DOWNLOAD_TIMEOUT=300 \
|
||||
-e LIBERO_DATA_FOLDER=/tmp/libero_data \
|
||||
lerobot-benchmark-robocerebra:ci \
|
||||
bash -c "
|
||||
hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
|
||||
lerobot-eval \
|
||||
--policy.path=lerobot/smolvla_robocerebra \
|
||||
--env.type=libero \
|
||||
--env.task=libero_10 \
|
||||
--env.fps=20 \
|
||||
--env.obs_type=pixels_agent_pos \
|
||||
--env.observation_height=256 \
|
||||
--env.observation_width=256 \
|
||||
'--env.camera_name_mapping={\"agentview_image\": \"image\", \"robot0_eye_in_hand_image\": \"wrist_image\"}' \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=1 \
|
||||
--eval.use_async_envs=false \
|
||||
--policy.device=cuda \
|
||||
'--rename_map={\"observation.images.image\": \"observation.images.camera1\", \"observation.images.wrist_image\": \"observation.images.camera2\"}' \
|
||||
--policy.empty_cameras=1 \
|
||||
--output_dir=/tmp/eval-artifacts
|
||||
python scripts/ci/extract_task_descriptions.py \
|
||||
--env libero --task libero_10 \
|
||||
--output /tmp/eval-artifacts/task_descriptions.json
|
||||
"
|
||||
|
||||
- name: Copy RoboCerebra artifacts from container
|
||||
if: always()
|
||||
run: |
|
||||
mkdir -p /tmp/robocerebra-artifacts
|
||||
docker cp robocerebra-eval:/tmp/eval-artifacts/. /tmp/robocerebra-artifacts/ 2>/dev/null || true
|
||||
docker rm -f robocerebra-eval || true
|
||||
|
||||
- name: Parse RoboCerebra eval metrics
|
||||
if: always()
|
||||
run: |
|
||||
python3 scripts/ci/parse_eval_metrics.py \
|
||||
--artifacts-dir /tmp/robocerebra-artifacts \
|
||||
--env robocerebra \
|
||||
--task libero_10 \
|
||||
--policy lerobot/smolvla_robocerebra
|
||||
|
||||
- name: Upload RoboCerebra rollout video
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
name: robocerebra-rollout-video
|
||||
path: /tmp/robocerebra-artifacts/videos/
|
||||
if-no-files-found: warn
|
||||
|
||||
- name: Upload RoboCerebra eval metrics
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
name: robocerebra-metrics
|
||||
path: /tmp/robocerebra-artifacts/metrics.json
|
||||
if-no-files-found: warn
|
||||
|
||||
# ── ROBOMME ───────────────────────────────────────────────────────────────
|
||||
# Isolated image: mani-skill/SAPIEN/Vulkan chain with gymnasium and numpy
|
||||
# overrides (robomme can't be a pyproject extra due to numpy<2 pin).
|
||||
robomme-integration-test:
|
||||
name: RoboMME — build image + 1-episode eval
|
||||
runs-on:
|
||||
group: aws-g6-4xlarge-plus
|
||||
env:
|
||||
HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
|
||||
ROBOMME_POLICY: lerobot/smolvla_robomme
|
||||
ROBOMME_TASKS: PickXtimes,BinFill,StopCube,MoveCube,InsertPeg,SwingXtimes,VideoUnmask,ButtonUnmask,PickHighlight,PatternLock
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
persist-credentials: false
|
||||
lfs: true
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
cache-binary: false
|
||||
|
||||
- name: Login to Docker Hub
|
||||
if: ${{ env.DOCKERHUB_USERNAME != '' }}
|
||||
uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
|
||||
env:
|
||||
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
|
||||
|
||||
- name: Build RoboMME benchmark image
|
||||
uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
context: .
|
||||
file: docker/Dockerfile.benchmark.robomme
|
||||
push: false
|
||||
load: true
|
||||
tags: lerobot-benchmark-robomme:ci
|
||||
|
||||
- name: Run RoboMME smoke eval (10 tasks, 1 episode each)
|
||||
if: env.HF_USER_TOKEN != ''
|
||||
run: |
|
||||
docker run --name robomme-eval --gpus all \
|
||||
--shm-size=4g \
|
||||
-e HF_HOME=/tmp/hf \
|
||||
-e HF_USER_TOKEN="${HF_USER_TOKEN}" \
|
||||
-e HF_HUB_DOWNLOAD_TIMEOUT=300 \
|
||||
-e ROBOMME_POLICY="${ROBOMME_POLICY}" \
|
||||
-e ROBOMME_TASKS="${ROBOMME_TASKS}" \
|
||||
lerobot-benchmark-robomme:ci \
|
||||
bash -c "
|
||||
hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
|
||||
lerobot-eval \
|
||||
--policy.path=\"\$ROBOMME_POLICY\" \
|
||||
--env.type=robomme \
|
||||
--env.task=\"\$ROBOMME_TASKS\" \
|
||||
--env.dataset_split=test \
|
||||
--env.task_ids=[0] \
|
||||
--env.max_parallel_tasks=5 \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=1 \
|
||||
--eval.use_async_envs=false \
|
||||
--policy.device=cuda \
|
||||
'--rename_map={\"observation.images.image\": \"observation.images.camera1\", \"observation.images.wrist_image\": \"observation.images.camera2\"}' \
|
||||
--policy.empty_cameras=3 \
|
||||
--output_dir=/tmp/eval-artifacts
|
||||
python scripts/ci/extract_task_descriptions.py \
|
||||
--env robomme --task \"\$ROBOMME_TASKS\" \
|
||||
--output /tmp/eval-artifacts/task_descriptions.json
|
||||
"
|
||||
|
||||
- name: Copy RoboMME artifacts from container
|
||||
if: always()
|
||||
run: |
|
||||
mkdir -p /tmp/robomme-artifacts
|
||||
docker cp robomme-eval:/tmp/eval-artifacts/. /tmp/robomme-artifacts/ 2>/dev/null || true
|
||||
docker rm -f robomme-eval || true
|
||||
|
||||
- name: Parse RoboMME eval metrics
|
||||
if: always()
|
||||
run: |
|
||||
python3 scripts/ci/parse_eval_metrics.py \
|
||||
--artifacts-dir /tmp/robomme-artifacts \
|
||||
--env robomme \
|
||||
--task "${ROBOMME_TASKS}" \
|
||||
--policy "${ROBOMME_POLICY}"
|
||||
|
||||
- name: Upload RoboMME rollout video
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
name: robomme-rollout-video
|
||||
path: /tmp/robomme-artifacts/videos/
|
||||
if-no-files-found: warn
|
||||
|
||||
- name: Upload RoboMME eval metrics
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
name: robomme-metrics
|
||||
path: /tmp/robomme-artifacts/metrics.json
|
||||
if-no-files-found: warn
|
||||
|
||||
# ── LIBERO-plus ───────────────────────────────────────────────────────────
|
||||
# Isolated image: LIBERO-plus fork cloned into /home/user_lerobot on top of
|
||||
# huggingface/lerobot-gpu (see docker/Dockerfile.benchmark.libero_plus).
|
||||
libero-plus-integration-test:
|
||||
name: LIBERO-plus — build image + 1-episode eval
|
||||
runs-on:
|
||||
group: aws-g6-4xlarge-plus
|
||||
env:
|
||||
HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
|
||||
LIBERO_PLUS_SUITE: libero_spatial
|
||||
LIBERO_PLUS_POLICY: lerobot/smolvla_libero_plus
|
||||
LIBERO_PLUS_TASK_IDS: "[0,100,260,500,1000,1500,2000,2400]"
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
persist-credentials: false
|
||||
lfs: true
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
cache-binary: false
|
||||
|
||||
- name: Login to Docker Hub
|
||||
if: ${{ env.DOCKERHUB_USERNAME != '' }}
|
||||
uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
|
||||
env:
|
||||
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
|
||||
|
||||
- name: Build LIBERO-plus benchmark image
|
||||
uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
context: .
|
||||
file: docker/Dockerfile.benchmark.libero_plus
|
||||
push: false
|
||||
load: true
|
||||
tags: lerobot-benchmark-libero-plus:ci
|
||||
cache-from: type=local,src=/tmp/.buildx-cache-libero-plus
|
||||
cache-to: type=local,dest=/tmp/.buildx-cache-libero-plus,mode=max
|
||||
|
||||
- name: Run LIBERO-plus smoke eval (1 episode)
|
||||
if: env.HF_USER_TOKEN != ''
|
||||
run: |
|
||||
docker run --name libero-plus-eval --gpus all \
|
||||
--shm-size=4g \
|
||||
-e HF_HOME=/tmp/hf \
|
||||
-e HF_USER_TOKEN="${HF_USER_TOKEN}" \
|
||||
-e HF_HUB_DOWNLOAD_TIMEOUT=300 \
|
||||
-e LIBERO_PLUS_SUITE="${LIBERO_PLUS_SUITE}" \
|
||||
-e LIBERO_PLUS_POLICY="${LIBERO_PLUS_POLICY}" \
|
||||
-e LIBERO_PLUS_TASK_IDS="${LIBERO_PLUS_TASK_IDS}" \
|
||||
lerobot-benchmark-libero-plus:ci \
|
||||
bash -c "
|
||||
hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
|
||||
lerobot-eval \
|
||||
--policy.path=\"\$LIBERO_PLUS_POLICY\" \
|
||||
--env.type=libero_plus \
|
||||
--env.task=\"\$LIBERO_PLUS_SUITE\" \
|
||||
--env.task_ids=\"\$LIBERO_PLUS_TASK_IDS\" \
|
||||
--env.max_parallel_tasks=5 \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=1 \
|
||||
--eval.use_async_envs=false \
|
||||
--policy.device=cuda \
|
||||
'--env.camera_name_mapping={\"agentview_image\": \"camera1\", \"robot0_eye_in_hand_image\": \"camera2\"}' \
|
||||
--policy.empty_cameras=1 \
|
||||
--output_dir=/tmp/eval-artifacts
|
||||
python scripts/ci/extract_task_descriptions.py \
|
||||
--env libero_plus --task \"\$LIBERO_PLUS_SUITE\" \
|
||||
--output /tmp/eval-artifacts/task_descriptions.json
|
||||
"
|
||||
|
||||
- name: Copy LIBERO-plus artifacts from container
|
||||
if: always()
|
||||
run: |
|
||||
mkdir -p /tmp/libero-plus-artifacts
|
||||
docker cp libero-plus-eval:/tmp/eval-artifacts/. /tmp/libero-plus-artifacts/ 2>/dev/null || true
|
||||
docker rm -f libero-plus-eval || true
|
||||
|
||||
- name: Parse LIBERO-plus eval metrics
|
||||
if: always()
|
||||
run: |
|
||||
python3 scripts/ci/parse_eval_metrics.py \
|
||||
--artifacts-dir /tmp/libero-plus-artifacts \
|
||||
--env libero_plus \
|
||||
--task "${LIBERO_PLUS_SUITE}" \
|
||||
--policy "${LIBERO_PLUS_POLICY}"
|
||||
|
||||
- name: Upload LIBERO-plus rollout video
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
name: libero-plus-rollout-video
|
||||
path: /tmp/libero-plus-artifacts/videos/
|
||||
if-no-files-found: warn
|
||||
|
||||
- name: Upload LIBERO-plus eval metrics
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
name: libero-plus-metrics
|
||||
path: /tmp/libero-plus-artifacts/metrics.json
|
||||
if-no-files-found: warn
|
||||
|
||||
# ── VLABENCH ─────────────────────────────────────────────────────────────
|
||||
# Isolated image: lerobot[vlabench] only (VLABench, mujoco==3.2.2, dm-control chain)
|
||||
vlabench-integration-test:
|
||||
name: VLABench — build image + 1-episode eval
|
||||
runs-on:
|
||||
group: aws-g6-4xlarge-plus
|
||||
env:
|
||||
HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
persist-credentials: false
|
||||
lfs: true
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
cache-binary: false
|
||||
|
||||
- name: Login to Docker Hub
|
||||
if: ${{ env.DOCKERHUB_USERNAME != '' }}
|
||||
uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
|
||||
env:
|
||||
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
|
||||
|
||||
- name: Build VLABench benchmark image
|
||||
uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
context: .
|
||||
file: docker/Dockerfile.benchmark.vlabench
|
||||
push: false
|
||||
load: true
|
||||
tags: lerobot-benchmark-vlabench:ci
|
||||
build-args: |
|
||||
VLABENCH_ASSETS_REPO=lerobot/vlabench-assets
|
||||
|
||||
- name: Run VLABench smoke eval (10 tasks, 1 episode each)
|
||||
if: env.HF_USER_TOKEN != ''
|
||||
run: |
|
||||
docker run --name vlabench-eval --gpus all \
|
||||
--shm-size=4g \
|
||||
-e HF_HOME=/tmp/hf \
|
||||
-e HF_USER_TOKEN="${HF_USER_TOKEN}" \
|
||||
-e HF_HUB_DOWNLOAD_TIMEOUT=300 \
|
||||
-e MUJOCO_GL=egl \
|
||||
lerobot-benchmark-vlabench:ci \
|
||||
bash -c "
|
||||
hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
|
||||
lerobot-eval \
|
||||
--policy.path=lerobot/smolvla_vlabench \
|
||||
--env.type=vlabench \
|
||||
--env.task=select_fruit,select_toy,select_book,select_painting,select_drink,select_ingredient,select_billiards,select_poker,add_condiment,insert_flower \
|
||||
--env.episode_length=50 \
|
||||
--env.max_parallel_tasks=5 \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=1 \
|
||||
--eval.use_async_envs=false \
|
||||
--policy.device=cuda \
|
||||
'--rename_map={\"observation.images.image\": \"observation.images.camera1\", \"observation.images.second_image\": \"observation.images.camera2\", \"observation.images.wrist_image\": \"observation.images.camera3\"}' \
|
||||
--output_dir=/tmp/eval-artifacts
|
||||
python scripts/ci/extract_task_descriptions.py \
|
||||
--env vlabench \
|
||||
--task select_fruit,select_toy,select_book,select_painting,select_drink,select_ingredient,select_billiards,select_poker,add_condiment,insert_flower \
|
||||
--output /tmp/eval-artifacts/task_descriptions.json
|
||||
"
|
||||
|
||||
- name: Copy VLABench artifacts from container
|
||||
if: always()
|
||||
run: |
|
||||
mkdir -p /tmp/vlabench-artifacts
|
||||
docker cp vlabench-eval:/tmp/eval-artifacts/. /tmp/vlabench-artifacts/ 2>/dev/null || true
|
||||
docker rm -f vlabench-eval || true
|
||||
|
||||
- name: Parse VLABench eval metrics
|
||||
if: always()
|
||||
run: |
|
||||
python3 scripts/ci/parse_eval_metrics.py \
|
||||
--artifacts-dir /tmp/vlabench-artifacts \
|
||||
--env vlabench \
|
||||
--task select_fruit,select_toy,select_book,select_painting,select_drink,select_ingredient,select_billiards,select_poker,add_condiment,insert_flower \
|
||||
--policy lerobot/smolvla_vlabench
|
||||
|
||||
- name: Upload VLABench rollout video
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
name: vlabench-rollout-video
|
||||
path: /tmp/vlabench-artifacts/videos/
|
||||
if-no-files-found: warn
|
||||
|
||||
- name: Upload VLABench eval metrics
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
name: vlabench-metrics
|
||||
path: /tmp/vlabench-artifacts/metrics.json
|
||||
if-no-files-found: warn
|
||||
|
||||
@@ -33,7 +33,7 @@ jobs:
|
||||
github.event.workflow_run.event == 'pull_request' &&
|
||||
github.event.workflow_run.conclusion == 'success' &&
|
||||
github.repository == 'huggingface/lerobot'
|
||||
uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@2430c1ec91d04667414e2fa31ecfc36c153ea391 # main
|
||||
uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@9ad2de8582b56c017cb530c1165116d40433f1c6 # main
|
||||
with:
|
||||
package_name: lerobot
|
||||
secrets:
|
||||
|
||||
@@ -55,7 +55,7 @@ jobs:
|
||||
github.repository == 'huggingface/lerobot'
|
||||
permissions:
|
||||
contents: read
|
||||
uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@2430c1ec91d04667414e2fa31ecfc36c153ea391 # main
|
||||
uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@90b4ee2c10b81b5c1a6367c4e6fc9e2fb510a7e3 # main
|
||||
with:
|
||||
commit_sha: ${{ github.sha }}
|
||||
package: lerobot
|
||||
@@ -78,7 +78,7 @@ jobs:
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: write
|
||||
uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@2430c1ec91d04667414e2fa31ecfc36c153ea391 # main
|
||||
uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@90b4ee2c10b81b5c1a6367c4e6fc9e2fb510a7e3 # main
|
||||
with:
|
||||
commit_sha: ${{ github.event.pull_request.head.sha }}
|
||||
pr_number: ${{ github.event.number }}
|
||||
|
||||
@@ -24,14 +24,14 @@ on:
|
||||
|
||||
env:
|
||||
CLOSE_ISSUE_MESSAGE: >
|
||||
This issue was closed because it has been stalled for 30 days with no activity.
|
||||
This issue was closed because it has been stalled for 14 days with no activity.
|
||||
Feel free to reopen if is still relevant, or to ping a collaborator if you have any questions.
|
||||
CLOSE_PR_MESSAGE: >
|
||||
This PR was closed because it has been stalled for 30 days with no activity.
|
||||
This PR was closed because it has been stalled for 21 days with no activity.
|
||||
Feel free to reopen if is still relevant, or to ping a collaborator if you have any questions.
|
||||
WARN_ISSUE_MESSAGE: >
|
||||
This issue has been automatically marked as stale because it has not had
|
||||
recent activity (1 year). It will be closed if no further activity occurs.
|
||||
recent activity (6 months). It will be closed if no further activity occurs.
|
||||
Any change, comment or update to this issue will reset this count.
|
||||
Thank you for your contributions.
|
||||
WARN_PR_MESSAGE: >
|
||||
@@ -59,10 +59,10 @@ jobs:
|
||||
stale-pr-label: stale
|
||||
exempt-issue-labels: never-stale
|
||||
exempt-pr-labels: never-stale
|
||||
days-before-issue-stale: 365
|
||||
days-before-issue-close: 30
|
||||
days-before-issue-stale: 180
|
||||
days-before-issue-close: 14
|
||||
days-before-pr-stale: 365
|
||||
days-before-pr-close: 30
|
||||
days-before-pr-close: 21
|
||||
delete-branch: true
|
||||
close-issue-message: ${{ env.CLOSE_ISSUE_MESSAGE }}
|
||||
close-pr-message: ${{ env.CLOSE_PR_MESSAGE }}
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
This file provides guidance to AI agents when working with code in this repository.
|
||||
|
||||
> **User-facing help → [`AGENT_GUIDE.md`](./AGENT_GUIDE.md)** (SO-101 setup, recording, picking a policy, training duration, eval — with copy-pasteable commands).
|
||||
|
||||
## Project Overview
|
||||
|
||||
LeRobot is a PyTorch-based library for real-world robotics, providing datasets, pretrained policies, and tools for training, evaluation, data collection, and robot control. It integrates with Hugging Face Hub for model/dataset sharing.
|
||||
|
||||
-410
@@ -1,410 +0,0 @@
|
||||
# AGENT_GUIDE.md — LeRobot Helper for AI Agents & Users
|
||||
|
||||
This file is a practical, copy-paste-friendly companion for any AI agent (Cursor, Claude, ChatGPT, Codex, etc.) helping a user work with LeRobot. It complements [`AGENTS.md`](./AGENTS.md) (dev/contributor context) with **user-facing guidance**: how to start, what to train, how long, how to record, and how to calibrate an SO-101.
|
||||
|
||||
---
|
||||
|
||||
## 1. Start here — ask the user first (MANDATORY)
|
||||
|
||||
Before suggesting any command, an agent MUST ask the user at least these questions and wait for answers:
|
||||
|
||||
1. **What's your goal?** (e.g. "teach my SO-101 to fold a cloth", "train a policy on an existing HF dataset", "contribute a PR", "understand the codebase")
|
||||
2. **What hardware do you have?**
|
||||
- Robot: none / SO-100 / SO-101 / Koch / LeKiwi / Reachy / other
|
||||
- Teleop: leader arm / phone / keyboard / gamepad / none
|
||||
- Cameras: how many, resolution, fixed or moving?
|
||||
3. **What machine will you train on?**
|
||||
- GPU model + VRAM (e.g. "laptop 3060 6 GB", "RTX 4090 24 GB", "A100 80 GB", "CPU only")
|
||||
- OS: macOS / Linux / Windows
|
||||
4. **Skill level & time budget?** First time, some ML, experienced? Hours, days, a weekend?
|
||||
5. **Do you already have a dataset?** Yes (HF repo id?) / no / want to record one
|
||||
6. **How can I help right now?** (pick one concrete next step)
|
||||
|
||||
Only after you have answers, propose a concrete path. If something is ambiguous, ask again rather than guessing. Bias toward **the simplest thing that works** for the user's hardware and goal.
|
||||
|
||||
---
|
||||
|
||||
## 2. LeRobot in 60 seconds
|
||||
|
||||
LeRobot = **datasets + policies + envs + robot control**, unified by a small set of strong abstractions.
|
||||
|
||||
- **`LeRobotDataset`** — episode-aware dataset (video or images + actions + state), loadable from the Hub or disk.
|
||||
- **Policies** (`ACT`, `Diffusion`, `SmolVLA`, `π0`, `π0.5`, `Wall-X`, `X-VLA`, `VQ-BeT`, `TD-MPC`, …) — all inherit `PreTrainedPolicy` and can be pushed/pulled from the Hub.
|
||||
- **Processors** — small composable transforms between dataset → policy → robot.
|
||||
- **Envs** (sim) and **Robots** (real) — same action/observation contract so code swaps cleanly.
|
||||
- **CLI** — `lerobot-record`, `lerobot-train`, `lerobot-eval`, `lerobot-teleoperate`, `lerobot-calibrate`, `lerobot-find-port`, `lerobot-setup-motors`, `lerobot-replay`.
|
||||
|
||||
See [`AGENTS.md`](./AGENTS.md) for repo architecture.
|
||||
|
||||
---
|
||||
|
||||
## 3. Quickstart paths (pick one)
|
||||
|
||||
### Path A — "I have an SO-101 and want my first trained policy"
|
||||
|
||||
Go to §4 (SO-101 end-to-end), then §5 (data tips), then §6 (pick a policy — likely **ACT**), then §7 (how long), then §8 (eval).
|
||||
|
||||
### Path B — "No hardware, I want to train on an existing dataset"
|
||||
|
||||
Skip §4. Pick a policy in §6, pick a duration in §7, then run `lerobot-train` per §4.9 with a Hub `--dataset.repo_id` and an `--env.type` for eval. Finish with §8.
|
||||
|
||||
### Path C — "I just want to understand the codebase"
|
||||
|
||||
Read §2 above, then `AGENTS.md` "Architecture", then open `src/lerobot/policies/act/` and `src/lerobot/datasets/lerobot_dataset.py` as canonical examples.
|
||||
|
||||
---
|
||||
|
||||
## 4. SO-101 end-to-end cheat-sheet
|
||||
|
||||
Full details in [`docs/source/so101.mdx`](./docs/source/so101.mdx) and [`docs/source/il_robots.mdx`](./docs/source/il_robots.mdx). Minimum commands in order. Confirm arms are assembled + powered before issuing.
|
||||
|
||||
**4.1 Install**
|
||||
|
||||
```bash
|
||||
pip install 'lerobot[feetech]' # SO-100/SO-101 motor stack
|
||||
# pip install 'lerobot[all]' # everything
|
||||
# pip install 'lerobot[aloha,pusht]' # specific features
|
||||
# pip install 'lerobot[smolvla]' # add SmolVLA deps
|
||||
git lfs install && git lfs pull
|
||||
hf auth login # required to push datasets/policies
|
||||
```
|
||||
|
||||
Contributors can alternatively use `uv sync --locked --extra feetech` (see `AGENTS.md`).
|
||||
|
||||
**4.2 Find USB ports** — run once per arm, unplug when prompted.
|
||||
|
||||
```bash
|
||||
lerobot-find-port
|
||||
```
|
||||
|
||||
macOS: `/dev/tty.usbmodem...`; Linux: `/dev/ttyACM0` (may need `sudo chmod 666 /dev/ttyACM0`).
|
||||
|
||||
**4.3 Setup motor IDs & baudrate** (one-time, per arm)
|
||||
|
||||
```bash
|
||||
lerobot-setup-motors --robot.type=so101_follower --robot.port=<FOLLOWER_PORT>
|
||||
lerobot-setup-motors --teleop.type=so101_leader --teleop.port=<LEADER_PORT>
|
||||
```
|
||||
|
||||
**4.4 Calibrate** — center all joints, press Enter, sweep each joint through its full range. The `id` is the calibration key — reuse it everywhere.
|
||||
|
||||
```bash
|
||||
lerobot-calibrate --robot.type=so101_follower --robot.port=<FOLLOWER_PORT> --robot.id=my_follower
|
||||
lerobot-calibrate --teleop.type=so101_leader --teleop.port=<LEADER_PORT> --teleop.id=my_leader
|
||||
```
|
||||
|
||||
**4.5 Teleoperate** (sanity check, no recording)
|
||||
|
||||
```bash
|
||||
lerobot-teleoperate \
|
||||
--robot.type=so101_follower --robot.port=<FOLLOWER_PORT> --robot.id=my_follower \
|
||||
--teleop.type=so101_leader --teleop.port=<LEADER_PORT> --teleop.id=my_leader \
|
||||
--robot.cameras="{ front: {type: opencv, index_or_path: 0, width: 640, height: 480, fps: 30}}" \
|
||||
--display_data=true
|
||||
```
|
||||
|
||||
> **Feetech timeout / comms error on SO-100 / SO-101?** Before touching software, check the **red motor LEDs** on the daisy chain.
|
||||
>
|
||||
> - **All steady red, gripper → base chain** → wiring OK.
|
||||
> - **One or more motors dark / chain stops mid-way** → wiring issue: reseat the 3-pin cables, check the controller-board power supply, and make sure each motor is fully clicked in.
|
||||
> - **LEDs blinking** → the motor is in an **error state**: usually overload (forcing a joint past its limit) **or wrong power supply voltage**. SO-100 / SO-101 ship in two variants — a **5 V / 7.4 V** build and a **12 V** build — they are NOT interchangeable. Using a 12 V PSU on a 5 V / 7.4 V arm (or vice-versa) will trip this error; confirm your motor variant before powering up.
|
||||
>
|
||||
> Most "timeout" errors are physical, not code.
|
||||
|
||||
**4.6 Record a dataset** — keys: **→** next, **←** redo, **ESC** finish & upload.
|
||||
|
||||
```bash
|
||||
HF_USER=$(NO_COLOR=1 hf auth whoami | awk -F': *' 'NR==1 {print $2}')
|
||||
|
||||
lerobot-record \
|
||||
--robot.type=so101_follower --robot.port=<FOLLOWER_PORT> --robot.id=my_follower \
|
||||
--teleop.type=so101_leader --teleop.port=<LEADER_PORT> --teleop.id=my_leader \
|
||||
--robot.cameras="{ front: {type: opencv, index_or_path: 0, width: 640, height: 480, fps: 30}}" \
|
||||
--dataset.repo_id=${HF_USER}/my_task \
|
||||
--dataset.single_task="<describe the task in one sentence>" \
|
||||
--dataset.num_episodes=50 \
|
||||
--dataset.episode_time_s=30 \
|
||||
--dataset.reset_time_s=10 \
|
||||
--display_data=true
|
||||
```
|
||||
|
||||
**4.7 Visualize** — **always** do this before training. Look for missing frames, camera blur, unreachable targets, inconsistent object positions.
|
||||
After upload: https://huggingface.co/spaces/lerobot/visualize_dataset → paste `${HF_USER}/my_task`. Works for **any LeRobot-formatted Hub dataset** — use it to scout other datasets, inspect episode quality, or debug your own data before retraining.
|
||||
|
||||
**4.8 Replay an episode** (sanity check)
|
||||
|
||||
```bash
|
||||
lerobot-replay --robot.type=so101_follower --robot.port=<FOLLOWER_PORT> --robot.id=my_follower \
|
||||
--dataset.repo_id=${HF_USER}/my_task --dataset.episode=0
|
||||
```
|
||||
|
||||
**4.9 Train** (default: ACT — fastest, lowest memory). Apple silicon: `--policy.device=mps`. See §6/§7 for policy and duration.
|
||||
|
||||
```bash
|
||||
lerobot-train \
|
||||
--dataset.repo_id=${HF_USER}/my_task \
|
||||
--policy.type=act \
|
||||
--policy.device=cuda \
|
||||
--output_dir=outputs/train/act_my_task \
|
||||
--job_name=act_my_task \
|
||||
--batch_size=8 \
|
||||
--wandb.enable=true \
|
||||
--policy.repo_id=${HF_USER}/act_my_task
|
||||
```
|
||||
|
||||
**4.10 Evaluate on the real robot** — compare success rate to a teleoperated baseline.
|
||||
|
||||
```bash
|
||||
lerobot-record \
|
||||
--robot.type=so101_follower --robot.port=<FOLLOWER_PORT> --robot.id=my_follower \
|
||||
--robot.cameras="{ front: {type: opencv, index_or_path: 0, width: 640, height: 480, fps: 30}}" \
|
||||
--dataset.repo_id=${HF_USER}/eval_my_task \
|
||||
--dataset.single_task="<same task description as training>" \
|
||||
--dataset.num_episodes=10 \
|
||||
--policy.path=${HF_USER}/act_my_task
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 5. Data collection tips (beginner → reliable policy)
|
||||
|
||||
Good data beats clever models. Adopt these defaults and deviate only with evidence.
|
||||
|
||||
### 5.1 Setup & ergonomics
|
||||
|
||||
- **Fix the rig and cameras** before touching the software. If the rig vibrates or the operator gets frustrated, fix that first — more bad data won't help.
|
||||
- **Lighting matters more than resolution.** Diffuse, consistent light. Avoid moving shadows.
|
||||
- **"Can you do the task from the camera view alone?"** If no, your cameras are wrong. Fix before recording.
|
||||
- Enable **action interpolation** for rollouts when available for smoother trajectories.
|
||||
|
||||
### 5.2 Practice before you record
|
||||
|
||||
- Do 5–10 demos without recording. Build a deliberate, repeatable strategy.
|
||||
- Hesitant or inconsistent demos teach the model hesitation.
|
||||
|
||||
### 5.3 Quality over speed
|
||||
|
||||
Deliberate, high-quality execution beats fast sloppy runs. Optimize for speed only **after** strategy is dialed in — never trade quality for it.
|
||||
|
||||
### 5.4 Consistency within and across episodes
|
||||
|
||||
Same grasp, approach vector, and timing. Coherent strategies are much easier to learn than wildly varying movements.
|
||||
|
||||
### 5.5 Start small, then extend (the golden rule)
|
||||
|
||||
- **First 50 episodes = constrained version** of the task: one object, fixed position, fixed camera setup, one operator.
|
||||
- Train a quick ACT model. See what fails.
|
||||
- **Then add diversity** along one axis at a time: more positions → more lighting → more objects → more operators.
|
||||
- Don't try to collect the "perfect dataset" on day one. Iterate.
|
||||
|
||||
### 5.6 Policy choice for beginners
|
||||
|
||||
- **Laptop / first time / want results fast → ACT.** Works surprisingly well, trains fast even on a laptop GPU.
|
||||
- **Bigger GPU / language-conditioned / multi-task → SmolVLA.** Unfreezing the vision encoder (see §7) is a big win here.
|
||||
- Defer π0 / π0.5 / Wall-X / X-VLA until you have a proven ACT baseline and a 20+ GB GPU.
|
||||
|
||||
### 5.7 Recommended defaults for your first task
|
||||
|
||||
| Setting | Value |
|
||||
| ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| Episodes | **50** to start, scale to 100–300 after first training |
|
||||
| Episode length | 20–45 s (shorter is fine for grasp/place) |
|
||||
| Reset time | 10 s |
|
||||
| FPS | 30 |
|
||||
| Cameras | **2 cameras recommended**: 1 fixed front + 1 wrist. Multi-view often outperforms single-view. A single fixed camera also works to keep things simple. |
|
||||
| Task description | Short, specific, action-phrased sentence |
|
||||
|
||||
### 5.8 Troubleshooting signal
|
||||
|
||||
- Policy fails at one specific stage → record 10–20 more episodes **targeting that stage**.
|
||||
- Policy flaps / oscillates → likely inconsistent demos, or need more training; re-record worst episodes (use **←** to redo).
|
||||
- Policy ignores the object → camera framing or lighting issue, not a model issue.
|
||||
|
||||
See also: [What makes a good dataset](https://huggingface.co/blog/lerobot-datasets#what-makes-a-good-dataset).
|
||||
|
||||
---
|
||||
|
||||
## 6. Which policy should I train?
|
||||
|
||||
Match the policy to the user's **GPU memory** and **time budget**. Numbers below come from an internal profiling run (one training update per policy). They are **indicative only** — see caveats.
|
||||
|
||||
### 6.1 Profiling snapshot (indicative)
|
||||
|
||||
All policies typically train for **5–10 epochs** (see §7).
|
||||
|
||||
| Policy | Batch | Update (ms) | Peak GPU mem (GB) | Best for |
|
||||
| ----------- | ----: | ----------: | ----------------: | ------------------------------------------------------------------------------------------------ |
|
||||
| `act` | 4 | **83.9** | **0.94** | First-time users, laptops, single-task. Fast and reliable. |
|
||||
| `diffusion` | 4 | 168.6 | 4.94 | Multi-modal action distributions; needs mid-range GPU. |
|
||||
| `smolvla` | 1 | 357.8 | 3.93 | Language-conditioned, multi-task, small VLA. **Unfreeze vision encoder for big gains** (see §7). |
|
||||
| `xvla` | 1 | 731.6 | 15.52 | Large VLA, multi-task. |
|
||||
| `wall_x` | 1 | 716.5 | 15.95 | Large VLA with world-model objective. |
|
||||
| `pi0` | 1 | 940.3 | 15.50 | Strong large VLA baseline (Physical Intelligence). |
|
||||
| `pi05` | 1 | 1055.8 | 16.35 | Newer π policy; similar footprint to `pi0`. |
|
||||
|
||||
**Critical caveats:**
|
||||
|
||||
- **Optimizer:** measured with **SGD**. LeRobot's default is **AdamW**, which keeps extra optimizer state → **peak memory will be noticeably higher** with the default, especially for `pi0`, `pi05`, `wall_x`, `xvla`.
|
||||
- **Batch size:** the large policies were profiled at batch 1. In practice use a **larger batch** for stable training (see §7.4). Memory scales roughly linearly with batch.
|
||||
|
||||
### 6.2 Decision rules
|
||||
|
||||
- **< 8 GB VRAM (laptop, 3060, M-series Mac):** → `act`. Maybe `diffusion` if you have ~6–8 GB free.
|
||||
- **12–16 GB VRAM (4070/4080, A4000):** → `smolvla` with defaults, or `act`/`diffusion` with larger batch. `pi0`/`pi05`/`wall_x`/`xvla` feasible only with small batch + gradient accumulation.
|
||||
- **24+ GB VRAM (3090/4090/A5000):** → any policy. Prefer `smolvla` (unfrozen) for multi-task; `act` for single-task grasp-and-place (still often the best ROI). Could experiment with `pi0` or `pi05` or `xvla`
|
||||
- **80 GB (A100/H100):** → any, with healthy batch. `pi05`, `xvla`, `wall_x` become comfortable.
|
||||
- **CPU only:** → don't train here. Use Google Colab (see [`docs/source/notebooks.mdx`](./docs/source/notebooks.mdx)) or a rented GPU.
|
||||
|
||||
---
|
||||
|
||||
## 7. How long should I train?
|
||||
|
||||
Robotics imitation learning usually converges in a **few epochs over the dataset**, not hundreds of thousands of raw steps. Think **epochs first**, then translate to steps.
|
||||
|
||||
### 7.1 Rule of thumb
|
||||
|
||||
- **Typical total: 5–10 epochs.** Start at 5, eval, then decide if more helps.
|
||||
- Very small datasets (< 30 episodes) may want slightly more epochs — but first, **collect more data**.
|
||||
- VLAs with a pretrained vision backbone typically need **fewer** epochs than training from scratch.
|
||||
|
||||
### 7.2 Steps ↔ epochs conversion
|
||||
|
||||
```
|
||||
total_frames = sum of frames over all episodes # e.g. 50 eps × 30 fps × 30 s ≈ 45,000
|
||||
steps_per_epoch = ceil(total_frames / batch_size)
|
||||
total_steps = epochs × steps_per_epoch
|
||||
```
|
||||
|
||||
Examples for `--batch_size=8`:
|
||||
|
||||
| Dataset size | Frames | Steps / epoch | 5 epochs | 10 epochs |
|
||||
| ----------------------- | ------: | ------------: | -------: | --------: |
|
||||
| 50 eps × 30 s @ 30 fps | 45,000 | ~5,625 | 28k | 56k |
|
||||
| 100 eps × 30 s @ 30 fps | 90,000 | ~11,250 | 56k | 113k |
|
||||
| 300 eps × 30 s @ 30 fps | 270,000 | ~33,750 | 169k | 338k |
|
||||
|
||||
Pass the resulting total with `--steps=<N>`; eval at intermediate checkpoints (`outputs/train/.../checkpoints/`).
|
||||
|
||||
### 7.3 Per-policy starting points (single-task, ~50 episodes)
|
||||
|
||||
| Policy | Batch | Steps (first run) | Notes |
|
||||
| -------------- | ----: | ----------------: | ----------------------------------------------------------------- |
|
||||
| `act` | 8–16 | 30k–80k | Usually converges under 50k for single-task. |
|
||||
| `diffusion` | 8–16 | 80k–150k | Benefits from longer training than ACT. |
|
||||
| `smolvla` | 4–8 | 30k–80k | Pretrained VLM → converges fast. |
|
||||
| `pi0` / `pi05` | 1–4 | 30k–80k | Memory-bound; use gradient accumulation for effective batch ≥ 16! |
|
||||
|
||||
### 7.4 Batch size guidance
|
||||
|
||||
- **Bigger batch is preferable** for stable gradients on teleop data.
|
||||
- If GPU memory is the bottleneck, use **gradient accumulation** to raise _effective_ batch without raising peak memory.
|
||||
- Scale **learning rate** gently with batch; most LeRobot defaults work fine for a 2–4× batch change.
|
||||
|
||||
### 7.5 Scale LR schedule & checkpoints with `--steps`
|
||||
|
||||
LeRobot's default schedulers (e.g. SmolVLA's cosine decay) use `scheduler_decay_steps=30_000`, which is sized for long training runs. When you shorten training (e.g. 5k–10k steps on a small dataset), **scale the scheduler down to match** — otherwise the LR stays near the peak and never decays. Same for checkpoint frequency.
|
||||
|
||||
```bash
|
||||
lerobot-train ... \
|
||||
--steps=5000 \
|
||||
--policy.scheduler_decay_steps=5000 \
|
||||
--save_freq=5000
|
||||
```
|
||||
|
||||
Rule of thumb: set `scheduler_decay_steps ≈ steps`, and `save_freq` to whatever granularity you want for eval (e.g. every 1k–5k steps). Match `scheduler_warmup_steps` proportionally if your run is very short.
|
||||
|
||||
### 7.6 SmolVLA: unfreeze the vision encoder for real gains
|
||||
|
||||
SmolVLA ships with `freeze_vision_encoder=True`. Unfreezing usually **improves performance substantially** on specialized tasks, at the cost of more VRAM and slower steps. Enable with:
|
||||
|
||||
```bash
|
||||
lerobot-train ... --policy.type=smolvla \
|
||||
--policy.freeze_vision_encoder=false \
|
||||
--policy.train_expert_only=false
|
||||
```
|
||||
|
||||
### 7.7 Signals to stop / keep going
|
||||
|
||||
- Train loss plateaus → stop, save a Hub checkpoint.
|
||||
- Train loss still dropping and you're under 10 epochs → keep going.
|
||||
|
||||
---
|
||||
|
||||
## 8. Evaluation & benchmarks
|
||||
|
||||
Two flavors of evaluation:
|
||||
|
||||
### 8.1 Real-robot eval (SO-101, etc.)
|
||||
|
||||
Reuse `lerobot-record` with `--policy.path` to run the trained policy on-robot and save the run as an eval dataset. Convention: prefix the dataset with `eval_`.
|
||||
|
||||
```bash
|
||||
lerobot-record \
|
||||
--robot.type=so101_follower --robot.port=<FOLLOWER_PORT> --robot.id=my_follower \
|
||||
--robot.cameras="{ front: {type: opencv, index_or_path: 0, width: 640, height: 480, fps: 30}}" \
|
||||
--dataset.repo_id=${HF_USER}/eval_my_task \
|
||||
--dataset.single_task="<same task description used during training>" \
|
||||
--dataset.num_episodes=10 \
|
||||
--policy.path=${HF_USER}/act_my_task
|
||||
```
|
||||
|
||||
Report success rate across episodes. Compare to a teleoperated baseline and to an earlier checkpoint to catch regressions.
|
||||
|
||||
### 8.2 Sim-benchmark eval
|
||||
|
||||
For policies trained on sim datasets (PushT, Aloha, LIBERO, MetaWorld, RoboCasa, …) use `lerobot-eval` against the matching `env.type`:
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path=${HF_USER}/diffusion_pusht \
|
||||
--env.type=pusht \
|
||||
--eval.n_episodes=50 \
|
||||
--eval.batch_size=10 \
|
||||
--policy.device=cuda
|
||||
```
|
||||
|
||||
- Use `--policy.path=outputs/train/.../checkpoints/<step>/pretrained_model` for local checkpoints.
|
||||
- `--eval.n_episodes` should be ≥ 50 for a stable success-rate estimate.
|
||||
- Available envs live in `src/lerobot/envs/`. See [`docs/source/libero.mdx`](./docs/source/libero.mdx), [`metaworld.mdx`](./docs/source/metaworld.mdx), [`robocasa.mdx`](./docs/source/robocasa.mdx), [`vlabench.mdx`](./docs/source/vlabench.mdx) for specific benchmarks.
|
||||
- To add a new benchmark, see [`docs/source/adding_benchmarks.mdx`](./docs/source/adding_benchmarks.mdx) and [`envhub.mdx`](./docs/source/envhub.mdx).
|
||||
|
||||
### 8.2b Dockerfiles for benchmark eval
|
||||
|
||||
Benchmark envs have native dependencies that are painful to install locally. The repo ships **pre-baked Dockerfiles** for each supported benchmark — use these to run `lerobot-eval` in a reproducible environment:
|
||||
|
||||
| Benchmark | Dockerfile |
|
||||
| ----------- | -------------------------------------------------------------------------------------- |
|
||||
| LIBERO | [`docker/Dockerfile.benchmark.libero`](./docker/Dockerfile.benchmark.libero) |
|
||||
| LIBERO+ | [`docker/Dockerfile.benchmark.libero_plus`](./docker/Dockerfile.benchmark.libero_plus) |
|
||||
| MetaWorld | [`docker/Dockerfile.benchmark.metaworld`](./docker/Dockerfile.benchmark.metaworld) |
|
||||
| RoboCasa | [`docker/Dockerfile.benchmark.robocasa`](./docker/Dockerfile.benchmark.robocasa) |
|
||||
| RoboCerebra | [`docker/Dockerfile.benchmark.robocerebra`](./docker/Dockerfile.benchmark.robocerebra) |
|
||||
| RoboMME | [`docker/Dockerfile.benchmark.robomme`](./docker/Dockerfile.benchmark.robomme) |
|
||||
| RoboTwin | [`docker/Dockerfile.benchmark.robotwin`](./docker/Dockerfile.benchmark.robotwin) |
|
||||
| VLABench | [`docker/Dockerfile.benchmark.vlabench`](./docker/Dockerfile.benchmark.vlabench) |
|
||||
|
||||
Build and run (adapt to your benchmark):
|
||||
|
||||
```bash
|
||||
docker build -f docker/Dockerfile.benchmark.robomme -t lerobot-bench-robomme .
|
||||
docker run --gpus all --rm -it \
|
||||
-v $HOME/.cache/huggingface:/root/.cache/huggingface \
|
||||
lerobot-bench-robomme \
|
||||
lerobot-eval --policy.path=<your_policy> --env.type=<env> --eval.n_episodes=50
|
||||
```
|
||||
|
||||
See [`docker/README.md`](./docker/README.md) for base-image details.
|
||||
|
||||
### 8.3 Target success rates
|
||||
|
||||
Single-task grasp-and-place with 50 clean episodes: ACT should reach **> 70% success** on the training configuration. Less → data problem (see §5), not model problem. Expect a drop when generalizing to new positions — scale episodes or diversity to recover.
|
||||
|
||||
---
|
||||
|
||||
## 9. Further reading & resources
|
||||
|
||||
- **Getting started:** [`installation.mdx`](./docs/source/installation.mdx) · [`il_robots.mdx`](./docs/source/il_robots.mdx) · [What makes a good dataset](https://huggingface.co/blog/lerobot-datasets)
|
||||
- **Per-policy docs:** browse [`docs/source/*.mdx`](./docs/source/) (policies, hardware, benchmarks, advanced training).
|
||||
- **Community:** [Discord](https://discord.com/invite/s3KuuzsPFb) · [Hub `LeRobot` tag](https://huggingface.co/datasets?other=LeRobot) · [Dataset visualizer](https://huggingface.co/spaces/lerobot/visualize_dataset)
|
||||
|
||||
> Keep this file current. If you learn a rule that would prevent a class of user mistakes, add it here and in [`AGENTS.md`](./AGENTS.md).
|
||||
@@ -1,4 +1,3 @@
|
||||
include src/lerobot/templates/lerobot_modelcard_template.md
|
||||
include src/lerobot/templates/lerobot_rewardmodel_modelcard_template.md
|
||||
include src/lerobot/datasets/card_template.md
|
||||
include src/lerobot/envs/metaworld_config.json
|
||||
|
||||
@@ -1,84 +0,0 @@
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Benchmark image for LIBERO-plus integration tests.
|
||||
# Extends the nightly GPU image (which has lerobot[all]) with the LIBERO-plus
|
||||
# fork source + its 6.4 GB perturbation assets.
|
||||
#
|
||||
# Build: docker build -f docker/Dockerfile.benchmark.libero_plus -t lerobot-benchmark-libero-plus .
|
||||
# Run: docker run --gpus all --rm lerobot-benchmark-libero-plus lerobot-eval ...
|
||||
|
||||
FROM huggingface/lerobot-gpu:latest
|
||||
ENV MUJOCO_GL=egl
|
||||
|
||||
# unzip for the 6.4 GB assets.zip; the rest are LIBERO-plus build-time extras
|
||||
# (wand / ImageMagick / fontconfig) not in the nightly base.
|
||||
USER root
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
unzip libexpat1 libfontconfig1-dev libmagickwand-dev \
|
||||
&& apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
USER user_lerobot
|
||||
|
||||
# robosuite==1.4.1 is mandatory (the fork uses `single_arm_env` removed in
|
||||
# v1.5+). The rest are LIBERO-plus runtime deps pulled from its setup.py.
|
||||
# We install these explicitly instead of via the [libero_plus] extra because
|
||||
# the extra's `libero @ git+...` dep installs as a namespace package and then
|
||||
# clone and PYTHONPATH-override it below.
|
||||
RUN uv pip install --no-cache \
|
||||
"robosuite==1.4.1" \
|
||||
"bddl==1.0.1" \
|
||||
"easydict==1.13" \
|
||||
"mujoco==3.7.0" \
|
||||
"matplotlib==3.10.8" \
|
||||
"Wand==0.6.13" \
|
||||
"scikit-image==0.25.2" \
|
||||
"gym==0.26.2"
|
||||
|
||||
# Clone LIBERO-plus and make it importable as `libero`. The nightly base has
|
||||
# hf-libero (10 tasks) preinstalled via lerobot[libero]; uninstall it so
|
||||
# Python resolves `import libero` to the 2402-task LIBERO-plus module instead.
|
||||
# Pinned to the current upstream main SHA so benchmark builds stay reproducible.
|
||||
ARG LIBERO_PLUS_SHA=4976dc3
|
||||
ENV LIBERO_PLUS_ROOT=/home/user_lerobot/libero-plus/libero/libero
|
||||
RUN git clone https://github.com/sylvestf/LIBERO-plus.git /home/user_lerobot/libero-plus \
|
||||
&& git -C /home/user_lerobot/libero-plus checkout ${LIBERO_PLUS_SHA} \
|
||||
&& cd /home/user_lerobot/libero-plus && uv pip install --no-cache --no-deps -e "." \
|
||||
&& (uv pip uninstall hf-libero 2>/dev/null || true)
|
||||
ENV PYTHONPATH="/home/user_lerobot/libero-plus:${PYTHONPATH}"
|
||||
|
||||
# Perturbation textures/scenes: bddl_base_domain.py resolves XMLs via
|
||||
# DIR_PATH/../assets (package-relative, ignoring ~/.libero/config.yaml). All
|
||||
# 2402 tasks reference files that ship only in Sylvest/LIBERO-plus's
|
||||
# assets.zip (6.4 GB) under a deep author-internal prefix — extract and
|
||||
# flatten it under ${LIBERO_PLUS_ROOT}/assets.
|
||||
RUN python -c "\
|
||||
from huggingface_hub import hf_hub_download; \
|
||||
hf_hub_download(repo_id='Sylvest/LIBERO-plus', repo_type='dataset', \
|
||||
filename='assets.zip', local_dir='/tmp/libero-plus-dl')" \
|
||||
&& unzip -q /tmp/libero-plus-dl/assets.zip -d /tmp/libero-plus-dl/extract \
|
||||
&& ASSETS_DIR=$(find /tmp/libero-plus-dl/extract -type d -name assets | head -1) \
|
||||
&& mv "${ASSETS_DIR}" ${LIBERO_PLUS_ROOT}/assets \
|
||||
&& rm -rf /tmp/libero-plus-dl
|
||||
|
||||
# Point ~/.libero/config.yaml at the clone so LIBERO-plus's imports are
|
||||
# non-interactive (it calls input() when the config is missing).
|
||||
RUN mkdir -p /home/user_lerobot/.libero \
|
||||
&& printf "assets: ${LIBERO_PLUS_ROOT}/assets\nbddl_files: ${LIBERO_PLUS_ROOT}/bddl_files\ndatasets: ${LIBERO_PLUS_ROOT}/../datasets\ninit_states: ${LIBERO_PLUS_ROOT}/init_files\n" \
|
||||
> /home/user_lerobot/.libero/config.yaml
|
||||
|
||||
# Overlay the PR's source code on top of the nightly image.
|
||||
COPY --chown=user_lerobot:user_lerobot . .
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
@@ -1,71 +0,0 @@
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Benchmark image for RoboCasa365 integration tests.
|
||||
# Extends the nightly GPU image (which already has all extras installed)
|
||||
# with the PR's source code and RoboCasa-specific asset setup.
|
||||
#
|
||||
# Build: docker build -f docker/Dockerfile.benchmark.robocasa -t lerobot-benchmark-robocasa .
|
||||
# Run: docker run --gpus all --rm lerobot-benchmark-robocasa lerobot-eval ...
|
||||
|
||||
FROM huggingface/lerobot-gpu:latest
|
||||
|
||||
# Install robocasa + robosuite as editable clones. pip-installing from git
|
||||
# omits data files like robocasa/models/assets/box_links/box_links_assets.json
|
||||
# (not declared in package_data), which download_kitchen_assets needs at import.
|
||||
#
|
||||
# `--no-deps` on robocasa is deliberate: its setup.py pins `lerobot==0.3.3`
|
||||
# in install_requires, which would shadow the editable lerobot baked into
|
||||
# this image. We install robocasa's actual runtime deps explicitly instead.
|
||||
# Pinned SHAs for reproducible benchmark runs. Bump when you need an
|
||||
# upstream fix; don't rely on `main`/`master` drift.
|
||||
ARG ROBOCASA_SHA=56e355ccc64389dfc1b8a61a33b9127b975ba681
|
||||
ARG ROBOSUITE_SHA=aaa8b9b214ce8e77e82926d677b4d61d55e577ab
|
||||
RUN git clone https://github.com/robocasa/robocasa.git ~/robocasa && \
|
||||
git -C ~/robocasa checkout ${ROBOCASA_SHA} && \
|
||||
git clone https://github.com/ARISE-Initiative/robosuite.git ~/robosuite && \
|
||||
git -C ~/robosuite checkout ${ROBOSUITE_SHA} && \
|
||||
uv pip install --no-cache -e ~/robocasa --no-deps && \
|
||||
uv pip install --no-cache -e ~/robosuite && \
|
||||
uv pip install --no-cache \
|
||||
"numpy==2.2.5" "numba==0.61.2" "scipy==1.15.3" "mujoco==3.3.1" \
|
||||
"pygame==2.6.1" "Pillow==12.2.0" "opencv-python==4.13.0.92" \
|
||||
"pyyaml==6.0.3" "pynput==1.8.1" "tqdm==4.67.3" "termcolor==3.3.0" \
|
||||
"imageio==2.37.3" "h5py==3.16.0" "lxml==6.0.4" "hidapi==0.14.0.post4" \
|
||||
"tianshou==0.4.10" "gymnasium==1.2.3"
|
||||
|
||||
# Set up robocasa macros and download kitchen assets. We need:
|
||||
# - tex : base environment textures
|
||||
# - tex_generative : AI-generated textures; kitchen fixture XMLs embed
|
||||
# refs to generative_textures/wall/tex*.png
|
||||
# unconditionally, so MjModel.from_xml_string fails
|
||||
# at reset time without them (even if the env is
|
||||
# constructed with generative_textures=None).
|
||||
# - fixtures_lw : lightwheel kitchen fixtures (fridge, counters...)
|
||||
# - objs_lw : lightwheel object meshes (stools, misc props)
|
||||
# We skip the objaverse/aigen object packs (~30GB combined) by pairing
|
||||
# this with --env.obj_registries=["lightwheel"] on the lerobot side.
|
||||
# The download script prompts interactively, so pipe 'y' to auto-accept.
|
||||
RUN python -m robocasa.scripts.setup_macros && \
|
||||
yes y | python -m robocasa.scripts.download_kitchen_assets \
|
||||
--type tex tex_generative fixtures_lw objs_lw
|
||||
|
||||
# Overlay the PR's source code on top of the nightly image.
|
||||
COPY --chown=user_lerobot:user_lerobot . .
|
||||
|
||||
# Re-install lerobot editably so the new source (with RoboCasaEnv registration)
|
||||
# replaces the stale package baked into the nightly image.
|
||||
RUN uv pip install --no-cache --no-deps -e .
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
@@ -1,43 +0,0 @@
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Benchmark image for RoboCerebra integration tests.
|
||||
# RoboCerebra reuses LIBERO's simulator (libero_10 suite) with a different
|
||||
# rename_map, so this image is identical to the LIBERO benchmark image —
|
||||
# extends the nightly GPU base with LIBERO assets + the PR's source code.
|
||||
#
|
||||
# Build: docker build -f docker/Dockerfile.benchmark.robocerebra -t lerobot-benchmark-robocerebra .
|
||||
# Run: docker run --gpus all --rm lerobot-benchmark-robocerebra lerobot-eval ...
|
||||
|
||||
FROM huggingface/lerobot-gpu:latest
|
||||
|
||||
# Pre-download lerobot/libero-assets from HF Hub so nothing is fetched at
|
||||
# runtime (which times out on CI). Point the libero config at the cached path.
|
||||
# libero/libero/__init__.py calls input() when ~/.libero/config.yaml is missing,
|
||||
# so we write the config before any libero import can happen.
|
||||
RUN LIBERO_DIR=$(python -c \
|
||||
"import importlib.util, os; s=importlib.util.find_spec('libero'); \
|
||||
print(os.path.join(os.path.dirname(s.origin), 'libero'))") && \
|
||||
mkdir -p /home/user_lerobot/.libero && \
|
||||
python -c "\
|
||||
from huggingface_hub import snapshot_download; \
|
||||
snapshot_download(repo_id='lerobot/libero-assets', repo_type='dataset', \
|
||||
local_dir='/home/user_lerobot/.libero/assets')" && \
|
||||
printf "assets: /home/user_lerobot/.libero/assets\nbddl_files: ${LIBERO_DIR}/bddl_files\ndatasets: ${LIBERO_DIR}/../datasets\ninit_states: ${LIBERO_DIR}/init_files\n" \
|
||||
> /home/user_lerobot/.libero/config.yaml
|
||||
|
||||
# Overlay the PR's source code on top of the nightly image.
|
||||
COPY --chown=user_lerobot:user_lerobot . .
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
@@ -1,56 +0,0 @@
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Benchmark image for RoboMME integration tests.
|
||||
# Extends the nightly GPU image (which has lerobot[all]) with Vulkan system
|
||||
# libs for ManiSkill/SAPIEN and the robomme extra. robomme isn't in [all]
|
||||
# because mani-skill hard-pins gymnasium==0.29.1 and numpy<2.0.0 which
|
||||
# conflict with lerobot's defaults; both are safe at runtime:
|
||||
# - gymnasium 0.29.x has the same 5-tuple step() API as 1.x (since 0.26)
|
||||
# - numpy 1.26.4 is API-compatible with lerobot's actual usage.
|
||||
#
|
||||
# Build: docker build -f docker/Dockerfile.benchmark.robomme -t lerobot-benchmark-robomme .
|
||||
# Run: docker run --gpus all --rm lerobot-benchmark-robomme lerobot-eval ...
|
||||
|
||||
FROM huggingface/lerobot-gpu:latest
|
||||
|
||||
# NVIDIA Container Toolkit: expose Vulkan driver capability for headless rendering.
|
||||
ENV NVIDIA_DRIVER_CAPABILITIES=all \
|
||||
VK_ICD_FILENAMES=/usr/share/vulkan/icd.d/nvidia_icd.json
|
||||
|
||||
# ManiSkill/SAPIEN's renderer needs Vulkan, which isn't in the base image.
|
||||
USER root
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
libvulkan1 libvulkan-dev mesa-vulkan-drivers \
|
||||
&& mkdir -p /usr/share/vulkan/icd.d \
|
||||
&& echo '{"file_format_version":"1.0.0","ICD":{"library_path":"libGLX_nvidia.so.0","api_version":"1.3.0"}}' \
|
||||
> /usr/share/vulkan/icd.d/nvidia_icd.json \
|
||||
&& apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
USER user_lerobot
|
||||
|
||||
# Install smolvla + av-dep via the PR's pyproject, then layer robomme on top
|
||||
# with gymnasium/numpy overrides. robomme isn't a pyproject extra because its
|
||||
# mani-skill pin conflicts with lerobot's base numpy>=2 (see pyproject.toml).
|
||||
COPY --chown=user_lerobot:user_lerobot setup.py pyproject.toml uv.lock README.md MANIFEST.in ./
|
||||
RUN printf 'gymnasium==0.29.1\nnumpy==1.26.4\n' > /tmp/robomme_override.txt \
|
||||
&& uv pip install --no-cache --override /tmp/robomme_override.txt \
|
||||
-e ".[smolvla,av-dep]" \
|
||||
"robomme @ git+https://github.com/RoboMME/robomme_benchmark.git@main" \
|
||||
&& python -c "import robomme; print('robomme import OK')"
|
||||
|
||||
# Overlay the PR's source code on top of the nightly image.
|
||||
COPY --chown=user_lerobot:user_lerobot . .
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
@@ -1,138 +0,0 @@
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Benchmark image for RoboTwin 2.0 integration tests.
|
||||
# Extends the nightly GPU image with the RoboTwin simulator stack:
|
||||
# sapien/mplib/pytorch3d + NVlabs CuRobo + embodiments.zip + objects.zip
|
||||
# (~3.96 GB of assets; background_texture.zip ~11 GB skipped for smoke eval).
|
||||
#
|
||||
# Build: docker build -f docker/Dockerfile.benchmark.robotwin -t lerobot-benchmark-robotwin .
|
||||
# Run: docker run --gpus all --rm lerobot-benchmark-robotwin \
|
||||
# lerobot-eval --env.type=robotwin --env.task=beat_block_hammer ...
|
||||
|
||||
FROM huggingface/lerobot-gpu:latest
|
||||
|
||||
ENV NVIDIA_DRIVER_CAPABILITIES=all \
|
||||
VK_ICD_FILENAMES=/usr/share/vulkan/icd.d/nvidia_icd.json \
|
||||
ROBOTWIN_ROOT=/opt/robotwin
|
||||
|
||||
# The nightly base is CUDA -base (no compiler, no Vulkan loader). CuRobo's
|
||||
# `pip install -e .` runs nvcc, and SAPIEN renders via Vulkan — add both.
|
||||
USER root
|
||||
# Pinned upstream SHA for reproducible benchmark runs. Bump when we need
|
||||
# an upstream fix; don't rely on `main` drift.
|
||||
ARG ROBOTWIN_SHA=0aeea2d669c0f8516f4d5785f0aa33ba812c14b4
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
cuda-nvcc-12-6 cuda-cudart-dev-12-6 \
|
||||
libvulkan1 vulkan-tools \
|
||||
&& mkdir -p /usr/share/vulkan/icd.d \
|
||||
&& echo '{"file_format_version":"1.0.0","ICD":{"library_path":"libGLX_nvidia.so.0","api_version":"1.3.0"}}' \
|
||||
> /usr/share/vulkan/icd.d/nvidia_icd.json \
|
||||
&& git clone https://github.com/RoboTwin-Platform/RoboTwin.git ${ROBOTWIN_ROOT} \
|
||||
&& git -C ${ROBOTWIN_ROOT} checkout ${ROBOTWIN_SHA} \
|
||||
&& chown -R user_lerobot:user_lerobot ${ROBOTWIN_ROOT} \
|
||||
&& apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
USER user_lerobot
|
||||
|
||||
# RoboTwin runtime deps (av is already in the base via [av-dep]).
|
||||
RUN uv pip install --no-cache \
|
||||
"sapien==3.0.0b1" "mplib==0.2.1" "transforms3d==0.4.2" "trimesh==4.4.3" \
|
||||
"open3d==0.19.0" "imageio==2.34.2" termcolor zarr pydantic h5py
|
||||
|
||||
# pytorch3d has no universal wheel; must be built from source (~10 min, cached).
|
||||
RUN uv pip install --no-cache --no-build-isolation \
|
||||
"git+https://github.com/facebookresearch/pytorch3d.git@stable"
|
||||
|
||||
# CuRobo — NVlabs motion generator; TORCH_CUDA_ARCH_LIST must be set or the
|
||||
# build aborts on an empty arch list. RoboTwin's own installer pins v0.7.8,
|
||||
# which still exposes the v1 API (`curobo.types.math`) that RoboTwin imports.
|
||||
ARG CUROBO_REF=v0.7.8
|
||||
RUN cd ${ROBOTWIN_ROOT}/envs \
|
||||
&& git clone --branch ${CUROBO_REF} --depth 1 https://github.com/NVlabs/curobo.git \
|
||||
&& cd curobo \
|
||||
&& TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;8.9;9.0" \
|
||||
uv pip install -e . --no-build-isolation --no-cache
|
||||
|
||||
# Upstream patches (mirror RoboTwin's script/_install.sh).
|
||||
# These patches target the exact versions pinned above; re-check when upgrading.
|
||||
# mplib==0.2.1: drop a broken `or collide` clause in planner.py.
|
||||
# Safe to remove once mplib > 0.2.1 ships with the fix upstream.
|
||||
# sapien==3.0.0b1: fix URDF loader encoding + .srdf extension check.
|
||||
# Safe to remove once sapien > 3.0.0b1 ships with the fix upstream.
|
||||
RUN python - <<'EOF'
|
||||
import pathlib, re, site
|
||||
for d in site.getsitepackages():
|
||||
p = pathlib.Path(d) / "mplib" / "planner.py"
|
||||
if p.exists():
|
||||
p.write_text(re.sub(r"\bor collide\b", "", p.read_text(), count=1))
|
||||
print(f"mplib patch applied: {p}")
|
||||
p = pathlib.Path(d) / "sapien" / "wrapper" / "urdf_loader.py"
|
||||
if p.exists():
|
||||
src = p.read_text().replace(
|
||||
"with open(srdf_path) as f:", 'with open(srdf_path, encoding="utf-8") as f:'
|
||||
).replace('"srdf"', '".srdf"')
|
||||
p.write_text(src)
|
||||
print(f"sapien patch applied: {p}")
|
||||
EOF
|
||||
|
||||
# Simulation assets from TianxingChen/RoboTwin2.0: embodiments (~220 MB) +
|
||||
# objects (~3.74 GB). background_texture (~11 GB) is intentionally skipped.
|
||||
# The dataset is public — no auth token needed.
|
||||
RUN python - <<'EOF'
|
||||
import os, pathlib, zipfile
|
||||
from huggingface_hub import hf_hub_download
|
||||
|
||||
assets_dir = pathlib.Path(os.environ["ROBOTWIN_ROOT"]) / "assets"
|
||||
assets_dir.mkdir(parents=True, exist_ok=True)
|
||||
for fname in ("embodiments.zip", "objects.zip"):
|
||||
local = hf_hub_download(
|
||||
repo_id="TianxingChen/RoboTwin2.0",
|
||||
repo_type="dataset",
|
||||
filename=fname,
|
||||
local_dir=str(assets_dir),
|
||||
)
|
||||
with zipfile.ZipFile(local, "r") as z:
|
||||
z.extractall(str(assets_dir))
|
||||
pathlib.Path(local).unlink()
|
||||
EOF
|
||||
|
||||
WORKDIR ${ROBOTWIN_ROOT}
|
||||
RUN python script/update_embodiment_config_path.py
|
||||
|
||||
ENV PYTHONPATH="${ROBOTWIN_ROOT}"
|
||||
|
||||
# Fail the image build early if the CuRobo package layout regresses. Importing
|
||||
# RoboTwin's planner here is too eager because CuRobo constructs CUDA-backed
|
||||
# defaults at import time, while Docker builds don't have access to an NVIDIA
|
||||
# driver.
|
||||
RUN python - <<'EOF'
|
||||
from pathlib import Path
|
||||
|
||||
from curobo.types.math import Pose
|
||||
|
||||
planner_src = (Path("/opt/robotwin/envs/robot/planner.py")).read_text()
|
||||
assert "from curobo.types.math import Pose as CuroboPose" in planner_src
|
||||
|
||||
print("CuRobo import OK:", Pose.__name__)
|
||||
print("RoboTwin planner import references curobo.types.math")
|
||||
EOF
|
||||
|
||||
# Return to the lerobot source directory (set by base image) before overlaying.
|
||||
WORKDIR /lerobot
|
||||
|
||||
# Overlay the PR's source code on top of the nightly image.
|
||||
COPY --chown=user_lerobot:user_lerobot . .
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
@@ -1,99 +0,0 @@
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Benchmark image for VLABench integration tests.
|
||||
# Extends the nightly GPU image with the PR's source code and VLABench setup.
|
||||
#
|
||||
# Build: docker build -f docker/Dockerfile.benchmark.vlabench -t lerobot-benchmark-vlabench .
|
||||
# Run: docker run --gpus all --rm lerobot-benchmark-vlabench lerobot-eval ...
|
||||
|
||||
FROM huggingface/lerobot-gpu:latest
|
||||
|
||||
# Install VLABench from GitHub (not on PyPI) and pin MuJoCo/dm-control.
|
||||
# Shallow-clone without submodule recursion (nested SSH-only submodules fail in CI).
|
||||
# Editable install (-e) because VLABench/utils/ has no __init__.py, so
|
||||
# find_packages() omits it from wheels; editable mode uses the source tree directly.
|
||||
# rrt-algorithms has the same packaging issue (rrt/ dir missing __init__.py).
|
||||
# Patch: constant.py calls os.listdir on ~100 asset/obj/meshes/* dirs at import
|
||||
# time. Guard the call so missing dirs return [] instead of crashing (in case
|
||||
# the asset download is partial).
|
||||
#
|
||||
# Pinned upstream SHAs for reproducible benchmark runs. Bump when you need
|
||||
# an upstream fix; don't rely on `main`/`develop` drift.
|
||||
ARG VLABENCH_SHA=cf588fe60c0c7282174fe979f5913170cfe69017
|
||||
ARG RRT_ALGORITHMS_SHA=e51d95ee489a225220d6ae2a764c4111f6ba7d85
|
||||
RUN git clone https://github.com/OpenMOSS/VLABench.git ~/VLABench && \
|
||||
git -C ~/VLABench checkout ${VLABENCH_SHA} && \
|
||||
git clone https://github.com/motion-planning/rrt-algorithms.git ~/rrt-algorithms && \
|
||||
git -C ~/rrt-algorithms checkout ${RRT_ALGORITHMS_SHA} && \
|
||||
python3 -c "\
|
||||
import pathlib; \
|
||||
p = pathlib.Path.home() / 'VLABench/VLABench/configs/constant.py'; \
|
||||
t = p.read_text(); \
|
||||
p.write_text(t.replace( \
|
||||
'subdirs = os.listdir(xml_dir)', \
|
||||
'if not os.path.isdir(xml_dir): return []\n subdirs = os.listdir(xml_dir)'))" && \
|
||||
uv pip install --no-cache -e ~/VLABench -e ~/rrt-algorithms \
|
||||
mujoco==3.2.2 dm-control==1.0.22 \
|
||||
open3d colorlog scikit-learn openai gdown
|
||||
|
||||
# Download VLABench mesh assets. Task configs reference object meshes
|
||||
# (obj/meshes/fruit/, containers/basket/, tablewares/plates/, etc.); without
|
||||
# them the task builder picks from an empty mesh list and crashes with
|
||||
# IndexError at task-build time (random.choice([]) in config_manager.py).
|
||||
#
|
||||
# Preferred source: an HF Hub mirror. Set VLABENCH_ASSETS_REPO at build time
|
||||
# (e.g. --build-arg VLABENCH_ASSETS_REPO=lerobot/vlabench-assets) and we'll
|
||||
# snapshot_download the repo into VLABench's assets dir. This is the reliable
|
||||
# path for CI — Google Drive frequently returns HTTP 429 ("Too many users have
|
||||
# viewed or downloaded this file recently") on shared academic files.
|
||||
#
|
||||
# After download we *validate* that at least one XML exists under each
|
||||
# task-critical subtree and fail the build loudly if not. Silent-empty asset
|
||||
# dirs are the #1 cause of VLABench runtime crashes in CI, so we surface them
|
||||
# here rather than after a 10-minute eval build.
|
||||
#
|
||||
# Fallback: VLABench's own gdown-based script. Best-effort only.
|
||||
ARG VLABENCH_ASSETS_REPO=""
|
||||
RUN ASSETS_DIR="$HOME/VLABench/VLABench/assets" && \
|
||||
if [ -n "${VLABENCH_ASSETS_REPO}" ]; then \
|
||||
echo "Downloading VLABench assets from HF Hub: ${VLABENCH_ASSETS_REPO}" && \
|
||||
uv pip install --no-cache "huggingface_hub[hf_xet]>=0.26" && \
|
||||
python -c "from huggingface_hub import snapshot_download; \
|
||||
p = snapshot_download(repo_id='${VLABENCH_ASSETS_REPO}', repo_type='dataset', \
|
||||
local_dir='${ASSETS_DIR}', allow_patterns=['obj/**', 'scenes/**']); \
|
||||
print('snapshot_download returned:', p)"; \
|
||||
else \
|
||||
echo "No VLABENCH_ASSETS_REPO set — falling back to gdown" && \
|
||||
python ~/VLABench/scripts/download_assets.py --choice all; \
|
||||
fi && \
|
||||
python -c "\
|
||||
from pathlib import Path; \
|
||||
import sys; \
|
||||
root = Path('${ASSETS_DIR}'); \
|
||||
checks = ['obj/meshes/tablewares/plates', 'obj/meshes/containers/basket', 'obj/meshes/fruit', 'obj/meshes/containers/tray']; \
|
||||
failed = []; \
|
||||
print(f'Validating VLABench assets under {root}'); \
|
||||
[print(f' {c}: {len(list((root/c).rglob(\"*.xml\")))} XMLs') for c in checks]; \
|
||||
[failed.append(c) for c in checks if not any((root/c).rglob('*.xml'))]; \
|
||||
sys.exit(f'Empty asset dirs (no *.xml): {failed}') if failed else print('All asset dirs populated.')"
|
||||
|
||||
# Overlay the PR's source code on top of the nightly image.
|
||||
COPY --chown=user_lerobot:user_lerobot . .
|
||||
|
||||
# Re-install lerobot editably so the new source (with VLABenchEnv registration
|
||||
# and updated obs handling) replaces the stale package baked into the nightly image.
|
||||
RUN uv pip install --no-cache --no-deps -e .
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
@@ -18,8 +18,9 @@
|
||||
# docker build -f docker/Dockerfile.internal -t lerobot-internal .
|
||||
|
||||
# Configure the base image for CI with GPU access
|
||||
ARG CUDA_VERSION=12.6.3
|
||||
ARG OS_VERSION=24.04
|
||||
# TODO(Steven): Bump these versions
|
||||
ARG CUDA_VERSION=12.4.1
|
||||
ARG OS_VERSION=22.04
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu${OS_VERSION}
|
||||
|
||||
# Define Python version argument
|
||||
@@ -35,13 +36,16 @@ ENV DEBIAN_FRONTEND=noninteractive \
|
||||
|
||||
# Install Python, system dependencies, and uv (as root)
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
build-essential git curl \
|
||||
libglib2.0-0 libgl1 libegl1 ffmpeg \
|
||||
software-properties-common build-essential git curl \
|
||||
libglib2.0-0 libgl1-mesa-glx libegl1-mesa ffmpeg \
|
||||
libusb-1.0-0-dev speech-dispatcher libgeos-dev portaudio19-dev \
|
||||
cmake pkg-config ninja-build \
|
||||
python${PYTHON_VERSION} \
|
||||
python${PYTHON_VERSION}-venv \
|
||||
python${PYTHON_VERSION}-dev \
|
||||
&& add-apt-repository -y ppa:deadsnakes/ppa \
|
||||
&& apt-get update \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
python${PYTHON_VERSION} \
|
||||
python${PYTHON_VERSION}-venv \
|
||||
python${PYTHON_VERSION}-dev \
|
||||
&& curl -LsSf https://astral.sh/uv/install.sh | sh \
|
||||
&& mv /root/.local/bin/uv /usr/local/bin/uv \
|
||||
&& useradd --create-home --shell /bin/bash user_lerobot \
|
||||
|
||||
@@ -31,10 +31,8 @@
|
||||
title: Porting Large Datasets
|
||||
- local: using_dataset_tools
|
||||
title: Using the Dataset Tools
|
||||
- local: language_and_recipes
|
||||
title: Language Columns and Recipes
|
||||
- local: tools
|
||||
title: Tools
|
||||
- local: dataset_subtask
|
||||
title: Using Subtasks in the Dataset
|
||||
- local: streaming_video_encoding
|
||||
title: Streaming Video Encoding
|
||||
title: "Datasets"
|
||||
@@ -49,8 +47,6 @@
|
||||
title: π₀-FAST (Pi0Fast)
|
||||
- local: pi05
|
||||
title: π₀.₅ (Pi05)
|
||||
- local: eo1
|
||||
title: EO-1
|
||||
- local: groot
|
||||
title: NVIDIA GR00T N1.5
|
||||
- local: xvla
|
||||
@@ -83,22 +79,10 @@
|
||||
title: Adding a New Benchmark
|
||||
- local: libero
|
||||
title: LIBERO
|
||||
- local: libero_plus
|
||||
title: LIBERO-plus
|
||||
- local: metaworld
|
||||
title: Meta-World
|
||||
- local: robotwin
|
||||
title: RoboTwin 2.0
|
||||
- local: robocasa
|
||||
title: RoboCasa365
|
||||
- local: robocerebra
|
||||
title: RoboCerebra
|
||||
- local: robomme
|
||||
title: RoboMME
|
||||
- local: envhub_isaaclab_arena
|
||||
title: NVIDIA IsaacLab Arena Environments
|
||||
- local: vlabench
|
||||
title: VLABench
|
||||
title: "Benchmarks"
|
||||
- sections:
|
||||
- local: introduction_processors
|
||||
|
||||
@@ -0,0 +1,277 @@
|
||||
# Using Subtasks in LeRobot Datasets
|
||||
|
||||
Subtask support in robotics datasets has proven effective in improving robot reasoning and understanding. Subtasks are particularly useful for:
|
||||
|
||||
- **Hierarchical policies**: Building policies that include subtask predictions to visualize robot reasoning in real time
|
||||
- **Reward modeling**: Helping reward models understand task progression (e.g., SARM-style stage-aware reward models)
|
||||
- **Task decomposition**: Breaking down complex manipulation tasks into atomic, interpretable steps
|
||||
|
||||
LeRobotDataset now supports subtasks as part of its dataset structure, alongside tasks.
|
||||
|
||||
## What are Subtasks?
|
||||
|
||||
While a **task** describes the overall goal (e.g., "Pick up the apple and place it in the basket"), **subtasks** break down the execution into finer-grained steps:
|
||||
|
||||
1. "Approach the apple"
|
||||
2. "Grasp the apple"
|
||||
3. "Lift the apple"
|
||||
4. "Move to basket"
|
||||
5. "Release the apple"
|
||||
|
||||
Each frame in the dataset can be annotated with its corresponding subtask, enabling models to learn and predict these intermediate stages.
|
||||
|
||||
<img
|
||||
src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lerobot/subtask-asset.png"
|
||||
alt="An overview of subtask annotation showing how frames are labeled with intermediate subtask stages"
|
||||
width="80%"
|
||||
/>
|
||||
|
||||
<p>
|
||||
<em>Figure: Overview of subtask annotation.</em>
|
||||
</p>
|
||||
|
||||
**Reference:** _Subtask-learning based for robot self-assembly in flexible collaborative assembly in manufacturing_, Original Article, Published: 19 April 2022.
|
||||
|
||||
## Dataset Structure
|
||||
|
||||
Subtask information is stored in the dataset metadata:
|
||||
|
||||
```
|
||||
my-dataset/
|
||||
├── data/
|
||||
│ └── ...
|
||||
├── meta/
|
||||
│ ├── info.json
|
||||
│ ├── stats.json
|
||||
│ ├── tasks.parquet
|
||||
│ ├── subtasks.parquet # Subtask index → subtask string mapping
|
||||
│ └── episodes/
|
||||
│ └── ...
|
||||
└── videos/
|
||||
└── ...
|
||||
```
|
||||
|
||||
### Subtasks Parquet File
|
||||
|
||||
The `meta/subtasks.parquet` file maps subtask indices to their natural language descriptions:
|
||||
|
||||
| subtask_index | subtask (index column) |
|
||||
| ------------- | ---------------------- |
|
||||
| 0 | "Approach the apple" |
|
||||
| 1 | "Grasp the apple" |
|
||||
| 2 | "Lift the apple" |
|
||||
| ... | ... |
|
||||
|
||||
### Frame-Level Annotations
|
||||
|
||||
Each frame in the dataset can include a `subtask_index` field that references the subtasks parquet file:
|
||||
|
||||
```python
|
||||
# Example frame data in the parquet file
|
||||
{
|
||||
"index": 42,
|
||||
"timestamp": 1.4,
|
||||
"episode_index": 0,
|
||||
"task_index": 0,
|
||||
"subtask_index": 2, # References "Lift the apple"
|
||||
"observation.state": [...],
|
||||
"action": [...],
|
||||
}
|
||||
```
|
||||
|
||||
## Annotating Datasets with Subtasks
|
||||
|
||||
We provide a HuggingFace Space for easily annotating any LeRobotDataset with subtasks:
|
||||
|
||||
**[https://huggingface.co/spaces/lerobot/annotate](https://huggingface.co/spaces/lerobot/annotate)**
|
||||
|
||||
After completing your annotation:
|
||||
|
||||
1. Click "Push to Hub" to upload your annotated dataset
|
||||
2. You can also run the annotation space locally by following the instructions at [github.com/huggingface/lerobot-annotate](https://github.com/huggingface/lerobot-annotate)
|
||||
|
||||
## Loading Datasets with Subtasks
|
||||
|
||||
When you load a dataset with subtask annotations, the subtask information is automatically available:
|
||||
|
||||
```python
|
||||
from lerobot.datasets import LeRobotDataset
|
||||
|
||||
# Load a dataset with subtask annotations
|
||||
dataset = LeRobotDataset("jadechoghari/collect-fruit-annotated")
|
||||
|
||||
# Access a sample
|
||||
sample = dataset[100]
|
||||
|
||||
# The sample includes both task and subtask information
|
||||
print(sample["task"]) # "Collect the fruit"
|
||||
print(sample["subtask"]) # "Grasp the apple"
|
||||
print(sample["task_index"]) # tensor(0)
|
||||
print(sample["subtask_index"]) # tensor(2)
|
||||
```
|
||||
|
||||
### Checking for Subtask Support
|
||||
|
||||
You can check if a dataset has subtask annotations:
|
||||
|
||||
```python
|
||||
# Check if subtasks are available
|
||||
has_subtasks = (
|
||||
"subtask_index" in dataset.features
|
||||
and dataset.meta.subtasks is not None
|
||||
)
|
||||
|
||||
if has_subtasks:
|
||||
print(f"Dataset has {len(dataset.meta.subtasks)} unique subtasks")
|
||||
print("Subtasks:", list(dataset.meta.subtasks.index))
|
||||
```
|
||||
|
||||
## Using Subtasks for Training
|
||||
|
||||
### With the Tokenizer Processor
|
||||
|
||||
The `TokenizerProcessor` automatically handles subtask tokenization for Vision-Language Action (VLA) models:
|
||||
|
||||
```python
|
||||
from lerobot.processor import TokenizerProcessorStep
|
||||
|
||||
# Create a tokenizer processor step
|
||||
tokenizer_processor = TokenizerProcessorStep(
|
||||
tokenizer_name_or_path="google/paligemma-3b-pt-224",
|
||||
padding="max_length",
|
||||
max_length=64,
|
||||
)
|
||||
|
||||
# The processor will automatically tokenize subtasks if present in the batch
|
||||
# and add them to the observation under:
|
||||
# - "observation.subtask.tokens"
|
||||
# - "observation.subtask.attention_mask"
|
||||
```
|
||||
|
||||
When subtasks are available in the batch, the tokenizer processor adds:
|
||||
|
||||
- `observation.subtask.tokens`: Tokenized subtask text
|
||||
- `observation.subtask.attention_mask`: Attention mask for the subtask tokens
|
||||
|
||||
### DataLoader with Subtasks
|
||||
|
||||
```python
|
||||
import torch
|
||||
from lerobot.datasets import LeRobotDataset
|
||||
|
||||
dataset = LeRobotDataset("jadechoghari/collect-fruit-annotated")
|
||||
|
||||
dataloader = torch.utils.data.DataLoader(
|
||||
dataset,
|
||||
batch_size=16,
|
||||
shuffle=True,
|
||||
)
|
||||
|
||||
for batch in dataloader:
|
||||
# Access subtask information in the batch
|
||||
subtasks = batch["subtask"] # List of subtask strings
|
||||
subtask_indices = batch["subtask_index"] # Tensor of subtask indices
|
||||
|
||||
# Use for training hierarchical policies or reward models
|
||||
print(f"Batch subtasks: {set(subtasks)}")
|
||||
```
|
||||
|
||||
## Example Datasets with Subtask Annotations
|
||||
|
||||
Try loading a dataset with subtask annotations:
|
||||
|
||||
```python
|
||||
from lerobot.datasets import LeRobotDataset
|
||||
|
||||
# Example dataset with subtask annotations
|
||||
dataset = LeRobotDataset("jadechoghari/collect-fruit-annotated")
|
||||
|
||||
# Explore the subtasks
|
||||
print("Available subtasks:")
|
||||
for subtask_name in dataset.meta.subtasks.index:
|
||||
print(f" - {subtask_name}")
|
||||
|
||||
# Get subtask distribution
|
||||
subtask_counts = {}
|
||||
for i in range(len(dataset)):
|
||||
sample = dataset[i]
|
||||
subtask = sample["subtask"]
|
||||
subtask_counts[subtask] = subtask_counts.get(subtask, 0) + 1
|
||||
|
||||
print("\nSubtask distribution:")
|
||||
for subtask, count in sorted(subtask_counts.items(), key=lambda x: -x[1]):
|
||||
print(f" {subtask}: {count} frames")
|
||||
```
|
||||
|
||||
## Use Cases
|
||||
|
||||
### 1. Hierarchical Policy Training
|
||||
|
||||
Train policies that predict both actions and current subtask:
|
||||
|
||||
```python
|
||||
class HierarchicalPolicy(nn.Module):
|
||||
def __init__(self, num_subtasks):
|
||||
super().__init__()
|
||||
self.action_head = nn.Linear(hidden_dim, action_dim)
|
||||
self.subtask_head = nn.Linear(hidden_dim, num_subtasks)
|
||||
|
||||
def forward(self, observations):
|
||||
features = self.encoder(observations)
|
||||
actions = self.action_head(features)
|
||||
subtask_logits = self.subtask_head(features)
|
||||
return actions, subtask_logits
|
||||
```
|
||||
|
||||
### 2. Stage-Aware Reward Modeling (SARM)
|
||||
|
||||
Build reward models that understand task progression:
|
||||
|
||||
```python
|
||||
# SARM predicts:
|
||||
# - Stage: Which subtask is being executed (discrete)
|
||||
# - Progress: How far along the subtask (continuous 0-1)
|
||||
|
||||
class SARMRewardModel(nn.Module):
|
||||
def forward(self, observations):
|
||||
features = self.encoder(observations)
|
||||
stage_logits = self.stage_classifier(features)
|
||||
progress = self.progress_regressor(features)
|
||||
return stage_logits, progress
|
||||
```
|
||||
|
||||
### 3. Progress Visualization
|
||||
|
||||
Monitor robot execution by tracking subtask progression:
|
||||
|
||||
```python
|
||||
def visualize_execution(model, observations):
|
||||
for t, obs in enumerate(observations):
|
||||
action, subtask_logits = model(obs)
|
||||
predicted_subtask = subtask_names[subtask_logits.argmax()]
|
||||
print(f"t={t}: Executing '{predicted_subtask}'")
|
||||
```
|
||||
|
||||
## API Reference
|
||||
|
||||
### LeRobotDataset Properties
|
||||
|
||||
| Property | Type | Description |
|
||||
| --------------------------- | ---------------------- | ------------------------------------------ |
|
||||
| `meta.subtasks` | `pd.DataFrame \| None` | DataFrame mapping subtask names to indices |
|
||||
| `features["subtask_index"]` | `dict` | Feature spec for subtask_index if present |
|
||||
|
||||
### Sample Keys
|
||||
|
||||
When subtasks are available, each sample includes:
|
||||
|
||||
| Key | Type | Description |
|
||||
| --------------- | -------------- | ------------------------------------ |
|
||||
| `subtask_index` | `torch.Tensor` | Integer index of the current subtask |
|
||||
| `subtask` | `str` | Natural language subtask description |
|
||||
|
||||
## Related Resources
|
||||
|
||||
- [SARM Paper](https://arxiv.org/pdf/2509.25358) - Stage-Aware Reward Modeling for Long Horizon Robot Manipulation
|
||||
- [LeRobot Annotate Space](https://huggingface.co/spaces/lerobot/annotate) - Interactive annotation tool
|
||||
- [LeRobotDataset v3.0](./lerobot-dataset-v3) - Dataset format documentation
|
||||
@@ -1,168 +0,0 @@
|
||||
# EO-1
|
||||
|
||||
EO-1 is a **Vision-Language-Action policy for robot control**. The LeRobot implementation integrates EO-1 with the standard LeRobot training, evaluation, processor interface.
|
||||
|
||||
## Model Overview
|
||||
|
||||
EO-1 uses a Qwen2.5-VL backbone for vision-language understanding and adds a continuous flow-matching action head for robot control. The policy formats each robot-control sample as a multimodal conversation: camera images are passed to Qwen2.5-VL, the robot state is represented with EO-1 state tokens, and the future action chunk is represented with EO-1 action tokens.
|
||||
|
||||
<img
|
||||
src="https://huggingface.co/datasets/HaomingSong/lerobot-documentation-images/resolve/main/lerobot/eo_pipeline.png"
|
||||
alt="An overview of EO-1"
|
||||
width="85%"
|
||||
/>
|
||||
|
||||
During training, EO-1 learns to denoise continuous action chunks at the action-token positions. During inference, it samples an action chunk, returns continuous actions, and executes `n_action_steps` from the chunk before sampling again.
|
||||
|
||||
### What the LeRobot Integration Covers
|
||||
|
||||
- Standard `policy.type=eo1` configuration through LeRobot
|
||||
- Qwen2.5-VL image and text preprocessing through policy processors
|
||||
- Continuous flow-matching action prediction
|
||||
- Checkpoint save/load through LeRobot policy APIs
|
||||
- Training with `lerobot-train` and evaluation with `lerobot-eval`
|
||||
|
||||
The broader EO-1 project also includes interleaved vision-text-action pretraining and multimodal reasoning workflows. This page focuses on the LeRobot robot-control policy path.
|
||||
|
||||
## Installation Requirements
|
||||
|
||||
1. Install LeRobot by following the [Installation Guide](./installation).
|
||||
2. Install EO-1 dependencies by running:
|
||||
|
||||
```bash
|
||||
pip install -e ".[eo1]"
|
||||
```
|
||||
|
||||
3. If you want to train or evaluate on LIBERO, install the LIBERO dependencies too:
|
||||
|
||||
```bash
|
||||
pip install -e ".[eo1,libero]"
|
||||
```
|
||||
|
||||
EO-1 can use the standard PyTorch scaled-dot-product attention backend through `policy.attn_implementation=sdpa`. If your environment has a compatible `flash_attn` installation, you can request `policy.attn_implementation=flash_attention_2`.
|
||||
|
||||
## Data Requirements
|
||||
|
||||
EO-1 expects a LeRobot dataset with:
|
||||
|
||||
- At least one visual observation, for example `observation.images.image`
|
||||
- `observation.state`
|
||||
- `action`
|
||||
- A language task instruction through the dataset `task` field
|
||||
|
||||
If your dataset uses different observation names, use `rename_map` to align them with the names expected by your training or evaluation setup.
|
||||
|
||||
## Usage
|
||||
|
||||
To use EO-1 in a LeRobot configuration, specify the policy type as:
|
||||
|
||||
```python
|
||||
policy.type=eo1
|
||||
```
|
||||
|
||||
By default, a new EO-1 policy initializes its backbone from:
|
||||
|
||||
```python
|
||||
policy.vlm_base=Qwen/Qwen2.5-VL-3B-Instruct
|
||||
```
|
||||
|
||||
Once a LeRobot-format EO-1 checkpoint is available, load it with:
|
||||
|
||||
```python
|
||||
policy.path=your-org/your-eo1-checkpoint
|
||||
```
|
||||
|
||||
## Training
|
||||
|
||||
### Training Command Example
|
||||
|
||||
```bash
|
||||
lerobot-train \
|
||||
--dataset.repo_id=your_org/your_dataset \
|
||||
--policy.type=eo1 \
|
||||
--policy.vlm_base=Qwen/Qwen2.5-VL-3B-Instruct \
|
||||
--policy.dtype=bfloat16 \
|
||||
--policy.attn_implementation=sdpa \
|
||||
--policy.gradient_checkpointing=false \
|
||||
--output_dir=./outputs/eo1_training \
|
||||
--job_name=eo1_training \
|
||||
--steps=300000 \
|
||||
--batch_size=16 \
|
||||
--policy.device=cuda
|
||||
```
|
||||
|
||||
### Key Training Parameters
|
||||
|
||||
| Parameter | Default | Description |
|
||||
| -------------------------------------- | ----------------------------- | ----------------------------------------------------------------------- |
|
||||
| `policy.vlm_base` | `Qwen/Qwen2.5-VL-3B-Instruct` | Qwen2.5-VL checkpoint used to initialize a new policy |
|
||||
| `policy.dtype` | `auto` | Backbone dtype request: `auto`, `bfloat16`, or `float32` |
|
||||
| `policy.attn_implementation` | `None` | Optional Qwen attention backend, such as `sdpa` |
|
||||
| `policy.gradient_checkpointing` | `false` | Reduces memory usage during training |
|
||||
| `policy.chunk_size` | `8` | Number of future actions predicted per chunk |
|
||||
| `policy.n_action_steps` | `8` | Number of actions consumed from a sampled chunk |
|
||||
| `policy.num_denoise_steps` | `10` | Number of flow-matching denoising steps used during sampling |
|
||||
| `policy.max_state_dim` | `32` | State padding dimension |
|
||||
| `policy.max_action_dim` | `32` | Action padding dimension |
|
||||
| `policy.force_fp32_autocast` | `true` | Keeps the flow head in fp32 even when the backbone uses mixed precision |
|
||||
| `policy.supervise_padding_action_dims` | `true` | Controls whether padded action dimensions are supervised |
|
||||
| `policy.supervise_padding_actions` | `true` | Controls whether padded future action rows are supervised |
|
||||
|
||||
## Evaluation
|
||||
|
||||
EO-1 can be evaluated through `lerobot-eval` once you have a LeRobot-format checkpoint:
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path=your-org/your-eo1-checkpoint \
|
||||
--env.type=libero \
|
||||
--env.task=libero_object \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=20
|
||||
```
|
||||
|
||||
For datasets or environments whose camera names differ from the checkpoint configuration, pass a `rename_map`:
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path=your-org/your-eo1-checkpoint \
|
||||
--env.type=libero \
|
||||
--env.task=libero_object \
|
||||
--rename_map='{"observation.images.image2":"observation.images.wrist_image"}'
|
||||
```
|
||||
|
||||
## Configuration Notes
|
||||
|
||||
### Image Processing
|
||||
|
||||
EO-1 uses the Qwen2.5-VL processor. The `policy.image_min_pixels` and `policy.image_max_pixels` settings control the image resizing bounds before the visual tokens are passed into the backbone.
|
||||
|
||||
### State and Action Dimensions
|
||||
|
||||
The policy pads state and action vectors to `policy.max_state_dim` and `policy.max_action_dim` before the EO-1 flow head. Predictions are cropped back to the original action dimension before being returned by the policy.
|
||||
|
||||
### Attention Backend
|
||||
|
||||
Use `policy.attn_implementation=sdpa` for a portable setup. Use `flash_attention_2` only when `flash_attn` is installed and compatible with your environment.
|
||||
|
||||
## References
|
||||
|
||||
- [EO-1 project](https://github.com/EO-Robotics/EO1)
|
||||
- [EO-1 paper](https://arxiv.org/abs/2508.21112)
|
||||
- [Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct)
|
||||
|
||||
## Citation
|
||||
|
||||
```bibtex
|
||||
@article{eo1,
|
||||
title={EO-1: Interleaved Vision-Text-Action Pretraining for General Robot Control},
|
||||
author={Delin Qu and Haoming Song and Qizhi Chen and Zhaoqing Chen and Xianqiang Gao and Xinyi Ye and Qi Lv and Modi Shi and Guanghui Ren and Cheng Ruan and Maoqing Yao and Haoran Yang and Jiacheng Bao and Bin Zhao and Dong Wang},
|
||||
journal={arXiv preprint},
|
||||
year={2025},
|
||||
url={https://arxiv.org/abs/2508.21112}
|
||||
}
|
||||
```
|
||||
|
||||
## License
|
||||
|
||||
This LeRobot integration follows the **Apache 2.0 License** used by LeRobot. Check the upstream EO-1 model and dataset pages for the licenses of released EO-1 checkpoints and data.
|
||||
@@ -108,7 +108,7 @@ lerobot-rollout --strategy.type=dagger \
|
||||
--teleop.port_left=/dev/ttyACM0 \
|
||||
--teleop.port_right=/dev/ttyACM1 \
|
||||
--policy.path=outputs/pretrain/checkpoints/last/pretrained_model \
|
||||
--dataset.repo_id=your-username/rollout_hil_dataset \
|
||||
--dataset.repo_id=your-username/hil-dataset \
|
||||
--dataset.single_task="Fold the T-shirt properly" \
|
||||
--dataset.fps=30 \
|
||||
--strategy.num_episodes=50 \
|
||||
@@ -135,7 +135,7 @@ lerobot-rollout --strategy.type=dagger \
|
||||
--teleop.port_left=/dev/ttyACM0 \
|
||||
--teleop.port_right=/dev/ttyACM1 \
|
||||
--policy.path=outputs/pretrain/checkpoints/last/pretrained_model \
|
||||
--dataset.repo_id=your-username/rollout_hil_rtc_dataset \
|
||||
--dataset.repo_id=your-username/hil-rtc-dataset \
|
||||
--dataset.single_task="Fold the T-shirt properly" \
|
||||
--dataset.fps=30 \
|
||||
--strategy.num_episodes=50 \
|
||||
|
||||
@@ -59,7 +59,7 @@ lerobot-rollout \
|
||||
--robot.type=so100_follower \
|
||||
--robot.port=/dev/ttyACM0 \
|
||||
--robot.cameras="{ front: {type: opencv, index_or_path: 0, width: 640, height: 480, fps: 30}}" \
|
||||
--dataset.repo_id=${HF_USER}/rollout_eval_data \
|
||||
--dataset.repo_id=${HF_USER}/eval_data \
|
||||
--dataset.single_task="Put lego brick into the box" \
|
||||
--duration=3600
|
||||
```
|
||||
@@ -84,7 +84,7 @@ lerobot-rollout \
|
||||
--policy.path=${HF_USER}/my_policy \
|
||||
--robot.type=koch_follower \
|
||||
--robot.port=/dev/ttyACM0 \
|
||||
--dataset.repo_id=${HF_USER}/rollout_highlight_data \
|
||||
--dataset.repo_id=${HF_USER}/highlight_data \
|
||||
--dataset.single_task="Pick up the red cube"
|
||||
```
|
||||
|
||||
@@ -118,7 +118,7 @@ lerobot-rollout \
|
||||
--policy.path=outputs/pretrain/checkpoints/last/pretrained_model \
|
||||
--robot.type=bi_openarm_follower \
|
||||
--teleop.type=openarm_mini \
|
||||
--dataset.repo_id=${HF_USER}/rollout_hil_data \
|
||||
--dataset.repo_id=${HF_USER}/hil_data \
|
||||
--dataset.single_task="Fold the T-shirt"
|
||||
```
|
||||
|
||||
@@ -134,7 +134,7 @@ lerobot-rollout \
|
||||
--robot.port=/dev/ttyACM0 \
|
||||
--teleop.type=so101_leader \
|
||||
--teleop.port=/dev/ttyACM1 \
|
||||
--dataset.repo_id=${HF_USER}/rollout_dagger_data \
|
||||
--dataset.repo_id=${HF_USER}/dagger_data \
|
||||
--dataset.single_task="Grasp the block"
|
||||
```
|
||||
|
||||
|
||||
@@ -1,147 +0,0 @@
|
||||
# Language columns and recipes
|
||||
|
||||
Most LeRobot datasets ship with a single `task` string per episode — fine for
|
||||
short, single-instruction skills, but not enough for the longer-horizon,
|
||||
multi-modal robot policies the field is moving toward (high-level planning,
|
||||
memory, interjections, VQA, tool use). To support those policies without
|
||||
forking the dataset format, LeRobot extends `LeRobotDataset` with two optional
|
||||
language columns and a small recipe layer that turns those rows into
|
||||
chat-style training samples on the fly.
|
||||
|
||||
The design splits cleanly into three layers:
|
||||
|
||||
1. **Data in the dataset** — language annotations stored next to frames in
|
||||
`data/chunk-*/file-*.parquet` as two optional columns (`language_persistent`
|
||||
and `language_events`). Datasets without these columns keep their existing
|
||||
behavior.
|
||||
2. **Recipe** — a YAML file that declares which annotation rows to bind and
|
||||
how to lay them out as chat turns (`role`, `content`, optional images,
|
||||
optional tool calls). Recipes are pure config; no Python required to add a
|
||||
new one.
|
||||
3. **Training format** — at sample time, `RenderMessagesStep` resolves the
|
||||
recipe against the per-frame annotations and emits HF-style `messages` plus
|
||||
LeRobot-specific sidecars (`message_streams`, `target_message_indices`)
|
||||
that policy processors consume.
|
||||
|
||||
This page describes each layer in turn.
|
||||
|
||||
## Layer 1 — language columns in the dataset
|
||||
|
||||
The two optional columns live next to frame data in
|
||||
`data/chunk-*/file-*.parquet`:
|
||||
|
||||
- `language_persistent`: a list of rows broadcast across every frame in an episode for state that remains active, such as `subtask`, `plan`, and `memory`.
|
||||
- `language_events`: a list of rows only on the exact frame where an event was emitted, such as `interjection`, `vqa`, and speech tool calls.
|
||||
|
||||
Both columns share the same row shape (event rows omit `timestamp` because the
|
||||
frame the row sits on already provides it):
|
||||
|
||||
```text
|
||||
role: string
|
||||
content: string | null
|
||||
style: string | null
|
||||
timestamp: float64 # persistent rows only
|
||||
camera: string | null # observation.images.* feature key, view-dependent rows only
|
||||
tool_calls: list[Json] | null
|
||||
```
|
||||
|
||||
The `camera` field tags rows whose `content` is grounded in a specific camera
|
||||
view. Rows of view-dependent styles (`vqa` and `trace`) MUST set `camera` to
|
||||
the matching `observation.images.*` feature key. Rows of every other style —
|
||||
including `motion`, which describes robot-frame primitives in joint / Cartesian
|
||||
terms — MUST leave `camera` as `null`. Pipeline writers and the validator
|
||||
enforce this via `validate_camera_field(style, camera)`.
|
||||
|
||||
`meta/tasks.parquet` remains the canonical source for the task. The special `${task}` recipe binding always reads that task string and does not depend on language annotations.
|
||||
|
||||
### Architecture
|
||||
|
||||
The language stack itself has three internal modules backing layer 1:
|
||||
|
||||
1. `lerobot.datasets.language` defines the schema, style registry, and `column_for_style`.
|
||||
2. `lerobot.datasets.language_render` resolves rows and renders messages.
|
||||
3. `RenderMessagesStep` turns dataset samples into `messages`, `message_streams`, and `target_message_indices`.
|
||||
|
||||
`LeRobotDataset` stays recipe-agnostic. It passes `language_persistent` and `language_events` through when present, and unannotated datasets keep their existing behavior.
|
||||
|
||||
### Temporal semantics
|
||||
|
||||
Persistent styles are active after emission until replaced:
|
||||
|
||||
- `active_at(t, style=subtask)`
|
||||
- `nth_prev(style=memory, offset=1)`
|
||||
- `nth_next(style=subtask, offset=1)`
|
||||
|
||||
Event styles only exist on their exact timestamp:
|
||||
|
||||
- `emitted_at(t, style=interjection)`
|
||||
- `emitted_at(t, style=vqa, role=user, camera=observation.images.top)`
|
||||
- `emitted_at(t, role=assistant, tool_name=say)`
|
||||
|
||||
Exact event matching has no tolerance window, so writers must stamp event rows with frame timestamps from the parquet data.
|
||||
|
||||
### View-dependent resolution
|
||||
|
||||
For view-dependent styles (`vqa` and `trace`), the resolver gains a
|
||||
`camera=` filter parallel to `role=` and `tool_name=`. Datasets with multiple
|
||||
cameras typically emit one (`vqa`, `user`) + (`vqa`, `assistant`) pair per
|
||||
camera at the same timestamp; without `camera=`, those resolvers see two
|
||||
matches and raise an ambiguity error. Recipes consume each camera through its
|
||||
own binding plus a matching image block, e.g.
|
||||
|
||||
```yaml
|
||||
ask_vqa_top:
|
||||
bindings:
|
||||
vqa_query: "emitted_at(t, style=vqa, role=user, camera=observation.images.top)"
|
||||
vqa: "emitted_at(t, style=vqa, role=assistant, camera=observation.images.top)"
|
||||
messages:
|
||||
- role: user
|
||||
stream: high_level
|
||||
if_present: vqa_query
|
||||
content:
|
||||
- { type: image, feature: observation.images.top }
|
||||
- { type: text, text: "${vqa_query}" }
|
||||
- {
|
||||
role: assistant,
|
||||
content: "${vqa}",
|
||||
stream: high_level,
|
||||
target: true,
|
||||
if_present: vqa,
|
||||
}
|
||||
```
|
||||
|
||||
Add one such sub-recipe per camera the dataset records.
|
||||
|
||||
## Layer 2 — recipe anatomy
|
||||
|
||||
Recipes are YAML files backed by `TrainingRecipe` and `MessageTurn`. They
|
||||
declare which annotation rows to pull (via `bindings`) and how to compose them
|
||||
into chat turns (`messages`).
|
||||
|
||||
```yaml
|
||||
messages:
|
||||
- { role: user, content: "${task}", stream: high_level }
|
||||
- { role: assistant, content: "${subtask}", stream: low_level, target: true }
|
||||
```
|
||||
|
||||
A recipe can also branch into a weighted **blend** of sub-recipes. At sample
|
||||
time, exactly one branch is selected deterministically from the sample index,
|
||||
so different frames train different objectives (e.g. memory updates vs.
|
||||
low-level execution vs. VQA) without any Python wiring.
|
||||
|
||||
## Layer 3 — training format
|
||||
|
||||
Rendered samples use HF-style chat messages plus LeRobot sidecars:
|
||||
|
||||
```python
|
||||
sample["messages"]
|
||||
sample["message_streams"]
|
||||
sample["target_message_indices"]
|
||||
```
|
||||
|
||||
The renderer does not apply a tokenizer chat template. Policy processors decide how to serialize the messages for their backbone, which keeps the same dataset usable across SmolVLA, Pi0.5, and any future VLM that expects OpenAI-style chat messages.
|
||||
|
||||
## Graceful absence
|
||||
|
||||
If both language columns are missing, `None`, or empty, `RenderMessagesStep` is a no-op.
|
||||
If an event-scoped branch is selected on a frame without the required event row, rendering returns `None`, allowing a loader to retry another sample.
|
||||
@@ -1,188 +0,0 @@
|
||||
# LIBERO-plus
|
||||
|
||||
LIBERO-plus is a **robustness benchmark** for Vision-Language-Action (VLA) models built on top of [LIBERO](./libero). It systematically stress-tests policies by applying **seven independent perturbation dimensions** to the original LIBERO task set, exposing failure modes that standard benchmarks miss.
|
||||
|
||||
- Paper: [In-depth Robustness Analysis of Vision-Language-Action Models](https://arxiv.org/abs/2510.13626)
|
||||
- GitHub: [sylvestf/LIBERO-plus](https://github.com/sylvestf/LIBERO-plus)
|
||||
- Dataset: [lerobot/libero_plus](https://huggingface.co/datasets/lerobot/libero_plus)
|
||||
|
||||

|
||||
|
||||
## Perturbation dimensions
|
||||
|
||||
LIBERO-plus creates ~10 000 task variants by perturbing each original LIBERO task along these axes:
|
||||
|
||||
| Dimension | What changes |
|
||||
| --------------------- | ----------------------------------------------------- |
|
||||
| Objects layout | Target position, presence of confounding objects |
|
||||
| Camera viewpoints | Camera position, orientation, field-of-view |
|
||||
| Robot initial states | Manipulator start pose |
|
||||
| Language instructions | LLM-rewritten task description (paraphrase / synonym) |
|
||||
| Light conditions | Intensity, direction, color, shadow |
|
||||
| Background textures | Scene surface and object appearance |
|
||||
| Sensor noise | Photometric distortions and image degradation |
|
||||
|
||||
## Available task suites
|
||||
|
||||
LIBERO-plus covers the same five suites as LIBERO:
|
||||
|
||||
| Suite | CLI name | Tasks | Max steps | Description |
|
||||
| -------------- | ---------------- | ----- | --------- | -------------------------------------------------- |
|
||||
| LIBERO-Spatial | `libero_spatial` | 10 | 280 | Tasks requiring reasoning about spatial relations |
|
||||
| LIBERO-Object | `libero_object` | 10 | 280 | Tasks centered on manipulating different objects |
|
||||
| LIBERO-Goal | `libero_goal` | 10 | 300 | Goal-conditioned tasks with changing targets |
|
||||
| LIBERO-90 | `libero_90` | 90 | 400 | Short-horizon tasks from the LIBERO-100 collection |
|
||||
| LIBERO-Long | `libero_10` | 10 | 520 | Long-horizon tasks from the LIBERO-100 collection |
|
||||
|
||||
<Tip warning={true}>
|
||||
Installing LIBERO-plus **replaces** vanilla LIBERO — it uninstalls `hf-libero`
|
||||
so that `import libero` resolves to the LIBERO-plus fork. You cannot have both
|
||||
installed at the same time. To switch back to vanilla LIBERO, uninstall the
|
||||
fork and reinstall with `pip install -e ".[libero]"`.
|
||||
</Tip>
|
||||
|
||||
## Installation
|
||||
|
||||
### System dependencies (Linux only)
|
||||
|
||||
```bash
|
||||
sudo apt install libexpat1 libfontconfig1-dev libmagickwand-dev
|
||||
```
|
||||
|
||||
### Python package
|
||||
|
||||
```bash
|
||||
pip install -e ".[libero]" "robosuite==1.4.1" bddl easydict mujoco wand scikit-image gym
|
||||
git clone https://github.com/sylvestf/LIBERO-plus.git
|
||||
cd LIBERO-plus && pip install --no-deps -e .
|
||||
pip uninstall -y hf-libero # so `import libero` resolves to the fork
|
||||
```
|
||||
|
||||
LIBERO-plus is installed from its GitHub fork rather than a pyproject extra — the fork ships as a namespace package that pip can't handle, so it must be cloned and added to `PYTHONPATH`. See `docker/Dockerfile.benchmark.libero_plus` for the canonical install. MuJoCo is required, so only Linux is supported.
|
||||
|
||||
<Tip>
|
||||
Set the MuJoCo rendering backend before running evaluation:
|
||||
|
||||
```bash
|
||||
export MUJOCO_GL=egl # headless / HPC / cloud
|
||||
```
|
||||
|
||||
</Tip>
|
||||
|
||||
### Download LIBERO-plus assets
|
||||
|
||||
LIBERO-plus ships its extended asset pack separately. Download `assets.zip` from the [Hugging Face dataset](https://huggingface.co/datasets/Sylvest/LIBERO-plus/tree/main) and extract it into the LIBERO-plus package directory:
|
||||
|
||||
```bash
|
||||
# After installing the package, find where it was installed:
|
||||
python -c "import libero; print(libero.__file__)"
|
||||
# Then extract assets.zip into <package_root>/libero/assets/
|
||||
```
|
||||
|
||||
## Evaluation
|
||||
|
||||
### Default evaluation (recommended)
|
||||
|
||||
Evaluate across the four standard suites (10 episodes per task):
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path="your-policy-id" \
|
||||
--env.type=libero_plus \
|
||||
--env.task=libero_spatial,libero_object,libero_goal,libero_10 \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=10 \
|
||||
--env.max_parallel_tasks=1
|
||||
```
|
||||
|
||||
### Single-suite evaluation
|
||||
|
||||
Evaluate on one LIBERO-plus suite:
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path="your-policy-id" \
|
||||
--env.type=libero_plus \
|
||||
--env.task=libero_spatial \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=10
|
||||
```
|
||||
|
||||
- `--env.task` picks the suite (`libero_spatial`, `libero_object`, etc.).
|
||||
- `--env.task_ids` restricts to specific task indices (`[0]`, `[1,2,3]`, etc.). Omit to run all tasks in the suite.
|
||||
- `--eval.batch_size` controls how many environments run in parallel.
|
||||
- `--eval.n_episodes` sets how many episodes to run per task.
|
||||
|
||||
### Multi-suite evaluation
|
||||
|
||||
Benchmark a policy across multiple suites at once by passing a comma-separated list:
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path="your-policy-id" \
|
||||
--env.type=libero_plus \
|
||||
--env.task=libero_spatial,libero_object \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=10
|
||||
```
|
||||
|
||||
### Control mode
|
||||
|
||||
LIBERO-plus supports two control modes — `relative` (default) and `absolute`. Different VLA checkpoints are trained with different action parameterizations, so make sure the mode matches your policy:
|
||||
|
||||
```bash
|
||||
--env.control_mode=relative # or "absolute"
|
||||
```
|
||||
|
||||
### Policy inputs and outputs
|
||||
|
||||
**Observations:**
|
||||
|
||||
- `observation.state` — 8-dim proprioceptive features (eef position, axis-angle orientation, gripper qpos)
|
||||
- `observation.images.image` — main camera view (`agentview_image`), HWC uint8
|
||||
- `observation.images.image2` — wrist camera view (`robot0_eye_in_hand_image`), HWC uint8
|
||||
|
||||
**Actions:**
|
||||
|
||||
- Continuous control in `Box(-1, 1, shape=(7,))` — 6D end-effector delta + 1D gripper
|
||||
|
||||
### Recommended evaluation episodes
|
||||
|
||||
For reproducible benchmarking, use **10 episodes per task** across all four standard suites (Spatial, Object, Goal, Long). This gives 400 total episodes and matches the protocol used for published results.
|
||||
|
||||
## Training
|
||||
|
||||
### Dataset
|
||||
|
||||
A LeRobot-format training dataset for LIBERO-plus is available at:
|
||||
|
||||
- [lerobot/libero_plus](https://huggingface.co/datasets/lerobot/libero_plus)
|
||||
|
||||
### Example training command
|
||||
|
||||
```bash
|
||||
lerobot-train \
|
||||
--policy.type=smolvla \
|
||||
--policy.repo_id=${HF_USER}/smolvla_libero_plus \
|
||||
--policy.load_vlm_weights=true \
|
||||
--dataset.repo_id=lerobot/libero_plus \
|
||||
--env.type=libero_plus \
|
||||
--env.task=libero_spatial \
|
||||
--output_dir=./outputs/ \
|
||||
--steps=100000 \
|
||||
--batch_size=4 \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=1 \
|
||||
--eval_freq=1000
|
||||
```
|
||||
|
||||
## Relationship to LIBERO
|
||||
|
||||
LIBERO-plus is a drop-in extension of LIBERO:
|
||||
|
||||
- Same Python gym interface (`LiberoEnv`, `LiberoProcessorStep`)
|
||||
- Same camera names and observation/action format
|
||||
- Same task suite names
|
||||
- Installs under the same `libero` Python package name (different GitHub repo)
|
||||
|
||||
To use the original LIBERO benchmark, see [LIBERO](./libero) and use `--env.type=libero`.
|
||||
@@ -61,6 +61,17 @@ lerobot-eval \
|
||||
--rename_map='{"observation.images.image": "observation.images.base_0_rgb", "observation.images.image2": "observation.images.left_wrist_0_rgb"}'
|
||||
```
|
||||
|
||||
### Recording
|
||||
|
||||
`lerobot-record` also supports rename maps, nested under the dataset config:
|
||||
|
||||
```bash
|
||||
lerobot-record \ # When running inference
|
||||
--policy.path="<user>/smolVLA_finetuned" \
|
||||
... \
|
||||
--dataset.rename_map='{"observation.images.glove2": "observation.images.image"}'
|
||||
```
|
||||
|
||||
## Alternative: edit the policy config directly
|
||||
|
||||
If you always use the same dataset or environment, you can **edit the policy's `config.json`** so its observation keys match your data source. Then no rename map is needed.
|
||||
@@ -94,10 +105,10 @@ XVLA-base has three visual inputs and `empty_cameras=0` by default. Your dataset
|
||||
|
||||
## Quick reference
|
||||
|
||||
| Goal | What to do |
|
||||
| --------------------------------------- | --------------------------------------------------------------------------- |
|
||||
| Dataset keys ≠ policy keys | `--rename_map='{"dataset_key": "policy_key", ...}'` |
|
||||
| Env keys ≠ policy keys (eval) | `--rename_map='{"env_key": "policy_key", ...}'` |
|
||||
| Rollout with different keys (inference) | `--rename_map='{"source_key": "policy_key", ...}'`. |
|
||||
| Fewer cameras than policy expects | `--policy.empty_cameras=N` (supported by PI0, PI05, PI0Fast, SmolVLA, XVLA) |
|
||||
| Avoid passing a rename map | Edit the policy's `config.json` so its keys match your data source |
|
||||
| Goal | What to do |
|
||||
| ----------------------------------------- | --------------------------------------------------------------------------- |
|
||||
| Dataset keys ≠ policy keys | `--rename_map='{"dataset_key": "policy_key", ...}'` |
|
||||
| Env keys ≠ policy keys (eval) | `--rename_map='{"env_key": "policy_key", ...}'` |
|
||||
| Recording with different keys (inference) | `--dataset.rename_map='{"source_key": "policy_key", ...}'`. |
|
||||
| Fewer cameras than policy expects | `--policy.empty_cameras=N` (supported by PI0, PI05, PI0Fast, SmolVLA, XVLA) |
|
||||
| Avoid passing a rename map | Edit the policy's `config.json` so its keys match your data source |
|
||||
|
||||
@@ -1,188 +0,0 @@
|
||||
# RoboCasa365
|
||||
|
||||
[RoboCasa365](https://robocasa.ai) is a large-scale simulation framework for training and benchmarking **generalist robots** in everyday kitchen tasks. It ships 365 diverse manipulation tasks across 2,500 kitchen environments, 3,200+ object assets and 600+ hours of human demonstration data, on a PandaOmron 12-DOF mobile manipulator (Franka arm on a holonomic base).
|
||||
|
||||
- Paper: [RoboCasa: Large-Scale Simulation of Everyday Tasks for Generalist Robots](https://arxiv.org/abs/2406.02523)
|
||||
- GitHub: [robocasa/robocasa](https://github.com/robocasa/robocasa)
|
||||
- Project website: [robocasa.ai](https://robocasa.ai)
|
||||
- Pretrained policy: [`lerobot/smolvla_robocasa`](https://huggingface.co/lerobot/smolvla_robocasa)
|
||||
- Single-task dataset (CloseFridge): [`pepijn223/robocasa_CloseFridge`](https://huggingface.co/datasets/pepijn223/robocasa_CloseFridge)
|
||||
|
||||
<img
|
||||
src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lerobot/robocasa-banner.webp"
|
||||
alt="RoboCasa365 benchmark overview"
|
||||
width="85%"
|
||||
/>
|
||||
|
||||
## Available tasks
|
||||
|
||||
RoboCasa365 organizes its 365 tasks into two families and three upstream benchmark groups that LeRobot exposes as first-class `--env.task` shortcuts:
|
||||
|
||||
| Family | Tasks | Description |
|
||||
| --------- | ----- | ------------------------------------------------------------------------------- |
|
||||
| Atomic | ~65 | Single-skill tasks: pick-and-place, door/drawer manipulation, appliance control |
|
||||
| Composite | ~300 | Multi-step tasks across 60+ categories: cooking, cleaning, organizing, etc. |
|
||||
|
||||
**Atomic task examples:** `CloseFridge`, `OpenDrawer`, `OpenCabinet`, `TurnOnMicrowave`, `TurnOffStove`, `NavigateKitchen`, `PickPlaceCounterToStove`.
|
||||
|
||||
**Composite task categories:** baking, boiling, brewing, chopping, clearing table, defrosting food, loading dishwasher, making tea, microwaving food, washing dishes, and more.
|
||||
|
||||
`--env.task` accepts three forms:
|
||||
|
||||
- a single task name (`CloseFridge`)
|
||||
- a comma-separated list (`CloseFridge,OpenBlenderLid,PickPlaceCoffee`)
|
||||
- a benchmark-group shortcut — `atomic_seen`, `composite_seen`, `composite_unseen`, `pretrain50`, `pretrain100`, `pretrain200`, `pretrain300` — which auto-expands to the upstream task list and auto-sets the dataset `split` (`target` or `pretrain`).
|
||||
|
||||
## Installation
|
||||
|
||||
RoboCasa and its dependency `robosuite` are not published on PyPI, and RoboCasa's own `setup.py` hardcodes `lerobot==0.3.3`, which conflicts with this repo's `lerobot`. LeRobot therefore does **not** expose a `robocasa` extra — install the two packages manually as editable clones (using `--no-deps` on `robocasa` to skip its shadowed `lerobot` pin):
|
||||
|
||||
```bash
|
||||
# After following the standard LeRobot installation instructions.
|
||||
|
||||
git clone https://github.com/robocasa/robocasa.git ~/robocasa
|
||||
git clone https://github.com/ARISE-Initiative/robosuite.git ~/robosuite
|
||||
pip install -e ~/robocasa --no-deps
|
||||
pip install -e ~/robosuite
|
||||
|
||||
# Robocasa's runtime deps (the ones its setup.py would have pulled, minus
|
||||
# the bad lerobot pin).
|
||||
pip install numpy numba scipy mujoco pygame Pillow opencv-python \
|
||||
pyyaml pynput tqdm termcolor imageio h5py lxml hidapi \
|
||||
tianshou gymnasium
|
||||
|
||||
python -m robocasa.scripts.setup_macros
|
||||
# Lightweight assets (lightwheel object meshes + textures). Enough for
|
||||
# the default env out of the box.
|
||||
python -m robocasa.scripts.download_kitchen_assets \
|
||||
--type tex tex_generative fixtures_lw objs_lw
|
||||
# Optional: full objaverse/aigen registries (~30GB) for richer object
|
||||
# variety. Enable at eval time via --env.obj_registries (see below).
|
||||
# python -m robocasa.scripts.download_kitchen_assets --type objs_objaverse
|
||||
```
|
||||
|
||||
<Tip>
|
||||
RoboCasa requires MuJoCo. Set the rendering backend before training or evaluation:
|
||||
|
||||
```bash
|
||||
export MUJOCO_GL=egl # for headless servers (HPC, cloud)
|
||||
```
|
||||
|
||||
</Tip>
|
||||
|
||||
### Object registries
|
||||
|
||||
By default the env samples objects only from the `lightwheel` registry (what `--type objs_lw` ships), which avoids a `Probabilities contain NaN` crash when the objaverse / aigen packs aren't on disk. If you've downloaded the full asset set, enable the full registry at runtime:
|
||||
|
||||
```bash
|
||||
--env.obj_registries='[objaverse,lightwheel]'
|
||||
```
|
||||
|
||||
## Evaluation
|
||||
|
||||
All eval snippets below mirror the CI command (see `.github/workflows/benchmark_tests.yml`). The `--rename_map` argument maps RoboCasa's native camera keys (`robot0_agentview_left` / `robot0_eye_in_hand` / `robot0_agentview_right`) onto the three-camera (`camera1` / `camera2` / `camera3`) input layout the released `smolvla_robocasa` policy was trained on.
|
||||
|
||||
### Single-task evaluation (recommended for quick iteration)
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path=lerobot/smolvla_robocasa \
|
||||
--env.type=robocasa \
|
||||
--env.task=CloseFridge \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=20 \
|
||||
--eval.use_async_envs=false \
|
||||
--policy.device=cuda \
|
||||
'--rename_map={"observation.images.robot0_agentview_left": "observation.images.camera1", "observation.images.robot0_eye_in_hand": "observation.images.camera2", "observation.images.robot0_agentview_right": "observation.images.camera3"}'
|
||||
```
|
||||
|
||||
### Multi-task evaluation
|
||||
|
||||
Pass a comma-separated list of tasks:
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path=lerobot/smolvla_robocasa \
|
||||
--env.type=robocasa \
|
||||
--env.task=CloseFridge,OpenCabinet,OpenDrawer,TurnOnMicrowave,TurnOffStove \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=20 \
|
||||
--eval.use_async_envs=false \
|
||||
--policy.device=cuda \
|
||||
'--rename_map={"observation.images.robot0_agentview_left": "observation.images.camera1", "observation.images.robot0_eye_in_hand": "observation.images.camera2", "observation.images.robot0_agentview_right": "observation.images.camera3"}'
|
||||
```
|
||||
|
||||
### Benchmark-group evaluation
|
||||
|
||||
Run an entire upstream group (e.g. all 18 `atomic_seen` tasks with `split=target`):
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path=lerobot/smolvla_robocasa \
|
||||
--env.type=robocasa \
|
||||
--env.task=atomic_seen \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=20 \
|
||||
--eval.use_async_envs=false \
|
||||
--policy.device=cuda \
|
||||
'--rename_map={"observation.images.robot0_agentview_left": "observation.images.camera1", "observation.images.robot0_eye_in_hand": "observation.images.camera2", "observation.images.robot0_agentview_right": "observation.images.camera3"}'
|
||||
```
|
||||
|
||||
### Recommended evaluation episodes
|
||||
|
||||
**20 episodes per task** for reproducible benchmarking. Matches the protocol used in published results.
|
||||
|
||||
## Policy inputs and outputs
|
||||
|
||||
**Observations** (raw RoboCasa camera names are preserved verbatim):
|
||||
|
||||
- `observation.state` — 16-dim proprioceptive state (base position, base quaternion, relative end-effector position, relative end-effector quaternion, gripper qpos)
|
||||
- `observation.images.robot0_agentview_left` — left agent view, 256×256 HWC uint8
|
||||
- `observation.images.robot0_eye_in_hand` — wrist camera view, 256×256 HWC uint8
|
||||
- `observation.images.robot0_agentview_right` — right agent view, 256×256 HWC uint8
|
||||
|
||||
**Actions:**
|
||||
|
||||
- Continuous control in `Box(-1, 1, shape=(12,))` — base motion (4D) + control mode (1D) + end-effector position (3D) + end-effector rotation (3D) + gripper (1D).
|
||||
|
||||
## Training
|
||||
|
||||
### Single-task example
|
||||
|
||||
A ready-to-use single-task dataset is on the Hub:
|
||||
[`pepijn223/robocasa_CloseFridge`](https://huggingface.co/datasets/pepijn223/robocasa_CloseFridge).
|
||||
|
||||
Fine-tune a SmolVLA base on `CloseFridge`:
|
||||
|
||||
```bash
|
||||
lerobot-train \
|
||||
--policy.type=smolvla \
|
||||
--policy.repo_id=${HF_USER}/smolvla_robocasa_CloseFridge \
|
||||
--policy.load_vlm_weights=true \
|
||||
--policy.push_to_hub=true \
|
||||
--dataset.repo_id=pepijn223/robocasa_CloseFridge \
|
||||
--env.type=robocasa \
|
||||
--env.task=CloseFridge \
|
||||
--output_dir=./outputs/smolvla_robocasa_CloseFridge \
|
||||
--steps=100000 \
|
||||
--batch_size=4 \
|
||||
--eval_freq=5000 \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=5 \
|
||||
--save_freq=10000
|
||||
```
|
||||
|
||||
Evaluate the resulting checkpoint:
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path=${HF_USER}/smolvla_robocasa_CloseFridge \
|
||||
--env.type=robocasa \
|
||||
--env.task=CloseFridge \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=20
|
||||
```
|
||||
|
||||
## Reproducing published results
|
||||
|
||||
The released checkpoint [`lerobot/smolvla_robocasa`](https://huggingface.co/lerobot/smolvla_robocasa) is evaluated with the commands in the [Evaluation](#evaluation) section. CI runs a 10-atomic-task smoke eval (one episode each) on every PR touching the benchmark, picking fixture-centric tasks that don't require the objaverse asset pack.
|
||||
@@ -1,99 +0,0 @@
|
||||
# RoboCerebra
|
||||
|
||||
[RoboCerebra](https://robocerebra-project.github.io/) is a long-horizon manipulation benchmark that evaluates **high-level reasoning, planning, and memory** in VLAs. Episodes chain multiple sub-goals with language-grounded intermediate instructions, built on top of LIBERO's simulator stack (MuJoCo + robosuite, Franka Panda 7-DOF).
|
||||
|
||||
- Paper: [RoboCerebra: A Large-scale Benchmark for Long-horizon Robotic Manipulation Evaluation](https://arxiv.org/abs/2506.06677)
|
||||
- Project website: [robocerebra-project.github.io](https://robocerebra-project.github.io/)
|
||||
- Dataset: [`lerobot/robocerebra_unified`](https://huggingface.co/datasets/lerobot/robocerebra_unified) — LeRobot v3.0, 6,660 episodes / 571,116 frames at 20 fps, 1,728 language-grounded sub-tasks.
|
||||
- Pretrained policy: [`lerobot/smolvla_robocerebra`](https://huggingface.co/lerobot/smolvla_robocerebra)
|
||||
|
||||
## Available tasks
|
||||
|
||||
RoboCerebra reuses LIBERO's simulator, so evaluation runs against the LIBERO `libero_10` long-horizon suite:
|
||||
|
||||
| Suite | CLI name | Tasks | Description |
|
||||
| --------- | ----------- | ----- | ------------------------------------------------------------- |
|
||||
| LIBERO-10 | `libero_10` | 10 | Long-horizon kitchen/living room tasks chaining 3–6 sub-goals |
|
||||
|
||||
Each RoboCerebra episode in the dataset is segmented into multiple sub-tasks with natural-language instructions, which the unified dataset exposes as independent supervision signals.
|
||||
|
||||
## Installation
|
||||
|
||||
RoboCerebra piggybacks on LIBERO, so the `libero` extra is all you need:
|
||||
|
||||
```bash
|
||||
pip install -e ".[libero]"
|
||||
```
|
||||
|
||||
<Tip>
|
||||
RoboCerebra requires Linux (MuJoCo / robosuite). Set the rendering backend before training or evaluation:
|
||||
|
||||
```bash
|
||||
export MUJOCO_GL=egl # for headless servers (HPC, cloud)
|
||||
```
|
||||
|
||||
</Tip>
|
||||
|
||||
## Evaluation
|
||||
|
||||
RoboCerebra eval runs against LIBERO's `libero_10` suite with RoboCerebra's camera naming (`image` + `wrist_image`) and an extra empty-camera slot so a three-view-trained policy receives the expected input layout:
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path=lerobot/smolvla_robocerebra \
|
||||
--env.type=libero \
|
||||
--env.task=libero_10 \
|
||||
--env.fps=20 \
|
||||
--env.obs_type=pixels_agent_pos \
|
||||
--env.observation_height=256 \
|
||||
--env.observation_width=256 \
|
||||
'--env.camera_name_mapping={"agentview_image": "image", "robot0_eye_in_hand_image": "wrist_image"}' \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=10 \
|
||||
--eval.use_async_envs=false \
|
||||
--policy.device=cuda \
|
||||
'--rename_map={"observation.images.image": "observation.images.camera1", "observation.images.wrist_image": "observation.images.camera2"}' \
|
||||
--policy.empty_cameras=1
|
||||
```
|
||||
|
||||
### Recommended evaluation episodes
|
||||
|
||||
**10 episodes per task** across the `libero_10` suite (100 total) for reproducible benchmarking. Matches the protocol used in the RoboCerebra paper.
|
||||
|
||||
## Policy inputs and outputs
|
||||
|
||||
**Observations:**
|
||||
|
||||
- `observation.state` — 8-dim proprioceptive state (7 joint positions + gripper)
|
||||
- `observation.images.image` — third-person view, 256×256 HWC uint8
|
||||
- `observation.images.wrist_image` — wrist-mounted camera view, 256×256 HWC uint8
|
||||
|
||||
**Actions:**
|
||||
|
||||
- Continuous control in `Box(-1, 1, shape=(7,))` — end-effector delta (6D) + gripper (1D)
|
||||
|
||||
## Training
|
||||
|
||||
The unified dataset at [`lerobot/robocerebra_unified`](https://huggingface.co/datasets/lerobot/robocerebra_unified) exposes two RGB streams and language-grounded sub-task annotations:
|
||||
|
||||
| Feature | Shape | Description |
|
||||
| -------------------------------- | ------------- | -------------------- |
|
||||
| `observation.images.image` | (256, 256, 3) | Third-person view |
|
||||
| `observation.images.wrist_image` | (256, 256, 3) | Wrist-mounted camera |
|
||||
| `observation.state` | (8,) | Joint pos + gripper |
|
||||
| `action` | (7,) | EEF delta + gripper |
|
||||
|
||||
Fine-tune a SmolVLA base on it:
|
||||
|
||||
```bash
|
||||
lerobot-train \
|
||||
--policy.path=lerobot/smolvla_base \
|
||||
--dataset.repo_id=lerobot/robocerebra_unified \
|
||||
--env.type=libero \
|
||||
--env.task=libero_10 \
|
||||
--output_dir=outputs/smolvla_robocerebra
|
||||
```
|
||||
|
||||
## Reproducing published results
|
||||
|
||||
The released checkpoint [`lerobot/smolvla_robocerebra`](https://huggingface.co/lerobot/smolvla_robocerebra) was trained on `lerobot/robocerebra_unified` and evaluated with the command in the [Evaluation](#evaluation) section. CI runs the same command with `--eval.n_episodes=1` as a smoke test on every PR touching the benchmark.
|
||||
@@ -1,130 +0,0 @@
|
||||
# RoboMME
|
||||
|
||||
[RoboMME](https://robomme.github.io) is a memory-augmented manipulation benchmark built on ManiSkill (SAPIEN). It evaluates a robot's ability to retain and use information across an episode — counting, object permanence, reference, and imitation.
|
||||
|
||||
- **16 tasks** across 4 memory-skill suites
|
||||
- **1,600 training demos** (100 per task, 50 val, 50 test)
|
||||
- **Dataset**: [`lerobot/robomme`](https://huggingface.co/datasets/lerobot/robomme) — LeRobot v3.0, 768K frames at 10 fps
|
||||
- **Simulator**: ManiSkill / SAPIEN, Panda arm, Linux only
|
||||
|
||||

|
||||
|
||||
## Tasks
|
||||
|
||||
| Suite | Tasks |
|
||||
| --------------------------------- | ------------------------------------------------------------- |
|
||||
| **Counting** (temporal memory) | BinFill, PickXtimes, SwingXtimes, StopCube |
|
||||
| **Permanence** (spatial memory) | VideoUnmask, VideoUnmaskSwap, ButtonUnmask, ButtonUnmaskSwap |
|
||||
| **Reference** (object memory) | PickHighlight, VideoRepick, VideoPlaceButton, VideoPlaceOrder |
|
||||
| **Imitation** (procedural memory) | MoveCube, InsertPeg, PatternLock, RouteStick |
|
||||
|
||||
## Installation
|
||||
|
||||
> RoboMME requires **Linux** (ManiSkill/SAPIEN uses Vulkan rendering). Docker is recommended to isolate dependency conflicts.
|
||||
|
||||
### Native (Linux)
|
||||
|
||||
```bash
|
||||
pip install --override <(printf 'gymnasium==0.29.1\nnumpy==1.26.4\n') \
|
||||
-e '.[smolvla,av-dep]' \
|
||||
'robomme @ git+https://github.com/RoboMME/robomme_benchmark.git@main'
|
||||
```
|
||||
|
||||
> **Dependency note**: `mani-skill` (pulled by `robomme`) pins `gymnasium==0.29.1` and `numpy<2.0.0`, which conflict with lerobot's base `numpy>=2.0.0`. That's why `robomme` is not a pyproject extra — use the override install above, or the Docker approach below to avoid conflicts entirely.
|
||||
|
||||
### Docker (recommended)
|
||||
|
||||
```bash
|
||||
# Build base image first (from repo root)
|
||||
docker build -f docker/Dockerfile.eval-base -t lerobot-eval-base .
|
||||
|
||||
# Build RoboMME eval image (applies gymnasium + numpy pin overrides)
|
||||
docker build -f docker/Dockerfile.benchmark.robomme -t lerobot-robomme .
|
||||
```
|
||||
|
||||
The `docker/Dockerfile.benchmark.robomme` image overrides `gymnasium==0.29.1` and `numpy==1.26.4` after lerobot's install. Both versions are runtime-safe for lerobot's actual API usage.
|
||||
|
||||
## Running Evaluation
|
||||
|
||||
### Default (single task, single episode)
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path=<your_policy_repo> \
|
||||
--env.type=robomme \
|
||||
--env.task=PickXtimes \
|
||||
--env.dataset_split=test \
|
||||
--env.task_ids=[0] \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=1
|
||||
```
|
||||
|
||||
### Multi-task evaluation
|
||||
|
||||
Evaluate multiple tasks in one run by comma-separating task names. Use `task_ids` to control which episodes are evaluated per task. Recommended: 50 episodes per task for the test split.
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path=<your_policy_repo> \
|
||||
--env.type=robomme \
|
||||
--env.task=PickXtimes,BinFill,StopCube,MoveCube,InsertPeg \
|
||||
--env.dataset_split=test \
|
||||
--env.task_ids=[0,1,2,3,4,5,6,7,8,9] \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=50
|
||||
```
|
||||
|
||||
### Key CLI options for `env.type=robomme`
|
||||
|
||||
| Option | Default | Description |
|
||||
| -------------------- | ------------- | -------------------------------------------------- |
|
||||
| `env.task` | `PickXtimes` | Any of the 16 task names above (comma-separated) |
|
||||
| `env.dataset_split` | `test` | `train`, `val`, or `test` |
|
||||
| `env.action_space` | `joint_angle` | `joint_angle` (8-D) or `ee_pose` (7-D) |
|
||||
| `env.episode_length` | `300` | Max steps per episode |
|
||||
| `env.task_ids` | `null` | List of episode indices to evaluate (null = `[0]`) |
|
||||
|
||||
## Dataset
|
||||
|
||||
The dataset [`lerobot/robomme`](https://huggingface.co/datasets/lerobot/robomme) is in **LeRobot v3.0 format** and can be loaded directly:
|
||||
|
||||
```python
|
||||
from lerobot.datasets.lerobot_dataset import LeRobotDataset
|
||||
|
||||
dataset = LeRobotDataset("lerobot/robomme")
|
||||
```
|
||||
|
||||
### Dataset features
|
||||
|
||||
| Feature | Shape | Description |
|
||||
| ------------------ | ------------- | ------------------------------- |
|
||||
| `image` | (256, 256, 3) | Front camera RGB |
|
||||
| `wrist_image` | (256, 256, 3) | Wrist camera RGB |
|
||||
| `actions` | (8,) | Joint angles + gripper |
|
||||
| `state` | (8,) | Joint positions + gripper state |
|
||||
| `simple_subgoal` | str | High-level language annotation |
|
||||
| `grounded_subgoal` | str | Grounded language annotation |
|
||||
| `episode_index` | int | Episode ID |
|
||||
| `frame_index` | int | Frame within episode |
|
||||
|
||||
### Feature key alignment (training)
|
||||
|
||||
The env wrapper exposes `pixels/image` and `pixels/wrist_image` as observation keys. The `features_map` in `RoboMMEEnv` maps these to `observation.images.image` and `observation.images.wrist_image` for the policy. State is exposed as `agent_pos` and maps to `observation.state`.
|
||||
|
||||
The dataset's `image` and `wrist_image` columns already align with the policy input keys, so no renaming is needed when fine-tuning.
|
||||
|
||||
## Action Spaces
|
||||
|
||||
| Type | Dim | Description |
|
||||
| ------------- | --- | --------------------------------------------------------- |
|
||||
| `joint_angle` | 8 | 7 joint angles + 1 gripper (−1 closed, +1 open, absolute) |
|
||||
| `ee_pose` | 7 | xyz + roll/pitch/yaw + gripper |
|
||||
|
||||
Set via `--env.action_space=joint_angle` (default) or `--env.action_space=ee_pose`.
|
||||
|
||||
## Platform Notes
|
||||
|
||||
- **Linux only**: ManiSkill requires SAPIEN/Vulkan. macOS and Windows are not supported.
|
||||
- **GPU recommended**: Rendering is CPU-capable but slow; CUDA + Vulkan gives full speed.
|
||||
- **gymnasium / numpy conflict**: See installation note above. Docker image handles this automatically.
|
||||
- **ManiSkill fork**: `robomme` depends on a specific ManiSkill fork (`YinpeiDai/ManiSkill`), pulled in automatically via the `robomme` package.
|
||||
@@ -1,223 +0,0 @@
|
||||
# RoboTwin 2.0
|
||||
|
||||
RoboTwin 2.0 is a **large-scale dual-arm manipulation benchmark** built on the SAPIEN physics engine. It provides a standardized evaluation protocol for bimanual robotic policies across 50 tasks (as of upstream `main`) with strong domain randomization (clutter, lighting, background, tabletop height, and language instructions).
|
||||
|
||||
- Paper: [RoboTwin 2.0: A Scalable Data Generator and Benchmark with Strong Domain Randomization for Robust Bimanual Robotic Manipulation](https://arxiv.org/abs/2506.18088)
|
||||
- GitHub: [RoboTwin-Platform/RoboTwin](https://github.com/RoboTwin-Platform/RoboTwin)
|
||||
- Leaderboard: [robotwin-platform.github.io/leaderboard](https://robotwin-platform.github.io/leaderboard)
|
||||
- Dataset: [lerobot/robotwin_unified](https://huggingface.co/datasets/lerobot/robotwin_unified)
|
||||
|
||||

|
||||
|
||||
## Overview
|
||||
|
||||
| Property | Value |
|
||||
| ------------- | -------------------------------------------------------- |
|
||||
| Tasks | 50 dual-arm manipulation tasks |
|
||||
| Robot | Aloha-AgileX bimanual (14 DOF, 7 per arm) |
|
||||
| Action space | 14-dim joint-space, continuous in `[-1, 1]` |
|
||||
| Cameras | `head_camera`, `left_camera`, `right_camera` |
|
||||
| Simulator | SAPIEN (not MuJoCo) |
|
||||
| Eval protocol | 100 episodes/task, 50 demo_clean demonstrations |
|
||||
| Eval settings | **Easy** (`demo_clean`) and **Hard** (`demo_randomized`) |
|
||||
|
||||
## Available tasks
|
||||
|
||||
RoboTwin 2.0 ships 50 dual-arm manipulation tasks in its upstream `envs/` directory. The canonical list is the `ROBOTWIN_TASKS` tuple in `src/lerobot/envs/robotwin.py`, mirrored verbatim from the upstream repo. Example tasks:
|
||||
|
||||
| Task | CLI name | Category |
|
||||
| ------------------------ | ------------------------ | ----------------- |
|
||||
| Beat block with hammer | `beat_block_hammer` | Tool use |
|
||||
| Click bell / alarm clock | `click_bell` | Precision press |
|
||||
| Stack blocks (2 / 3) | `stack_blocks_two/three` | Stacking |
|
||||
| Stack bowls (2 / 3) | `stack_bowls_two/three` | Stacking |
|
||||
| Handover block / mic | `handover_block` | Bimanual coord. |
|
||||
| Lift pot | `lift_pot` | Bimanual lift |
|
||||
| Shake bottle | `shake_bottle` | Continuous motion |
|
||||
| Turn switch | `turn_switch` | Articulated obj |
|
||||
| Stamp seal | `stamp_seal` | Precision place |
|
||||
| Scan object | `scan_object` | Mobile manip. |
|
||||
|
||||
Pass a comma-separated list to `--env.task` to run multiple tasks in a single eval sweep.
|
||||
|
||||
<Tip warning={true}>
|
||||
`open_laptop` is currently broken upstream (its `check_success()` uses
|
||||
`self.arm_tag`, which is only set inside the scripted-expert `play_once()`
|
||||
path and therefore unavailable during normal policy eval). Avoid it until the
|
||||
upstream bug is fixed, or patch the task to default `self.arm_tag = "left"` in
|
||||
`load_actors()`.
|
||||
</Tip>
|
||||
|
||||
## Dataset
|
||||
|
||||
The RoboTwin 2.0 dataset is available in **LeRobot v3.0 format** on the Hugging Face Hub:
|
||||
|
||||
```
|
||||
lerobot/robotwin_unified
|
||||
```
|
||||
|
||||
It contains over 100,000 pre-collected trajectories across all 50 tasks (79.6 GB, Apache 2.0 license). No format conversion is needed — it is already in the correct LeRobot v3.0 schema with video observations and action labels.
|
||||
|
||||
You can load it directly with the HF Datasets library:
|
||||
|
||||
```python
|
||||
from datasets import load_dataset
|
||||
|
||||
ds = load_dataset("lerobot/robotwin_unified", split="train")
|
||||
```
|
||||
|
||||
## Installation
|
||||
|
||||
RoboTwin 2.0 requires **Linux** with an NVIDIA GPU (CUDA 12.1 recommended). Installation takes approximately 20 minutes.
|
||||
|
||||
### 1. Create a conda environment
|
||||
|
||||
```bash
|
||||
conda create -n robotwin python=3.10 -y
|
||||
conda activate robotwin
|
||||
```
|
||||
|
||||
### 2. Install LeRobot
|
||||
|
||||
```bash
|
||||
git clone https://github.com/huggingface/lerobot.git
|
||||
cd lerobot
|
||||
pip install -e "."
|
||||
```
|
||||
|
||||
### 3. Install RoboTwin 2.0
|
||||
|
||||
```bash
|
||||
git clone https://github.com/RoboTwin-Platform/RoboTwin.git
|
||||
cd RoboTwin
|
||||
bash script/_install.sh
|
||||
bash script/_download_assets.sh
|
||||
```
|
||||
|
||||
The install script handles all Python dependencies including SAPIEN, CuRobo, mplib, and pytorch3d.
|
||||
|
||||
<Tip warning={true}>
|
||||
If the automated install fails, install manually:
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
pip install "git+https://github.com/facebookresearch/pytorch3d.git@stable"
|
||||
cd envs && git clone https://github.com/NVlabs/curobo.git && cd curobo
|
||||
pip install -e . --no-build-isolation
|
||||
```
|
||||
|
||||
Then apply the required mplib fix: in `mplib/planner.py` line 807, remove `or collide` from the conditional.
|
||||
|
||||
</Tip>
|
||||
|
||||
### 4. Add RoboTwin to PYTHONPATH
|
||||
|
||||
The RoboTwin task modules must be importable by LeRobot. From within the `RoboTwin/` directory:
|
||||
|
||||
```bash
|
||||
export PYTHONPATH="${PYTHONPATH}:$(pwd)"
|
||||
```
|
||||
|
||||
Add this to your shell profile to make it permanent.
|
||||
|
||||
## Evaluation
|
||||
|
||||
### Standard evaluation (recommended)
|
||||
|
||||
Evaluate a policy on a single task with the official protocol (100 episodes):
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path="your-hf-policy-id" \
|
||||
--env.type=robotwin \
|
||||
--env.task=beat_block_hammer \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=100
|
||||
```
|
||||
|
||||
### Single-task quick check
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path="your-hf-policy-id" \
|
||||
--env.type=robotwin \
|
||||
--env.task=beat_block_hammer \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=5
|
||||
```
|
||||
|
||||
### Multi-task sweep
|
||||
|
||||
Evaluate on several tasks in one run:
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path="your-hf-policy-id" \
|
||||
--env.type=robotwin \
|
||||
--env.task=beat_block_hammer,click_bell,handover_block,stack_blocks_two \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=100
|
||||
```
|
||||
|
||||
### Full benchmark (all 50 tasks)
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path="your-hf-policy-id" \
|
||||
--env.type=robotwin \
|
||||
--env.task=adjust_bottle,beat_block_hammer,blocks_ranking_rgb,blocks_ranking_size,click_alarmclock,click_bell,dump_bin_bigbin,grab_roller,handover_block,handover_mic,hanging_mug,lift_pot,move_can_pot,move_pillbottle_pad,move_playingcard_away,move_stapler_pad,open_microwave,pick_diverse_bottles,pick_dual_bottles,place_a2b_left,place_a2b_right,place_bread_basket,place_bread_skillet,place_burger_fries,place_can_basket,place_cans_plasticbox,place_container_plate,place_dual_shoes,place_empty_cup,place_fan,place_mouse_pad,place_object_basket,place_object_scale,place_object_stand,place_phone_stand,place_shoe,press_stapler,put_bottles_dustbin,put_object_cabinet,rotate_qrcode,scan_object,shake_bottle,shake_bottle_horizontally,stack_blocks_three,stack_blocks_two,stack_bowls_three,stack_bowls_two,stamp_seal,turn_switch \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=100
|
||||
```
|
||||
|
||||
<Tip>
|
||||
`open_laptop` is intentionally omitted above because of the upstream
|
||||
`self.arm_tag` bug (see the **Available tasks** section). Re-add it once the
|
||||
upstream fix lands.
|
||||
</Tip>
|
||||
|
||||
## Camera configuration
|
||||
|
||||
By default, all three cameras are included:
|
||||
|
||||
| Camera key | Description |
|
||||
| -------------- | ------------------------------ |
|
||||
| `head_camera` | Torso-mounted overhead view |
|
||||
| `left_camera` | Left arm wrist-mounted camera |
|
||||
| `right_camera` | Right arm wrist-mounted camera |
|
||||
|
||||
To use a subset of cameras, override `--env.camera_names`:
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path="your-hf-policy-id" \
|
||||
--env.type=robotwin \
|
||||
--env.task=beat_block_hammer \
|
||||
--env.camera_names="head_camera,left_camera" \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=10
|
||||
```
|
||||
|
||||
## Environment config reference
|
||||
|
||||
Key parameters for `RoboTwinEnvConfig`:
|
||||
|
||||
| Parameter | Default | Description |
|
||||
| -------------------- | ---------------------------------------- | ---------------------------------- |
|
||||
| `task` | `"beat_block_hammer"` | Comma-separated task name(s) |
|
||||
| `fps` | `25` | Simulation FPS |
|
||||
| `episode_length` | `300` | Max steps per episode |
|
||||
| `obs_type` | `"pixels_agent_pos"` | `"pixels"` or `"pixels_agent_pos"` |
|
||||
| `camera_names` | `"head_camera,left_camera,right_camera"` | Comma-separated active cameras |
|
||||
| `observation_height` | `240` | Camera pixel height |
|
||||
| `observation_width` | `320` | Camera pixel width |
|
||||
|
||||
## Leaderboard submission
|
||||
|
||||
Results can be submitted to the [RoboTwin 2.0 leaderboard](https://robotwin-platform.github.io/leaderboard). The official protocol requires:
|
||||
|
||||
- Training on 50 `demo_clean` demonstrations per task
|
||||
- Evaluating 100 episodes per task
|
||||
- Reporting success rate separately for **Easy** (`demo_clean`) and **Hard** (`demo_randomized`) settings
|
||||
|
||||
For submission instructions, refer to the [RoboTwin 2.0 documentation](https://robotwin-platform.github.io/doc/).
|
||||
+28
-29
@@ -46,7 +46,7 @@ This ensures identical task states map to consistent progress values, even acros
|
||||
|
||||
## Inputs and Targets (What the new code expects)
|
||||
|
||||
SARM is trained through its processor (`src/lerobot/rewards/sarm/processor_sarm.py`), which:
|
||||
SARM is trained through its processor (`src/lerobot/policies/sarm/processor_sarm.py`), which:
|
||||
|
||||
- **Encodes** images and task text with CLIP (ViT-B/32) into `video_features` and `text_features`
|
||||
- **Pads/truncates** robot state into `state_features` (up to `max_state_dim`)
|
||||
@@ -347,7 +347,7 @@ Use `compute_rabc_weights.py` with `--visualize-only` to visualize model predict
|
||||
<hfoption id="single_stage">
|
||||
|
||||
```bash
|
||||
python -m lerobot.rewards.sarm.compute_rabc_weights \
|
||||
python src/lerobot/policies/sarm/compute_rabc_weights.py \
|
||||
--dataset-repo-id your-username/your-dataset \
|
||||
--reward-model-path your-username/sarm-model \
|
||||
--visualize-only \
|
||||
@@ -360,7 +360,7 @@ python -m lerobot.rewards.sarm.compute_rabc_weights \
|
||||
<hfoption id="dense_only">
|
||||
|
||||
```bash
|
||||
python -m lerobot.rewards.sarm.compute_rabc_weights \
|
||||
python src/lerobot/policies/sarm/compute_rabc_weights.py \
|
||||
--dataset-repo-id your-username/your-dataset \
|
||||
--reward-model-path your-username/sarm-model \
|
||||
--visualize-only \
|
||||
@@ -373,7 +373,7 @@ python -m lerobot.rewards.sarm.compute_rabc_weights \
|
||||
<hfoption id="dual">
|
||||
|
||||
```bash
|
||||
python -m lerobot.rewards.sarm.compute_rabc_weights \
|
||||
python src/lerobot/policies/sarm/compute_rabc_weights.py \
|
||||
--dataset-repo-id your-username/your-dataset \
|
||||
--reward-model-path your-username/sarm-model \
|
||||
--visualize-only \
|
||||
@@ -429,7 +429,7 @@ The weighting follows **Equations 8-9** from the paper:
|
||||
First, run the SARM model on all frames in your dataset to compute progress values:
|
||||
|
||||
```bash
|
||||
python -m lerobot.rewards.sarm.compute_rabc_weights \
|
||||
python src/lerobot/policies/sarm/compute_rabc_weights.py \
|
||||
--dataset-repo-id your-username/your-dataset \
|
||||
--reward-model-path your-username/sarm-model \
|
||||
--head-mode sparse \
|
||||
@@ -465,15 +465,15 @@ This script:
|
||||
|
||||
### Step 5b: Train Policy with RA-BC
|
||||
|
||||
Once you have the progress file, train your policy with RA-BC weighting. The progress file is auto-detected from the dataset path (`sarm_progress.parquet`) if not explicitly provided. Currently PI0, PI0.5 and SmolVLA are supported with RA-BC:
|
||||
Once you have the progress file, train your policy with RA-BC weighting. The progress file is auto-detected from the dataset path (`sarm_progress.parquet`). Currently PI0, PI0.5 and SmolVLA are supported with RA-BC:
|
||||
|
||||
```bash
|
||||
lerobot-train \
|
||||
--dataset.repo_id=your-username/your-dataset \
|
||||
--policy.type=pi0 \
|
||||
--sample_weighting.type=rabc \
|
||||
--sample_weighting.head_mode=sparse \
|
||||
--sample_weighting.kappa=0.01 \
|
||||
--use_rabc=true \
|
||||
--rabc_head_mode=sparse \
|
||||
--rabc_kappa=0.01 \
|
||||
--output_dir=outputs/train/policy_rabc \
|
||||
--batch_size=32 \
|
||||
--steps=40000
|
||||
@@ -488,13 +488,12 @@ The training script automatically:
|
||||
|
||||
**RA-BC Arguments:**
|
||||
|
||||
| Argument | Description | Default |
|
||||
| ---------------------------------- | ------------------------------------------------------ | ----------------------- |
|
||||
| `--sample_weighting.type` | Weighting strategy type (`rabc` or `uniform`) | `rabc` |
|
||||
| `--sample_weighting.progress_path` | Path to progress parquet file | `sarm_progress.parquet` |
|
||||
| `--sample_weighting.head_mode` | Which SARM head's progress to use: `sparse` or `dense` | `sparse` |
|
||||
| `--sample_weighting.kappa` | Threshold κ for high-quality samples | `0.01` |
|
||||
| `--sample_weighting.epsilon` | Small constant for numerical stability | `1e-6` |
|
||||
| Argument | Description | Default |
|
||||
| ---------------------- | ---------------------------------------------------------- | ---------------------------------- |
|
||||
| `--use_rabc` | Enable RA-BC sample weighting | `false` |
|
||||
| `--rabc_progress_path` | Path to progress parquet file (auto-detected from dataset) | `sarm_progress.parquet` in dataset |
|
||||
| `--rabc_head_mode` | Which SARM head's progress to use: `sparse` or `dense` | `sparse` |
|
||||
| `--rabc_kappa` | Threshold κ for high-quality samples | `0.01` |
|
||||
|
||||
### Tuning RA-BC Kappa
|
||||
|
||||
@@ -512,30 +511,30 @@ The `kappa` parameter is the threshold that determines which samples get full we
|
||||
|
||||
Monitor these WandB metrics during training:
|
||||
|
||||
| Metric | Healthy Range | Problem Indicator |
|
||||
| ----------------------------- | ------------- | ------------------------- |
|
||||
| `sample_weight_mean_weight` | 0.3 - 0.8 | ≈ 1.0 means kappa too low |
|
||||
| `sample_weighting/delta_mean` | > 0 | Should be positive |
|
||||
| `sample_weighting/delta_std` | > 0 | Variance in data quality |
|
||||
| Metric | Healthy Range | Problem Indicator |
|
||||
| ------------------ | ------------- | ------------------------- |
|
||||
| `rabc_mean_weight` | 0.3 - 0.8 | ≈ 1.0 means kappa too low |
|
||||
| `rabc_delta_mean` | > 0 | Should be positive |
|
||||
| `rabc_delta_std` | > 0 | Variance in data quality |
|
||||
|
||||
**If `sample_weight_mean_weight ≈ 1.0`:** Your kappa is too low. Most samples have `delta > kappa` and bypass the soft-weighting entirely. RA-BC becomes equivalent to vanilla BC.
|
||||
**If `rabc_mean_weight ≈ 1.0`:** Your kappa is too low. Most samples have `delta > kappa` and bypass the soft-weighting entirely. RA-BC becomes equivalent to vanilla BC.
|
||||
|
||||
**Setting kappa based on your data:**
|
||||
|
||||
The default `kappa=0.01` was tuned for the paper's T-shirt folding task (~90s episodes at 30fps). For your dataset, check the logged `sample_weighting/delta_mean` and `sample_weighting/delta_std`:
|
||||
The default `kappa=0.01` was tuned for the paper's T-shirt folding task (~90s episodes at 30fps). For your dataset, check the logged `rabc_delta_mean` and `rabc_delta_std`:
|
||||
|
||||
```
|
||||
# If delta_mean ≈ 0.03 and delta_std ≈ 0.02:
|
||||
# Most deltas fall in range [0.01, 0.05]
|
||||
|
||||
# Option 1: Set kappa = delta_mean (medium selectivity)
|
||||
--sample_weighting.kappa=0.03
|
||||
--rabc_kappa=0.03
|
||||
|
||||
# Option 2: Set kappa = delta_mean + delta_std (high selectivity)
|
||||
--sample_weighting.kappa=0.05
|
||||
--rabc_kappa=0.05
|
||||
|
||||
# Option 3: Set kappa = delta_mean + 2*delta_std (very selective)
|
||||
--sample_weighting.kappa=0.07
|
||||
--rabc_kappa=0.07
|
||||
```
|
||||
|
||||
**When RA-BC may not help:**
|
||||
@@ -551,8 +550,8 @@ accelerate launch \
|
||||
src/lerobot/scripts/lerobot_train.py \
|
||||
--dataset.repo_id=your-username/your-dataset \
|
||||
--policy.type=pi0 \
|
||||
--sample_weighting.type=rabc \
|
||||
--sample_weighting.kappa=0.01 \
|
||||
--use_rabc=true \
|
||||
--rabc_kappa=0.01 \
|
||||
--output_dir=outputs/train/policy_rabc \
|
||||
--batch_size=32 \
|
||||
--steps=40000
|
||||
@@ -577,7 +576,7 @@ accelerate launch \
|
||||
### RA-BC
|
||||
|
||||
1. **Train SARM first**: RA-BC quality depends entirely on SARM quality
|
||||
2. **Monitor `sample_weight_mean_weight`**: If it's ≈ 1.0, increase kappa (see [Tuning RA-BC Kappa](#tuning-ra-bc-kappa))
|
||||
2. **Monitor `rabc_mean_weight`**: If it's ≈ 1.0, increase kappa (see [Tuning RA-BC Kappa](#tuning-ra-bc-kappa))
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -1,210 +0,0 @@
|
||||
# Tools
|
||||
|
||||
LeRobot v3.1 supports **tool calls** in policies — assistant messages can
|
||||
emit structured invocations like `say(text="OK, starting now")` that the
|
||||
runtime dispatches to a real implementation (TTS, controller, logger, …).
|
||||
|
||||
This page covers:
|
||||
|
||||
1. Where the tool catalog lives.
|
||||
2. How the annotation pipeline produces tool-call atoms.
|
||||
3. How to add your own tool.
|
||||
|
||||
## Where tools are declared
|
||||
|
||||
Two layers.
|
||||
|
||||
**The catalog** — a list of OpenAI-style function schemas — lives at
|
||||
`meta/info.json["tools"]` on each dataset. Example:
|
||||
|
||||
```json
|
||||
{
|
||||
"features": { "...": "..." },
|
||||
"tools": [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "say",
|
||||
"description": "Speak a short utterance to the user via the TTS executor.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"text": {
|
||||
"type": "string",
|
||||
"description": "The verbatim text to speak."
|
||||
}
|
||||
},
|
||||
"required": ["text"]
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
Read it via the dataset metadata accessor:
|
||||
|
||||
```python
|
||||
from lerobot.datasets.dataset_metadata import LeRobotDatasetMetadata
|
||||
|
||||
meta = LeRobotDatasetMetadata(repo_id="pepijn/super_poulain_final_annotations")
|
||||
tools = meta.tools # list[dict] — OpenAI tool schemas
|
||||
```
|
||||
|
||||
If the dataset's `info.json` doesn't declare any tools, `meta.tools`
|
||||
returns `DEFAULT_TOOLS` from `lerobot.datasets.language` — currently a
|
||||
single-entry list with the canonical `say` schema. So unannotated
|
||||
datasets and chat-template consumers keep working without any
|
||||
configuration:
|
||||
|
||||
```python
|
||||
prompt_str = tokenizer.apply_chat_template(
|
||||
sample["messages"],
|
||||
tools=meta.tools, # works either way
|
||||
add_generation_prompt=False,
|
||||
tokenize=False,
|
||||
)
|
||||
```
|
||||
|
||||
**The implementations** — runnable Python — will live under
|
||||
`src/lerobot/tools/`, one file per tool. The runtime dispatcher and
|
||||
the canonical `say` implementation (wrapping Kyutai's pocket-tts) are
|
||||
not part of the catalog layer described here; today this layer ships
|
||||
only the schema storage and the `DEFAULT_TOOLS` fallback constant.
|
||||
|
||||
## Per-row tool _invocations_
|
||||
|
||||
The catalog above describes _what can be called_. The actual _call_ — the
|
||||
function name plus the argument values — is stored per-row, on the
|
||||
assistant atoms in `language_events`:
|
||||
|
||||
```python
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": null,
|
||||
"style": null,
|
||||
"timestamp": 12.4,
|
||||
"camera": null,
|
||||
"tool_calls": [
|
||||
{ "type": "function",
|
||||
"function": { "name": "say", "arguments": { "text": "On it." } } }
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
Recipes splice these into rendered messages via `tool_calls_from`:
|
||||
|
||||
```yaml
|
||||
user_interjection_response:
|
||||
bindings:
|
||||
speech: "emitted_at(t, role=assistant, tool_name=say)"
|
||||
messages:
|
||||
- { role: user, content: "${task}", stream: high_level }
|
||||
- {
|
||||
role: assistant,
|
||||
content: "${current_plan}",
|
||||
stream: high_level,
|
||||
target: true,
|
||||
tool_calls_from: speech,
|
||||
}
|
||||
```
|
||||
|
||||
The model's training target is one assistant turn that carries both the
|
||||
plan text _and_ the `say` tool call. At inference, the runtime parses
|
||||
the generated text back into structured `tool_calls` and dispatches to
|
||||
the matching implementation.
|
||||
|
||||
## How to add your own tool
|
||||
|
||||
> **Note:** Steps 2 and 3 below describe the runtime layer
|
||||
> (`src/lerobot/tools/`, the `Tool` protocol, `TOOL_REGISTRY`,
|
||||
> `get_tools(meta)`) which is not part of the catalog layer shipped
|
||||
> today — those modules don't yet exist in the tree. Step 1 alone is
|
||||
> enough to make the tool visible to the chat template via
|
||||
> `meta.tools` so the model can learn to _generate_ the call;
|
||||
> executing the call at inference requires the runtime layer.
|
||||
|
||||
Three steps. Concrete example: a `record_observation` tool the policy
|
||||
can call to capture an extra observation outside the regular control
|
||||
loop.
|
||||
|
||||
### Step 1 — declare the schema
|
||||
|
||||
Add an entry under `meta/info.json["tools"]`. Either edit the file
|
||||
directly on disk _before_ running the annotation pipeline (it'll be
|
||||
preserved) or hand it to `lerobot-annotate` via a config flag.
|
||||
|
||||
```json
|
||||
{
|
||||
"tools": [
|
||||
{ "type": "function", "function": { "name": "say", "...": "..." } },
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "record_observation",
|
||||
"description": "Capture a high-resolution still image for the user.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"label": {
|
||||
"type": "string",
|
||||
"description": "Short label for the saved image."
|
||||
}
|
||||
},
|
||||
"required": ["label"]
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
The schema follows OpenAI's function-calling convention exactly, so the
|
||||
chat template can render it natively.
|
||||
|
||||
### Step 2 — implement the call
|
||||
|
||||
Create `src/lerobot/tools/record_observation.py`:
|
||||
|
||||
```python
|
||||
from .base import Tool
|
||||
from typing import Any
|
||||
|
||||
RECORD_OBSERVATION_SCHEMA: dict[str, Any] = { "...": "..." } # mirrors the JSON above
|
||||
|
||||
|
||||
class RecordObservationTool:
|
||||
name = "record_observation"
|
||||
schema = RECORD_OBSERVATION_SCHEMA
|
||||
|
||||
def __init__(self, schema: dict | None = None, output_dir: str = "."):
|
||||
self.output_dir = output_dir
|
||||
|
||||
def call(self, arguments: dict) -> str:
|
||||
label = arguments["label"]
|
||||
# ... save the latest camera frame to <output_dir>/<label>.png ...
|
||||
return f"saved {label}.png"
|
||||
```
|
||||
|
||||
One file per tool keeps dependencies isolated — `record_observation`
|
||||
might pull `pillow`, while `say` pulls `pocket-tts`. Users installing
|
||||
only the tools they need avoid heavy transitive deps.
|
||||
|
||||
### Step 3 — register it
|
||||
|
||||
Add to `src/lerobot/tools/registry.py`:
|
||||
|
||||
```python
|
||||
from .record_observation import RecordObservationTool
|
||||
|
||||
TOOL_REGISTRY["record_observation"] = RecordObservationTool
|
||||
```
|
||||
|
||||
That's it. At runtime `get_tools(meta)` looks up each schema in
|
||||
`meta.tools`, instantiates the matching registered class, and returns
|
||||
a name → instance dict the dispatcher can route into.
|
||||
|
||||
If you want to use a tool _without_ writing an implementation (e.g. for
|
||||
training-time chat-template formatting only), step 1 alone is enough —
|
||||
the model still learns to _generate_ the call. Steps 2 and 3 are only
|
||||
needed to actually _execute_ it at inference.
|
||||
@@ -274,8 +274,7 @@ python src/lerobot/scripts/lerobot_train.py \
|
||||
Once trained, we recommend deploying policies using inference-time RTC:
|
||||
|
||||
```bash
|
||||
lerobot-rollout \
|
||||
--strategy.type=base \
|
||||
python examples/rtc/eval_with_real_robot.py \
|
||||
--policy.path=your-username/your-repo-id \
|
||||
--policy.device=cuda \
|
||||
--robot.type=unitree_g1 \
|
||||
|
||||
@@ -1,176 +0,0 @@
|
||||
# VLABench
|
||||
|
||||
[VLABench](https://github.com/OpenMOSS/VLABench) is a large-scale benchmark for **language-conditioned robotic manipulation with long-horizon reasoning**. The upstream suite covers 100 task categories across 2,000+ objects and evaluates six dimensions of robot intelligence: mesh & texture understanding, spatial reasoning, world-knowledge transfer, semantic instruction comprehension, physical-law understanding, and long-horizon planning. Built on MuJoCo / dm_control with a Franka Panda 7-DOF arm. LeRobot exposes **43 of these tasks** through `--env.task` (21 primitives + 22 composites, see [Available tasks](#available-tasks) below).
|
||||
|
||||
- Paper: [VLABench: A Large-Scale Benchmark for Language-Conditioned Robotics Manipulation with Long-Horizon Reasoning](https://arxiv.org/abs/2412.18194)
|
||||
- GitHub: [OpenMOSS/VLABench](https://github.com/OpenMOSS/VLABench)
|
||||
- Project website: [vlabench.github.io](https://vlabench.github.io)
|
||||
- Pretrained policy: [`lerobot/smolvla_vlabench`](https://huggingface.co/lerobot/smolvla_vlabench)
|
||||
|
||||
<img
|
||||
src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lerobot/vlabench.png"
|
||||
alt="VLABench benchmark overview"
|
||||
width="85%"
|
||||
/>
|
||||
|
||||
## Available tasks
|
||||
|
||||
VLABench ships two task suites covering **43 task categories** in LeRobot's `--env.task` surface:
|
||||
|
||||
| Suite | CLI name | Tasks | Description |
|
||||
| --------- | ----------- | ----- | ---------------------------------------------------------------- |
|
||||
| Primitive | `primitive` | 21 | Single / few-skill combinations (select, insert, physics QA) |
|
||||
| Composite | `composite` | 22 | Multi-step reasoning and long-horizon planning (cook, rearrange) |
|
||||
|
||||
**Primitive tasks:** `select_fruit`, `select_toy`, `select_chemistry_tube`, `add_condiment`, `select_book`, `select_painting`, `select_drink`, `insert_flower`, `select_billiards`, `select_ingredient`, `select_mahjong`, `select_poker`, and physical-reasoning tasks (`density_qa`, `friction_qa`, `magnetism_qa`, `reflection_qa`, `simple_cuestick_usage`, `simple_seesaw_usage`, `sound_speed_qa`, `thermal_expansion_qa`, `weight_qa`).
|
||||
|
||||
**Composite tasks:** `cluster_billiards`, `cluster_book`, `cluster_drink`, `cluster_toy`, `cook_dishes`, `cool_drink`, `find_unseen_object`, `get_coffee`, `hammer_nail`, `heat_food`, `make_juice`, `play_mahjong`, `play_math_game`, `play_poker`, `play_snooker`, `rearrange_book`, `rearrange_chemistry_tube`, `set_dining_table`, `set_study_table`, `store_food`, `take_chemistry_experiment`, `use_seesaw_complex`.
|
||||
|
||||
`--env.task` accepts three forms:
|
||||
|
||||
- a single task name (`select_fruit`)
|
||||
- a comma-separated list (`select_fruit,heat_food`)
|
||||
- a suite shortcut (`primitive`, `composite`, or `primitive,composite`)
|
||||
|
||||
## Installation
|
||||
|
||||
VLABench is **not on PyPI** — its only distribution is the [OpenMOSS/VLABench](https://github.com/OpenMOSS/VLABench) GitHub repo — so LeRobot does not expose a `vlabench` extra. Install it manually as an editable clone, alongside the MuJoCo / dm_control pins VLABench needs, then fetch the mesh assets:
|
||||
|
||||
```bash
|
||||
# After following the standard LeRobot installation instructions.
|
||||
|
||||
git clone https://github.com/OpenMOSS/VLABench.git ~/VLABench
|
||||
git clone https://github.com/motion-planning/rrt-algorithms.git ~/rrt-algorithms
|
||||
pip install -e ~/VLABench -e ~/rrt-algorithms
|
||||
pip install "mujoco==3.2.2" "dm-control==1.0.22" \
|
||||
open3d colorlog scikit-learn openai gdown
|
||||
|
||||
python ~/VLABench/scripts/download_assets.py
|
||||
```
|
||||
|
||||
<Tip>
|
||||
VLABench requires Linux (`sys_platform == 'linux'`) and Python 3.10+. Set the MuJoCo rendering backend before running:
|
||||
|
||||
```bash
|
||||
export MUJOCO_GL=egl # for headless servers (HPC, cloud)
|
||||
```
|
||||
|
||||
</Tip>
|
||||
|
||||
## Evaluation
|
||||
|
||||
All eval snippets below mirror the command CI runs (see `.github/workflows/benchmark_tests.yml`). The `--rename_map` argument maps VLABench's `image` / `second_image` / `wrist_image` camera keys onto the three-camera (`camera1` / `camera2` / `camera3`) input layout the released `smolvla_vlabench` policy was trained on.
|
||||
|
||||
### Single-task evaluation (recommended for quick iteration)
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path=lerobot/smolvla_vlabench \
|
||||
--env.type=vlabench \
|
||||
--env.task=select_fruit \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=10 \
|
||||
--eval.use_async_envs=false \
|
||||
--policy.device=cuda \
|
||||
'--rename_map={"observation.images.image": "observation.images.camera1", "observation.images.second_image": "observation.images.camera2", "observation.images.wrist_image": "observation.images.camera3"}'
|
||||
```
|
||||
|
||||
### Multi-task evaluation
|
||||
|
||||
Pass a comma-separated list of tasks:
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path=lerobot/smolvla_vlabench \
|
||||
--env.type=vlabench \
|
||||
--env.task=select_fruit,select_toy,add_condiment,heat_food \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=10 \
|
||||
--eval.use_async_envs=false \
|
||||
--policy.device=cuda \
|
||||
'--rename_map={"observation.images.image": "observation.images.camera1", "observation.images.second_image": "observation.images.camera2", "observation.images.wrist_image": "observation.images.camera3"}'
|
||||
```
|
||||
|
||||
### Suite-wide evaluation
|
||||
|
||||
Run an entire suite (all 21 primitives or all 22 composites):
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path=lerobot/smolvla_vlabench \
|
||||
--env.type=vlabench \
|
||||
--env.task=primitive \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=10 \
|
||||
--eval.use_async_envs=false \
|
||||
--policy.device=cuda \
|
||||
--env.max_parallel_tasks=1 \
|
||||
'--rename_map={"observation.images.image": "observation.images.camera1", "observation.images.second_image": "observation.images.camera2", "observation.images.wrist_image": "observation.images.camera3"}'
|
||||
```
|
||||
|
||||
Or both suites:
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path=lerobot/smolvla_vlabench \
|
||||
--env.type=vlabench \
|
||||
--env.task=primitive,composite \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=10 \
|
||||
--eval.use_async_envs=false \
|
||||
--policy.device=cuda \
|
||||
--env.max_parallel_tasks=1 \
|
||||
'--rename_map={"observation.images.image": "observation.images.camera1", "observation.images.second_image": "observation.images.camera2", "observation.images.wrist_image": "observation.images.camera3"}'
|
||||
```
|
||||
|
||||
### Recommended evaluation episodes
|
||||
|
||||
**10 episodes per task** for reproducible benchmarking (210 total for the full primitive suite, 220 for composite). Matches the protocol in the VLABench paper.
|
||||
|
||||
## Policy inputs and outputs
|
||||
|
||||
**Observations:**
|
||||
|
||||
- `observation.state` — 7-dim end-effector state (position xyz + Euler xyz + gripper)
|
||||
- `observation.images.image` — front camera, 480×480 HWC uint8
|
||||
- `observation.images.second_image` — second camera, 480×480 HWC uint8
|
||||
- `observation.images.wrist_image` — wrist camera, 480×480 HWC uint8
|
||||
|
||||
**Actions:**
|
||||
|
||||
- Continuous control in `Box(-1, 1, shape=(7,))` — 3D position + 3D Euler orientation + 1D gripper.
|
||||
|
||||
## Training
|
||||
|
||||
### Datasets
|
||||
|
||||
Pre-collected VLABench datasets in LeRobot format on the Hub:
|
||||
|
||||
- [`VLABench/vlabench_primitive_ft_lerobot_video`](https://huggingface.co/datasets/VLABench/vlabench_primitive_ft_lerobot_video) — 5,000 episodes, 128 tasks, 480×480 images.
|
||||
- [`VLABench/vlabench_composite_ft_lerobot_video`](https://huggingface.co/datasets/VLABench/vlabench_composite_ft_lerobot_video) — 5,977 episodes, 167 tasks, 224×224 images.
|
||||
|
||||
### Example training command
|
||||
|
||||
Fine-tune a SmolVLA base on the primitive suite:
|
||||
|
||||
```bash
|
||||
lerobot-train \
|
||||
--policy.type=smolvla \
|
||||
--policy.repo_id=${HF_USER}/smolvla_vlabench_primitive \
|
||||
--policy.load_vlm_weights=true \
|
||||
--policy.push_to_hub=true \
|
||||
--dataset.repo_id=VLABench/vlabench_primitive_ft_lerobot_video \
|
||||
--env.type=vlabench \
|
||||
--env.task=select_fruit \
|
||||
--output_dir=./outputs/smolvla_vlabench_primitive \
|
||||
--steps=100000 \
|
||||
--batch_size=4 \
|
||||
--eval_freq=5000 \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=1 \
|
||||
--save_freq=10000
|
||||
```
|
||||
|
||||
## Reproducing published results
|
||||
|
||||
The released checkpoint [`lerobot/smolvla_vlabench`](https://huggingface.co/lerobot/smolvla_vlabench) was trained on the primitive-suite dataset above and is evaluated with the [Single-task](#single-task-evaluation-recommended-for-quick-iteration) / [Suite-wide](#suite-wide-evaluation) commands. CI runs a 10-primitive-task smoke eval (one episode each) on every PR touching the benchmark.
|
||||
@@ -220,7 +220,7 @@ REAL_DIM = 12
|
||||
# Postprocessing: Trim 20D predictions to 12D for deployment
|
||||
```
|
||||
|
||||
See the [action_hub.py](https://github.com/huggingface/lerobot/blob/main/src/lerobot/policies/xvla/action_hub.py) implementation for details.
|
||||
See the [action_hub.py](/home/jade_choghari/robot/lerobot/src/lerobot/policies/xvla/action_hub.py) implementation for details.
|
||||
|
||||
#### Auto Action Mode (Recommended)
|
||||
|
||||
@@ -519,9 +519,9 @@ If you use X-VLA in your research, please cite:
|
||||
|
||||
- [X-VLA Paper](https://arxiv.org/pdf/2510.10274)
|
||||
- [LeRobot Documentation](https://github.com/huggingface/lerobot)
|
||||
- [Action Registry Implementation](https://github.com/huggingface/lerobot/blob/main/src/lerobot/policies/xvla/action_hub.py)
|
||||
- [Processor Implementation](https://github.com/huggingface/lerobot/blob/main/src/lerobot/policies/xvla/processor_xvla.py)
|
||||
- [Model Configuration](https://github.com/huggingface/lerobot/blob/main/src/lerobot/policies/xvla/configuration_xvla.py)
|
||||
- [Action Registry Implementation](https://github.com/huggingface/lerobot/src/lerobot/policies/xvla/action_hub.py)
|
||||
- [Processor Implementation](https://github.com/huggingface/lerobot/src/lerobot/policies/xvla/processor_xvla.py)
|
||||
- [Model Configuration](https://github.com/huggingface/lerobot/src/lerobot/policies/xvla/configuration_xvla.py)
|
||||
|
||||
## Contributing
|
||||
|
||||
|
||||
@@ -69,7 +69,7 @@ class ComputeProgressShards(PipelineStep):
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
|
||||
from lerobot.rewards.sarm.compute_rabc_weights import (
|
||||
from lerobot.policies.sarm.compute_rabc_weights import (
|
||||
generate_all_frame_indices,
|
||||
interpolate_progress,
|
||||
load_sarm_resources,
|
||||
|
||||
@@ -10,7 +10,7 @@ from lerobot.datasets import LeRobotDataset
|
||||
from lerobot.envs.configs import HILSerlProcessorConfig, HILSerlRobotEnvConfig
|
||||
from lerobot.policies import SACConfig
|
||||
from lerobot.policies.sac.modeling_sac import SACPolicy
|
||||
from lerobot.rewards.classifier.modeling_classifier import Classifier
|
||||
from lerobot.policies.sac.reward_model.modeling_classifier import Classifier
|
||||
from lerobot.rl.buffer import ReplayBuffer
|
||||
from lerobot.rl.gym_manipulator import make_robot_env
|
||||
from lerobot.robots.so_follower import SO100FollowerConfig
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import torch
|
||||
|
||||
from lerobot.datasets import LeRobotDataset
|
||||
from lerobot.rewards import RewardClassifierConfig, make_reward_model, make_reward_pre_post_processors
|
||||
from lerobot.policies import RewardClassifierConfig, make_policy, make_pre_post_processors
|
||||
|
||||
|
||||
def main():
|
||||
@@ -22,10 +22,10 @@ def main():
|
||||
model_name="microsoft/resnet-18",
|
||||
)
|
||||
|
||||
# Make reward model, preprocessor, and optimizer
|
||||
reward_model = make_reward_model(config, dataset_stats=dataset.meta.stats)
|
||||
optimizer = config.get_optimizer_preset().build(reward_model.parameters())
|
||||
preprocessor, _ = make_reward_pre_post_processors(config, dataset_stats=dataset.meta.stats)
|
||||
# Make policy, preprocessor, and optimizer
|
||||
policy = make_policy(config, ds_meta=dataset.meta)
|
||||
optimizer = config.get_optimizer_preset().build(policy.parameters())
|
||||
preprocessor, _ = make_pre_post_processors(policy_cfg=config, dataset_stats=dataset.meta.stats)
|
||||
|
||||
classifier_id = "<user>/reward_classifier_hil_serl_example"
|
||||
|
||||
@@ -42,7 +42,7 @@ def main():
|
||||
batch = preprocessor(batch)
|
||||
|
||||
# Forward pass
|
||||
loss, output_dict = reward_model.forward(batch)
|
||||
loss, output_dict = policy.forward(batch)
|
||||
|
||||
# Backward pass and optimization
|
||||
optimizer.zero_grad()
|
||||
@@ -58,8 +58,8 @@ def main():
|
||||
|
||||
print("Training finished!")
|
||||
|
||||
# You can now save the trained reward model.
|
||||
reward_model.push_to_hub(classifier_id)
|
||||
# You can now save the trained policy.
|
||||
policy.push_to_hub(classifier_id)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
+2
-17
@@ -95,7 +95,7 @@ dependencies = [
|
||||
|
||||
# ── Feature-scoped extras ──────────────────────────────────
|
||||
dataset = [
|
||||
"datasets>=4.7.0,<5.0.0",
|
||||
"datasets>=4.0.0,<5.0.0",
|
||||
"pandas>=2.0.0,<3.0.0", # NOTE: Transitive dependency of datasets
|
||||
"pyarrow>=21.0.0,<30.0.0", # NOTE: Transitive dependency of datasets
|
||||
"lerobot[av-dep]",
|
||||
@@ -128,7 +128,7 @@ dataset_viz = ["lerobot[dataset]", "lerobot[viz]"]
|
||||
av-dep = ["av>=15.0.0,<16.0.0"]
|
||||
pygame-dep = ["pygame>=2.5.1,<2.7.0"]
|
||||
placo-dep = ["placo>=0.9.6,<0.9.17"]
|
||||
transformers-dep = ["transformers>=5.4.0,<5.6.0"]
|
||||
transformers-dep = ["transformers==5.3.0"] # TODO(Steven): https://github.com/huggingface/lerobot/pull/3249
|
||||
grpcio-dep = ["grpcio==1.73.1", "protobuf>=6.31.1,<6.32.0"]
|
||||
can-dep = ["python-can>=4.2.0,<5.0.0"]
|
||||
peft-dep = ["peft>=0.18.0,<1.0.0"]
|
||||
@@ -194,7 +194,6 @@ groot = [
|
||||
]
|
||||
sarm = ["lerobot[transformers-dep]", "pydantic>=2.0.0,<3.0.0", "faker>=33.0.0,<35.0.0", "lerobot[matplotlib-dep]", "lerobot[qwen-vl-utils-dep]"]
|
||||
xvla = ["lerobot[transformers-dep]"]
|
||||
eo1 = ["lerobot[transformers-dep]", "lerobot[qwen-vl-utils-dep]"]
|
||||
hilserl = ["lerobot[transformers-dep]", "gym-hil>=0.1.13,<0.2.0", "lerobot[grpcio-dep]", "lerobot[placo-dep]"]
|
||||
|
||||
# Features
|
||||
@@ -213,20 +212,6 @@ aloha = ["lerobot[dataset]", "gym-aloha>=0.1.2,<0.2.0", "lerobot[scipy-dep]"]
|
||||
pusht = ["lerobot[dataset]", "gym-pusht>=0.1.5,<0.2.0", "pymunk>=6.6.0,<7.0.0"] # TODO: Fix pymunk version in gym-pusht instead
|
||||
libero = ["lerobot[dataset]", "lerobot[transformers-dep]", "hf-libero>=0.1.3,<0.2.0; sys_platform == 'linux'", "lerobot[scipy-dep]"]
|
||||
metaworld = ["lerobot[dataset]", "metaworld==3.0.0", "lerobot[scipy-dep]"]
|
||||
# NOTE: vlabench is NOT exposed as a `lerobot` extra. Its only distribution
|
||||
# is the OpenMOSS/VLABench GitHub repo (package name `VLABench`, no PyPI
|
||||
# release), so any `vlabench>=X` pip spec is unresolvable. Install it
|
||||
# manually alongside MuJoCo / dm-control — see docs/source/vlabench.mdx
|
||||
# for the recipe.
|
||||
# NOTE: robomme is NOT a pyproject extra — mani-skill hard-pins numpy<2
|
||||
# which conflicts with lerobot's numpy>=2 base pin, so the two trees can't
|
||||
# resolve into a single env. Install it only in the RoboMME Docker image
|
||||
# via `uv pip install --override` (see docker/Dockerfile.benchmark.robomme).
|
||||
# NOTE: robocasa is NOT exposed as a `lerobot` extra. Its setup.py pins
|
||||
# `lerobot==0.3.3` in install_requires, which cyclically shadows our own
|
||||
# workspace `lerobot` and makes the graph unsolvable under any resolver
|
||||
# (uv, pip). Install it manually alongside robosuite — see
|
||||
# docs/source/robocasa.mdx for the recipe.
|
||||
|
||||
# All
|
||||
all = [
|
||||
|
||||
@@ -31,23 +31,9 @@ from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# LIBERO-plus derives task.language by space-joining the perturbation-variant
|
||||
# filename (grab_language_from_filename in libero/libero/benchmark/__init__.py),
|
||||
# so non-_language_ variants inherit a trailing metadata blob like
|
||||
# "view 0 0 100 0 0 initstate 0 noise 45" or "add 16". Strip those tokens so
|
||||
# the description matches the base instruction used in the training dataset.
|
||||
_LIBERO_PERTURBATION_TAIL_RE = re.compile(
|
||||
r"(?:\s(?:view|initstate|noise|add|tb|table|light|level)(?:\s\d+)+)+$"
|
||||
)
|
||||
|
||||
|
||||
def _strip_libero_perturbation_tail(instruction: str) -> str:
|
||||
return _LIBERO_PERTURBATION_TAIL_RE.sub("", instruction).strip()
|
||||
|
||||
|
||||
def _libero_descriptions(task_suite: str) -> dict[str, str]:
|
||||
from libero.libero import benchmark # type: ignore[import-untyped]
|
||||
@@ -61,10 +47,7 @@ def _libero_descriptions(task_suite: str) -> dict[str, str]:
|
||||
)
|
||||
return {}
|
||||
suite = suite_dict[task_suite]()
|
||||
return {
|
||||
f"{task_suite}_{i}": _strip_libero_perturbation_tail(suite.get_task(i).language)
|
||||
for i in range(suite.n_tasks)
|
||||
}
|
||||
return {f"{task_suite}_{i}": suite.get_task(i).language for i in range(suite.n_tasks)}
|
||||
|
||||
|
||||
def _metaworld_descriptions(task_name: str) -> dict[str, str]:
|
||||
@@ -74,120 +57,19 @@ def _metaworld_descriptions(task_name: str) -> dict[str, str]:
|
||||
return {f"{task_name}_0": label}
|
||||
|
||||
|
||||
def _robotwin_descriptions(task_names: str) -> dict[str, str]:
|
||||
"""Return descriptions for each requested RoboTwin task. Reads
|
||||
`description/task_instruction/<task>.json` from the RoboTwin clone
|
||||
(cwd is /opt/robotwin in CI). Falls back to the task name if missing."""
|
||||
out: dict[str, str] = {}
|
||||
root = Path("description/task_instruction")
|
||||
for name in (t.strip() for t in task_names.split(",") if t.strip()):
|
||||
desc_file = root / f"{name}.json"
|
||||
desc = name.replace("_", " ")
|
||||
if desc_file.is_file():
|
||||
data = json.loads(desc_file.read_text())
|
||||
full = data.get("full_description") or desc
|
||||
# Strip the schema placeholders ({A}, {a}) — keep the sentence readable.
|
||||
desc = full.replace("<", "").replace(">", "")
|
||||
out[f"{name}_0"] = desc
|
||||
return out
|
||||
|
||||
|
||||
def _robocasa_descriptions(task_spec: str) -> dict[str, str]:
|
||||
"""For each task in the comma-separated list, emit a cleaned-name label.
|
||||
|
||||
RoboCasa episodes carry their language instruction in the env's
|
||||
`ep_meta['lang']`, populated per reset. Pulling it requires spinning
|
||||
up the full kitchen env per task (~seconds each); we use the task
|
||||
name as the key here and let the eval's episode info carry the
|
||||
actual instruction.
|
||||
"""
|
||||
out: dict[str, str] = {}
|
||||
for task in (t.strip() for t in task_spec.split(",") if t.strip()):
|
||||
# Split CamelCase into words: "CloseFridge" → "close fridge".
|
||||
label = "".join(f" {c.lower()}" if c.isupper() else c for c in task).strip()
|
||||
out[f"{task}_0"] = label or task
|
||||
return out
|
||||
|
||||
|
||||
_ROBOMME_DESCRIPTIONS = {
|
||||
"BinFill": "Fill the target bin with the correct number of cubes",
|
||||
"PickXtimes": "Pick the indicated cube the specified number of times",
|
||||
"SwingXtimes": "Swing the object the specified number of times",
|
||||
"StopCube": "Grasp and stop the moving cube",
|
||||
"VideoUnmask": "Pick the cube shown in the reference video",
|
||||
"VideoUnmaskSwap": "Pick the cube matching the reference video after a swap",
|
||||
"ButtonUnmask": "Press the button indicated by the reference",
|
||||
"ButtonUnmaskSwap": "Press the correct button after objects are swapped",
|
||||
"PickHighlight": "Pick the highlighted cube",
|
||||
"VideoRepick": "Repick the cube shown in the reference video",
|
||||
"VideoPlaceButton": "Place the cube on the button shown in the video",
|
||||
"VideoPlaceOrder": "Place cubes in the order shown in the video",
|
||||
"MoveCube": "Move the cube to the target location",
|
||||
"InsertPeg": "Insert the peg into the target hole",
|
||||
"PatternLock": "Unlock the pattern by pressing buttons in sequence",
|
||||
"RouteStick": "Route the stick through the required waypoints",
|
||||
}
|
||||
|
||||
|
||||
def _robomme_descriptions(task_names: str, task_ids: list[int] | None = None) -> dict[str, str]:
|
||||
"""Return descriptions for each requested RoboMME task. Keys match the
|
||||
video filename pattern `<task>_<task_id>` used by the eval script."""
|
||||
if task_ids is None:
|
||||
task_ids = [0]
|
||||
out: dict[str, str] = {}
|
||||
for name in (t.strip() for t in task_names.split(",") if t.strip()):
|
||||
desc = _ROBOMME_DESCRIPTIONS.get(name, name)
|
||||
for tid in task_ids:
|
||||
out[f"{name}_{tid}"] = desc
|
||||
return out
|
||||
|
||||
|
||||
def _vlabench_descriptions(task_spec: str) -> dict[str, str]:
|
||||
"""For each task in the comma-separated list, emit a cleaned-name label.
|
||||
|
||||
VLABench tasks carry language instructions on their dm_control task
|
||||
object, but pulling them requires loading the full env per task
|
||||
(~seconds each). The CI smoke-eval already captures the instruction
|
||||
inside its episode info; this mapping is just enough to key
|
||||
`metrics.json` by `<task>_0`.
|
||||
"""
|
||||
out: dict[str, str] = {}
|
||||
for task in (t.strip() for t in task_spec.split(",") if t.strip()):
|
||||
out[f"{task}_0"] = task.replace("_", " ").strip()
|
||||
return out
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--env", required=True, help="Environment family (libero, metaworld, ...)")
|
||||
parser.add_argument("--task", required=True, help="Task/suite name (e.g. libero_spatial)")
|
||||
parser.add_argument(
|
||||
"--task-ids",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Comma-separated task IDs (e.g. '0,1,2'). Default: [0]",
|
||||
)
|
||||
parser.add_argument("--output", required=True, help="Path to write task_descriptions.json")
|
||||
args = parser.parse_args()
|
||||
|
||||
task_ids: list[int] | None = None
|
||||
if args.task_ids:
|
||||
task_ids = [int(x.strip()) for x in args.task_ids.split(",")]
|
||||
|
||||
descriptions: dict[str, str] = {}
|
||||
try:
|
||||
if args.env == ("libero", "libero_plus"):
|
||||
if args.env == "libero":
|
||||
descriptions = _libero_descriptions(args.task)
|
||||
elif args.env == "metaworld":
|
||||
descriptions = _metaworld_descriptions(args.task)
|
||||
elif args.env == "robotwin":
|
||||
descriptions = _robotwin_descriptions(args.task)
|
||||
elif args.env == "robocasa":
|
||||
descriptions = _robocasa_descriptions(args.task)
|
||||
elif args.env == "robomme":
|
||||
descriptions = _robomme_descriptions(args.task, task_ids=task_ids)
|
||||
elif args.env == "vlabench":
|
||||
descriptions = _vlabench_descriptions(args.task)
|
||||
else:
|
||||
print(
|
||||
f"[extract_task_descriptions] No description extractor for env '{args.env}'.",
|
||||
|
||||
@@ -17,7 +17,6 @@ Provides the RealSenseCamera class for capturing frames from Intel RealSense cam
|
||||
"""
|
||||
|
||||
import logging
|
||||
import sys
|
||||
import time
|
||||
from threading import Event, Lock, Thread
|
||||
from typing import TYPE_CHECKING, Any
|
||||
@@ -42,7 +41,6 @@ from ..utils import get_cv2_rotation
|
||||
from .configuration_realsense import RealSenseCameraConfig
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
pkg_name = "pyrealsense2-macosx" if sys.platform == "darwin" else "pyrealsense2"
|
||||
|
||||
|
||||
class RealSenseCamera(Camera):
|
||||
@@ -116,7 +114,7 @@ class RealSenseCamera(Camera):
|
||||
Args:
|
||||
config: The configuration settings for the camera.
|
||||
"""
|
||||
require_package(pkg_name, extra="intelrealsense", import_name="pyrealsense2")
|
||||
require_package("pyrealsense2", extra="intelrealsense")
|
||||
super().__init__(config)
|
||||
|
||||
self.config = config
|
||||
|
||||
@@ -41,12 +41,8 @@ def cfg_to_group(
|
||||
return tag
|
||||
return tag[:max_tag_length]
|
||||
|
||||
if cfg.is_reward_model_training:
|
||||
trainable_tag = f"reward_model:{cfg.reward_model.type}"
|
||||
else:
|
||||
trainable_tag = f"policy:{cfg.policy.type}"
|
||||
lst = [
|
||||
trainable_tag,
|
||||
f"policy:{cfg.policy.type}",
|
||||
f"seed:{cfg.seed}",
|
||||
]
|
||||
if cfg.dataset is not None:
|
||||
|
||||
@@ -24,7 +24,6 @@ Import them directly: ``from lerobot.configs.train import TrainPipelineConfig``
|
||||
from .dataset import DatasetRecordConfig
|
||||
from .default import DatasetConfig, EvalConfig, PeftConfig, WandBConfig
|
||||
from .policies import PreTrainedConfig
|
||||
from .recipe import MessageTurn, TrainingRecipe, load_recipe
|
||||
from .types import (
|
||||
FeatureType,
|
||||
NormalizationMode,
|
||||
@@ -44,10 +43,7 @@ __all__ = [
|
||||
"DatasetRecordConfig",
|
||||
"DatasetConfig",
|
||||
"EvalConfig",
|
||||
"MessageTurn",
|
||||
"PeftConfig",
|
||||
"PreTrainedConfig",
|
||||
"TrainingRecipe",
|
||||
"WandBConfig",
|
||||
"load_recipe",
|
||||
]
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
|
||||
"""Shared dataset recording configuration used by both ``lerobot-record`` and ``lerobot-rollout``."""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
@@ -68,13 +68,10 @@ class DatasetRecordConfig:
|
||||
# Number of threads per encoder instance. None = auto (codec default).
|
||||
# Lower values reduce CPU usage, maps to 'lp' (via svtav1-params) for libsvtav1 and 'threads' for h264/hevc..
|
||||
encoder_threads: int | None = None
|
||||
# Rename map for the observation to override the image and state keys
|
||||
rename_map: dict[str, str] = field(default_factory=dict)
|
||||
|
||||
def stamp_repo_id(self) -> None:
|
||||
"""Append a date-time tag to ``repo_id`` so each recording session gets a unique name.
|
||||
|
||||
Must be called explicitly at dataset *creation* time — not on resume,
|
||||
where the existing ``repo_id`` (already stamped) must be preserved.
|
||||
"""
|
||||
def __post_init__(self) -> None:
|
||||
if self.repo_id:
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
self.repo_id = f"{self.repo_id}_{timestamp}"
|
||||
|
||||
@@ -1,206 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any, Literal, get_args
|
||||
|
||||
MessageRole = Literal["user", "assistant", "system", "tool"]
|
||||
MessageStream = Literal["high_level", "low_level"]
|
||||
|
||||
DEFAULT_BINDINGS = {
|
||||
"subtask": "active_at(t, style=subtask)",
|
||||
"memory": "active_at(t, style=memory)",
|
||||
"plan": "active_at(t, style=plan)",
|
||||
"speech": "emitted_at(t, role=assistant, tool_name=say)",
|
||||
"interjection": "emitted_at(t, style=interjection)",
|
||||
"vqa": "emitted_at(t, style=vqa, role=assistant)",
|
||||
"vqa_query": "emitted_at(t, style=vqa, role=user)",
|
||||
}
|
||||
|
||||
PLACEHOLDER_RE = re.compile(r"\$\{([A-Za-z_][A-Za-z0-9_]*)\}")
|
||||
"""``${name}`` placeholder pattern used by both recipe binding-reference
|
||||
discovery (here) and rendered-message substitution (in ``language_render``)."""
|
||||
|
||||
_VALID_ROLES = frozenset(get_args(MessageRole))
|
||||
_VALID_STREAMS = frozenset(get_args(MessageStream))
|
||||
|
||||
|
||||
@dataclass
|
||||
class MessageTurn:
|
||||
"""A single chat-style turn in a recipe template.
|
||||
|
||||
``content`` may be a plain string, a list of HF-style multimodal blocks, or
|
||||
``None`` when ``tool_calls_from`` supplies tool-call payloads instead.
|
||||
``stream`` tags the turn for downstream filtering, ``target`` flags it as a
|
||||
training target, and ``if_present`` skips the turn when the named binding
|
||||
resolves to ``None``.
|
||||
"""
|
||||
|
||||
role: MessageRole
|
||||
content: str | list[dict[str, Any]] | None = None
|
||||
stream: MessageStream | None = None
|
||||
target: bool = False
|
||||
if_present: str | None = None
|
||||
tool_calls_from: str | None = None
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
"""Validate role, stream, and content after dataclass construction."""
|
||||
if self.role not in _VALID_ROLES:
|
||||
raise ValueError(f"Unsupported message role: {self.role!r}")
|
||||
# ``stream`` is typed Optional only so the dataclass can keep its
|
||||
# field ordering, but recipes must always tag every turn with a
|
||||
# stream — the renderer's ``_validate_rendered`` would reject
|
||||
# ``None`` later on. Fail at construction so the bad recipe is
|
||||
# caught at YAML load time rather than at the first sample.
|
||||
if self.stream is None:
|
||||
raise ValueError(
|
||||
f"MessageTurn(role={self.role!r}) is missing a stream — "
|
||||
f"every turn must declare one of {sorted(_VALID_STREAMS)}."
|
||||
)
|
||||
if self.stream not in _VALID_STREAMS:
|
||||
raise ValueError(f"Unsupported message stream: {self.stream!r}")
|
||||
if self.content is None and self.tool_calls_from is None:
|
||||
raise ValueError("MessageTurn.content is required unless tool_calls_from is set.")
|
||||
if self.content is not None and not isinstance(self.content, (str, list)):
|
||||
raise TypeError("MessageTurn.content must be a string, a list of HF-style blocks, or None.")
|
||||
if isinstance(self.content, list):
|
||||
for block in self.content:
|
||||
if not isinstance(block, dict) or "type" not in block:
|
||||
raise ValueError(
|
||||
"Multimodal content blocks must be HF-style dictionaries with a type key."
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict[str, Any]) -> MessageTurn:
|
||||
"""Construct a :class:`MessageTurn` from a plain dictionary."""
|
||||
return cls(**data)
|
||||
|
||||
|
||||
@dataclass
|
||||
class TrainingRecipe:
|
||||
"""A recipe describing how to render training samples from language rows.
|
||||
|
||||
A recipe is either a *message recipe* (``messages`` plus optional
|
||||
``bindings``) or a *blend recipe* (``blend`` mapping names to weighted
|
||||
sub-recipes). ``weight`` is only meaningful inside a blend.
|
||||
"""
|
||||
|
||||
messages: list[MessageTurn] | None = None
|
||||
bindings: dict[str, str] | None = None
|
||||
blend: dict[str, TrainingRecipe] | None = None
|
||||
weight: float | None = None
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
"""Validate that exactly one of ``messages`` or ``blend`` is set."""
|
||||
if self.messages is not None and self.blend is not None:
|
||||
raise ValueError("TrainingRecipe must set only one of messages or blend.")
|
||||
if self.messages is None and self.blend is None:
|
||||
raise ValueError("TrainingRecipe must set one of messages or blend.")
|
||||
|
||||
if self.messages is not None:
|
||||
self._validate_message_recipe()
|
||||
if self.blend is not None:
|
||||
self._validate_blend_recipe()
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict[str, Any]) -> TrainingRecipe:
|
||||
"""Construct a :class:`TrainingRecipe` from a nested dictionary."""
|
||||
data = dict(data)
|
||||
if data.get("messages") is not None:
|
||||
data["messages"] = [
|
||||
turn if isinstance(turn, MessageTurn) else MessageTurn.from_dict(turn)
|
||||
for turn in data["messages"]
|
||||
]
|
||||
if data.get("blend") is not None:
|
||||
data["blend"] = {
|
||||
name: recipe if isinstance(recipe, TrainingRecipe) else cls.from_dict(recipe)
|
||||
for name, recipe in data["blend"].items()
|
||||
}
|
||||
return cls(**data)
|
||||
|
||||
@classmethod
|
||||
def from_yaml(cls, path: str | Path) -> TrainingRecipe:
|
||||
"""Load a :class:`TrainingRecipe` from a YAML file at ``path``."""
|
||||
import yaml # type: ignore[import-untyped]
|
||||
|
||||
with open(path) as f:
|
||||
data = yaml.safe_load(f)
|
||||
if not isinstance(data, dict):
|
||||
raise ValueError(f"Recipe YAML must contain a mapping at the top level: {path}")
|
||||
return cls.from_dict(data)
|
||||
|
||||
def _validate_message_recipe(self) -> None:
|
||||
"""Ensure every templated binding is known and at least one turn is a target."""
|
||||
assert self.messages is not None
|
||||
known_bindings = set(DEFAULT_BINDINGS) | set(self.bindings or {}) | {"task"}
|
||||
|
||||
for turn in self.messages:
|
||||
missing = self._referenced_bindings(turn) - known_bindings
|
||||
if missing:
|
||||
raise ValueError(f"MessageTurn references unknown binding(s): {sorted(missing)}")
|
||||
|
||||
if not any(turn.target for turn in self.messages):
|
||||
raise ValueError("Message recipes must contain at least one target turn.")
|
||||
|
||||
def _validate_blend_recipe(self) -> None:
|
||||
"""Ensure each blend component is a non-empty, weighted message recipe."""
|
||||
assert self.blend is not None
|
||||
if not self.blend:
|
||||
raise ValueError("Blend recipes must contain at least one component.")
|
||||
|
||||
for name, recipe in self.blend.items():
|
||||
if recipe.blend is not None:
|
||||
raise ValueError(f"Blend component {name!r} cannot itself define a blend.")
|
||||
if recipe.messages is None:
|
||||
raise ValueError(f"Blend component {name!r} must define messages.")
|
||||
if recipe.weight is None:
|
||||
raise ValueError(f"Blend component {name!r} must define weight.")
|
||||
if recipe.weight <= 0:
|
||||
raise ValueError(f"Blend component {name!r} must have a positive weight.")
|
||||
|
||||
def _referenced_bindings(self, turn: MessageTurn) -> set[str]:
|
||||
"""Return the binding names that ``turn`` references via placeholders or attributes."""
|
||||
names: set[str] = set()
|
||||
if turn.if_present is not None:
|
||||
names.add(turn.if_present)
|
||||
if turn.tool_calls_from is not None:
|
||||
names.add(turn.tool_calls_from)
|
||||
names.update(_placeholders_in_content(turn.content))
|
||||
return names
|
||||
|
||||
|
||||
def _placeholders_in_content(content: str | list[dict[str, Any]] | None) -> set[str]:
|
||||
"""Return the set of ``${name}`` placeholders found anywhere in ``content``."""
|
||||
if content is None:
|
||||
return set()
|
||||
if isinstance(content, str):
|
||||
return set(PLACEHOLDER_RE.findall(content))
|
||||
|
||||
names: set[str] = set()
|
||||
for block in content:
|
||||
for value in block.values():
|
||||
if isinstance(value, str):
|
||||
names.update(PLACEHOLDER_RE.findall(value))
|
||||
return names
|
||||
|
||||
|
||||
def load_recipe(path: str | Path) -> TrainingRecipe:
|
||||
"""Load a :class:`TrainingRecipe` from a YAML file at ``path``."""
|
||||
return TrainingRecipe.from_yaml(path)
|
||||
@@ -1,163 +0,0 @@
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import abc
|
||||
import builtins
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any, TypeVar
|
||||
|
||||
import draccus
|
||||
from huggingface_hub import hf_hub_download
|
||||
from huggingface_hub.constants import CONFIG_NAME
|
||||
from huggingface_hub.errors import HfHubHTTPError
|
||||
|
||||
from lerobot.configs.types import PolicyFeature
|
||||
from lerobot.optim.optimizers import OptimizerConfig
|
||||
from lerobot.optim.schedulers import LRSchedulerConfig
|
||||
from lerobot.utils.device_utils import auto_select_torch_device, is_torch_device_available
|
||||
from lerobot.utils.hub import HubMixin
|
||||
|
||||
T = TypeVar("T", bound="RewardModelConfig")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class RewardModelConfig(draccus.ChoiceRegistry, HubMixin, abc.ABC):
|
||||
"""Base configuration for reward models.
|
||||
|
||||
Args:
|
||||
input_features: A dictionary defining the PolicyFeature of the input data for the reward. The key represents
|
||||
the input data name, and the value is PolicyFeature, which consists of FeatureType and shape attributes.
|
||||
output_features: A dictionary defining the PolicyFeature of the output data for the reward. The key represents
|
||||
the output data name, and the value is PolicyFeature, which consists of FeatureType and shape attributes.
|
||||
"""
|
||||
|
||||
# Reuses PolicyFeature
|
||||
input_features: dict[str, PolicyFeature] = field(default_factory=dict)
|
||||
output_features: dict[str, PolicyFeature] = field(default_factory=dict)
|
||||
|
||||
device: str | None = None
|
||||
|
||||
pretrained_path: str | None = None
|
||||
|
||||
push_to_hub: bool = False
|
||||
repo_id: str | None = None
|
||||
|
||||
# Hub metadata
|
||||
license: str | None = None
|
||||
tags: list[str] | None = None
|
||||
private: bool | None = None
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
if not self.device or not is_torch_device_available(self.device):
|
||||
auto_device = auto_select_torch_device()
|
||||
logger.warning(f"Device '{self.device}' is not available. Switching to '{auto_device}'.")
|
||||
self.device = auto_device.type
|
||||
|
||||
@property
|
||||
def type(self) -> str:
|
||||
choice_name = self.get_choice_name(self.__class__)
|
||||
if not isinstance(choice_name, str):
|
||||
raise TypeError(f"Expected string from get_choice_name, got {type(choice_name)}")
|
||||
return choice_name
|
||||
|
||||
@property
|
||||
def observation_delta_indices(self) -> list | None: # type: ignore[type-arg]
|
||||
return None
|
||||
|
||||
@property
|
||||
def action_delta_indices(self) -> list | None: # type: ignore[type-arg]
|
||||
return None
|
||||
|
||||
@property
|
||||
def reward_delta_indices(self) -> list | None: # type: ignore[type-arg]
|
||||
return None
|
||||
|
||||
@abc.abstractmethod
|
||||
def get_optimizer_preset(self) -> OptimizerConfig:
|
||||
raise NotImplementedError
|
||||
|
||||
def get_scheduler_preset(self) -> LRSchedulerConfig | None:
|
||||
return None
|
||||
|
||||
def validate_features(self) -> None:
|
||||
pass
|
||||
|
||||
def _save_pretrained(self, save_directory: Path) -> None:
|
||||
with open(save_directory / CONFIG_NAME, "w") as f, draccus.config_type("json"):
|
||||
draccus.dump(self, f, indent=4)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(
|
||||
cls: builtins.type[T],
|
||||
pretrained_name_or_path: str | Path,
|
||||
*,
|
||||
force_download: bool = False,
|
||||
resume_download: bool | None = None,
|
||||
proxies: dict[Any, Any] | None = None,
|
||||
token: str | bool | None = None,
|
||||
cache_dir: str | Path | None = None,
|
||||
local_files_only: bool = False,
|
||||
revision: str | None = None,
|
||||
**reward_kwargs: Any,
|
||||
) -> T:
|
||||
model_id = str(pretrained_name_or_path)
|
||||
config_file: str | None = None
|
||||
if Path(model_id).is_dir():
|
||||
if CONFIG_NAME in os.listdir(model_id):
|
||||
config_file = os.path.join(model_id, CONFIG_NAME)
|
||||
else:
|
||||
logger.error(f"{CONFIG_NAME} not found in {Path(model_id).resolve()}")
|
||||
else:
|
||||
try:
|
||||
config_file = hf_hub_download(
|
||||
repo_id=model_id,
|
||||
filename=CONFIG_NAME,
|
||||
revision=revision,
|
||||
cache_dir=cache_dir,
|
||||
force_download=force_download,
|
||||
proxies=proxies,
|
||||
resume_download=resume_download,
|
||||
token=token,
|
||||
local_files_only=local_files_only,
|
||||
)
|
||||
except HfHubHTTPError as e:
|
||||
raise FileNotFoundError(
|
||||
f"{CONFIG_NAME} not found on the HuggingFace Hub in {model_id}"
|
||||
) from e
|
||||
|
||||
if config_file is None:
|
||||
raise FileNotFoundError(f"{CONFIG_NAME} not found in {model_id}")
|
||||
|
||||
# HACK: Parse the original config to get the config subclass, so that we can
|
||||
# apply cli overrides.
|
||||
with draccus.config_type("json"):
|
||||
orig_config = draccus.parse(cls, config_file, args=[])
|
||||
|
||||
with open(config_file) as f:
|
||||
config = json.load(f)
|
||||
|
||||
config.pop("type", None)
|
||||
with tempfile.NamedTemporaryFile("w+", delete=False, suffix=".json") as f:
|
||||
json.dump(config, f)
|
||||
config_file = f.name
|
||||
|
||||
cli_overrides = reward_kwargs.pop("cli_overrides", [])
|
||||
with draccus.config_type("json"):
|
||||
return draccus.parse(orig_config.__class__, config_file, args=cli_overrides)
|
||||
@@ -13,9 +13,7 @@
|
||||
# limitations under the License.
|
||||
import builtins
|
||||
import datetime as dt
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
@@ -28,57 +26,18 @@ from lerobot import envs
|
||||
from lerobot.configs import parser
|
||||
from lerobot.optim import LRSchedulerConfig, OptimizerConfig
|
||||
from lerobot.utils.hub import HubMixin
|
||||
from lerobot.utils.sample_weighting import SampleWeightingConfig
|
||||
|
||||
from .default import DatasetConfig, EvalConfig, PeftConfig, WandBConfig
|
||||
from .policies import PreTrainedConfig
|
||||
from .rewards import RewardModelConfig
|
||||
|
||||
TRAIN_CONFIG_NAME = "train_config.json"
|
||||
|
||||
|
||||
def _migrate_legacy_rabc_fields(config: dict[str, Any]) -> dict[str, Any] | None:
|
||||
"""Return migrated payload for legacy RA-BC fields, or None when no migration is needed."""
|
||||
legacy_fields = (
|
||||
"use_rabc",
|
||||
"rabc_progress_path",
|
||||
"rabc_kappa",
|
||||
"rabc_epsilon",
|
||||
"rabc_head_mode",
|
||||
)
|
||||
if not any(key in config for key in legacy_fields):
|
||||
return None
|
||||
|
||||
migrated_config = dict(config)
|
||||
use_rabc = bool(migrated_config.pop("use_rabc", False))
|
||||
rabc_progress_path = migrated_config.pop("rabc_progress_path", None)
|
||||
rabc_kappa = migrated_config.pop("rabc_kappa", None)
|
||||
rabc_epsilon = migrated_config.pop("rabc_epsilon", None)
|
||||
rabc_head_mode = migrated_config.pop("rabc_head_mode", None)
|
||||
|
||||
# New configs may already define sample_weighting explicitly. In that case,
|
||||
# legacy fields are ignored after being stripped from the payload.
|
||||
if migrated_config.get("sample_weighting") is None and use_rabc:
|
||||
sample_weighting: dict[str, Any] = {"type": "rabc"}
|
||||
if rabc_progress_path is not None:
|
||||
sample_weighting["progress_path"] = rabc_progress_path
|
||||
if rabc_kappa is not None:
|
||||
sample_weighting["kappa"] = rabc_kappa
|
||||
if rabc_epsilon is not None:
|
||||
sample_weighting["epsilon"] = rabc_epsilon
|
||||
if rabc_head_mode is not None:
|
||||
sample_weighting["head_mode"] = rabc_head_mode
|
||||
migrated_config["sample_weighting"] = sample_weighting
|
||||
|
||||
return migrated_config
|
||||
|
||||
|
||||
@dataclass
|
||||
class TrainPipelineConfig(HubMixin):
|
||||
dataset: DatasetConfig
|
||||
env: envs.EnvConfig | None = None
|
||||
policy: PreTrainedConfig | None = None
|
||||
reward_model: RewardModelConfig | None = None
|
||||
# Set `dir` to where you would like to save all of the run outputs. If you run another training session
|
||||
# with the same value for `dir` its contents will be overwritten unless you set `resume` to true.
|
||||
output_dir: Path | None = None
|
||||
@@ -113,41 +72,27 @@ class TrainPipelineConfig(HubMixin):
|
||||
wandb: WandBConfig = field(default_factory=WandBConfig)
|
||||
peft: PeftConfig | None = None
|
||||
|
||||
# Sample weighting configuration (e.g., for RA-BC training)
|
||||
sample_weighting: SampleWeightingConfig | None = None
|
||||
# RA-BC (Reward-Aligned Behavior Cloning) parameters
|
||||
use_rabc: bool = False # Enable reward-weighted training
|
||||
rabc_progress_path: str | None = None # Path to precomputed SARM progress parquet file
|
||||
rabc_kappa: float = 0.01 # Hard threshold for high-quality samples
|
||||
rabc_epsilon: float = 1e-6 # Small constant for numerical stability
|
||||
rabc_head_mode: str | None = "sparse" # For dual-head models: "sparse" or "dense"
|
||||
|
||||
# Rename map for the observation to override the image and state keys
|
||||
rename_map: dict[str, str] = field(default_factory=dict)
|
||||
checkpoint_path: Path | None = field(init=False, default=None)
|
||||
|
||||
@property
|
||||
def is_reward_model_training(self) -> bool:
|
||||
"""True when the config targets a reward model rather than a policy."""
|
||||
return self.reward_model is not None
|
||||
|
||||
@property
|
||||
def trainable_config(self) -> PreTrainedConfig | RewardModelConfig:
|
||||
"""Return whichever config (policy or reward_model) is active."""
|
||||
if self.is_reward_model_training:
|
||||
return self.reward_model # type: ignore[return-value]
|
||||
return self.policy # type: ignore[return-value]
|
||||
|
||||
def validate(self) -> None:
|
||||
# HACK: We parse again the cli args here to get the pretrained paths if there was some.
|
||||
policy_path = parser.get_path_arg("policy")
|
||||
reward_model_path = parser.get_path_arg("reward_model")
|
||||
|
||||
if reward_model_path:
|
||||
cli_overrides = parser.get_cli_overrides("reward_model")
|
||||
self.reward_model = RewardModelConfig.from_pretrained(
|
||||
reward_model_path, cli_overrides=cli_overrides
|
||||
)
|
||||
self.reward_model.pretrained_path = str(Path(reward_model_path))
|
||||
elif policy_path:
|
||||
if policy_path:
|
||||
# Only load the policy config
|
||||
cli_overrides = parser.get_cli_overrides("policy")
|
||||
self.policy = PreTrainedConfig.from_pretrained(policy_path, cli_overrides=cli_overrides)
|
||||
self.policy.pretrained_path = Path(policy_path)
|
||||
elif self.resume:
|
||||
# The entire train config is already loaded, we just need to get the checkpoint dir
|
||||
config_path = parser.parse_arg("config_path")
|
||||
if not config_path:
|
||||
raise ValueError(
|
||||
@@ -163,22 +108,18 @@ class TrainPipelineConfig(HubMixin):
|
||||
policy_dir = Path(config_path).parent
|
||||
if self.policy is not None:
|
||||
self.policy.pretrained_path = policy_dir
|
||||
if self.reward_model is not None:
|
||||
self.reward_model.pretrained_path = str(policy_dir)
|
||||
self.checkpoint_path = policy_dir.parent
|
||||
|
||||
if self.policy is None and self.reward_model is None:
|
||||
if self.policy is None:
|
||||
raise ValueError(
|
||||
"Neither policy nor reward_model is configured. "
|
||||
"Please specify one with `--policy.path` or `--reward_model.path`."
|
||||
"Policy is not configured. Please specify a pretrained policy with `--policy.path`."
|
||||
)
|
||||
|
||||
active_cfg = self.trainable_config
|
||||
if not self.job_name:
|
||||
if self.env is None:
|
||||
self.job_name = f"{active_cfg.type}"
|
||||
self.job_name = f"{self.policy.type}"
|
||||
else:
|
||||
self.job_name = f"{self.env.type}_{active_cfg.type}"
|
||||
self.job_name = f"{self.env.type}_{self.policy.type}"
|
||||
|
||||
if not self.resume and isinstance(self.output_dir, Path) and self.output_dir.is_dir():
|
||||
raise FileExistsError(
|
||||
@@ -196,16 +137,26 @@ class TrainPipelineConfig(HubMixin):
|
||||
if not self.use_policy_training_preset and (self.optimizer is None or self.scheduler is None):
|
||||
raise ValueError("Optimizer and Scheduler must be set when the policy presets are not used.")
|
||||
elif self.use_policy_training_preset and not self.resume:
|
||||
self.optimizer = active_cfg.get_optimizer_preset()
|
||||
self.scheduler = active_cfg.get_scheduler_preset()
|
||||
self.optimizer = self.policy.get_optimizer_preset()
|
||||
self.scheduler = self.policy.get_scheduler_preset()
|
||||
|
||||
if hasattr(active_cfg, "push_to_hub") and active_cfg.push_to_hub and not active_cfg.repo_id:
|
||||
raise ValueError("'repo_id' argument missing. Please specify it to push the model to the hub.")
|
||||
if self.policy.push_to_hub and not self.policy.repo_id:
|
||||
raise ValueError(
|
||||
"'policy.repo_id' argument missing. Please specify it to push the model to the hub."
|
||||
)
|
||||
|
||||
if self.use_rabc and not self.rabc_progress_path:
|
||||
# Auto-detect from dataset path
|
||||
repo_id = self.dataset.repo_id
|
||||
if self.dataset.root:
|
||||
self.rabc_progress_path = str(Path(self.dataset.root) / "sarm_progress.parquet")
|
||||
else:
|
||||
self.rabc_progress_path = f"hf://datasets/{repo_id}/sarm_progress.parquet"
|
||||
|
||||
@classmethod
|
||||
def __get_path_fields__(cls) -> list[str]:
|
||||
"""Keys for draccus pretrained-path loading."""
|
||||
return ["policy", "reward_model"]
|
||||
"""This enables the parser to load config from the policy using `--policy.path=local/dir`"""
|
||||
return ["policy"]
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
return draccus.encode(self) # type: ignore[no-any-return] # because of the third-party library draccus uses Any as the return type
|
||||
@@ -256,15 +207,6 @@ class TrainPipelineConfig(HubMixin):
|
||||
) from e
|
||||
|
||||
cli_args = kwargs.pop("cli_args", [])
|
||||
if config_file is not None:
|
||||
with open(config_file) as f:
|
||||
config = json.load(f)
|
||||
migrated_config = _migrate_legacy_rabc_fields(config)
|
||||
if migrated_config is not None:
|
||||
with tempfile.NamedTemporaryFile("w+", delete=False, suffix=".json") as f:
|
||||
json.dump(migrated_config, f)
|
||||
config_file = f.name
|
||||
|
||||
with draccus.config_type("json"):
|
||||
return draccus.parse(cls, config_file, args=cli_args)
|
||||
|
||||
|
||||
@@ -37,14 +37,6 @@ from .dataset_tools import (
|
||||
from .factory import make_dataset, resolve_delta_timestamps
|
||||
from .image_writer import safe_stop_image_writer
|
||||
from .io_utils import load_episodes, write_stats
|
||||
from .language import (
|
||||
EVENT_ONLY_STYLES,
|
||||
LANGUAGE_EVENTS,
|
||||
LANGUAGE_PERSISTENT,
|
||||
PERSISTENT_STYLES,
|
||||
STYLE_REGISTRY,
|
||||
column_for_style,
|
||||
)
|
||||
from .lerobot_dataset import LeRobotDataset
|
||||
from .multi_dataset import MultiLeRobotDataset
|
||||
from .pipeline_features import aggregate_pipeline_dataset_features, create_initial_features
|
||||
@@ -61,15 +53,10 @@ __all__ = [
|
||||
"CODEBASE_VERSION",
|
||||
"DEFAULT_EPISODES_PATH",
|
||||
"DEFAULT_QUANTILES",
|
||||
"EVENT_ONLY_STYLES",
|
||||
"EpisodeAwareSampler",
|
||||
"LANGUAGE_EVENTS",
|
||||
"LANGUAGE_PERSISTENT",
|
||||
"LeRobotDataset",
|
||||
"LeRobotDatasetMetadata",
|
||||
"MultiLeRobotDataset",
|
||||
"PERSISTENT_STYLES",
|
||||
"STYLE_REGISTRY",
|
||||
"StreamingLeRobotDataset",
|
||||
"VideoEncodingManager",
|
||||
"add_features",
|
||||
@@ -79,7 +66,6 @@ __all__ = [
|
||||
"convert_image_to_video_dataset",
|
||||
"create_initial_features",
|
||||
"create_lerobot_dataset_card",
|
||||
"column_for_style",
|
||||
"delete_episodes",
|
||||
"get_feature_stats",
|
||||
"load_episodes",
|
||||
|
||||
@@ -97,8 +97,8 @@ def update_data_df(df, src_meta, dst_meta):
|
||||
pd.DataFrame: Updated DataFrame with adjusted indices.
|
||||
"""
|
||||
|
||||
df["episode_index"] = df["episode_index"] + dst_meta.info.total_episodes
|
||||
df["index"] = df["index"] + dst_meta.info.total_frames
|
||||
df["episode_index"] = df["episode_index"] + dst_meta.info["total_episodes"]
|
||||
df["index"] = df["index"] + dst_meta.info["total_frames"]
|
||||
|
||||
src_task_names = src_meta.tasks.index.take(df["task_index"].to_numpy())
|
||||
df["task_index"] = dst_meta.tasks.loc[src_task_names, "task_index"].to_numpy()
|
||||
@@ -225,9 +225,9 @@ def update_meta_data(
|
||||
# Clean up temporary columns
|
||||
df = df.drop(columns=["_orig_chunk", "_orig_file"])
|
||||
|
||||
df["dataset_from_index"] = df["dataset_from_index"] + dst_meta.info.total_frames
|
||||
df["dataset_to_index"] = df["dataset_to_index"] + dst_meta.info.total_frames
|
||||
df["episode_index"] = df["episode_index"] + dst_meta.info.total_episodes
|
||||
df["dataset_from_index"] = df["dataset_from_index"] + dst_meta.info["total_frames"]
|
||||
df["dataset_to_index"] = df["dataset_to_index"] + dst_meta.info["total_frames"]
|
||||
df["episode_index"] = df["episode_index"] + dst_meta.info["total_episodes"]
|
||||
|
||||
return df
|
||||
|
||||
@@ -237,8 +237,8 @@ def aggregate_datasets(
|
||||
aggr_repo_id: str,
|
||||
roots: list[Path] | None = None,
|
||||
aggr_root: Path | None = None,
|
||||
data_files_size_in_mb: int | None = None,
|
||||
video_files_size_in_mb: int | None = None,
|
||||
data_files_size_in_mb: float | None = None,
|
||||
video_files_size_in_mb: float | None = None,
|
||||
chunk_size: int | None = None,
|
||||
):
|
||||
"""Aggregates multiple LeRobot datasets into a single unified dataset.
|
||||
@@ -313,8 +313,8 @@ def aggregate_datasets(
|
||||
# to avoid interference between different source datasets
|
||||
data_idx.pop("src_to_dst", None)
|
||||
|
||||
dst_meta.info.total_episodes += src_meta.total_episodes
|
||||
dst_meta.info.total_frames += src_meta.total_frames
|
||||
dst_meta.info["total_episodes"] += src_meta.total_episodes
|
||||
dst_meta.info["total_frames"] += src_meta.total_frames
|
||||
|
||||
finalize_aggregation(dst_meta, all_metadata)
|
||||
logging.info("Aggregation complete.")
|
||||
@@ -640,10 +640,14 @@ def finalize_aggregation(aggr_meta, all_metadata):
|
||||
write_tasks(aggr_meta.tasks, aggr_meta.root)
|
||||
|
||||
logging.info("write info")
|
||||
aggr_meta.info.total_tasks = len(aggr_meta.tasks)
|
||||
aggr_meta.info.total_episodes = sum(m.total_episodes for m in all_metadata)
|
||||
aggr_meta.info.total_frames = sum(m.total_frames for m in all_metadata)
|
||||
aggr_meta.info.splits = {"train": f"0:{sum(m.total_episodes for m in all_metadata)}"}
|
||||
aggr_meta.info.update(
|
||||
{
|
||||
"total_tasks": len(aggr_meta.tasks),
|
||||
"total_episodes": sum(m.total_episodes for m in all_metadata),
|
||||
"total_frames": sum(m.total_frames for m in all_metadata),
|
||||
"splits": {"train": f"0:{sum(m.total_episodes for m in all_metadata)}"},
|
||||
}
|
||||
)
|
||||
write_info(aggr_meta.info, aggr_meta.root)
|
||||
|
||||
logging.info("write stats")
|
||||
|
||||
@@ -512,7 +512,7 @@ def compute_episode_stats(
|
||||
|
||||
ep_stats = {}
|
||||
for key, data in episode_data.items():
|
||||
if features[key]["dtype"] in {"string", "language"}:
|
||||
if features[key]["dtype"] == "string":
|
||||
continue
|
||||
|
||||
if features[key]["dtype"] in ["image", "video"]:
|
||||
|
||||
@@ -34,13 +34,16 @@ from .io_utils import (
|
||||
load_episodes,
|
||||
load_info,
|
||||
load_stats,
|
||||
load_subtasks,
|
||||
load_tasks,
|
||||
write_info,
|
||||
write_json,
|
||||
write_stats,
|
||||
write_tasks,
|
||||
)
|
||||
from .utils import (
|
||||
DEFAULT_EPISODES_PATH,
|
||||
INFO_PATH,
|
||||
check_version_compatibility,
|
||||
get_safe_version,
|
||||
has_legacy_hub_download_metadata,
|
||||
@@ -174,6 +177,7 @@ class LeRobotDatasetMetadata:
|
||||
self.info = load_info(self.root)
|
||||
check_version_compatibility(self.repo_id, self._version, CODEBASE_VERSION)
|
||||
self.tasks = load_tasks(self.root)
|
||||
self.subtasks = load_subtasks(self.root)
|
||||
self.episodes = load_episodes(self.root)
|
||||
self.stats = load_stats(self.root)
|
||||
|
||||
@@ -224,7 +228,7 @@ class LeRobotDatasetMetadata:
|
||||
@property
|
||||
def _version(self) -> packaging.version.Version:
|
||||
"""Codebase version used to create this dataset."""
|
||||
return packaging.version.parse(self.info.codebase_version)
|
||||
return packaging.version.parse(self.info["codebase_version"])
|
||||
|
||||
def get_data_file_path(self, ep_index: int) -> Path:
|
||||
"""Return the relative parquet file path for the given episode index.
|
||||
@@ -279,27 +283,27 @@ class LeRobotDatasetMetadata:
|
||||
@property
|
||||
def data_path(self) -> str:
|
||||
"""Formattable string for the parquet files."""
|
||||
return self.info.data_path
|
||||
return self.info["data_path"]
|
||||
|
||||
@property
|
||||
def video_path(self) -> str | None:
|
||||
"""Formattable string for the video files."""
|
||||
return self.info.video_path
|
||||
return self.info["video_path"]
|
||||
|
||||
@property
|
||||
def robot_type(self) -> str | None:
|
||||
"""Robot type used in recording this dataset."""
|
||||
return self.info.robot_type
|
||||
return self.info["robot_type"]
|
||||
|
||||
@property
|
||||
def fps(self) -> int:
|
||||
"""Frames per second used during data collection."""
|
||||
return self.info.fps
|
||||
return self.info["fps"]
|
||||
|
||||
@property
|
||||
def features(self) -> dict[str, dict]:
|
||||
"""All features contained in the dataset."""
|
||||
return self.info.features
|
||||
return self.info["features"]
|
||||
|
||||
@property
|
||||
def image_keys(self) -> list[str]:
|
||||
@@ -316,39 +320,6 @@ class LeRobotDatasetMetadata:
|
||||
"""Keys to access visual modalities (regardless of their storage method)."""
|
||||
return [key for key, ft in self.features.items() if ft["dtype"] in ["video", "image"]]
|
||||
|
||||
@property
|
||||
def has_language_columns(self) -> bool:
|
||||
"""Return ``True`` if the dataset declares any language column.
|
||||
|
||||
Used to gate language-aware code paths (collate, render step) so
|
||||
unannotated datasets keep PyTorch's default collate behavior.
|
||||
"""
|
||||
from .language import LANGUAGE_COLUMNS # noqa: PLC0415 (avoid circular import)
|
||||
|
||||
return any(col in self.features for col in LANGUAGE_COLUMNS)
|
||||
|
||||
@property
|
||||
def tools(self) -> list[dict]:
|
||||
"""OpenAI-style tool schemas declared by this dataset.
|
||||
|
||||
Read from ``meta/info.json["tools"]``. Returns a copy, so callers
|
||||
can mutate the result safely. Falls back to
|
||||
:data:`lerobot.datasets.language.DEFAULT_TOOLS` (the canonical
|
||||
``say`` schema) when the dataset doesn't declare any — that way
|
||||
unannotated datasets and chat-template consumers
|
||||
(``apply_chat_template(messages, tools=meta.tools)``) keep
|
||||
working out of the box.
|
||||
|
||||
Implementations live under :mod:`lerobot.tools` (one file per
|
||||
tool); see ``docs/source/tools.mdx`` for the authoring guide.
|
||||
"""
|
||||
from .language import DEFAULT_TOOLS # noqa: PLC0415 (avoid circular import)
|
||||
|
||||
declared = self.info.tools
|
||||
if declared:
|
||||
return [dict(t) for t in declared]
|
||||
return [dict(t) for t in DEFAULT_TOOLS]
|
||||
|
||||
@property
|
||||
def names(self) -> dict[str, list | dict]:
|
||||
"""Names of the various dimensions of vector modalities."""
|
||||
@@ -362,32 +333,32 @@ class LeRobotDatasetMetadata:
|
||||
@property
|
||||
def total_episodes(self) -> int:
|
||||
"""Total number of episodes available."""
|
||||
return self.info.total_episodes
|
||||
return self.info["total_episodes"]
|
||||
|
||||
@property
|
||||
def total_frames(self) -> int:
|
||||
"""Total number of frames saved in this dataset."""
|
||||
return self.info.total_frames
|
||||
return self.info["total_frames"]
|
||||
|
||||
@property
|
||||
def total_tasks(self) -> int:
|
||||
"""Total number of different tasks performed in this dataset."""
|
||||
return self.info.total_tasks
|
||||
return self.info["total_tasks"]
|
||||
|
||||
@property
|
||||
def chunks_size(self) -> int:
|
||||
"""Max number of files per chunk."""
|
||||
return self.info.chunks_size
|
||||
return self.info["chunks_size"]
|
||||
|
||||
@property
|
||||
def data_files_size_in_mb(self) -> int:
|
||||
"""Max size of data file in mega bytes."""
|
||||
return self.info.data_files_size_in_mb
|
||||
return self.info["data_files_size_in_mb"]
|
||||
|
||||
@property
|
||||
def video_files_size_in_mb(self) -> int:
|
||||
"""Max size of video file in mega bytes."""
|
||||
return self.info.video_files_size_in_mb
|
||||
return self.info["video_files_size_in_mb"]
|
||||
|
||||
def get_task_index(self, task: str) -> int | None:
|
||||
"""
|
||||
@@ -531,10 +502,10 @@ class LeRobotDatasetMetadata:
|
||||
self._save_episode_metadata(episode_dict)
|
||||
|
||||
# Update info
|
||||
self.info.total_episodes += 1
|
||||
self.info.total_frames += episode_length
|
||||
self.info.total_tasks = len(self.tasks)
|
||||
self.info.splits = {"train": f"0:{self.info.total_episodes}"}
|
||||
self.info["total_episodes"] += 1
|
||||
self.info["total_frames"] += episode_length
|
||||
self.info["total_tasks"] = len(self.tasks)
|
||||
self.info["splits"] = {"train": f"0:{self.info['total_episodes']}"}
|
||||
|
||||
write_info(self.info, self.root)
|
||||
|
||||
@@ -553,7 +524,7 @@ class LeRobotDatasetMetadata:
|
||||
for key in video_keys:
|
||||
if not self.features[key].get("info", None):
|
||||
video_path = self.root / self.video_path.format(video_key=key, chunk_index=0, file_index=0)
|
||||
self.info.features[key]["info"] = get_video_info(video_path)
|
||||
self.info["features"][key]["info"] = get_video_info(video_path)
|
||||
|
||||
def update_chunk_settings(
|
||||
self,
|
||||
@@ -575,17 +546,17 @@ class LeRobotDatasetMetadata:
|
||||
if chunks_size is not None:
|
||||
if chunks_size <= 0:
|
||||
raise ValueError(f"chunks_size must be positive, got {chunks_size}")
|
||||
self.info.chunks_size = chunks_size
|
||||
self.info["chunks_size"] = chunks_size
|
||||
|
||||
if data_files_size_in_mb is not None:
|
||||
if data_files_size_in_mb <= 0:
|
||||
raise ValueError(f"data_files_size_in_mb must be positive, got {data_files_size_in_mb}")
|
||||
self.info.data_files_size_in_mb = data_files_size_in_mb
|
||||
self.info["data_files_size_in_mb"] = data_files_size_in_mb
|
||||
|
||||
if video_files_size_in_mb is not None:
|
||||
if video_files_size_in_mb <= 0:
|
||||
raise ValueError(f"video_files_size_in_mb must be positive, got {video_files_size_in_mb}")
|
||||
self.info.video_files_size_in_mb = video_files_size_in_mb
|
||||
self.info["video_files_size_in_mb"] = video_files_size_in_mb
|
||||
|
||||
# Update the info file on disk
|
||||
write_info(self.info, self.root)
|
||||
@@ -664,6 +635,7 @@ class LeRobotDatasetMetadata:
|
||||
_validate_feature_names(features)
|
||||
|
||||
obj.tasks = None
|
||||
obj.subtasks = None
|
||||
obj.episodes = None
|
||||
obj.stats = None
|
||||
obj.info = create_empty_dataset_info(
|
||||
@@ -681,7 +653,7 @@ class LeRobotDatasetMetadata:
|
||||
f"Features contain video keys {obj.video_keys}, but 'use_videos' is set to False. "
|
||||
"Either remove video features from the features dict, or set 'use_videos=True'."
|
||||
)
|
||||
write_info(obj.info, obj.root)
|
||||
write_json(obj.info, obj.root / INFO_PATH)
|
||||
obj.revision = None
|
||||
obj._pq_writer = None
|
||||
obj.latest_episode = None
|
||||
|
||||
@@ -295,4 +295,9 @@ class DatasetReader:
|
||||
task_idx = item["task_index"].item()
|
||||
item["task"] = self._meta.tasks.iloc[task_idx].name
|
||||
|
||||
# add subtask information if available
|
||||
if "subtask_index" in self._meta.features and self._meta.subtasks is not None:
|
||||
subtask_idx = item["subtask_index"].item()
|
||||
item["subtask"] = self._meta.subtasks.iloc[subtask_idx].name
|
||||
|
||||
return item
|
||||
|
||||
@@ -897,10 +897,14 @@ def _copy_and_reindex_episodes_metadata(
|
||||
|
||||
dst_meta.finalize()
|
||||
|
||||
dst_meta.info.total_episodes = len(episode_mapping)
|
||||
dst_meta.info.total_frames = total_frames
|
||||
dst_meta.info.total_tasks = len(dst_meta.tasks) if dst_meta.tasks is not None else 0
|
||||
dst_meta.info.splits = {"train": f"0:{len(episode_mapping)}"}
|
||||
dst_meta.info.update(
|
||||
{
|
||||
"total_episodes": len(episode_mapping),
|
||||
"total_frames": total_frames,
|
||||
"total_tasks": len(dst_meta.tasks) if dst_meta.tasks is not None else 0,
|
||||
"splits": {"train": f"0:{len(episode_mapping)}"},
|
||||
}
|
||||
)
|
||||
write_info(dst_meta.info, dst_meta.root)
|
||||
|
||||
if not all_stats:
|
||||
@@ -1065,20 +1069,21 @@ def _copy_episodes_metadata_and_stats(
|
||||
if episodes_dir.exists():
|
||||
shutil.copytree(episodes_dir, dst_episodes_dir, dirs_exist_ok=True)
|
||||
|
||||
dst_meta.info.total_episodes = src_dataset.meta.total_episodes
|
||||
dst_meta.info.total_frames = src_dataset.meta.total_frames
|
||||
dst_meta.info.total_tasks = src_dataset.meta.total_tasks
|
||||
# Preserve original splits if available, otherwise create default
|
||||
dst_meta.info.splits = (
|
||||
src_dataset.meta.info.splits
|
||||
if src_dataset.meta.info.splits
|
||||
else {"train": f"0:{src_dataset.meta.total_episodes}"}
|
||||
dst_meta.info.update(
|
||||
{
|
||||
"total_episodes": src_dataset.meta.total_episodes,
|
||||
"total_frames": src_dataset.meta.total_frames,
|
||||
"total_tasks": src_dataset.meta.total_tasks,
|
||||
"splits": src_dataset.meta.info.get("splits", {"train": f"0:{src_dataset.meta.total_episodes}"}),
|
||||
}
|
||||
)
|
||||
|
||||
if dst_meta.video_keys and src_dataset.meta.video_keys:
|
||||
for key in dst_meta.video_keys:
|
||||
if key in src_dataset.meta.features:
|
||||
dst_meta.info.features[key]["info"] = src_dataset.meta.info.features[key].get("info", {})
|
||||
dst_meta.info["features"][key]["info"] = src_dataset.meta.info["features"][key].get(
|
||||
"info", {}
|
||||
)
|
||||
|
||||
write_info(dst_meta.info, dst_meta.root)
|
||||
|
||||
@@ -1520,7 +1525,7 @@ def modify_tasks(
|
||||
write_tasks(new_task_df, root)
|
||||
|
||||
# Update info.json
|
||||
dataset.meta.info.total_tasks = len(unique_tasks)
|
||||
dataset.meta.info["total_tasks"] = len(unique_tasks)
|
||||
write_info(dataset.meta.info, root)
|
||||
|
||||
# Reload metadata to reflect changes
|
||||
@@ -1853,10 +1858,10 @@ def convert_image_to_video_dataset(
|
||||
episodes_df.to_parquet(episodes_path, index=False)
|
||||
|
||||
# Update metadata info
|
||||
new_meta.info.total_episodes = len(episode_indices)
|
||||
new_meta.info.total_frames = sum(ep["length"] for ep in all_episode_metadata.values())
|
||||
new_meta.info.total_tasks = dataset.meta.total_tasks
|
||||
new_meta.info.splits = {"train": f"0:{len(episode_indices)}"}
|
||||
new_meta.info["total_episodes"] = len(episode_indices)
|
||||
new_meta.info["total_frames"] = sum(ep["length"] for ep in all_episode_metadata.values())
|
||||
new_meta.info["total_tasks"] = dataset.meta.total_tasks
|
||||
new_meta.info["splits"] = {"train": f"0:{len(episode_indices)}"}
|
||||
|
||||
# Update video info for all image keys (now videos)
|
||||
# We need to manually set video info since update_video_info() checks video_keys first
|
||||
@@ -1865,7 +1870,7 @@ def convert_image_to_video_dataset(
|
||||
video_path = new_meta.root / new_meta.video_path.format(
|
||||
video_key=img_key, chunk_index=0, file_index=0
|
||||
)
|
||||
new_meta.info.features[img_key]["info"] = get_video_info(video_path)
|
||||
new_meta.info["features"][img_key]["info"] = get_video_info(video_path)
|
||||
|
||||
write_info(new_meta.info, new_meta.root)
|
||||
|
||||
|
||||
@@ -19,7 +19,6 @@ from pprint import pformat
|
||||
import torch
|
||||
|
||||
from lerobot.configs import PreTrainedConfig
|
||||
from lerobot.configs.rewards import RewardModelConfig
|
||||
from lerobot.configs.train import TrainPipelineConfig
|
||||
from lerobot.transforms import ImageTransforms
|
||||
from lerobot.utils.constants import ACTION, IMAGENET_STATS, OBS_PREFIX, REWARD
|
||||
@@ -31,14 +30,12 @@ from .streaming_dataset import StreamingLeRobotDataset
|
||||
|
||||
|
||||
def resolve_delta_timestamps(
|
||||
cfg: PreTrainedConfig | RewardModelConfig, ds_meta: LeRobotDatasetMetadata
|
||||
cfg: PreTrainedConfig, ds_meta: LeRobotDatasetMetadata
|
||||
) -> dict[str, list] | None:
|
||||
"""Resolves delta_timestamps by reading from the 'delta_indices' properties of the config.
|
||||
"""Resolves delta_timestamps by reading from the 'delta_indices' properties of the PreTrainedConfig.
|
||||
|
||||
Args:
|
||||
cfg (PreTrainedConfig | RewardModelConfig): The config to read delta_indices from. Both
|
||||
``PreTrainedConfig`` and concrete ``RewardModelConfig`` subclasses expose the
|
||||
``{observation,action,reward}_delta_indices`` properties used below.
|
||||
cfg (PreTrainedConfig): The PreTrainedConfig to read delta_indices from.
|
||||
ds_meta (LeRobotDatasetMetadata): The dataset from which features and fps are used to build
|
||||
delta_timestamps against.
|
||||
|
||||
@@ -85,7 +82,7 @@ def make_dataset(cfg: TrainPipelineConfig) -> LeRobotDataset | MultiLeRobotDatas
|
||||
ds_meta = LeRobotDatasetMetadata(
|
||||
cfg.dataset.repo_id, root=cfg.dataset.root, revision=cfg.dataset.revision
|
||||
)
|
||||
delta_timestamps = resolve_delta_timestamps(cfg.trainable_config, ds_meta)
|
||||
delta_timestamps = resolve_delta_timestamps(cfg.policy, ds_meta)
|
||||
if not cfg.dataset.streaming:
|
||||
dataset = LeRobotDataset(
|
||||
cfg.dataset.repo_id,
|
||||
|
||||
@@ -22,19 +22,12 @@ from PIL import Image as PILImage
|
||||
from lerobot.utils.constants import DEFAULT_FEATURES
|
||||
from lerobot.utils.utils import is_valid_numpy_dtype_string
|
||||
|
||||
from .language import (
|
||||
LANGUAGE_PERSISTENT,
|
||||
is_language_column,
|
||||
language_events_column_feature,
|
||||
language_persistent_column_feature,
|
||||
)
|
||||
from .utils import (
|
||||
DEFAULT_CHUNK_SIZE,
|
||||
DEFAULT_DATA_FILE_SIZE_IN_MB,
|
||||
DEFAULT_DATA_PATH,
|
||||
DEFAULT_VIDEO_FILE_SIZE_IN_MB,
|
||||
DEFAULT_VIDEO_PATH,
|
||||
DatasetInfo,
|
||||
)
|
||||
|
||||
|
||||
@@ -52,13 +45,7 @@ def get_hf_features_from_features(features: dict) -> datasets.Features:
|
||||
"""
|
||||
hf_features = {}
|
||||
for key, ft in features.items():
|
||||
if is_language_column(key):
|
||||
hf_features[key] = (
|
||||
language_persistent_column_feature()
|
||||
if key == LANGUAGE_PERSISTENT
|
||||
else language_events_column_feature()
|
||||
)
|
||||
elif ft["dtype"] == "video":
|
||||
if ft["dtype"] == "video":
|
||||
continue
|
||||
elif ft["dtype"] == "image":
|
||||
hf_features[key] = datasets.Image()
|
||||
@@ -91,8 +78,8 @@ def create_empty_dataset_info(
|
||||
chunks_size: int | None = None,
|
||||
data_files_size_in_mb: int | None = None,
|
||||
video_files_size_in_mb: int | None = None,
|
||||
) -> DatasetInfo:
|
||||
"""Create a template ``DatasetInfo`` object for a new dataset's ``meta/info.json``.
|
||||
) -> dict:
|
||||
"""Create a template dictionary for a new dataset's `info.json`.
|
||||
|
||||
Args:
|
||||
codebase_version (str): The version of the LeRobot codebase.
|
||||
@@ -100,24 +87,25 @@ def create_empty_dataset_info(
|
||||
features (dict): The LeRobot features dictionary for the dataset.
|
||||
use_videos (bool): Whether the dataset will store videos.
|
||||
robot_type (str | None): The type of robot used, if any.
|
||||
chunks_size (int | None): Max files per chunk directory. Defaults to ``DEFAULT_CHUNK_SIZE``.
|
||||
data_files_size_in_mb (int | None): Max parquet file size in MB. Defaults to ``DEFAULT_DATA_FILE_SIZE_IN_MB``.
|
||||
video_files_size_in_mb (int | None): Max video file size in MB. Defaults to ``DEFAULT_VIDEO_FILE_SIZE_IN_MB``.
|
||||
|
||||
Returns:
|
||||
DatasetInfo: A typed dataset information object with initial metadata.
|
||||
dict: A dictionary with the initial dataset metadata.
|
||||
"""
|
||||
return DatasetInfo(
|
||||
codebase_version=codebase_version,
|
||||
fps=fps,
|
||||
features=features,
|
||||
robot_type=robot_type,
|
||||
chunks_size=chunks_size or DEFAULT_CHUNK_SIZE,
|
||||
data_files_size_in_mb=data_files_size_in_mb or DEFAULT_DATA_FILE_SIZE_IN_MB,
|
||||
video_files_size_in_mb=video_files_size_in_mb or DEFAULT_VIDEO_FILE_SIZE_IN_MB,
|
||||
data_path=DEFAULT_DATA_PATH,
|
||||
video_path=DEFAULT_VIDEO_PATH if use_videos else None,
|
||||
)
|
||||
return {
|
||||
"codebase_version": codebase_version,
|
||||
"robot_type": robot_type,
|
||||
"total_episodes": 0,
|
||||
"total_frames": 0,
|
||||
"total_tasks": 0,
|
||||
"chunks_size": chunks_size or DEFAULT_CHUNK_SIZE,
|
||||
"data_files_size_in_mb": data_files_size_in_mb or DEFAULT_DATA_FILE_SIZE_IN_MB,
|
||||
"video_files_size_in_mb": video_files_size_in_mb or DEFAULT_VIDEO_FILE_SIZE_IN_MB,
|
||||
"fps": fps,
|
||||
"splits": {},
|
||||
"data_path": DEFAULT_DATA_PATH,
|
||||
"video_path": DEFAULT_VIDEO_PATH if use_videos else None,
|
||||
"features": features,
|
||||
}
|
||||
|
||||
|
||||
def check_delta_timestamps(
|
||||
@@ -254,8 +242,6 @@ def validate_feature_dtype_and_shape(
|
||||
return validate_feature_image_or_video(name, expected_shape, value)
|
||||
elif expected_dtype == "string":
|
||||
return validate_feature_string(name, value)
|
||||
elif expected_dtype == "language":
|
||||
return ""
|
||||
else:
|
||||
raise NotImplementedError(f"The feature dtype '{expected_dtype}' is not implemented yet.")
|
||||
|
||||
|
||||
@@ -31,15 +31,14 @@ from torchvision import transforms
|
||||
from lerobot.utils.io_utils import load_json, write_json
|
||||
from lerobot.utils.utils import SuppressProgressBars, flatten_dict, unflatten_dict
|
||||
|
||||
from .language import LANGUAGE_COLUMNS
|
||||
from .utils import (
|
||||
DEFAULT_DATA_FILE_SIZE_IN_MB,
|
||||
DEFAULT_EPISODES_PATH,
|
||||
DEFAULT_SUBTASKS_PATH,
|
||||
DEFAULT_TASKS_PATH,
|
||||
EPISODES_DIR,
|
||||
INFO_PATH,
|
||||
STATS_PATH,
|
||||
DatasetInfo,
|
||||
serialize_dict,
|
||||
)
|
||||
|
||||
@@ -116,21 +115,25 @@ def embed_images(dataset: datasets.Dataset) -> datasets.Dataset:
|
||||
return dataset
|
||||
|
||||
|
||||
def write_info(info: DatasetInfo, local_dir: Path) -> None:
|
||||
write_json(info.to_dict(), local_dir / INFO_PATH)
|
||||
def write_info(info: dict, local_dir: Path) -> None:
|
||||
write_json(info, local_dir / INFO_PATH)
|
||||
|
||||
|
||||
def load_info(local_dir: Path) -> DatasetInfo:
|
||||
def load_info(local_dir: Path) -> dict:
|
||||
"""Load dataset info metadata from its standard file path.
|
||||
|
||||
Also converts shape lists to tuples for consistency.
|
||||
|
||||
Args:
|
||||
local_dir (Path): The root directory of the dataset.
|
||||
|
||||
Returns:
|
||||
DatasetInfo: The typed dataset information object.
|
||||
dict: The dataset information dictionary.
|
||||
"""
|
||||
raw = load_json(local_dir / INFO_PATH)
|
||||
return DatasetInfo.from_dict(raw)
|
||||
info = load_json(local_dir / INFO_PATH)
|
||||
for ft in info["features"].values():
|
||||
ft["shape"] = tuple(ft["shape"])
|
||||
return info
|
||||
|
||||
|
||||
def write_stats(stats: dict, local_dir: Path) -> None:
|
||||
@@ -186,6 +189,14 @@ def load_tasks(local_dir: Path) -> pandas.DataFrame:
|
||||
return tasks
|
||||
|
||||
|
||||
def load_subtasks(local_dir: Path) -> pandas.DataFrame | None:
|
||||
"""Load subtasks from subtasks.parquet if it exists."""
|
||||
subtasks_path = local_dir / DEFAULT_SUBTASKS_PATH
|
||||
if subtasks_path.exists():
|
||||
return pd.read_parquet(subtasks_path)
|
||||
return None
|
||||
|
||||
|
||||
def write_episodes(episodes: Dataset, local_dir: Path) -> None:
|
||||
"""Write episode metadata to a parquet file in the LeRobot v3.0 format.
|
||||
This function writes episode-level metadata to a single parquet file.
|
||||
@@ -257,13 +268,11 @@ def hf_transform_to_torch(items_dict: dict[str, list[Any]]) -> dict[str, list[to
|
||||
dict: The batch with items converted to torch tensors.
|
||||
"""
|
||||
for key in items_dict:
|
||||
if key in LANGUAGE_COLUMNS:
|
||||
continue
|
||||
first_item = items_dict[key][0]
|
||||
if isinstance(first_item, PILImage.Image):
|
||||
to_tensor = transforms.ToTensor()
|
||||
items_dict[key] = [to_tensor(img) for img in items_dict[key]]
|
||||
elif first_item is None or isinstance(first_item, dict):
|
||||
elif first_item is None:
|
||||
pass
|
||||
else:
|
||||
items_dict[key] = [x if isinstance(x, str) else torch.tensor(x) for x in items_dict[key]]
|
||||
@@ -298,9 +307,8 @@ def item_to_torch(item: dict) -> dict:
|
||||
Returns:
|
||||
dict: Dictionary with all tensor-like items converted to torch.Tensor.
|
||||
"""
|
||||
skip_keys = {"task", *LANGUAGE_COLUMNS}
|
||||
for key, val in item.items():
|
||||
if isinstance(val, (np.ndarray | list)) and key not in skip_keys:
|
||||
if isinstance(val, (np.ndarray | list)) and key not in ["task"]:
|
||||
# Convert numpy arrays and lists to torch tensors
|
||||
item[key] = torch.tensor(val)
|
||||
return item
|
||||
|
||||
@@ -1,240 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Literal
|
||||
|
||||
import datasets
|
||||
import pyarrow as pa
|
||||
|
||||
LANGUAGE_PERSISTENT = "language_persistent"
|
||||
LANGUAGE_EVENTS = "language_events"
|
||||
LANGUAGE_COLUMNS = (LANGUAGE_PERSISTENT, LANGUAGE_EVENTS)
|
||||
PERSISTENT_ROW_FIELDS = ("role", "content", "style", "timestamp", "camera", "tool_calls")
|
||||
EVENT_ROW_FIELDS = ("role", "content", "style", "camera", "tool_calls")
|
||||
|
||||
CORE_STYLES = {
|
||||
"subtask",
|
||||
"plan",
|
||||
"memory",
|
||||
"motion",
|
||||
"interjection",
|
||||
"vqa",
|
||||
"trace",
|
||||
"task_aug",
|
||||
}
|
||||
# Project-local styles can be registered at import time by appending to
|
||||
# ``EXTENDED_STYLES`` before ``column_for_style`` is called. Anything added
|
||||
# here is treated as a known style alongside ``CORE_STYLES`` for resolver
|
||||
# validation. Empty by default — populate from a downstream module that
|
||||
# also extends ``PERSISTENT_STYLES`` or ``EVENT_ONLY_STYLES`` to declare
|
||||
# the new style's column.
|
||||
EXTENDED_STYLES: set[str] = set()
|
||||
STYLE_REGISTRY = CORE_STYLES | EXTENDED_STYLES
|
||||
|
||||
PERSISTENT_STYLES = {"subtask", "plan", "memory", "motion", "task_aug"}
|
||||
EVENT_ONLY_STYLES = {"interjection", "vqa", "trace"}
|
||||
|
||||
# Styles whose ``content`` is grounded in a specific camera view. Rows of these
|
||||
# styles MUST carry a non-null ``camera`` referencing an ``observation.images.*``
|
||||
# feature key. Rows of every other style MUST have ``camera=None``. ``motion``
|
||||
# is intentionally NOT in this set: motion primitives are described in
|
||||
# robot-frame (joint / Cartesian) terms, not pixel space, so they are
|
||||
# camera-agnostic. ``trace`` is the pixel-trajectory event style and IS
|
||||
# view-dependent. The ``camera`` field nevertheless lives on
|
||||
# ``PERSISTENT_ROW_FIELDS`` too so the schema, validator, and resolver
|
||||
# behave symmetrically across the two columns; persistent rows simply
|
||||
# always have ``camera=None`` in practice today.
|
||||
VIEW_DEPENDENT_STYLES = {"vqa", "trace"}
|
||||
|
||||
LanguageColumn = Literal["language_persistent", "language_events"]
|
||||
|
||||
|
||||
def _json_arrow_type() -> pa.DataType:
|
||||
"""Return the Arrow JSON type, falling back to ``string`` on older pyarrow."""
|
||||
return pa.json_() if hasattr(pa, "json_") else pa.string()
|
||||
|
||||
|
||||
def _json_feature() -> object:
|
||||
"""Return the HF ``datasets`` JSON feature, falling back to a string value."""
|
||||
return datasets.Json() if hasattr(datasets, "Json") else datasets.Value("string")
|
||||
|
||||
|
||||
def language_persistent_row_arrow_type() -> pa.StructType:
|
||||
"""Return the Arrow struct type for a single persistent language row.
|
||||
|
||||
Persistent rows carry their own ``timestamp`` because they represent a state
|
||||
that became active at a specific moment and remains active until superseded.
|
||||
"""
|
||||
return pa.struct(
|
||||
[
|
||||
pa.field("role", pa.string(), nullable=False),
|
||||
pa.field("content", pa.string(), nullable=True),
|
||||
pa.field("style", pa.string(), nullable=True),
|
||||
pa.field("timestamp", pa.float64(), nullable=False),
|
||||
pa.field("camera", pa.string(), nullable=True),
|
||||
pa.field("tool_calls", pa.list_(_json_arrow_type()), nullable=True),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def language_event_row_arrow_type() -> pa.StructType:
|
||||
"""Return the Arrow struct type for a single event language row.
|
||||
|
||||
Event rows have no ``timestamp`` field: each event is stored on the dataset
|
||||
row whose frame timestamp is the event's firing time.
|
||||
"""
|
||||
return pa.struct(
|
||||
[
|
||||
pa.field("role", pa.string(), nullable=False),
|
||||
pa.field("content", pa.string(), nullable=True),
|
||||
pa.field("style", pa.string(), nullable=True),
|
||||
pa.field("camera", pa.string(), nullable=True),
|
||||
pa.field("tool_calls", pa.list_(_json_arrow_type()), nullable=True),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def language_persistent_arrow_type() -> pa.ListType:
|
||||
"""Return the Arrow list type for the ``language_persistent`` column."""
|
||||
return pa.list_(language_persistent_row_arrow_type())
|
||||
|
||||
|
||||
def language_events_arrow_type() -> pa.ListType:
|
||||
"""Return the Arrow list type for the ``language_events`` column."""
|
||||
return pa.list_(language_event_row_arrow_type())
|
||||
|
||||
|
||||
def language_persistent_row_feature() -> dict[str, object]:
|
||||
"""Return the HF ``datasets`` feature mapping for a persistent language row."""
|
||||
return {
|
||||
"role": datasets.Value("string"),
|
||||
"content": datasets.Value("string"),
|
||||
"style": datasets.Value("string"),
|
||||
"timestamp": datasets.Value("float64"),
|
||||
"camera": datasets.Value("string"),
|
||||
"tool_calls": datasets.List(_json_feature()),
|
||||
}
|
||||
|
||||
|
||||
def language_event_row_feature() -> dict[str, object]:
|
||||
"""Return the HF ``datasets`` feature mapping for an event language row."""
|
||||
return {
|
||||
"role": datasets.Value("string"),
|
||||
"content": datasets.Value("string"),
|
||||
"style": datasets.Value("string"),
|
||||
"camera": datasets.Value("string"),
|
||||
"tool_calls": datasets.List(_json_feature()),
|
||||
}
|
||||
|
||||
|
||||
def language_persistent_column_feature() -> datasets.List:
|
||||
"""Return the HF ``datasets`` feature for the ``language_persistent`` column."""
|
||||
return datasets.List(language_persistent_row_feature())
|
||||
|
||||
|
||||
def language_events_column_feature() -> datasets.List:
|
||||
"""Return the HF ``datasets`` feature for the ``language_events`` column."""
|
||||
return datasets.List(language_event_row_feature())
|
||||
|
||||
|
||||
def language_feature_info() -> dict[str, dict]:
|
||||
"""Return the ``info["features"]`` entries for both language columns."""
|
||||
return {
|
||||
LANGUAGE_PERSISTENT: {"dtype": "language", "shape": (1,), "names": None},
|
||||
LANGUAGE_EVENTS: {"dtype": "language", "shape": (1,), "names": None},
|
||||
}
|
||||
|
||||
|
||||
def is_language_column(key: str) -> bool:
|
||||
"""Return ``True`` if ``key`` is one of the dataset's language column names."""
|
||||
return key in LANGUAGE_COLUMNS
|
||||
|
||||
|
||||
def is_view_dependent_style(style: str | None) -> bool:
|
||||
"""Return ``True`` if rows of ``style`` must be tagged with a ``camera`` key."""
|
||||
return style in VIEW_DEPENDENT_STYLES
|
||||
|
||||
|
||||
def validate_camera_field(style: str | None, camera: str | None) -> None:
|
||||
"""Enforce the ``camera`` invariant: required iff ``style`` is view-dependent.
|
||||
|
||||
Raises ``ValueError`` if a view-dependent style is missing ``camera`` or if
|
||||
a non-view-dependent style carries one. Pipeline writers and the validator
|
||||
should call this on every emitted row.
|
||||
"""
|
||||
if is_view_dependent_style(style):
|
||||
if not camera:
|
||||
raise ValueError(
|
||||
f"Rows of view-dependent style {style!r} require a non-empty 'camera' "
|
||||
f"field referencing an 'observation.images.*' feature key."
|
||||
)
|
||||
elif camera is not None:
|
||||
raise ValueError(f"Rows of style {style!r} must have camera=None; got camera={camera!r}.")
|
||||
|
||||
|
||||
# --- Tool registry --------------------------------------------------------
|
||||
# Tools declared on a dataset live in ``meta/info.json["tools"]`` as a list
|
||||
# of OpenAI-style function schemas. The runtime / training stack reads them
|
||||
# through :class:`LeRobotDatasetMetadata.tools` (with these constants as
|
||||
# fallback when the dataset doesn't declare any). Implementations live
|
||||
# under :mod:`lerobot.tools` (one file per tool); see
|
||||
# ``docs/source/tools.mdx`` for the authoring guide.
|
||||
|
||||
SAY_TOOL_SCHEMA: dict = {
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "say",
|
||||
"description": "Speak a short utterance to the user via the TTS executor.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"text": {
|
||||
"type": "string",
|
||||
"description": "The verbatim text to speak.",
|
||||
}
|
||||
},
|
||||
"required": ["text"],
|
||||
},
|
||||
},
|
||||
}
|
||||
"""Canonical schema for the ``say`` tool emitted by the steerable
|
||||
annotation pipeline (PR 2 Module 2). Single source of truth — PR 2's
|
||||
writer, PR 3's runtime tool registry, and the dataset visualizer all
|
||||
import this constant rather than duplicating the dict."""
|
||||
|
||||
DEFAULT_TOOLS: list[dict] = [SAY_TOOL_SCHEMA]
|
||||
"""Fallback tools list. Returned by ``LeRobotDatasetMetadata.tools``
|
||||
when ``meta/info.json["tools"]`` is unset, so unannotated datasets and
|
||||
chat-template consumers (``apply_chat_template(messages, tools=...)``)
|
||||
keep working out of the box."""
|
||||
|
||||
|
||||
def column_for_style(style: str | None) -> LanguageColumn:
|
||||
"""Map a language style to the column where rows of that style are stored.
|
||||
|
||||
Styles in :data:`PERSISTENT_STYLES` route to :data:`LANGUAGE_PERSISTENT`.
|
||||
Styles in :data:`EVENT_ONLY_STYLES` and the implicit ``None`` style route
|
||||
to :data:`LANGUAGE_EVENTS`.
|
||||
"""
|
||||
if style is None:
|
||||
return LANGUAGE_EVENTS
|
||||
if style in PERSISTENT_STYLES:
|
||||
return LANGUAGE_PERSISTENT
|
||||
if style in EVENT_ONLY_STYLES:
|
||||
return LANGUAGE_EVENTS
|
||||
raise ValueError(f"Unknown language style: {style!r}")
|
||||
@@ -1,543 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import copy
|
||||
import hashlib
|
||||
import re
|
||||
from collections.abc import Sequence
|
||||
from typing import Any
|
||||
|
||||
from lerobot.configs.recipe import DEFAULT_BINDINGS, PLACEHOLDER_RE, TrainingRecipe
|
||||
|
||||
from .language import LANGUAGE_PERSISTENT, column_for_style
|
||||
|
||||
LanguageRow = dict[str, Any]
|
||||
RenderedMessages = dict[str, list[Any]]
|
||||
|
||||
_RESOLVER_RE = re.compile(r"^(?P<name>[A-Za-z_][A-Za-z0-9_]*)\((?P<args>.*)\)$")
|
||||
|
||||
|
||||
def active_at(
|
||||
t: float,
|
||||
*,
|
||||
persistent: Sequence[LanguageRow],
|
||||
style: str | None = None,
|
||||
role: str | None = None,
|
||||
tool_name: str | None = None,
|
||||
camera: str | None = None,
|
||||
) -> LanguageRow | None:
|
||||
"""Return the persistent row of ``style`` that is active at time ``t``.
|
||||
|
||||
A persistent row is "active" at ``t`` when its own ``timestamp`` is the
|
||||
most recent one ``<= t`` for the given ``style``/``role``/``tool_name``/
|
||||
``camera`` selector. Only valid for persistent styles.
|
||||
"""
|
||||
_validate_persistent_resolver("active_at", style)
|
||||
matches = [
|
||||
row
|
||||
for row in _matching_rows(persistent, style=style, role=role, tool_name=tool_name, camera=camera)
|
||||
if _timestamp(row) <= t
|
||||
]
|
||||
if not matches:
|
||||
return None
|
||||
latest_ts = max(_timestamp(row) for row in matches)
|
||||
return _select_one(
|
||||
[row for row in matches if _timestamp(row) == latest_ts],
|
||||
style=style,
|
||||
role=role,
|
||||
tool_name=tool_name,
|
||||
camera=camera,
|
||||
)
|
||||
|
||||
|
||||
EMITTED_AT_TOLERANCE_S = 0.1
|
||||
"""Half-window for matching persistent rows to a frame timestamp in
|
||||
``emitted_at``. Persistent timestamps come from parquet (float64) and ``t``
|
||||
is also a float64 from parquet, so in the ideal hot path an exact match
|
||||
would suffice — but any caller that derives ``t`` arithmetically (e.g.
|
||||
``frame_idx / fps``) breaks bit-equality. A 0.1 s tolerance covers
|
||||
common arithmetic drift without admitting frames that are visibly far
|
||||
apart at typical control rates (30–100 Hz)."""
|
||||
|
||||
|
||||
def emitted_at(
|
||||
t: float,
|
||||
*,
|
||||
persistent: Sequence[LanguageRow],
|
||||
events: Sequence[LanguageRow],
|
||||
style: str | None = None,
|
||||
role: str | None = None,
|
||||
tool_name: str | None = None,
|
||||
camera: str | None = None,
|
||||
) -> LanguageRow | None:
|
||||
"""Return the row of ``style`` emitted at exactly time ``t``.
|
||||
|
||||
For persistent styles, this matches persistent rows whose own ``timestamp``
|
||||
is within ``EMITTED_AT_TOLERANCE_S`` of ``t`` (see that constant for why
|
||||
we use a tolerance instead of bit-equality). For event styles, the
|
||||
``events`` list is assumed to come from the dataset row at frame ``t``
|
||||
(event rows carry no timestamp of their own), so all matching event rows
|
||||
are considered emitted at ``t``. ``camera`` filters by the row's
|
||||
``camera`` field — required to disambiguate when multiple view-dependent
|
||||
rows share ``(t, role)`` across cameras.
|
||||
"""
|
||||
if column_for_style(style) == LANGUAGE_PERSISTENT:
|
||||
matches = [
|
||||
row
|
||||
for row in _matching_rows(persistent, style=style, role=role, tool_name=tool_name, camera=camera)
|
||||
if abs(_timestamp(row) - t) <= EMITTED_AT_TOLERANCE_S
|
||||
]
|
||||
else:
|
||||
matches = _matching_rows(events, style=style, role=role, tool_name=tool_name, camera=camera)
|
||||
return _select_one(matches, style=style, role=role, tool_name=tool_name, camera=camera)
|
||||
|
||||
|
||||
def nth_prev(
|
||||
t: float,
|
||||
*,
|
||||
persistent: Sequence[LanguageRow],
|
||||
style: str | None = None,
|
||||
offset: int = 1,
|
||||
role: str | None = None,
|
||||
tool_name: str | None = None,
|
||||
camera: str | None = None,
|
||||
) -> LanguageRow | None:
|
||||
"""Return the persistent row that was active ``offset`` steps before ``t``.
|
||||
|
||||
Walks back through chronologically sorted persistent rows of ``style``
|
||||
(filtered by optional ``role``/``tool_name``/``camera``) and returns the
|
||||
one ``offset`` positions before the row active at ``t``. Only valid for
|
||||
persistent styles.
|
||||
"""
|
||||
return _nth_relative("nth_prev", t, persistent, style, -offset, role, tool_name, camera)
|
||||
|
||||
|
||||
def nth_next(
|
||||
t: float,
|
||||
*,
|
||||
persistent: Sequence[LanguageRow],
|
||||
style: str | None = None,
|
||||
offset: int = 1,
|
||||
role: str | None = None,
|
||||
tool_name: str | None = None,
|
||||
camera: str | None = None,
|
||||
) -> LanguageRow | None:
|
||||
"""Return the persistent row that becomes active ``offset`` steps after ``t``.
|
||||
|
||||
Walks forward through chronologically sorted persistent rows of ``style``
|
||||
(filtered by optional ``role``/``tool_name``/``camera``) and returns the
|
||||
one ``offset`` positions after the row active at ``t``. Only valid for
|
||||
persistent styles.
|
||||
"""
|
||||
return _nth_relative("nth_next", t, persistent, style, offset, role, tool_name, camera)
|
||||
|
||||
|
||||
def render_sample(
|
||||
*,
|
||||
recipe: TrainingRecipe,
|
||||
persistent: Sequence[LanguageRow] | None,
|
||||
events: Sequence[LanguageRow] | None,
|
||||
t: float,
|
||||
sample_idx: int,
|
||||
task: str | None = None,
|
||||
dataset_ctx: Any | None = None,
|
||||
) -> RenderedMessages | None:
|
||||
"""Render the chat-style messages for a single dataset sample.
|
||||
|
||||
Resolves the recipe's bindings against ``persistent`` and ``events`` rows
|
||||
at frame timestamp ``t``, then expands the recipe's message templates.
|
||||
Returns ``None`` if the resolved sample contains no target message.
|
||||
"""
|
||||
persistent_rows = _normalize_rows(persistent or [])
|
||||
event_rows = _normalize_rows(events or [])
|
||||
selected_recipe = _select_recipe(recipe, sample_idx)
|
||||
bindings = _resolve_bindings(
|
||||
selected_recipe,
|
||||
persistent=persistent_rows,
|
||||
events=event_rows,
|
||||
t=t,
|
||||
sample_idx=sample_idx,
|
||||
task=task,
|
||||
dataset_ctx=dataset_ctx,
|
||||
)
|
||||
return _render_message_recipe(selected_recipe, bindings)
|
||||
|
||||
|
||||
def _select_recipe(recipe: TrainingRecipe, sample_idx: int) -> TrainingRecipe:
|
||||
"""Pick a deterministic blend component for ``sample_idx`` (or return ``recipe``)."""
|
||||
if recipe.blend is None:
|
||||
return recipe
|
||||
|
||||
total_weight = sum(component.weight or 0.0 for component in recipe.blend.values())
|
||||
if total_weight <= 0:
|
||||
raise ValueError("Blend weights must sum to a positive value.")
|
||||
|
||||
digest = hashlib.blake2b(str(sample_idx).encode(), digest_size=8).digest()
|
||||
draw = int.from_bytes(digest, "big") / 2**64 * total_weight
|
||||
cumulative = 0.0
|
||||
last_component: TrainingRecipe | None = None
|
||||
for component in recipe.blend.values():
|
||||
last_component = component
|
||||
cumulative += component.weight or 0.0
|
||||
if draw < cumulative:
|
||||
return component
|
||||
assert last_component is not None
|
||||
return last_component
|
||||
|
||||
|
||||
def _resolve_bindings(
|
||||
recipe: TrainingRecipe,
|
||||
*,
|
||||
persistent: Sequence[LanguageRow],
|
||||
events: Sequence[LanguageRow],
|
||||
t: float,
|
||||
sample_idx: int,
|
||||
task: str | None,
|
||||
dataset_ctx: Any | None,
|
||||
) -> dict[str, LanguageRow | str | None]:
|
||||
"""Resolve every binding in ``recipe`` (plus ``task``) at time ``t``."""
|
||||
bindings: dict[str, LanguageRow | str | None] = {
|
||||
"task": _resolve_task(task, dataset_ctx, persistent=persistent, sample_idx=sample_idx),
|
||||
}
|
||||
specs = {**DEFAULT_BINDINGS, **(recipe.bindings or {})}
|
||||
for name, spec in specs.items():
|
||||
bindings[name] = _resolve_spec(spec, persistent=persistent, events=events, t=t)
|
||||
return bindings
|
||||
|
||||
|
||||
def _resolve_task(
|
||||
task: str | None,
|
||||
dataset_ctx: Any | None,
|
||||
*,
|
||||
persistent: Sequence[LanguageRow] = (),
|
||||
sample_idx: int = 0,
|
||||
) -> str | None:
|
||||
"""Return the task string for ``sample_idx``.
|
||||
|
||||
Resolution order:
|
||||
|
||||
1. Explicit ``task`` override (caller-supplied) wins.
|
||||
2. If ``persistent`` contains rows of style ``task_aug`` (role=user),
|
||||
deterministically pick one by ``sample_idx`` so each frame of an
|
||||
episode rotates through the available rephrasings across an epoch.
|
||||
This realizes Xiao 2022 / CAST-style task-prompt diversity without
|
||||
changing ``meta/tasks.parquet`` and without forcing recipes to opt
|
||||
in: ``${task}`` automatically picks a rephrasing when one exists,
|
||||
and falls back to the canonical task otherwise. Recipes that want
|
||||
the literal canonical task can override the binding.
|
||||
3. Otherwise read the canonical task from ``dataset_ctx`` (which is
|
||||
backed by ``meta/tasks.parquet``).
|
||||
"""
|
||||
if task is not None:
|
||||
return task
|
||||
|
||||
aug_rows = [r for r in persistent if r.get("style") == "task_aug" and r.get("role") == "user"]
|
||||
if aug_rows:
|
||||
# Deterministic, blake2b-based pick keyed on sample_idx so the
|
||||
# rotation is reproducible across runs (Python's built-in ``hash``
|
||||
# is process-randomized).
|
||||
digest = hashlib.blake2b(f"task_aug:{sample_idx}".encode(), digest_size=8).digest()
|
||||
idx = int.from_bytes(digest, "big") % len(aug_rows)
|
||||
chosen = aug_rows[idx].get("content")
|
||||
if chosen:
|
||||
return str(chosen)
|
||||
|
||||
if dataset_ctx is None:
|
||||
return None
|
||||
if isinstance(dataset_ctx, dict):
|
||||
return dataset_ctx.get("task")
|
||||
return getattr(dataset_ctx, "task", None)
|
||||
|
||||
|
||||
def _resolve_spec(
|
||||
spec: str,
|
||||
*,
|
||||
persistent: Sequence[LanguageRow],
|
||||
events: Sequence[LanguageRow],
|
||||
t: float,
|
||||
) -> LanguageRow | None:
|
||||
"""Parse a single binding's resolver expression and dispatch to its function."""
|
||||
match = _RESOLVER_RE.match(spec.strip())
|
||||
if match is None:
|
||||
raise ValueError(f"Invalid resolver expression: {spec!r}")
|
||||
name = match.group("name")
|
||||
kwargs = _parse_resolver_args(match.group("args"))
|
||||
kwargs.pop("t_arg", None)
|
||||
|
||||
if name == "emitted_at":
|
||||
return emitted_at(t, persistent=persistent, events=events, **kwargs)
|
||||
if name == "active_at":
|
||||
return active_at(t, persistent=persistent, **kwargs)
|
||||
if name == "nth_prev":
|
||||
return nth_prev(t, persistent=persistent, **kwargs)
|
||||
if name == "nth_next":
|
||||
return nth_next(t, persistent=persistent, **kwargs)
|
||||
raise ValueError(f"Unknown language resolver: {name!r}")
|
||||
|
||||
|
||||
def _parse_resolver_args(args: str) -> dict[str, Any]:
|
||||
"""Parse a comma-separated resolver argument list into a kwargs dict."""
|
||||
kwargs: dict[str, Any] = {}
|
||||
if not args.strip():
|
||||
return kwargs
|
||||
|
||||
parts = [part.strip() for part in args.split(",") if part.strip()]
|
||||
for part in parts:
|
||||
if part == "t":
|
||||
kwargs["t_arg"] = True
|
||||
continue
|
||||
if "=" not in part:
|
||||
raise ValueError(f"Invalid resolver argument: {part!r}")
|
||||
key, value = (item.strip() for item in part.split("=", 1))
|
||||
if key == "offset":
|
||||
kwargs[key] = int(value)
|
||||
else:
|
||||
kwargs[key] = value.strip("\"'")
|
||||
return kwargs
|
||||
|
||||
|
||||
def _render_message_recipe(
|
||||
recipe: TrainingRecipe,
|
||||
bindings: dict[str, LanguageRow | str | None],
|
||||
) -> RenderedMessages | None:
|
||||
"""Expand ``recipe.messages`` into rendered chat messages using ``bindings``."""
|
||||
assert recipe.messages is not None
|
||||
messages: list[dict[str, Any]] = []
|
||||
streams: list[str | None] = []
|
||||
target_indices: list[int] = []
|
||||
|
||||
for turn in recipe.messages:
|
||||
if turn.if_present is not None and bindings.get(turn.if_present) is None:
|
||||
continue
|
||||
|
||||
message = {"role": turn.role}
|
||||
if turn.content is not None:
|
||||
message["content"] = _render_content(turn.content, bindings)
|
||||
|
||||
if turn.tool_calls_from is not None:
|
||||
row = bindings.get(turn.tool_calls_from)
|
||||
tool_calls = row.get("tool_calls") if isinstance(row, dict) else None
|
||||
if tool_calls:
|
||||
message["tool_calls"] = copy.deepcopy(tool_calls)
|
||||
|
||||
message_idx = len(messages)
|
||||
messages.append(message)
|
||||
streams.append(turn.stream)
|
||||
if turn.target:
|
||||
target_indices.append(message_idx)
|
||||
|
||||
if not target_indices:
|
||||
return None
|
||||
|
||||
rendered = {
|
||||
"messages": messages,
|
||||
"message_streams": streams,
|
||||
"target_message_indices": target_indices,
|
||||
}
|
||||
_validate_rendered(rendered)
|
||||
return rendered
|
||||
|
||||
|
||||
def _render_content(
|
||||
content: str | list[dict[str, Any]],
|
||||
bindings: dict[str, LanguageRow | str | None],
|
||||
) -> str | list[dict[str, Any]]:
|
||||
"""Substitute bindings into a string or each string field of multimodal blocks."""
|
||||
if isinstance(content, str):
|
||||
return _substitute(content, bindings)
|
||||
|
||||
rendered_blocks = []
|
||||
for block in content:
|
||||
rendered_block = copy.deepcopy(block)
|
||||
for key, value in rendered_block.items():
|
||||
if isinstance(value, str):
|
||||
rendered_block[key] = _substitute(value, bindings)
|
||||
rendered_blocks.append(rendered_block)
|
||||
return rendered_blocks
|
||||
|
||||
|
||||
def _substitute(template: str, bindings: dict[str, LanguageRow | str | None]) -> str:
|
||||
"""Replace ``${name}`` placeholders in ``template`` with their bound values."""
|
||||
|
||||
def replace(match: re.Match[str]) -> str:
|
||||
"""Resolve a single ``${name}`` match to its bound string value."""
|
||||
name = match.group(1)
|
||||
if name not in bindings:
|
||||
raise ValueError(f"Unknown template binding: {name!r}")
|
||||
value = bindings[name]
|
||||
if value is None:
|
||||
return ""
|
||||
if isinstance(value, dict):
|
||||
content = value.get("content")
|
||||
return "" if content is None else str(content)
|
||||
return str(value)
|
||||
|
||||
return PLACEHOLDER_RE.sub(replace, template)
|
||||
|
||||
|
||||
def _validate_rendered(rendered: RenderedMessages) -> None:
|
||||
"""Sanity-check the rendered output for stream/target alignment."""
|
||||
messages = rendered["messages"]
|
||||
streams = rendered["message_streams"]
|
||||
target_indices = rendered["target_message_indices"]
|
||||
|
||||
if len(streams) != len(messages):
|
||||
raise ValueError("message_streams must be aligned with messages.")
|
||||
if not target_indices:
|
||||
raise ValueError("Rendered samples must contain at least one target message.")
|
||||
for idx in target_indices:
|
||||
if idx < 0 or idx >= len(messages):
|
||||
raise ValueError(f"Target message index {idx} is out of bounds.")
|
||||
# ``stream`` is enforced non-None at MessageTurn construction time
|
||||
# (see ``MessageTurn.__post_init__``), so a missing stream here would
|
||||
# mean the dataclass invariant was bypassed; no need to re-check.
|
||||
|
||||
|
||||
def _nth_relative(
|
||||
name: str,
|
||||
t: float,
|
||||
persistent: Sequence[LanguageRow],
|
||||
style: str | None,
|
||||
offset: int,
|
||||
role: str | None,
|
||||
tool_name: str | None,
|
||||
camera: str | None,
|
||||
) -> LanguageRow | None:
|
||||
"""Shared body for ``nth_prev`` / ``nth_next`` with signed ``offset``."""
|
||||
_validate_persistent_resolver(name, style)
|
||||
if abs(offset) < 1:
|
||||
raise ValueError(f"{name} offset must be non-zero.")
|
||||
|
||||
rows = sorted(
|
||||
_matching_rows(persistent, style=style, role=role, tool_name=tool_name, camera=camera),
|
||||
key=_row_sort_key,
|
||||
)
|
||||
if not rows:
|
||||
return None
|
||||
|
||||
anchor_idx = None
|
||||
for idx, row in enumerate(rows):
|
||||
if _timestamp(row) <= t:
|
||||
anchor_idx = idx
|
||||
else:
|
||||
break
|
||||
|
||||
target_idx = (offset - 1 if offset > 0 else None) if anchor_idx is None else anchor_idx + offset
|
||||
|
||||
if target_idx is None or target_idx < 0 or target_idx >= len(rows):
|
||||
return None
|
||||
return rows[target_idx]
|
||||
|
||||
|
||||
def _validate_persistent_resolver(name: str, style: str | None) -> None:
|
||||
"""Reject calls with missing or event-only ``style`` for persistent resolvers."""
|
||||
if style is None:
|
||||
raise ValueError(f"{name} requires a persistent style.")
|
||||
if column_for_style(style) != LANGUAGE_PERSISTENT:
|
||||
raise ValueError(f"{name} cannot be used with event-only style {style!r}.")
|
||||
|
||||
|
||||
def _matching_rows(
|
||||
rows: Sequence[LanguageRow],
|
||||
*,
|
||||
style: str | None,
|
||||
role: str | None,
|
||||
tool_name: str | None,
|
||||
camera: str | None,
|
||||
) -> list[LanguageRow]:
|
||||
"""Return ``rows`` filtered by optional ``style``/``role``/``tool_name``/``camera`` selectors."""
|
||||
return [
|
||||
row
|
||||
for row in rows
|
||||
if (style is None or row.get("style") == style)
|
||||
and (role is None or row.get("role") == role)
|
||||
and (tool_name is None or _row_has_tool_name(row, tool_name))
|
||||
and (camera is None or row.get("camera") == camera)
|
||||
]
|
||||
|
||||
|
||||
def _select_one(
|
||||
rows: Sequence[LanguageRow],
|
||||
*,
|
||||
style: str | None,
|
||||
role: str | None,
|
||||
tool_name: str | None,
|
||||
camera: str | None,
|
||||
) -> LanguageRow | None:
|
||||
"""Return the single matching row, or raise if the resolver is ambiguous.
|
||||
|
||||
Multiple matches always raise — even when the caller already passed
|
||||
some selectors — because remaining ambiguity means the data has
|
||||
several rows that look identical to the resolver and the caller
|
||||
needs to pin down a specific one (e.g. add ``camera=...`` for VQA
|
||||
rows shared across cameras).
|
||||
"""
|
||||
if not rows:
|
||||
return None
|
||||
if len(rows) > 1:
|
||||
raise ValueError(
|
||||
f"Ambiguous resolver for style={style!r} role={role!r} "
|
||||
f"tool_name={tool_name!r} camera={camera!r}: {len(rows)} matching rows. "
|
||||
f"Add a selector that distinguishes them."
|
||||
)
|
||||
return rows[0]
|
||||
|
||||
|
||||
def _row_sort_key(row: LanguageRow) -> tuple[float, str, str]:
|
||||
"""Stable sort key for both persistent and event rows.
|
||||
|
||||
Event rows lack ``timestamp`` (it is implicit in the frame), so default
|
||||
to ``0.0`` — within a single frame all event rows share the same sort
|
||||
bucket and are tiebroken by ``(style, role)``.
|
||||
"""
|
||||
timestamp = row.get("timestamp")
|
||||
ts = (
|
||||
float(timestamp.item() if hasattr(timestamp, "item") else timestamp) if timestamp is not None else 0.0
|
||||
)
|
||||
return (ts, row.get("style") or "", row.get("role") or "")
|
||||
|
||||
|
||||
def _timestamp(row: LanguageRow) -> float:
|
||||
"""Extract a row's ``timestamp`` as a Python float (unwrapping numpy scalars)."""
|
||||
value = row["timestamp"]
|
||||
return float(value.item() if hasattr(value, "item") else value)
|
||||
|
||||
|
||||
def _row_has_tool_name(row: LanguageRow, tool_name: str) -> bool:
|
||||
"""Return ``True`` if any of the row's tool calls invokes ``tool_name``."""
|
||||
for tool_call in row.get("tool_calls") or []:
|
||||
if isinstance(tool_call, str):
|
||||
continue
|
||||
function = tool_call.get("function") if isinstance(tool_call, dict) else None
|
||||
if isinstance(function, dict) and function.get("name") == tool_name:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _normalize_rows(rows: Sequence[Any]) -> list[LanguageRow]:
|
||||
"""Convert pyarrow scalars / mappings into a fresh list of plain dict rows."""
|
||||
normalized = []
|
||||
for row in rows:
|
||||
if row is None:
|
||||
continue
|
||||
if hasattr(row, "as_py"):
|
||||
row = row.as_py()
|
||||
if not isinstance(row, dict):
|
||||
raise TypeError(f"Language rows must be dictionaries, got {type(row).__name__}.")
|
||||
normalized.append(dict(row))
|
||||
return normalized
|
||||
@@ -630,8 +630,6 @@ class LeRobotDataset(torch.utils.data.Dataset):
|
||||
streaming_encoding: bool = False,
|
||||
encoder_queue_maxsize: int = 30,
|
||||
encoder_threads: int | None = None,
|
||||
video_files_size_in_mb: int | None = None,
|
||||
data_files_size_in_mb: int | None = None,
|
||||
) -> "LeRobotDataset":
|
||||
"""Create a new LeRobotDataset from scratch for recording data.
|
||||
|
||||
@@ -679,8 +677,6 @@ class LeRobotDataset(torch.utils.data.Dataset):
|
||||
root=root,
|
||||
use_videos=use_videos,
|
||||
metadata_buffer_size=metadata_buffer_size,
|
||||
video_files_size_in_mb=video_files_size_in_mb,
|
||||
data_files_size_in_mb=data_files_size_in_mb,
|
||||
)
|
||||
obj.repo_id = obj.meta.repo_id
|
||||
obj._requested_root = obj.meta.root
|
||||
|
||||
@@ -123,7 +123,7 @@ class MultiLeRobotDataset(torch.utils.data.Dataset):
|
||||
|
||||
NOTE: Fow now, this relies on a check in __init__ to make sure all sub-datasets have the same info.
|
||||
"""
|
||||
return self._datasets[0].meta.info.fps
|
||||
return self._datasets[0].meta.info["fps"]
|
||||
|
||||
@property
|
||||
def video(self) -> bool:
|
||||
@@ -133,7 +133,7 @@ class MultiLeRobotDataset(torch.utils.data.Dataset):
|
||||
|
||||
NOTE: Fow now, this relies on a check in __init__ to make sure all sub-datasets have the same info.
|
||||
"""
|
||||
return len(self._datasets[0].meta.video_keys) > 0
|
||||
return self._datasets[0].meta.info.get("video", False)
|
||||
|
||||
@property
|
||||
def features(self) -> datasets.Features:
|
||||
|
||||
@@ -434,7 +434,7 @@ class StreamingLeRobotDataset(torch.utils.data.IterableDataset):
|
||||
|
||||
def _make_padding_camera_frame(self, camera_key: str):
|
||||
"""Variable-shape padding frame for given camera keys, given in (H, W, C)"""
|
||||
return torch.zeros(self.meta.info.features[camera_key]["shape"]).permute(-1, 0, 1)
|
||||
return torch.zeros(self.meta.info["features"][camera_key]["shape"]).permute(-1, 0, 1)
|
||||
|
||||
def _get_video_frame_padding_mask(
|
||||
self,
|
||||
|
||||
@@ -14,11 +14,9 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import contextlib
|
||||
import dataclasses
|
||||
import importlib.resources
|
||||
import json
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
|
||||
import datasets
|
||||
@@ -72,12 +70,9 @@ class ForwardCompatibilityError(CompatibilityError):
|
||||
super().__init__(message)
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
DEFAULT_CHUNK_SIZE = 1000 # Max number of files per chunk
|
||||
DEFAULT_DATA_FILE_SIZE_IN_MB = 100 # Max size per file
|
||||
DEFAULT_VIDEO_FILE_SIZE_IN_MB = 200 # Max size per file
|
||||
DEFAULT_DATA_FILE_SIZE_IN_MB = 50 # Max size per file
|
||||
DEFAULT_VIDEO_FILE_SIZE_IN_MB = 100 # Max size per file
|
||||
|
||||
INFO_PATH = "meta/info.json"
|
||||
STATS_PATH = "meta/stats.json"
|
||||
@@ -88,6 +83,7 @@ VIDEO_DIR = "videos"
|
||||
|
||||
CHUNK_FILE_PATTERN = "chunk-{chunk_index:03d}/file-{file_index:03d}"
|
||||
DEFAULT_TASKS_PATH = "meta/tasks.parquet"
|
||||
DEFAULT_SUBTASKS_PATH = "meta/subtasks.parquet"
|
||||
DEFAULT_EPISODES_PATH = EPISODES_DIR + "/" + CHUNK_FILE_PATTERN + ".parquet"
|
||||
DEFAULT_DATA_PATH = DATA_DIR + "/" + CHUNK_FILE_PATTERN + ".parquet"
|
||||
DEFAULT_VIDEO_PATH = VIDEO_DIR + "/{video_key}/" + CHUNK_FILE_PATTERN + ".mp4"
|
||||
@@ -98,130 +94,6 @@ LEGACY_EPISODES_STATS_PATH = "meta/episodes_stats.jsonl"
|
||||
LEGACY_TASKS_PATH = "meta/tasks.jsonl"
|
||||
|
||||
|
||||
@dataclass
|
||||
class DatasetInfo:
|
||||
"""Typed representation of the ``meta/info.json`` file for a LeRobot dataset.
|
||||
|
||||
Replaces the previously untyped ``dict`` returned by ``load_info()`` and
|
||||
created by ``create_empty_dataset_info()``. Using a dataclass provides
|
||||
explicit field definitions, IDE auto-completion, and validation at
|
||||
construction time.
|
||||
"""
|
||||
|
||||
codebase_version: str
|
||||
fps: int
|
||||
features: dict[str, dict]
|
||||
|
||||
# Episode / frame counters — start at zero for new datasets
|
||||
total_episodes: int = 0
|
||||
total_frames: int = 0
|
||||
total_tasks: int = 0
|
||||
|
||||
# Storage settings
|
||||
chunks_size: int = field(default=DEFAULT_CHUNK_SIZE)
|
||||
data_files_size_in_mb: int = field(default=DEFAULT_DATA_FILE_SIZE_IN_MB)
|
||||
video_files_size_in_mb: int = field(default=DEFAULT_VIDEO_FILE_SIZE_IN_MB)
|
||||
|
||||
# File path templates
|
||||
data_path: str = field(default=DEFAULT_DATA_PATH)
|
||||
video_path: str | None = field(default=DEFAULT_VIDEO_PATH)
|
||||
|
||||
# Optional metadata
|
||||
robot_type: str | None = None
|
||||
splits: dict[str, str] = field(default_factory=dict)
|
||||
# OpenAI-style tool schemas declared by the dataset. ``None`` means the
|
||||
# dataset doesn't declare any — readers fall back to ``DEFAULT_TOOLS``.
|
||||
tools: list[dict] | None = None
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
# Coerce feature shapes from list to tuple — JSON deserialisation
|
||||
# returns lists, but the rest of the codebase expects tuples.
|
||||
for ft in self.features.values():
|
||||
if isinstance(ft.get("shape"), list):
|
||||
ft["shape"] = tuple(ft["shape"])
|
||||
|
||||
if self.fps <= 0:
|
||||
raise ValueError(f"fps must be positive, got {self.fps}")
|
||||
if self.chunks_size <= 0:
|
||||
raise ValueError(f"chunks_size must be positive, got {self.chunks_size}")
|
||||
if self.data_files_size_in_mb <= 0:
|
||||
raise ValueError(f"data_files_size_in_mb must be positive, got {self.data_files_size_in_mb}")
|
||||
if self.video_files_size_in_mb <= 0:
|
||||
raise ValueError(f"video_files_size_in_mb must be positive, got {self.video_files_size_in_mb}")
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Return a JSON-serialisable dict.
|
||||
|
||||
Converts tuple shapes back to lists so ``json.dump`` can handle them.
|
||||
Drops ``tools`` when unset so existing datasets keep a clean
|
||||
``info.json``.
|
||||
"""
|
||||
d = dataclasses.asdict(self)
|
||||
for ft in d["features"].values():
|
||||
if isinstance(ft.get("shape"), tuple):
|
||||
ft["shape"] = list(ft["shape"])
|
||||
if d.get("tools") is None:
|
||||
d.pop("tools", None)
|
||||
return d
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict) -> "DatasetInfo":
|
||||
"""Construct from a raw dict (e.g. loaded directly from JSON).
|
||||
|
||||
Unknown keys are ignored for forward compatibility with datasets that
|
||||
carry additional fields (e.g. ``total_videos`` from v2.x). A warning is
|
||||
logged when such fields are present.
|
||||
"""
|
||||
known = {f.name for f in dataclasses.fields(cls)}
|
||||
unknown = sorted(k for k in data if k not in known)
|
||||
if unknown:
|
||||
logger.warning(f"Unknown fields in DatasetInfo: {unknown}. These will be ignored.")
|
||||
return cls(**{k: v for k, v in data.items() if k in known})
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Temporary dict-style compatibility layer
|
||||
# Allows existing ``info["key"]`` call-sites to keep working without changes.
|
||||
# Once all callers have been migrated to attribute access, remove these.
|
||||
# ---------------------------------------------------------------------------
|
||||
def __getitem__(self, key: str):
|
||||
import warnings
|
||||
|
||||
warnings.warn(
|
||||
f"Accessing DatasetInfo with dict-style syntax info['{key}'] is deprecated. "
|
||||
f"Use attribute access info.{key} instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
try:
|
||||
return getattr(self, key)
|
||||
except AttributeError as err:
|
||||
raise KeyError(key) from err
|
||||
|
||||
def __setitem__(self, key: str, value) -> None:
|
||||
import warnings
|
||||
|
||||
warnings.warn(
|
||||
f"Setting DatasetInfo with dict-style syntax info['{key}'] = ... is deprecated. "
|
||||
f"Use attribute assignment info.{key} = ... instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
if not hasattr(self, key):
|
||||
raise KeyError(f"DatasetInfo has no field '{key}'")
|
||||
setattr(self, key, value)
|
||||
|
||||
def __contains__(self, key: str) -> bool:
|
||||
"""Check if a field exists (dict-like interface)."""
|
||||
return hasattr(self, key)
|
||||
|
||||
def get(self, key: str, default=None):
|
||||
"""Get attribute value with default fallback (dict-like interface)."""
|
||||
try:
|
||||
return getattr(self, key)
|
||||
except AttributeError:
|
||||
return default
|
||||
|
||||
|
||||
def has_legacy_hub_download_metadata(root: Path) -> bool:
|
||||
"""Return ``True`` when *root* looks like a legacy Hub ``local_dir`` mirror.
|
||||
|
||||
@@ -422,7 +294,7 @@ def create_branch(repo_id: str, *, branch: str, repo_type: str | None = None) ->
|
||||
|
||||
def create_lerobot_dataset_card(
|
||||
tags: list | None = None,
|
||||
dataset_info: DatasetInfo | None = None,
|
||||
dataset_info: dict | None = None,
|
||||
**kwargs,
|
||||
) -> DatasetCard:
|
||||
"""Create a `DatasetCard` for a LeRobot dataset.
|
||||
@@ -433,7 +305,7 @@ def create_lerobot_dataset_card(
|
||||
|
||||
Args:
|
||||
tags (list | None): A list of tags to add to the dataset card.
|
||||
dataset_info (DatasetInfo | None): The dataset's info object, which will
|
||||
dataset_info (dict | None): The dataset's info dictionary, which will
|
||||
be displayed on the card.
|
||||
**kwargs: Additional keyword arguments to populate the card template.
|
||||
|
||||
@@ -446,7 +318,7 @@ def create_lerobot_dataset_card(
|
||||
card_tags += tags
|
||||
if dataset_info:
|
||||
dataset_structure = "[meta/info.json](meta/info.json):\n"
|
||||
dataset_structure += f"```json\n{json.dumps(dataset_info.to_dict(), indent=4)}\n```\n"
|
||||
dataset_structure += f"```json\n{json.dumps(dataset_info, indent=4)}\n```\n"
|
||||
kwargs = {**kwargs, "dataset_structure": dataset_structure}
|
||||
card_data = DatasetCardData(
|
||||
license=kwargs.get("license"),
|
||||
|
||||
@@ -331,7 +331,6 @@ class LiberoEnv(EnvConfig):
|
||||
camera_name_mapping: dict[str, str] | None = None
|
||||
observation_height: int = 360
|
||||
observation_width: int = 360
|
||||
is_libero_plus: bool = False
|
||||
features: dict[str, PolicyFeature] = field(
|
||||
default_factory=lambda: {
|
||||
ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(7,)),
|
||||
@@ -433,7 +432,6 @@ class LiberoEnv(EnvConfig):
|
||||
control_mode=self.control_mode,
|
||||
episode_length=self.episode_length,
|
||||
camera_name_mapping=self.camera_name_mapping,
|
||||
is_libero_plus=self.is_libero_plus,
|
||||
)
|
||||
|
||||
def get_env_processors(self):
|
||||
@@ -498,146 +496,6 @@ class MetaworldEnv(EnvConfig):
|
||||
)
|
||||
|
||||
|
||||
@EnvConfig.register_subclass("robocasa")
|
||||
@dataclass
|
||||
class RoboCasaEnv(EnvConfig):
|
||||
task: str = "CloseFridge"
|
||||
fps: int = 20
|
||||
episode_length: int = 1000
|
||||
obs_type: str = "pixels_agent_pos"
|
||||
render_mode: str = "rgb_array"
|
||||
camera_name: str = "robot0_agentview_left,robot0_eye_in_hand,robot0_agentview_right"
|
||||
observation_height: int = 256
|
||||
observation_width: int = 256
|
||||
visualization_height: int = 512
|
||||
visualization_width: int = 512
|
||||
split: str | None = None
|
||||
# Object-mesh registries to sample from. Upstream default is
|
||||
# ("objaverse", "lightwheel"), but objaverse is ~30GB and the CI image
|
||||
# only ships the lightwheel pack. Override to include objaverse once
|
||||
# you've run `python -m robocasa.scripts.download_kitchen_assets
|
||||
# --type objaverse` locally.
|
||||
obj_registries: list[str] = field(default_factory=lambda: ["lightwheel"])
|
||||
features: dict[str, PolicyFeature] = field(
|
||||
default_factory=lambda: {ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(12,))}
|
||||
)
|
||||
features_map: dict[str, str] = field(default_factory=lambda: {ACTION: ACTION, "agent_pos": OBS_STATE})
|
||||
|
||||
def __post_init__(self):
|
||||
if self.obs_type not in ("pixels", "pixels_agent_pos"):
|
||||
raise ValueError(f"Unsupported obs_type: {self.obs_type}")
|
||||
|
||||
# Preserve raw RoboCasa camera names end-to-end (e.g.
|
||||
# `observation.images.robot0_agentview_left`). This matches the
|
||||
# naming convention used by the RoboCasa datasets on the Hub, so
|
||||
# trained policies don't need a `--rename_map` at eval time.
|
||||
cams = [c.strip() for c in self.camera_name.split(",") if c.strip()]
|
||||
for cam in cams:
|
||||
self.features[f"pixels/{cam}"] = PolicyFeature(
|
||||
type=FeatureType.VISUAL,
|
||||
shape=(self.observation_height, self.observation_width, 3),
|
||||
)
|
||||
self.features_map[f"pixels/{cam}"] = f"{OBS_IMAGES}.{cam}"
|
||||
|
||||
if self.obs_type == "pixels_agent_pos":
|
||||
self.features["agent_pos"] = PolicyFeature(type=FeatureType.STATE, shape=(16,))
|
||||
|
||||
@property
|
||||
def gym_kwargs(self) -> dict:
|
||||
kwargs: dict[str, Any] = {
|
||||
"obs_type": self.obs_type,
|
||||
"render_mode": self.render_mode,
|
||||
"observation_height": self.observation_height,
|
||||
"observation_width": self.observation_width,
|
||||
"visualization_height": self.visualization_height,
|
||||
"visualization_width": self.visualization_width,
|
||||
}
|
||||
if self.split is not None:
|
||||
kwargs["split"] = self.split
|
||||
return kwargs
|
||||
|
||||
def create_envs(self, n_envs: int, use_async_envs: bool = False):
|
||||
from .robocasa import create_robocasa_envs
|
||||
|
||||
if self.task is None:
|
||||
raise ValueError("RoboCasaEnv requires a task to be specified")
|
||||
env_cls = _make_vec_env_cls(use_async_envs, n_envs)
|
||||
return create_robocasa_envs(
|
||||
task=self.task,
|
||||
n_envs=n_envs,
|
||||
camera_name=self.camera_name,
|
||||
gym_kwargs=self.gym_kwargs,
|
||||
env_cls=env_cls,
|
||||
episode_length=self.episode_length,
|
||||
obj_registries=tuple(self.obj_registries),
|
||||
)
|
||||
|
||||
|
||||
@EnvConfig.register_subclass("vlabench")
|
||||
@dataclass
|
||||
class VLABenchEnv(EnvConfig):
|
||||
task: str = "select_fruit"
|
||||
fps: int = 10
|
||||
episode_length: int = 500
|
||||
obs_type: str = "pixels_agent_pos"
|
||||
render_mode: str = "rgb_array"
|
||||
render_resolution: tuple[int, int] = (480, 480)
|
||||
robot: str = "franka"
|
||||
action_mode: str = "eef"
|
||||
features: dict[str, PolicyFeature] = field(
|
||||
default_factory=lambda: {
|
||||
ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(7,)),
|
||||
}
|
||||
)
|
||||
features_map: dict[str, str] = field(
|
||||
default_factory=lambda: {
|
||||
ACTION: ACTION,
|
||||
"agent_pos": OBS_STATE,
|
||||
"pixels/image": f"{OBS_IMAGES}.image",
|
||||
"pixels/second_image": f"{OBS_IMAGES}.second_image",
|
||||
"pixels/wrist_image": f"{OBS_IMAGES}.wrist_image",
|
||||
}
|
||||
)
|
||||
|
||||
def __post_init__(self):
|
||||
h, w = self.render_resolution
|
||||
if self.obs_type == "pixels":
|
||||
self.features["pixels/image"] = PolicyFeature(type=FeatureType.VISUAL, shape=(h, w, 3))
|
||||
self.features["pixels/second_image"] = PolicyFeature(type=FeatureType.VISUAL, shape=(h, w, 3))
|
||||
self.features["pixels/wrist_image"] = PolicyFeature(type=FeatureType.VISUAL, shape=(h, w, 3))
|
||||
elif self.obs_type == "pixels_agent_pos":
|
||||
self.features["pixels/image"] = PolicyFeature(type=FeatureType.VISUAL, shape=(h, w, 3))
|
||||
self.features["pixels/second_image"] = PolicyFeature(type=FeatureType.VISUAL, shape=(h, w, 3))
|
||||
self.features["pixels/wrist_image"] = PolicyFeature(type=FeatureType.VISUAL, shape=(h, w, 3))
|
||||
self.features["agent_pos"] = PolicyFeature(type=FeatureType.STATE, shape=(7,))
|
||||
else:
|
||||
raise ValueError(f"Unsupported obs_type: {self.obs_type}")
|
||||
|
||||
@property
|
||||
def gym_kwargs(self) -> dict:
|
||||
return {
|
||||
"obs_type": self.obs_type,
|
||||
"render_mode": self.render_mode,
|
||||
"render_resolution": self.render_resolution,
|
||||
"robot": self.robot,
|
||||
"max_episode_steps": self.episode_length,
|
||||
"action_mode": self.action_mode,
|
||||
}
|
||||
|
||||
def create_envs(self, n_envs: int, use_async_envs: bool = False):
|
||||
from .vlabench import create_vlabench_envs
|
||||
|
||||
if self.task is None:
|
||||
raise ValueError("VLABenchEnv requires a task to be specified")
|
||||
env_cls = _make_vec_env_cls(use_async_envs, n_envs)
|
||||
return create_vlabench_envs(
|
||||
task=self.task,
|
||||
n_envs=n_envs,
|
||||
gym_kwargs=self.gym_kwargs,
|
||||
env_cls=env_cls,
|
||||
)
|
||||
|
||||
|
||||
@EnvConfig.register_subclass("isaaclab_arena")
|
||||
@dataclass
|
||||
class IsaaclabArenaEnv(HubEnvConfig):
|
||||
@@ -716,171 +574,3 @@ class IsaaclabArenaEnv(HubEnvConfig):
|
||||
),
|
||||
PolicyProcessorPipeline(steps=[]),
|
||||
)
|
||||
|
||||
|
||||
@EnvConfig.register_subclass("libero_plus")
|
||||
@dataclass
|
||||
class LiberoPlusEnv(LiberoEnv):
|
||||
"""Config for LIBERO-plus robustness benchmark evaluation.
|
||||
|
||||
LIBERO-plus extends LIBERO with 7 perturbation dimensions (camera viewpoints,
|
||||
object layouts, robot initial states, language instructions, lighting, background
|
||||
textures, sensor noise) producing ~10k task variants.
|
||||
|
||||
The gym interface is identical to LIBERO so this class reuses ``LiberoEnv``
|
||||
entirely — only the registered name and default task suite differ.
|
||||
|
||||
Install: see docker/Dockerfile.benchmark.libero_plus — LIBERO-plus ships
|
||||
as a namespace package from a git fork and must be cloned + PYTHONPATH'd
|
||||
rather than installed as a pyproject extra.
|
||||
|
||||
See Also:
|
||||
https://github.com/sylvestf/LIBERO-plus
|
||||
"""
|
||||
|
||||
task: str = "libero_spatial"
|
||||
is_libero_plus: bool = True
|
||||
|
||||
|
||||
@EnvConfig.register_subclass("robotwin")
|
||||
@dataclass
|
||||
class RoboTwinEnvConfig(EnvConfig):
|
||||
"""Configuration for RoboTwin 2.0 benchmark environments.
|
||||
|
||||
RoboTwin 2.0 is a dual-arm manipulation benchmark with 50 tasks built on the
|
||||
SAPIEN simulator. The robot is an Aloha-AgileX bimanual platform with 14 DOF
|
||||
(7 per arm). All three cameras are enabled by default.
|
||||
|
||||
See: https://robotwin-platform.github.io
|
||||
Dataset: https://huggingface.co/datasets/lerobot/robotwin_unified
|
||||
"""
|
||||
|
||||
task: str = "beat_block_hammer" # single task or comma-separated list
|
||||
fps: int = 25
|
||||
episode_length: int = 300
|
||||
obs_type: str = "pixels_agent_pos"
|
||||
render_mode: str = "rgb_array"
|
||||
# Available cameras from RoboTwin's aloha-agilex embodiment: head_camera
|
||||
# (torso-mounted) + left_camera / right_camera (wrists).
|
||||
camera_names: str = "head_camera,left_camera,right_camera"
|
||||
# Match the D435 dims in task_config/demo_clean.yml (_camera_config.yml).
|
||||
# Gym's vector-env concatenate pre-allocates buffers of this shape, so it
|
||||
# must equal what SAPIEN actually renders.
|
||||
observation_height: int = 240
|
||||
observation_width: int = 320
|
||||
features: dict[str, PolicyFeature] = field(
|
||||
default_factory=lambda: {
|
||||
ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(14,)),
|
||||
}
|
||||
)
|
||||
features_map: dict[str, str] = field(
|
||||
default_factory=lambda: {
|
||||
ACTION: ACTION,
|
||||
"pixels/head_camera": f"{OBS_IMAGES}.head_camera",
|
||||
"pixels/left_camera": f"{OBS_IMAGES}.left_camera",
|
||||
"pixels/right_camera": f"{OBS_IMAGES}.right_camera",
|
||||
"agent_pos": OBS_STATE,
|
||||
}
|
||||
)
|
||||
|
||||
def __post_init__(self):
|
||||
cam_list = [c.strip() for c in self.camera_names.split(",") if c.strip()]
|
||||
for cam in cam_list:
|
||||
self.features[f"pixels/{cam}"] = PolicyFeature(
|
||||
type=FeatureType.VISUAL,
|
||||
shape=(self.observation_height, self.observation_width, 3),
|
||||
)
|
||||
# Keep features_map entry if already set (default_factory); add if missing.
|
||||
key = f"pixels/{cam}"
|
||||
if key not in self.features_map:
|
||||
self.features_map[key] = f"{OBS_IMAGES}.{cam}"
|
||||
|
||||
if self.obs_type == "pixels_agent_pos":
|
||||
self.features["agent_pos"] = PolicyFeature(
|
||||
type=FeatureType.STATE,
|
||||
shape=(14,), # 14 DOF: 7 per arm
|
||||
)
|
||||
elif self.obs_type != "pixels":
|
||||
raise ValueError(
|
||||
f"Unsupported obs_type '{self.obs_type}'. "
|
||||
"RoboTwinEnvConfig supports 'pixels' and 'pixels_agent_pos'."
|
||||
)
|
||||
|
||||
@property
|
||||
def gym_kwargs(self) -> dict:
|
||||
return {}
|
||||
|
||||
def create_envs(self, n_envs: int, use_async_envs: bool = True):
|
||||
from lerobot.envs.robotwin import create_robotwin_envs
|
||||
|
||||
if not self.task:
|
||||
raise ValueError("RoboTwinEnvConfig requires `task` to be specified.")
|
||||
|
||||
env_cls = _make_vec_env_cls(use_async_envs, n_envs)
|
||||
cam_list = [c.strip() for c in self.camera_names.split(",") if c.strip()]
|
||||
return create_robotwin_envs(
|
||||
task=self.task,
|
||||
n_envs=n_envs,
|
||||
env_cls=env_cls,
|
||||
camera_names=cam_list,
|
||||
observation_height=self.observation_height,
|
||||
observation_width=self.observation_width,
|
||||
episode_length=self.episode_length,
|
||||
)
|
||||
|
||||
|
||||
@EnvConfig.register_subclass("robomme")
|
||||
@dataclass
|
||||
class RoboMMEEnv(EnvConfig):
|
||||
"""RoboMME memory-augmented manipulation benchmark (ManiSkill/SAPIEN).
|
||||
|
||||
16 tasks across 4 suites: Counting, Permanence, Reference, Imitation.
|
||||
Dataset: lerobot/robomme (LeRobot v3.0, 1,600 episodes).
|
||||
Benchmark: https://github.com/RoboMME/robomme_benchmark
|
||||
|
||||
Requires the `robomme` git package installed separately (Linux only);
|
||||
see docker/Dockerfile.benchmark.robomme for the canonical install.
|
||||
"""
|
||||
|
||||
task: str = "PickXtimes"
|
||||
fps: int = 10
|
||||
episode_length: int = 300
|
||||
action_space: str = "joint_angle" # or "ee_pose" (7-D)
|
||||
dataset_split: str = "test" # "train" | "val" | "test"
|
||||
task_ids: list[int] | None = None
|
||||
features: dict[str, PolicyFeature] = field(default_factory=dict)
|
||||
features_map: dict[str, str] = field(
|
||||
default_factory=lambda: {
|
||||
ACTION: ACTION,
|
||||
"pixels/image": f"{OBS_IMAGES}.image",
|
||||
"pixels/wrist_image": f"{OBS_IMAGES}.wrist_image",
|
||||
"agent_pos": OBS_STATE,
|
||||
}
|
||||
)
|
||||
|
||||
def __post_init__(self):
|
||||
action_dim = 8 if self.action_space == "joint_angle" else 7
|
||||
self.features = {
|
||||
ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(action_dim,)),
|
||||
"pixels/image": PolicyFeature(type=FeatureType.VISUAL, shape=(256, 256, 3)),
|
||||
"pixels/wrist_image": PolicyFeature(type=FeatureType.VISUAL, shape=(256, 256, 3)),
|
||||
"agent_pos": PolicyFeature(type=FeatureType.STATE, shape=(8,)),
|
||||
}
|
||||
|
||||
@property
|
||||
def gym_kwargs(self) -> dict:
|
||||
return {}
|
||||
|
||||
def create_envs(self, n_envs: int, use_async_envs: bool = True):
|
||||
from lerobot.envs.robomme import create_robomme_envs
|
||||
|
||||
env_cls = _make_vec_env_cls(use_async_envs, n_envs)
|
||||
return create_robomme_envs(
|
||||
task=self.task,
|
||||
n_envs=n_envs,
|
||||
action_space_type=self.action_space,
|
||||
dataset=self.dataset_split,
|
||||
episode_length=self.episode_length,
|
||||
task_ids=self.task_ids,
|
||||
env_cls=env_cls,
|
||||
)
|
||||
|
||||
+26
-46
@@ -16,7 +16,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from collections.abc import Callable, Iterable, Mapping, Sequence
|
||||
from functools import partial
|
||||
@@ -32,7 +31,20 @@ from libero.libero.envs import OffScreenRenderEnv
|
||||
|
||||
from lerobot.types import RobotObservation
|
||||
|
||||
from .utils import _LazyAsyncVectorEnv, parse_camera_names
|
||||
from .utils import _LazyAsyncVectorEnv
|
||||
|
||||
|
||||
def _parse_camera_names(camera_name: str | Sequence[str]) -> list[str]:
|
||||
"""Normalize camera_name into a non-empty list of strings."""
|
||||
if isinstance(camera_name, str):
|
||||
cams = [c.strip() for c in camera_name.split(",") if c.strip()]
|
||||
elif isinstance(camera_name, (list | tuple)):
|
||||
cams = [str(c).strip() for c in camera_name if str(c).strip()]
|
||||
else:
|
||||
raise TypeError(f"camera_name must be str or sequence[str], got {type(camera_name).__name__}")
|
||||
if not cams:
|
||||
raise ValueError("camera_name resolved to an empty list.")
|
||||
return cams
|
||||
|
||||
|
||||
def _get_suite(name: str) -> benchmark.Benchmark:
|
||||
@@ -57,34 +69,14 @@ def _select_task_ids(total_tasks: int, task_ids: Iterable[int] | None) -> list[i
|
||||
return ids
|
||||
|
||||
|
||||
# LIBERO-plus perturbation variants encode the perturbation in the filename
|
||||
# but on disk only the base `.pruned_init` exists — strip the suffix to match
|
||||
# LIBERO-plus's own suite.get_task_init_states() (we reimplement it here so we
|
||||
# can pass weights_only=False for PyTorch 2.6+ numpy pickles).
|
||||
_LIBERO_PERTURBATION_SUFFIX_RE = re.compile(r"_(?:language|view|light)_[^.]*|_(?:table|tb)_\d+")
|
||||
|
||||
|
||||
def get_task_init_states(task_suite: Any, i: int, is_libero_plus: bool = False) -> np.ndarray:
|
||||
task = task_suite.tasks[i]
|
||||
filename = Path(task.init_states_file)
|
||||
root = Path(get_libero_path("init_states"))
|
||||
|
||||
if not is_libero_plus:
|
||||
init_states_path = root / task.problem_folder / filename.name
|
||||
return torch.load(init_states_path, weights_only=False) # nosec B614
|
||||
|
||||
# LIBERO-plus: `_add_` / `_level` variants store extra-object layouts under
|
||||
# libero_newobj/ as a flat array that must be reshaped to (1, -1).
|
||||
if "_add_" in filename.name or "_level" in filename.name:
|
||||
init_states_path = root / "libero_newobj" / task.problem_folder / filename.name
|
||||
init_states = torch.load(init_states_path, weights_only=False) # nosec B614
|
||||
return init_states.reshape(1, -1)
|
||||
|
||||
# LIBERO-plus perturbation variants encode the perturbation in the filename
|
||||
# but on disk only the base `.pruned_init` exists — strip the suffix to match.
|
||||
stripped = _LIBERO_PERTURBATION_SUFFIX_RE.sub("", filename.stem) + filename.suffix
|
||||
init_states_path = root / task.problem_folder / stripped
|
||||
return torch.load(init_states_path, weights_only=False) # nosec B614
|
||||
def get_task_init_states(task_suite: Any, i: int) -> np.ndarray:
|
||||
init_states_path = (
|
||||
Path(get_libero_path("init_states"))
|
||||
/ task_suite.tasks[i].problem_folder
|
||||
/ task_suite.tasks[i].init_states_file
|
||||
)
|
||||
init_states = torch.load(init_states_path, weights_only=False) # nosec B614
|
||||
return init_states
|
||||
|
||||
|
||||
def get_libero_dummy_action():
|
||||
@@ -126,11 +118,9 @@ class LiberoEnv(gym.Env):
|
||||
camera_name_mapping: dict[str, str] | None = None,
|
||||
num_steps_wait: int = 10,
|
||||
control_mode: str = "relative",
|
||||
is_libero_plus: bool = False,
|
||||
):
|
||||
super().__init__()
|
||||
self.task_id = task_id
|
||||
self.is_libero_plus = is_libero_plus
|
||||
self.obs_type = obs_type
|
||||
self.render_mode = render_mode
|
||||
self.observation_width = observation_width
|
||||
@@ -138,7 +128,7 @@ class LiberoEnv(gym.Env):
|
||||
self.visualization_width = visualization_width
|
||||
self.visualization_height = visualization_height
|
||||
self.init_states = init_states
|
||||
self.camera_name = parse_camera_names(
|
||||
self.camera_name = _parse_camera_names(
|
||||
camera_name
|
||||
) # agentview_image (main) or robot0_eye_in_hand_image (wrist)
|
||||
|
||||
@@ -157,11 +147,7 @@ class LiberoEnv(gym.Env):
|
||||
self.episode_index = episode_index
|
||||
self.episode_length = episode_length
|
||||
# Load once and keep
|
||||
self._init_states = (
|
||||
get_task_init_states(task_suite, self.task_id, is_libero_plus=self.is_libero_plus)
|
||||
if self.init_states
|
||||
else None
|
||||
)
|
||||
self._init_states = get_task_init_states(task_suite, self.task_id) if self.init_states else None
|
||||
self._reset_stride = n_envs # when performing a reset, append `_reset_stride` to `init_state_id`.
|
||||
|
||||
self.init_state_id = self.episode_index # tie each sub-env to a fixed init state
|
||||
@@ -394,7 +380,6 @@ def _make_env_fns(
|
||||
gym_kwargs: Mapping[str, Any],
|
||||
control_mode: str,
|
||||
camera_name_mapping: dict[str, str] | None = None,
|
||||
is_libero_plus: bool = False,
|
||||
) -> list[Callable[[], LiberoEnv]]:
|
||||
"""Build n_envs factory callables for a single (suite, task_id)."""
|
||||
|
||||
@@ -411,7 +396,6 @@ def _make_env_fns(
|
||||
n_envs=n_envs,
|
||||
control_mode=control_mode,
|
||||
camera_name_mapping=camera_name_mapping,
|
||||
is_libero_plus=is_libero_plus,
|
||||
**local_kwargs,
|
||||
)
|
||||
|
||||
@@ -434,7 +418,6 @@ def create_libero_envs(
|
||||
control_mode: str = "relative",
|
||||
episode_length: int | None = None,
|
||||
camera_name_mapping: dict[str, str] | None = None,
|
||||
is_libero_plus: bool = False,
|
||||
) -> dict[str, dict[int, Any]]:
|
||||
"""
|
||||
Create vectorized LIBERO environments with a consistent return shape.
|
||||
@@ -454,7 +437,7 @@ def create_libero_envs(
|
||||
gym_kwargs = dict(gym_kwargs or {})
|
||||
task_ids_filter = gym_kwargs.pop("task_ids", None) # optional: limit to specific tasks
|
||||
|
||||
camera_names = parse_camera_names(camera_name)
|
||||
camera_names = _parse_camera_names(camera_name)
|
||||
suite_names = [s.strip() for s in str(task).split(",") if s.strip()]
|
||||
if not suite_names:
|
||||
raise ValueError("`task` must contain at least one LIBERO suite name.")
|
||||
@@ -479,7 +462,6 @@ def create_libero_envs(
|
||||
# Probe once and reuse to avoid creating a temp env per task.
|
||||
cached_obs_space: spaces.Space | None = None
|
||||
cached_act_space: spaces.Space | None = None
|
||||
cached_metadata: dict[str, Any] | None = None
|
||||
|
||||
for tid in selected:
|
||||
fns = _make_env_fns(
|
||||
@@ -493,14 +475,12 @@ def create_libero_envs(
|
||||
gym_kwargs=gym_kwargs,
|
||||
control_mode=control_mode,
|
||||
camera_name_mapping=camera_name_mapping,
|
||||
is_libero_plus=is_libero_plus,
|
||||
)
|
||||
if is_async:
|
||||
lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata)
|
||||
lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space)
|
||||
if cached_obs_space is None:
|
||||
cached_obs_space = lazy.observation_space
|
||||
cached_act_space = lazy.action_space
|
||||
cached_metadata = lazy.metadata
|
||||
out[suite_name][tid] = lazy
|
||||
else:
|
||||
out[suite_name][tid] = env_cls(fns)
|
||||
|
||||
@@ -311,7 +311,6 @@ def create_metaworld_envs(
|
||||
is_async = env_cls is gym.vector.AsyncVectorEnv
|
||||
cached_obs_space = None
|
||||
cached_act_space = None
|
||||
cached_metadata = None
|
||||
out: dict[str, dict[int, Any]] = defaultdict(dict)
|
||||
|
||||
for group in task_groups:
|
||||
@@ -325,11 +324,10 @@ def create_metaworld_envs(
|
||||
fns = [(lambda tn=task_name: MetaworldEnv(task=tn, **gym_kwargs)) for _ in range(n_envs)]
|
||||
|
||||
if is_async:
|
||||
lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata)
|
||||
lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space)
|
||||
if cached_obs_space is None:
|
||||
cached_obs_space = lazy.observation_space
|
||||
cached_act_space = lazy.action_space
|
||||
cached_metadata = lazy.metadata
|
||||
out[group][tid] = lazy
|
||||
else:
|
||||
out[group][tid] = env_cls(fns)
|
||||
|
||||
@@ -1,425 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
from collections.abc import Callable, Sequence
|
||||
from functools import partial
|
||||
from typing import Any
|
||||
|
||||
import gymnasium as gym
|
||||
import numpy as np
|
||||
from gymnasium import spaces
|
||||
|
||||
from lerobot.types import RobotObservation
|
||||
|
||||
from .utils import _LazyAsyncVectorEnv, parse_camera_names
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Dimensions for the flat action/state vectors used by the LeRobot wrapper.
|
||||
# These correspond to the PandaOmron robot in RoboCasa365.
|
||||
OBS_STATE_DIM = 16 # base_pos(3) + base_quat(4) + ee_pos_rel(3) + ee_quat_rel(4) + gripper_qpos(2)
|
||||
ACTION_DIM = 12 # base_motion(4) + control_mode(1) + ee_pos(3) + ee_rot(3) + gripper(1)
|
||||
ACTION_LOW = -1.0
|
||||
ACTION_HIGH = 1.0
|
||||
|
||||
# Default PandaOmron cameras. We surface these raw names directly as
|
||||
# `observation.images.<name>` so the LeRobot dataset/policy keys match
|
||||
# RoboCasa's native convention (no implicit renaming).
|
||||
DEFAULT_CAMERAS = [
|
||||
"robot0_agentview_left",
|
||||
"robot0_eye_in_hand",
|
||||
"robot0_agentview_right",
|
||||
]
|
||||
|
||||
# Object-mesh registries to sample from. RoboCasa's upstream default is
|
||||
# ("objaverse", "lightwheel"), but the objaverse pack is huge (~30GB) and
|
||||
# most users — including our CI image — only download the lightwheel pack
|
||||
# (`--type objs_lw` in `download_kitchen_assets`). When a sampled object
|
||||
# category has zero candidates in every registry, robocasa crashes with
|
||||
# `ValueError: Probabilities contain NaN` (0/0 divide in the probability
|
||||
# normalization). Restricting to registries that are actually on disk
|
||||
# avoids the NaN and matches what the asset download provides.
|
||||
DEFAULT_OBJ_REGISTRIES: tuple[str, ...] = ("lightwheel",)
|
||||
|
||||
# Task-group shortcuts accepted as `--env.task`. When the user passes one of
|
||||
# these names, we expand it to the upstream RoboCasa task list and auto-set
|
||||
# the dataset split. Individual task names (optionally comma-separated) still
|
||||
# take precedence; this only triggers on an exact group-name match.
|
||||
_TASK_GROUP_SPLITS = {
|
||||
"atomic_seen": "target",
|
||||
"composite_seen": "target",
|
||||
"composite_unseen": "target",
|
||||
"pretrain50": "pretrain",
|
||||
"pretrain100": "pretrain",
|
||||
"pretrain200": "pretrain",
|
||||
"pretrain300": "pretrain",
|
||||
}
|
||||
|
||||
|
||||
def _resolve_tasks(task: str) -> tuple[list[str], str | None]:
|
||||
"""Resolve a `--env.task` value to (task_names, split_override).
|
||||
|
||||
If `task` is a known task-group name (e.g. `atomic_seen`, `pretrain100`),
|
||||
expand it via `robocasa.utils.dataset_registry.{TARGET,PRETRAINING}_TASKS`
|
||||
and return the matching split. Otherwise treat `task` as a single task or
|
||||
comma-separated list and leave the split untouched (None).
|
||||
"""
|
||||
key = task.strip()
|
||||
if key in _TASK_GROUP_SPLITS:
|
||||
from robocasa.utils.dataset_registry import PRETRAINING_TASKS, TARGET_TASKS
|
||||
|
||||
combined = {**TARGET_TASKS, **PRETRAINING_TASKS}
|
||||
if key not in combined:
|
||||
raise ValueError(
|
||||
f"Task group '{key}' is not available in this version of robocasa. "
|
||||
f"Known groups: {sorted(combined.keys())}."
|
||||
)
|
||||
return list(combined[key]), _TASK_GROUP_SPLITS[key]
|
||||
|
||||
names = [t.strip() for t in task.split(",") if t.strip()]
|
||||
if not names:
|
||||
raise ValueError("`task` must contain at least one RoboCasa task name.")
|
||||
return names, None
|
||||
|
||||
|
||||
def convert_action(flat_action: np.ndarray) -> dict[str, Any]:
|
||||
"""Split a flat (12,) action vector into a RoboCasa action dict.
|
||||
|
||||
Layout: base_motion(4) + control_mode(1) + ee_pos(3) + ee_rot(3) + gripper(1)
|
||||
"""
|
||||
return {
|
||||
"action.base_motion": flat_action[0:4],
|
||||
"action.control_mode": flat_action[4:5],
|
||||
"action.end_effector_position": flat_action[5:8],
|
||||
"action.end_effector_rotation": flat_action[8:11],
|
||||
"action.gripper_close": flat_action[11:12],
|
||||
}
|
||||
|
||||
|
||||
class RoboCasaEnv(gym.Env):
|
||||
"""LeRobot gym.Env wrapper for RoboCasa365 kitchen environments.
|
||||
|
||||
Wraps RoboCasaGymEnv from the robocasa package and converts its
|
||||
dict-based observations and actions into the flat arrays LeRobot expects.
|
||||
Raw RoboCasa camera names are preserved verbatim under `pixels/<cam>`.
|
||||
"""
|
||||
|
||||
metadata = {"render_modes": ["rgb_array"], "render_fps": 20}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
task: str,
|
||||
camera_name: str | Sequence[str] = ",".join(DEFAULT_CAMERAS),
|
||||
obs_type: str = "pixels_agent_pos",
|
||||
render_mode: str = "rgb_array",
|
||||
observation_width: int = 256,
|
||||
observation_height: int = 256,
|
||||
visualization_width: int = 512,
|
||||
visualization_height: int = 512,
|
||||
split: str | None = None,
|
||||
episode_length: int | None = None,
|
||||
obj_registries: Sequence[str] = DEFAULT_OBJ_REGISTRIES,
|
||||
episode_index: int = 0,
|
||||
):
|
||||
super().__init__()
|
||||
self.task = task
|
||||
self.obs_type = obs_type
|
||||
self.render_mode = render_mode
|
||||
self.observation_width = observation_width
|
||||
self.observation_height = observation_height
|
||||
self.visualization_width = visualization_width
|
||||
self.visualization_height = visualization_height
|
||||
self.split = split
|
||||
self.obj_registries = tuple(obj_registries)
|
||||
# Per-worker index (0..n_envs-1) used to spread the user-provided
|
||||
# seed across factories so each sub-env explores a distinct layout
|
||||
# even when the same seed is passed to `reset()`.
|
||||
self.episode_index = int(episode_index)
|
||||
|
||||
self.camera_name = parse_camera_names(camera_name)
|
||||
|
||||
self._max_episode_steps = episode_length if episode_length is not None else 1000
|
||||
|
||||
# Deferred — created on first reset() inside the worker subprocess
|
||||
# to avoid inheriting stale GPU/EGL contexts across fork().
|
||||
self._env: Any = None
|
||||
self.task_description = ""
|
||||
|
||||
images = {
|
||||
cam: spaces.Box(
|
||||
low=0,
|
||||
high=255,
|
||||
shape=(self.observation_height, self.observation_width, 3),
|
||||
dtype=np.uint8,
|
||||
)
|
||||
for cam in self.camera_name
|
||||
}
|
||||
|
||||
if self.obs_type == "pixels":
|
||||
self.observation_space = spaces.Dict({"pixels": spaces.Dict(images)})
|
||||
elif self.obs_type == "pixels_agent_pos":
|
||||
self.observation_space = spaces.Dict(
|
||||
{
|
||||
"pixels": spaces.Dict(images),
|
||||
"agent_pos": spaces.Box(
|
||||
low=-np.inf,
|
||||
high=np.inf,
|
||||
shape=(OBS_STATE_DIM,),
|
||||
dtype=np.float32,
|
||||
),
|
||||
}
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unsupported obs_type '{self.obs_type}'. Use 'pixels' or 'pixels_agent_pos'.")
|
||||
|
||||
self.action_space = spaces.Box(
|
||||
low=ACTION_LOW,
|
||||
high=ACTION_HIGH,
|
||||
shape=(ACTION_DIM,),
|
||||
dtype=np.float32,
|
||||
)
|
||||
|
||||
def _ensure_env(self) -> None:
|
||||
"""Create the underlying RoboCasaGymEnv on first use.
|
||||
|
||||
Called inside the worker subprocess after fork(), so each worker gets
|
||||
its own clean rendering context rather than inheriting a stale one from
|
||||
the parent process (which causes crashes with AsyncVectorEnv).
|
||||
"""
|
||||
if self._env is not None:
|
||||
return
|
||||
from robocasa.wrappers.gym_wrapper import RoboCasaGymEnv
|
||||
|
||||
# RoboCasaGymEnv defaults split="test", which create_env rejects
|
||||
# (only None/"all"/"pretrain"/"target" are valid). Always pass a
|
||||
# valid value so we don't hit that default. Extra kwargs are
|
||||
# forwarded to the underlying kitchen env via create_env/robosuite.make.
|
||||
self._env = RoboCasaGymEnv(
|
||||
env_name=self.task,
|
||||
camera_widths=self.observation_width,
|
||||
camera_heights=self.observation_height,
|
||||
split=self.split if self.split is not None else "all",
|
||||
obj_registries=self.obj_registries,
|
||||
)
|
||||
|
||||
ep_meta = self._env.env.get_ep_meta()
|
||||
self.task_description = ep_meta.get("lang", self.task)
|
||||
|
||||
def _format_raw_obs(self, raw_obs: dict) -> RobotObservation:
|
||||
"""Convert RoboCasaGymEnv observation dict to LeRobot format."""
|
||||
# RoboCasaGymEnv emits camera frames under "video.<cam>".
|
||||
images = {cam: raw_obs[f"video.{cam}"] for cam in self.camera_name if f"video.{cam}" in raw_obs}
|
||||
|
||||
if self.obs_type == "pixels":
|
||||
return {"pixels": images}
|
||||
|
||||
# `state.*` keys come from PandaOmronKeyConverter inside the wrapper.
|
||||
agent_pos = np.concatenate(
|
||||
[
|
||||
raw_obs.get("state.base_position", np.zeros(3)),
|
||||
raw_obs.get("state.base_rotation", np.zeros(4)),
|
||||
raw_obs.get("state.end_effector_position_relative", np.zeros(3)),
|
||||
raw_obs.get("state.end_effector_rotation_relative", np.zeros(4)),
|
||||
raw_obs.get("state.gripper_qpos", np.zeros(2)),
|
||||
],
|
||||
axis=-1,
|
||||
).astype(np.float32)
|
||||
|
||||
return {"pixels": images, "agent_pos": agent_pos}
|
||||
|
||||
def render(self) -> np.ndarray:
|
||||
self._ensure_env()
|
||||
assert self._env is not None
|
||||
return self._env.render()
|
||||
|
||||
def reset(self, seed=None, **kwargs):
|
||||
self._ensure_env()
|
||||
assert self._env is not None
|
||||
super().reset(seed=seed)
|
||||
# Spread the seed across workers so n_envs factories don't all
|
||||
# roll the same scene. With an explicit user seed we shift it by
|
||||
# episode_index; with no seed we fall back to episode_index so
|
||||
# each worker is still distinct rather than inheriting the same
|
||||
# global RNG state.
|
||||
worker_seed = seed + self.episode_index if seed is not None else self.episode_index
|
||||
raw_obs, info = self._env.reset(seed=worker_seed)
|
||||
|
||||
ep_meta = self._env.env.get_ep_meta()
|
||||
self.task_description = ep_meta.get("lang", self.task)
|
||||
|
||||
observation = self._format_raw_obs(raw_obs)
|
||||
info = {"is_success": False}
|
||||
return observation, info
|
||||
|
||||
def step(self, action: np.ndarray) -> tuple[RobotObservation, float, bool, bool, dict[str, Any]]:
|
||||
self._ensure_env()
|
||||
assert self._env is not None
|
||||
if action.ndim != 1:
|
||||
raise ValueError(
|
||||
f"Expected action to be 1-D (shape (action_dim,)), "
|
||||
f"but got shape {action.shape} with ndim={action.ndim}"
|
||||
)
|
||||
|
||||
action_dict = convert_action(action)
|
||||
raw_obs, reward, done, truncated, info = self._env.step(action_dict)
|
||||
|
||||
is_success = bool(info.get("success", False))
|
||||
terminated = done or is_success
|
||||
info.update({"task": self.task, "done": done, "is_success": is_success})
|
||||
|
||||
observation = self._format_raw_obs(raw_obs)
|
||||
if terminated:
|
||||
info["final_info"] = {
|
||||
"task": self.task,
|
||||
"done": bool(done),
|
||||
"is_success": bool(is_success),
|
||||
}
|
||||
self.reset()
|
||||
|
||||
return observation, reward, terminated, truncated, info
|
||||
|
||||
def close(self):
|
||||
if self._env is not None:
|
||||
self._env.close()
|
||||
|
||||
|
||||
def _make_env_fns(
|
||||
*,
|
||||
task: str,
|
||||
n_envs: int,
|
||||
camera_names: list[str],
|
||||
obs_type: str,
|
||||
render_mode: str,
|
||||
observation_width: int,
|
||||
observation_height: int,
|
||||
visualization_width: int,
|
||||
visualization_height: int,
|
||||
split: str | None,
|
||||
episode_length: int | None,
|
||||
obj_registries: Sequence[str],
|
||||
) -> list[Callable[[], RoboCasaEnv]]:
|
||||
"""Build n_envs factory callables for a single task.
|
||||
|
||||
Each factory carries a distinct ``episode_index`` (``0..n_envs-1``) so
|
||||
``RoboCasaEnv.reset()`` can derive a per-worker seed series from the
|
||||
user-provided seed.
|
||||
"""
|
||||
|
||||
def _make_env(episode_index: int) -> RoboCasaEnv:
|
||||
return RoboCasaEnv(
|
||||
task=task,
|
||||
camera_name=camera_names,
|
||||
obs_type=obs_type,
|
||||
render_mode=render_mode,
|
||||
observation_width=observation_width,
|
||||
observation_height=observation_height,
|
||||
visualization_width=visualization_width,
|
||||
visualization_height=visualization_height,
|
||||
split=split,
|
||||
episode_length=episode_length,
|
||||
obj_registries=obj_registries,
|
||||
episode_index=episode_index,
|
||||
)
|
||||
|
||||
return [partial(_make_env, i) for i in range(n_envs)]
|
||||
|
||||
|
||||
def create_robocasa_envs(
|
||||
task: str,
|
||||
n_envs: int,
|
||||
gym_kwargs: dict[str, Any] | None = None,
|
||||
camera_name: str | Sequence[str] = ",".join(DEFAULT_CAMERAS),
|
||||
env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None,
|
||||
episode_length: int | None = None,
|
||||
obj_registries: Sequence[str] = DEFAULT_OBJ_REGISTRIES,
|
||||
) -> dict[str, dict[int, Any]]:
|
||||
"""Create vectorized RoboCasa365 environments with a consistent return shape.
|
||||
|
||||
Returns:
|
||||
dict[task_name][task_id] -> vec_env (env_cls([...]) with exactly n_envs factories)
|
||||
|
||||
`task` can be:
|
||||
- a single task name (e.g. `CloseFridge`)
|
||||
- a comma-separated list of task names (e.g. `CloseFridge,PickPlaceCoffee`)
|
||||
- a benchmark-group shortcut (`atomic_seen`, `composite_seen`,
|
||||
`composite_unseen`, `pretrain50`, `pretrain100`, `pretrain200`,
|
||||
`pretrain300`), which auto-expands to the upstream task list and
|
||||
auto-sets the dataset `split` ("target" or "pretrain").
|
||||
"""
|
||||
if env_cls is None or not callable(env_cls):
|
||||
raise ValueError("env_cls must be a callable that wraps a list of environment factory callables.")
|
||||
if not isinstance(n_envs, int) or n_envs <= 0:
|
||||
raise ValueError(f"n_envs must be a positive int; got {n_envs}.")
|
||||
|
||||
gym_kwargs = dict(gym_kwargs or {})
|
||||
obs_type = gym_kwargs.pop("obs_type", "pixels_agent_pos")
|
||||
render_mode = gym_kwargs.pop("render_mode", "rgb_array")
|
||||
observation_width = gym_kwargs.pop("observation_width", 256)
|
||||
observation_height = gym_kwargs.pop("observation_height", 256)
|
||||
visualization_width = gym_kwargs.pop("visualization_width", 512)
|
||||
visualization_height = gym_kwargs.pop("visualization_height", 512)
|
||||
split = gym_kwargs.pop("split", None)
|
||||
|
||||
camera_names = parse_camera_names(camera_name)
|
||||
task_names, group_split = _resolve_tasks(str(task))
|
||||
if group_split is not None and split is None:
|
||||
split = group_split
|
||||
|
||||
logger.info(
|
||||
"Creating RoboCasa envs | tasks=%s | split=%s | n_envs(per task)=%d",
|
||||
task_names,
|
||||
split,
|
||||
n_envs,
|
||||
)
|
||||
|
||||
is_async = env_cls is gym.vector.AsyncVectorEnv
|
||||
|
||||
cached_obs_space: spaces.Space | None = None
|
||||
cached_act_space: spaces.Space | None = None
|
||||
cached_metadata: dict[str, Any] | None = None
|
||||
out: dict[str, dict[int, Any]] = defaultdict(dict)
|
||||
|
||||
for task_name in task_names:
|
||||
fns = _make_env_fns(
|
||||
task=task_name,
|
||||
n_envs=n_envs,
|
||||
camera_names=camera_names,
|
||||
obs_type=obs_type,
|
||||
render_mode=render_mode,
|
||||
observation_width=observation_width,
|
||||
observation_height=observation_height,
|
||||
visualization_width=visualization_width,
|
||||
visualization_height=visualization_height,
|
||||
split=split,
|
||||
episode_length=episode_length,
|
||||
obj_registries=obj_registries,
|
||||
)
|
||||
|
||||
if is_async:
|
||||
lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata)
|
||||
if cached_obs_space is None:
|
||||
cached_obs_space = lazy.observation_space
|
||||
cached_act_space = lazy.action_space
|
||||
cached_metadata = lazy.metadata
|
||||
out[task_name][0] = lazy
|
||||
else:
|
||||
out[task_name][0] = env_cls(fns)
|
||||
logger.info("Built vec env | task=%s | n_envs=%d", task_name, n_envs)
|
||||
|
||||
return {name: dict(task_map) for name, task_map in out.items()}
|
||||
@@ -1,245 +0,0 @@
|
||||
"""RoboMME environment wrapper for LeRobot evaluation.
|
||||
|
||||
Wraps the RoboMME ``BenchmarkEnvBuilder`` into a Gymnasium-compatible
|
||||
``VectorEnv`` suitable for ``lerobot_eval``.
|
||||
|
||||
RoboMME tasks:
|
||||
Counting: BinFill, PickXtimes, SwingXtimes, StopCube
|
||||
Permanence: VideoUnmask, VideoUnmaskSwap, ButtonUnmask, ButtonUnmaskSwap
|
||||
Reference: PickHighlight, VideoRepick, VideoPlaceButton, VideoPlaceOrder
|
||||
Imitation: MoveCube, InsertPeg, PatternLock, RouteStick
|
||||
|
||||
Dataset: lerobot/robomme (LeRobot v3.0, 1,600 episodes)
|
||||
Install: see docker/Dockerfile.benchmark.robomme (Linux only — mani-skill vs numpy pin conflict)
|
||||
Benchmark: https://github.com/RoboMME/robomme_benchmark
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Callable, Sequence
|
||||
from functools import partial
|
||||
from typing import Any
|
||||
|
||||
import gymnasium as gym
|
||||
import numpy as np
|
||||
from gymnasium import spaces
|
||||
|
||||
from .utils import _LazyAsyncVectorEnv
|
||||
|
||||
ROBOMME_TASKS = [
|
||||
"BinFill",
|
||||
"PickXtimes",
|
||||
"SwingXtimes",
|
||||
"StopCube",
|
||||
"VideoUnmask",
|
||||
"VideoUnmaskSwap",
|
||||
"ButtonUnmask",
|
||||
"ButtonUnmaskSwap",
|
||||
"PickHighlight",
|
||||
"VideoRepick",
|
||||
"VideoPlaceButton",
|
||||
"VideoPlaceOrder",
|
||||
"MoveCube",
|
||||
"InsertPeg",
|
||||
"PatternLock",
|
||||
"RouteStick",
|
||||
]
|
||||
|
||||
|
||||
class RoboMMEGymEnv(gym.Env):
|
||||
"""Thin Gymnasium wrapper around a single RoboMME episode env."""
|
||||
|
||||
metadata = {"render_modes": ["rgb_array"], "render_fps": 10}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
task: str = "PickXtimes",
|
||||
action_space_type: str = "joint_angle",
|
||||
dataset: str = "test",
|
||||
episode_idx: int = 0,
|
||||
max_steps: int = 300,
|
||||
):
|
||||
super().__init__()
|
||||
from robomme.env_record_wrapper import BenchmarkEnvBuilder
|
||||
|
||||
self._task = task
|
||||
self._action_space_type = action_space_type
|
||||
self._dataset = dataset
|
||||
self._episode_idx = episode_idx
|
||||
self._max_steps = max_steps
|
||||
self._max_episode_steps = max_steps
|
||||
|
||||
self._builder = BenchmarkEnvBuilder(
|
||||
env_id=task,
|
||||
dataset=dataset,
|
||||
action_space=action_space_type,
|
||||
gui_render=False,
|
||||
max_steps=max_steps,
|
||||
)
|
||||
self._env = None
|
||||
self._last_raw_obs: dict | None = None
|
||||
|
||||
action_dim = 8 if action_space_type == "joint_angle" else 7
|
||||
self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(action_dim,), dtype=np.float32)
|
||||
# `pixels` must be a nested Dict so `preprocess_observation()` in
|
||||
# envs/utils.py picks it up and maps each camera to
|
||||
# `observation.images.<cam>`. A flat layout (`pixels/image`,
|
||||
# `pixels/wrist_image`) silently drops every image from the batch.
|
||||
self.observation_space = spaces.Dict(
|
||||
{
|
||||
"pixels": spaces.Dict(
|
||||
{
|
||||
"image": spaces.Box(0, 255, shape=(256, 256, 3), dtype=np.uint8),
|
||||
"wrist_image": spaces.Box(0, 255, shape=(256, 256, 3), dtype=np.uint8),
|
||||
}
|
||||
),
|
||||
"agent_pos": spaces.Box(-np.inf, np.inf, shape=(8,), dtype=np.float32),
|
||||
}
|
||||
)
|
||||
|
||||
def reset(self, *, seed=None, options=None):
|
||||
super().reset(seed=seed)
|
||||
self._env = self._builder.make_env_for_episode(
|
||||
episode_idx=self._episode_idx,
|
||||
max_steps=self._max_steps,
|
||||
)
|
||||
obs, info = self._env.reset()
|
||||
self._last_raw_obs = obs
|
||||
return self._convert_obs(obs), self._convert_info(info)
|
||||
|
||||
def step(self, action):
|
||||
obs, reward, terminated, truncated, info = self._env.step(action)
|
||||
self._last_raw_obs = obs
|
||||
|
||||
terminated_bool = bool(terminated.item()) if hasattr(terminated, "item") else bool(terminated)
|
||||
truncated_bool = bool(truncated.item()) if hasattr(truncated, "item") else bool(truncated)
|
||||
|
||||
status = info.get("status", "ongoing")
|
||||
is_success = status == "success"
|
||||
conv_info = self._convert_info(info)
|
||||
conv_info["is_success"] = is_success
|
||||
|
||||
return self._convert_obs(obs), float(reward), terminated_bool, truncated_bool, conv_info
|
||||
|
||||
def render(self) -> np.ndarray | None:
|
||||
"""Return the front camera image from the last observation for video recording."""
|
||||
if self._last_raw_obs is None:
|
||||
return np.zeros((256, 256, 3), dtype=np.uint8)
|
||||
front = self._last_raw_obs.get("front_rgb_list")
|
||||
if front is None:
|
||||
return np.zeros((256, 256, 3), dtype=np.uint8)
|
||||
frame = front[-1] if isinstance(front, list) else front
|
||||
return np.asarray(frame, dtype=np.uint8)
|
||||
|
||||
def _convert_obs(self, obs: dict) -> dict:
|
||||
front_rgb = (
|
||||
obs["front_rgb_list"][-1] if isinstance(obs["front_rgb_list"], list) else obs["front_rgb_list"]
|
||||
)
|
||||
wrist_rgb = (
|
||||
obs["wrist_rgb_list"][-1] if isinstance(obs["wrist_rgb_list"], list) else obs["wrist_rgb_list"]
|
||||
)
|
||||
joint_state = (
|
||||
obs["joint_state_list"][-1]
|
||||
if isinstance(obs["joint_state_list"], list)
|
||||
else obs["joint_state_list"]
|
||||
)
|
||||
gripper_state = (
|
||||
obs["gripper_state_list"][-1]
|
||||
if isinstance(obs["gripper_state_list"], list)
|
||||
else obs["gripper_state_list"]
|
||||
)
|
||||
|
||||
front_rgb = np.asarray(front_rgb, dtype=np.uint8)
|
||||
wrist_rgb = np.asarray(wrist_rgb, dtype=np.uint8)
|
||||
joint = np.asarray(joint_state, dtype=np.float32).flatten()[:7]
|
||||
gripper = np.asarray(gripper_state, dtype=np.float32).flatten()[:1]
|
||||
state = np.concatenate([joint, gripper])
|
||||
|
||||
return {
|
||||
"pixels": {"image": front_rgb, "wrist_image": wrist_rgb},
|
||||
"agent_pos": state,
|
||||
}
|
||||
|
||||
def _convert_info(self, info: dict) -> dict:
|
||||
return {
|
||||
"status": info.get("status", "ongoing"),
|
||||
"task_goal": info.get("task_goal", ""),
|
||||
}
|
||||
|
||||
|
||||
def _make_env_fns(
|
||||
*,
|
||||
task: str,
|
||||
n_envs: int,
|
||||
action_space_type: str,
|
||||
dataset: str,
|
||||
episode_length: int,
|
||||
task_id: int,
|
||||
) -> list[Callable[[], RoboMMEGymEnv]]:
|
||||
"""Build n_envs factory callables for one RoboMME task id."""
|
||||
|
||||
def _make_one(episode_index: int) -> RoboMMEGymEnv:
|
||||
return RoboMMEGymEnv(
|
||||
task=task,
|
||||
action_space_type=action_space_type,
|
||||
dataset=dataset,
|
||||
episode_idx=episode_index,
|
||||
max_steps=episode_length,
|
||||
)
|
||||
|
||||
return [partial(_make_one, task_id + i) for i in range(n_envs)]
|
||||
|
||||
|
||||
def create_robomme_envs(
|
||||
task: str,
|
||||
n_envs: int = 1,
|
||||
action_space_type: str = "joint_angle",
|
||||
dataset: str = "test",
|
||||
episode_length: int = 300,
|
||||
task_ids: list[int] | None = None,
|
||||
env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None,
|
||||
) -> dict[str, dict[int, gym.vector.VectorEnv]]:
|
||||
"""Create vectorized RoboMME environments for evaluation.
|
||||
|
||||
`task` may be a single RoboMME task name (e.g. "PickXtimes") or a
|
||||
comma-separated list (e.g. "PickXtimes,BinFill,StopCube"). Each task
|
||||
becomes its own suite in the returned mapping.
|
||||
|
||||
Returns {suite_name: {task_id: VectorEnv}} matching lerobot's expected format.
|
||||
"""
|
||||
if env_cls is None or not callable(env_cls):
|
||||
raise ValueError("env_cls must be a callable that wraps a list of env factory callables.")
|
||||
if not isinstance(n_envs, int) or n_envs <= 0:
|
||||
raise ValueError(f"n_envs must be a positive int; got {n_envs}.")
|
||||
|
||||
if task_ids is None:
|
||||
task_ids = [0]
|
||||
|
||||
task_names = [t.strip() for t in task.split(",") if t.strip()]
|
||||
is_async = env_cls is gym.vector.AsyncVectorEnv
|
||||
cached_obs_space: spaces.Space | None = None
|
||||
cached_act_space: spaces.Space | None = None
|
||||
cached_metadata: dict[str, Any] | None = None
|
||||
out: dict[str, dict[int, gym.vector.VectorEnv]] = {}
|
||||
for task_name in task_names:
|
||||
envs_by_task: dict[int, gym.vector.VectorEnv] = {}
|
||||
for task_id in task_ids:
|
||||
fns = _make_env_fns(
|
||||
task=task_name,
|
||||
n_envs=n_envs,
|
||||
action_space_type=action_space_type,
|
||||
dataset=dataset,
|
||||
episode_length=episode_length,
|
||||
task_id=task_id,
|
||||
)
|
||||
if is_async:
|
||||
lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata)
|
||||
if cached_obs_space is None:
|
||||
cached_obs_space = lazy.observation_space
|
||||
cached_act_space = lazy.action_space
|
||||
cached_metadata = lazy.metadata
|
||||
envs_by_task[task_id] = lazy
|
||||
else:
|
||||
envs_by_task[task_id] = env_cls(fns)
|
||||
out[task_name] = envs_by_task
|
||||
return out
|
||||
@@ -1,488 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
from collections.abc import Callable, Sequence
|
||||
from functools import partial
|
||||
from typing import Any
|
||||
|
||||
import gymnasium as gym
|
||||
import numpy as np
|
||||
import torch
|
||||
from gymnasium import spaces
|
||||
|
||||
from lerobot.types import RobotObservation
|
||||
|
||||
from .utils import _LazyAsyncVectorEnv
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Camera names as used by RoboTwin 2.0. The wrapper appends "_rgb" when looking
|
||||
# up keys in get_obs() output (e.g. "head_camera" → "head_camera_rgb").
|
||||
ROBOTWIN_CAMERA_NAMES: tuple[str, ...] = (
|
||||
"head_camera",
|
||||
"left_camera",
|
||||
"right_camera",
|
||||
)
|
||||
|
||||
ACTION_DIM = 14 # 7 DOF × 2 arms
|
||||
ACTION_LOW = -1.0
|
||||
ACTION_HIGH = 1.0
|
||||
DEFAULT_EPISODE_LENGTH = 300
|
||||
# D435 dims from task_config/_camera_config.yml (what demo_clean.yml selects).
|
||||
DEFAULT_CAMERA_H = 240
|
||||
DEFAULT_CAMERA_W = 320
|
||||
|
||||
# Task list from RoboTwin 2.0's `envs/` directory — mirrors upstream exactly
|
||||
# (50 tasks as of main; earlier revisions had 60 with a different split).
|
||||
# Keep this in sync with:
|
||||
# gh api /repos/RoboTwin-Platform/RoboTwin/contents/envs --paginate \
|
||||
# | jq -r '.[].name' | grep -E '\.py$' | grep -v '^_' | sed 's/\.py$//'
|
||||
ROBOTWIN_TASKS: tuple[str, ...] = (
|
||||
"adjust_bottle",
|
||||
"beat_block_hammer",
|
||||
"blocks_ranking_rgb",
|
||||
"blocks_ranking_size",
|
||||
"click_alarmclock",
|
||||
"click_bell",
|
||||
"dump_bin_bigbin",
|
||||
"grab_roller",
|
||||
"handover_block",
|
||||
"handover_mic",
|
||||
"hanging_mug",
|
||||
"lift_pot",
|
||||
"move_can_pot",
|
||||
"move_pillbottle_pad",
|
||||
"move_playingcard_away",
|
||||
"move_stapler_pad",
|
||||
"open_laptop",
|
||||
"open_microwave",
|
||||
"pick_diverse_bottles",
|
||||
"pick_dual_bottles",
|
||||
"place_a2b_left",
|
||||
"place_a2b_right",
|
||||
"place_bread_basket",
|
||||
"place_bread_skillet",
|
||||
"place_burger_fries",
|
||||
"place_can_basket",
|
||||
"place_cans_plasticbox",
|
||||
"place_container_plate",
|
||||
"place_dual_shoes",
|
||||
"place_empty_cup",
|
||||
"place_fan",
|
||||
"place_mouse_pad",
|
||||
"place_object_basket",
|
||||
"place_object_scale",
|
||||
"place_object_stand",
|
||||
"place_phone_stand",
|
||||
"place_shoe",
|
||||
"press_stapler",
|
||||
"put_bottles_dustbin",
|
||||
"put_object_cabinet",
|
||||
"rotate_qrcode",
|
||||
"scan_object",
|
||||
"shake_bottle",
|
||||
"shake_bottle_horizontally",
|
||||
"stack_blocks_three",
|
||||
"stack_blocks_two",
|
||||
"stack_bowls_three",
|
||||
"stack_bowls_two",
|
||||
"stamp_seal",
|
||||
"turn_switch",
|
||||
)
|
||||
|
||||
|
||||
_ROBOTWIN_SETUP_CACHE: dict[str, dict[str, Any]] = {}
|
||||
|
||||
|
||||
def _load_robotwin_setup_kwargs(task_name: str) -> dict[str, Any]:
|
||||
"""Build the kwargs dict RoboTwin's setup_demo expects.
|
||||
|
||||
Mirrors the config loading done by RoboTwin's ``script/eval_policy.py``:
|
||||
reads ``task_config/demo_clean.yml``, resolves the embodiment file from
|
||||
``_embodiment_config.yml``, loads the robot's own ``config.yml``, and
|
||||
reads camera dimensions from ``_camera_config.yml``.
|
||||
|
||||
Uses ``aloha-agilex`` single-robot dual-arm by default (the only embodiment
|
||||
used by beat_block_hammer and most smoke-test tasks).
|
||||
"""
|
||||
if task_name in _ROBOTWIN_SETUP_CACHE:
|
||||
return dict(_ROBOTWIN_SETUP_CACHE[task_name])
|
||||
|
||||
import os
|
||||
|
||||
import yaml # type: ignore[import-untyped]
|
||||
from envs import CONFIGS_PATH # type: ignore[import-not-found]
|
||||
|
||||
task_config = "demo_clean"
|
||||
with open(os.path.join(CONFIGS_PATH, f"{task_config}.yml"), encoding="utf-8") as f:
|
||||
args = yaml.safe_load(f)
|
||||
|
||||
# Resolve embodiment — demo_clean.yml uses [aloha-agilex] (dual-arm single robot)
|
||||
with open(os.path.join(CONFIGS_PATH, "_embodiment_config.yml"), encoding="utf-8") as f:
|
||||
embodiment_types = yaml.safe_load(f)
|
||||
embodiment = args.get("embodiment", ["aloha-agilex"])
|
||||
if len(embodiment) == 1:
|
||||
robot_file = embodiment_types[embodiment[0]]["file_path"]
|
||||
args["left_robot_file"] = robot_file
|
||||
args["right_robot_file"] = robot_file
|
||||
args["dual_arm_embodied"] = True
|
||||
elif len(embodiment) == 3:
|
||||
args["left_robot_file"] = embodiment_types[embodiment[0]]["file_path"]
|
||||
args["right_robot_file"] = embodiment_types[embodiment[1]]["file_path"]
|
||||
args["embodiment_dis"] = embodiment[2]
|
||||
args["dual_arm_embodied"] = False
|
||||
else:
|
||||
raise ValueError(f"embodiment must have 1 or 3 items, got {len(embodiment)}")
|
||||
|
||||
with open(os.path.join(args["left_robot_file"], "config.yml"), encoding="utf-8") as f:
|
||||
args["left_embodiment_config"] = yaml.safe_load(f)
|
||||
with open(os.path.join(args["right_robot_file"], "config.yml"), encoding="utf-8") as f:
|
||||
args["right_embodiment_config"] = yaml.safe_load(f)
|
||||
|
||||
# Camera dimensions
|
||||
with open(os.path.join(CONFIGS_PATH, "_camera_config.yml"), encoding="utf-8") as f:
|
||||
camera_config = yaml.safe_load(f)
|
||||
head_cam = args["camera"]["head_camera_type"]
|
||||
args["head_camera_h"] = camera_config[head_cam]["h"]
|
||||
args["head_camera_w"] = camera_config[head_cam]["w"]
|
||||
|
||||
# Headless overrides
|
||||
args["render_freq"] = 0
|
||||
args["task_name"] = task_name
|
||||
args["task_config"] = task_config
|
||||
|
||||
_ROBOTWIN_SETUP_CACHE[task_name] = args
|
||||
return dict(args)
|
||||
|
||||
|
||||
def _load_robotwin_task(task_name: str) -> type:
|
||||
"""Dynamically import and return a RoboTwin 2.0 task class.
|
||||
|
||||
RoboTwin tasks live in ``envs/<task_name>.py`` relative to the repository
|
||||
root and are expected to be on ``sys.path`` after installation.
|
||||
"""
|
||||
try:
|
||||
module = importlib.import_module(f"envs.{task_name}")
|
||||
except ModuleNotFoundError as e:
|
||||
raise ModuleNotFoundError(
|
||||
f"Could not import RoboTwin task '{task_name}'. "
|
||||
"Ensure RoboTwin 2.0 is installed and its 'envs/' directory is on PYTHONPATH. "
|
||||
"See the RoboTwin installation guide: https://robotwin-platform.github.io/doc/usage/robotwin-install.html"
|
||||
) from e
|
||||
task_cls = getattr(module, task_name, None)
|
||||
if task_cls is None:
|
||||
raise AttributeError(f"Task class '{task_name}' not found in envs/{task_name}.py")
|
||||
return task_cls
|
||||
|
||||
|
||||
class RoboTwinEnv(gym.Env):
|
||||
"""Gymnasium wrapper around a single RoboTwin 2.0 task.
|
||||
|
||||
RoboTwin uses a custom SAPIEN-based API (``setup_demo`` / ``get_obs`` /
|
||||
``take_action`` / ``check_success``) rather than the standard gym interface.
|
||||
This class bridges that API to Gymnasium so that ``lerobot-eval`` can drive
|
||||
RoboTwin exactly like LIBERO or Meta-World.
|
||||
|
||||
The underlying SAPIEN environment is created lazily on the first ``reset()``
|
||||
call *inside the worker process*. This is required for
|
||||
``gym.vector.AsyncVectorEnv`` compatibility: SAPIEN allocates EGL/GPU
|
||||
contexts that must not be forked from the parent process.
|
||||
|
||||
Observations
|
||||
------------
|
||||
The ``pixels`` dict uses the raw RoboTwin camera names as keys (e.g.
|
||||
``"head_camera"``, ``"left_camera"``). ``preprocess_observation`` in
|
||||
``envs/utils.py`` then converts these to ``observation.images.<cam>``.
|
||||
|
||||
Actions
|
||||
-------
|
||||
14-dim float32 array in ``[-1, 1]`` (joint-space, 7 DOF per arm).
|
||||
|
||||
Autograd
|
||||
--------
|
||||
``setup_demo`` and ``take_action`` drive CuRobo's Newton trajectory
|
||||
optimizer, which calls ``cost.backward()`` internally. lerobot_eval wraps
|
||||
the rollout in ``torch.no_grad()``, so both call sites re-enable grad.
|
||||
"""
|
||||
|
||||
metadata = {"render_modes": ["rgb_array"], "render_fps": 25}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
task_name: str,
|
||||
episode_index: int = 0,
|
||||
n_envs: int = 1,
|
||||
camera_names: Sequence[str] = ROBOTWIN_CAMERA_NAMES,
|
||||
observation_height: int | None = None,
|
||||
observation_width: int | None = None,
|
||||
episode_length: int = DEFAULT_EPISODE_LENGTH,
|
||||
render_mode: str = "rgb_array",
|
||||
):
|
||||
super().__init__()
|
||||
self.task_name = task_name
|
||||
self.task = task_name # used by add_envs_task() in utils.py
|
||||
self.task_description = task_name.replace("_", " ")
|
||||
self.episode_index = episode_index
|
||||
self._reset_stride = n_envs
|
||||
self.camera_names = list(camera_names)
|
||||
# Default to D435 dims (the camera type baked into task_config/demo_clean.yml).
|
||||
# The YAML-driven lookup is deferred to reset() so construction doesn't
|
||||
# import RoboTwin's `envs` module — fast-tests run without RoboTwin installed.
|
||||
self.observation_height = observation_height or DEFAULT_CAMERA_H
|
||||
self.observation_width = observation_width or DEFAULT_CAMERA_W
|
||||
self.episode_length = episode_length
|
||||
self._max_episode_steps = episode_length # lerobot_eval.rollout reads this
|
||||
self.render_mode = render_mode
|
||||
|
||||
self._env: Any | None = None # deferred — created on first reset() inside worker
|
||||
self._step_count: int = 0
|
||||
self._black_frame = np.zeros((self.observation_height, self.observation_width, 3), dtype=np.uint8)
|
||||
|
||||
image_spaces = {
|
||||
cam: spaces.Box(
|
||||
low=0,
|
||||
high=255,
|
||||
shape=(self.observation_height, self.observation_width, 3),
|
||||
dtype=np.uint8,
|
||||
)
|
||||
for cam in self.camera_names
|
||||
}
|
||||
self.observation_space = spaces.Dict(
|
||||
{
|
||||
"pixels": spaces.Dict(image_spaces),
|
||||
"agent_pos": spaces.Box(low=-np.inf, high=np.inf, shape=(ACTION_DIM,), dtype=np.float32),
|
||||
}
|
||||
)
|
||||
self.action_space = spaces.Box(
|
||||
low=ACTION_LOW, high=ACTION_HIGH, shape=(ACTION_DIM,), dtype=np.float32
|
||||
)
|
||||
|
||||
def _ensure_env(self) -> None:
|
||||
"""Create the SAPIEN environment on first use.
|
||||
|
||||
Called inside the worker subprocess after fork(), so each worker gets
|
||||
its own EGL/GPU context rather than inheriting a stale one from the
|
||||
parent process (which causes crashes with AsyncVectorEnv).
|
||||
"""
|
||||
if self._env is not None:
|
||||
return
|
||||
task_cls = _load_robotwin_task(self.task_name)
|
||||
self._env = task_cls()
|
||||
|
||||
def _get_obs(self) -> RobotObservation:
|
||||
assert self._env is not None, "_get_obs called before _ensure_env()"
|
||||
raw = self._env.get_obs()
|
||||
cameras_raw = raw.get("observation", {})
|
||||
|
||||
images: dict[str, np.ndarray] = {}
|
||||
for cam in self.camera_names:
|
||||
cam_data = cameras_raw.get(cam)
|
||||
img = cam_data.get("rgb") if cam_data else None
|
||||
if img is None:
|
||||
images[cam] = self._black_frame
|
||||
continue
|
||||
img = np.asarray(img, dtype=np.uint8)
|
||||
if img.ndim == 2:
|
||||
img = np.stack([img, img, img], axis=-1)
|
||||
elif img.shape[-1] != 3:
|
||||
img = img[..., :3]
|
||||
images[cam] = img
|
||||
|
||||
ja = raw.get("joint_action") or {}
|
||||
vec = ja.get("vector")
|
||||
if vec is not None:
|
||||
arr = np.asarray(vec, dtype=np.float32).ravel()
|
||||
joint_state = (
|
||||
arr[:ACTION_DIM] if arr.size >= ACTION_DIM else np.zeros(ACTION_DIM, dtype=np.float32)
|
||||
)
|
||||
else:
|
||||
joint_state = np.zeros(ACTION_DIM, dtype=np.float32)
|
||||
|
||||
return {"pixels": images, "agent_pos": joint_state}
|
||||
|
||||
def reset(self, seed: int | None = None, **kwargs) -> tuple[RobotObservation, dict]:
|
||||
self._ensure_env()
|
||||
super().reset(seed=seed)
|
||||
assert self._env is not None # set by _ensure_env() above
|
||||
|
||||
actual_seed = self.episode_index if seed is None else seed
|
||||
setup_kwargs = _load_robotwin_setup_kwargs(self.task_name)
|
||||
setup_kwargs.update(seed=actual_seed, is_test=True)
|
||||
with torch.enable_grad():
|
||||
self._env.setup_demo(**setup_kwargs)
|
||||
self.episode_index += self._reset_stride
|
||||
self._step_count = 0
|
||||
|
||||
obs = self._get_obs()
|
||||
return obs, {"is_success": False, "task": self.task_name}
|
||||
|
||||
def step(self, action: np.ndarray) -> tuple[RobotObservation, float, bool, bool, dict[str, Any]]:
|
||||
assert self._env is not None, "step() called before reset()"
|
||||
if action.ndim != 1 or action.shape[0] != ACTION_DIM:
|
||||
raise ValueError(f"Expected 1-D action of shape ({ACTION_DIM},), got {action.shape}")
|
||||
|
||||
with torch.enable_grad():
|
||||
if hasattr(self._env, "take_action"):
|
||||
self._env.take_action(action)
|
||||
else:
|
||||
self._env.step(action)
|
||||
|
||||
self._step_count += 1
|
||||
|
||||
is_success = bool(getattr(self._env, "eval_success", False))
|
||||
if not is_success and hasattr(self._env, "check_success"):
|
||||
is_success = bool(self._env.check_success())
|
||||
|
||||
obs = self._get_obs()
|
||||
reward = float(is_success)
|
||||
terminated = is_success
|
||||
truncated = self._step_count >= self.episode_length
|
||||
|
||||
info: dict[str, Any] = {
|
||||
"task": self.task_name,
|
||||
"is_success": is_success,
|
||||
"step": self._step_count,
|
||||
}
|
||||
if terminated or truncated:
|
||||
info["final_info"] = {
|
||||
"task": self.task_name,
|
||||
"is_success": is_success,
|
||||
}
|
||||
self.reset()
|
||||
|
||||
return obs, reward, terminated, truncated, info
|
||||
|
||||
def render(self) -> np.ndarray:
|
||||
self._ensure_env()
|
||||
obs = self._get_obs()
|
||||
# Prefer head camera for rendering; fall back to first available.
|
||||
if "head_camera" in obs["pixels"]:
|
||||
return obs["pixels"]["head_camera"]
|
||||
return next(iter(obs["pixels"].values()))
|
||||
|
||||
def close(self) -> None:
|
||||
if self._env is not None:
|
||||
if hasattr(self._env, "close_env"):
|
||||
import contextlib
|
||||
|
||||
with contextlib.suppress(TypeError):
|
||||
self._env.close_env()
|
||||
self._env = None
|
||||
|
||||
|
||||
# ---- Multi-task factory --------------------------------------------------------
|
||||
|
||||
|
||||
def _make_env_fns(
|
||||
*,
|
||||
task_name: str,
|
||||
n_envs: int,
|
||||
camera_names: list[str],
|
||||
observation_height: int,
|
||||
observation_width: int,
|
||||
episode_length: int,
|
||||
) -> list[Callable[[], RoboTwinEnv]]:
|
||||
"""Return n_envs factory callables for a single task."""
|
||||
|
||||
def _make_one(episode_index: int) -> RoboTwinEnv:
|
||||
return RoboTwinEnv(
|
||||
task_name=task_name,
|
||||
episode_index=episode_index,
|
||||
n_envs=n_envs,
|
||||
camera_names=camera_names,
|
||||
observation_height=observation_height,
|
||||
observation_width=observation_width,
|
||||
episode_length=episode_length,
|
||||
)
|
||||
|
||||
return [partial(_make_one, i) for i in range(n_envs)]
|
||||
|
||||
|
||||
def create_robotwin_envs(
|
||||
task: str,
|
||||
n_envs: int,
|
||||
env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None,
|
||||
camera_names: Sequence[str] = ROBOTWIN_CAMERA_NAMES,
|
||||
observation_height: int = DEFAULT_CAMERA_H,
|
||||
observation_width: int = DEFAULT_CAMERA_W,
|
||||
episode_length: int = DEFAULT_EPISODE_LENGTH,
|
||||
) -> dict[str, dict[int, Any]]:
|
||||
"""Create vectorized RoboTwin 2.0 environments.
|
||||
|
||||
Returns:
|
||||
``dict[task_name][0] -> VectorEnv`` — one entry per task, each wrapping
|
||||
``n_envs`` parallel rollouts.
|
||||
|
||||
Args:
|
||||
task: Comma-separated list of task names (e.g. ``"beat_block_hammer"``
|
||||
or ``"beat_block_hammer,click_bell"``).
|
||||
n_envs: Number of parallel rollouts per task.
|
||||
env_cls: Vector env constructor (e.g. ``gym.vector.AsyncVectorEnv``).
|
||||
camera_names: Cameras to include in observations.
|
||||
observation_height: Pixel height for all cameras.
|
||||
observation_width: Pixel width for all cameras.
|
||||
episode_length: Max steps before truncation.
|
||||
"""
|
||||
if env_cls is None or not callable(env_cls):
|
||||
raise ValueError("env_cls must be callable (e.g. gym.vector.AsyncVectorEnv).")
|
||||
if not isinstance(n_envs, int) or n_envs <= 0:
|
||||
raise ValueError(f"n_envs must be a positive int; got {n_envs}.")
|
||||
|
||||
task_names = [t.strip() for t in str(task).split(",") if t.strip()]
|
||||
if not task_names:
|
||||
raise ValueError("`task` must contain at least one RoboTwin task name.")
|
||||
|
||||
unknown = [t for t in task_names if t not in ROBOTWIN_TASKS]
|
||||
if unknown:
|
||||
raise ValueError(f"Unknown RoboTwin tasks: {unknown}. Available tasks: {sorted(ROBOTWIN_TASKS)}")
|
||||
|
||||
logger.info(
|
||||
"Creating RoboTwin envs | tasks=%s | n_envs(per task)=%d",
|
||||
task_names,
|
||||
n_envs,
|
||||
)
|
||||
|
||||
is_async = env_cls is gym.vector.AsyncVectorEnv
|
||||
cached_obs_space: spaces.Space | None = None
|
||||
cached_act_space: spaces.Space | None = None
|
||||
cached_metadata: dict[str, Any] | None = None
|
||||
|
||||
out: dict[str, dict[int, Any]] = defaultdict(dict)
|
||||
for task_name in task_names:
|
||||
fns = _make_env_fns(
|
||||
task_name=task_name,
|
||||
n_envs=n_envs,
|
||||
camera_names=list(camera_names),
|
||||
observation_height=observation_height,
|
||||
observation_width=observation_width,
|
||||
episode_length=episode_length,
|
||||
)
|
||||
if is_async:
|
||||
lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata)
|
||||
if cached_obs_space is None:
|
||||
cached_obs_space = lazy.observation_space
|
||||
cached_act_space = lazy.action_space
|
||||
cached_metadata = lazy.metadata
|
||||
out[task_name][0] = lazy
|
||||
else:
|
||||
out[task_name][0] = env_cls(fns)
|
||||
logger.info("Built vec env | task=%s | n_envs=%d", task_name, n_envs)
|
||||
|
||||
return {k: dict(v) for k, v in out.items()}
|
||||
@@ -34,25 +34,6 @@ from lerobot.utils.utils import get_channel_first_image_shape
|
||||
from .configs import EnvConfig
|
||||
|
||||
|
||||
def parse_camera_names(camera_name: str | Sequence[str]) -> list[str]:
|
||||
"""Normalize ``camera_name`` into a non-empty list of strings.
|
||||
|
||||
Accepts a comma-separated string (``"cam_a,cam_b"``) or a sequence of
|
||||
strings (tuples/lists). Whitespace is stripped; empty entries are
|
||||
dropped. Raises ``TypeError`` for unsupported input types and
|
||||
``ValueError`` when the normalized list is empty.
|
||||
"""
|
||||
if isinstance(camera_name, str):
|
||||
cams = [c.strip() for c in camera_name.split(",") if c.strip()]
|
||||
elif isinstance(camera_name, (list | tuple)):
|
||||
cams = [str(c).strip() for c in camera_name if str(c).strip()]
|
||||
else:
|
||||
raise TypeError(f"camera_name must be str or sequence[str], got {type(camera_name).__name__}")
|
||||
if not cams:
|
||||
raise ValueError("camera_name resolved to an empty list.")
|
||||
return cams
|
||||
|
||||
|
||||
def _convert_nested_dict(d):
|
||||
result = {}
|
||||
for k, v in d.items():
|
||||
@@ -172,20 +153,17 @@ class _LazyAsyncVectorEnv:
|
||||
env_fns: list[Callable],
|
||||
observation_space=None,
|
||||
action_space=None,
|
||||
metadata=None,
|
||||
):
|
||||
self._env_fns = env_fns
|
||||
self._env: gym.vector.AsyncVectorEnv | None = None
|
||||
self.num_envs = len(env_fns)
|
||||
if observation_space is not None and action_space is not None and metadata is not None:
|
||||
if observation_space is not None and action_space is not None:
|
||||
self.observation_space = observation_space
|
||||
self.action_space = action_space
|
||||
self.metadata = metadata
|
||||
else:
|
||||
tmp = env_fns[0]()
|
||||
self.observation_space = tmp.observation_space
|
||||
self.action_space = tmp.action_space
|
||||
self.metadata = tmp.metadata
|
||||
tmp.close()
|
||||
self.single_observation_space = self.observation_space
|
||||
self.single_action_space = self.action_space
|
||||
@@ -194,10 +172,6 @@ class _LazyAsyncVectorEnv:
|
||||
if self._env is None:
|
||||
self._env = gym.vector.AsyncVectorEnv(self._env_fns, context="forkserver", shared_memory=True)
|
||||
|
||||
@property
|
||||
def unwrapped(self):
|
||||
return self
|
||||
|
||||
def reset(self, **kwargs):
|
||||
self._ensure()
|
||||
return self._env.reset(**kwargs)
|
||||
|
||||
@@ -1,589 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""VLABench environment wrapper for LeRobot.
|
||||
|
||||
VLABench is a large-scale benchmark for language-conditioned robotic manipulation
|
||||
with long-horizon reasoning, built on MuJoCo/dm_control.
|
||||
|
||||
- Paper: https://arxiv.org/abs/2412.18194
|
||||
- GitHub: https://github.com/OpenMOSS/VLABench
|
||||
- Website: https://vlabench.github.io
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
from collections.abc import Callable, Sequence
|
||||
from typing import Any
|
||||
|
||||
import cv2
|
||||
import gymnasium as gym
|
||||
import numpy as np
|
||||
from gymnasium import spaces
|
||||
from scipy.spatial.transform import Rotation
|
||||
|
||||
from lerobot.types import RobotObservation
|
||||
|
||||
from .utils import _LazyAsyncVectorEnv
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
ACTION_DIM = 7 # pos(3) + euler(3) + gripper(1)
|
||||
ACTION_LOW = np.array([-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 0.0], dtype=np.float32)
|
||||
ACTION_HIGH = np.array([1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], dtype=np.float32)
|
||||
|
||||
# Default max episode steps per task type
|
||||
DEFAULT_MAX_EPISODE_STEPS = 500
|
||||
|
||||
# VLABench task suites
|
||||
PRIMITIVE_TASKS = [
|
||||
"select_fruit",
|
||||
"select_toy",
|
||||
"select_chemistry_tube",
|
||||
"add_condiment",
|
||||
"select_book",
|
||||
"select_painting",
|
||||
"select_drink",
|
||||
"insert_flower",
|
||||
"select_billiards",
|
||||
"select_ingredient",
|
||||
"select_mahjong",
|
||||
"select_poker",
|
||||
# Physical series
|
||||
"density_qa",
|
||||
"friction_qa",
|
||||
"magnetism_qa",
|
||||
"reflection_qa",
|
||||
"simple_cuestick_usage",
|
||||
"simple_seesaw_usage",
|
||||
"sound_speed_qa",
|
||||
"thermal_expansion_qa",
|
||||
"weight_qa",
|
||||
]
|
||||
|
||||
COMPOSITE_TASKS = [
|
||||
"cluster_billiards",
|
||||
"cluster_book",
|
||||
"cluster_drink",
|
||||
"cluster_toy",
|
||||
"cook_dishes",
|
||||
"cool_drink",
|
||||
"find_unseen_object",
|
||||
"get_coffee",
|
||||
"hammer_nail",
|
||||
"heat_food",
|
||||
"make_juice",
|
||||
"play_mahjong",
|
||||
"play_math_game",
|
||||
"play_poker",
|
||||
"play_snooker",
|
||||
"rearrange_book",
|
||||
"rearrange_chemistry_tube",
|
||||
"set_dining_table",
|
||||
"set_study_table",
|
||||
"store_food",
|
||||
"take_chemistry_experiment",
|
||||
"use_seesaw_complex",
|
||||
]
|
||||
|
||||
SUITE_TASKS: dict[str, list[str]] = {
|
||||
"primitive": PRIMITIVE_TASKS,
|
||||
"composite": COMPOSITE_TASKS,
|
||||
}
|
||||
|
||||
|
||||
class VLABenchEnv(gym.Env):
|
||||
"""Gymnasium wrapper for VLABench environments.
|
||||
|
||||
Wraps the dm_control-based VLABench simulator behind a standard gym.Env interface.
|
||||
Supports multiple cameras (front, second, wrist) and end-effector control.
|
||||
"""
|
||||
|
||||
metadata = {"render_modes": ["rgb_array"], "render_fps": 10}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
task: str = "select_fruit",
|
||||
obs_type: str = "pixels_agent_pos",
|
||||
render_mode: str = "rgb_array",
|
||||
render_resolution: tuple[int, int] = (480, 480),
|
||||
robot: str = "franka",
|
||||
max_episode_steps: int = DEFAULT_MAX_EPISODE_STEPS,
|
||||
action_mode: str = "eef",
|
||||
):
|
||||
super().__init__()
|
||||
self.task = task
|
||||
self.obs_type = obs_type
|
||||
self.render_mode = render_mode
|
||||
self.render_resolution = render_resolution
|
||||
self.robot = robot
|
||||
self._max_episode_steps = max_episode_steps
|
||||
self.action_mode = action_mode
|
||||
|
||||
# Deferred — created on first reset() inside worker subprocess to avoid
|
||||
# inheriting stale GPU/EGL contexts when AsyncVectorEnv spawns workers.
|
||||
# We never cache `env.physics`: dm_control exposes it as a weakref
|
||||
# proxy that goes stale across resets (rebuilds the sim), so we always
|
||||
# refetch it via `self._env.physics` at the call site.
|
||||
self._env = None
|
||||
self.task_description = "" # populated on first reset
|
||||
# Cached world-frame XYZ of the robot base link. The VLABench datasets
|
||||
# log both `observation.state` positions and `actions` positions in
|
||||
# robot-base frame (see VLABench/scripts/convert_to_lerobot.py which
|
||||
# subtracts `robot_frame_pos` from ee_pos). The robot is attached at a
|
||||
# fixed offset per task so this is safe to cache once per env build.
|
||||
self._robot_base_xyz: np.ndarray | None = None
|
||||
|
||||
h, w = self.render_resolution
|
||||
|
||||
if self.obs_type == "state":
|
||||
raise NotImplementedError(
|
||||
"The 'state' observation type is not supported in VLABenchEnv. "
|
||||
"Please use 'pixels' or 'pixels_agent_pos'."
|
||||
)
|
||||
elif self.obs_type == "pixels":
|
||||
self.observation_space = spaces.Dict(
|
||||
{
|
||||
"pixels": spaces.Dict(
|
||||
{
|
||||
"image": spaces.Box(low=0, high=255, shape=(h, w, 3), dtype=np.uint8),
|
||||
"second_image": spaces.Box(low=0, high=255, shape=(h, w, 3), dtype=np.uint8),
|
||||
"wrist_image": spaces.Box(low=0, high=255, shape=(h, w, 3), dtype=np.uint8),
|
||||
}
|
||||
),
|
||||
}
|
||||
)
|
||||
elif self.obs_type == "pixels_agent_pos":
|
||||
self.observation_space = spaces.Dict(
|
||||
{
|
||||
"pixels": spaces.Dict(
|
||||
{
|
||||
"image": spaces.Box(low=0, high=255, shape=(h, w, 3), dtype=np.uint8),
|
||||
"second_image": spaces.Box(low=0, high=255, shape=(h, w, 3), dtype=np.uint8),
|
||||
"wrist_image": spaces.Box(low=0, high=255, shape=(h, w, 3), dtype=np.uint8),
|
||||
}
|
||||
),
|
||||
"agent_pos": spaces.Box(low=-np.inf, high=np.inf, shape=(7,), dtype=np.float64),
|
||||
}
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unsupported obs_type: {self.obs_type}")
|
||||
|
||||
self.action_space = spaces.Box(low=ACTION_LOW, high=ACTION_HIGH, dtype=np.float32)
|
||||
|
||||
# Max attempts to rebuild the underlying env when MuJoCo throws
|
||||
# `PhysicsError` (e.g. mjWARN_BADQACC) during VLABench's 20-step
|
||||
# reset warm-up. Some random task/layout samples land in unstable
|
||||
# initial configurations; re-sampling the layout almost always
|
||||
# gives a stable one. A handful of upstream tasks (notably
|
||||
# `select_mahjong`) have layout samplers that diverge often enough
|
||||
# to need >>5 retries, so we pick a generous ceiling.
|
||||
_ENSURE_ENV_MAX_ATTEMPTS = 20
|
||||
|
||||
def _ensure_env(self) -> None:
|
||||
"""Create the underlying VLABench env on first use.
|
||||
|
||||
Called inside the worker subprocess after fork(), so each worker gets
|
||||
its own clean rendering context rather than inheriting a stale one from
|
||||
the parent process (which causes crashes with AsyncVectorEnv).
|
||||
|
||||
Retries on `PhysicsError`: VLABench's `LM4ManipDMEnv.reset()` runs 20
|
||||
warm-up `step()` calls while toggling gravity/fluids to let the scene
|
||||
settle; for some random layouts MuJoCo's integrator diverges and
|
||||
raises `mjWARN_BADQACC`. Re-sampling the layout almost always yields
|
||||
a stable one, so we retry a number of times before giving up. Between
|
||||
attempts we reseed NumPy's global RNG from OS entropy so the upstream
|
||||
task sampler explores fresh initial states — without this, retries
|
||||
can replay the same diverging configuration when the sampler is
|
||||
deterministic given the current RNG state.
|
||||
"""
|
||||
if self._env is not None:
|
||||
return
|
||||
|
||||
import VLABench.robots # noqa: F401 # type: ignore[import-untyped]
|
||||
import VLABench.tasks # noqa: F401 # type: ignore[import-untyped]
|
||||
from dm_control.rl.control import PhysicsError # type: ignore[import-untyped]
|
||||
from VLABench.envs import load_env # type: ignore[import-untyped]
|
||||
|
||||
h, w = self.render_resolution
|
||||
last_exc: PhysicsError | None = None
|
||||
for attempt in range(1, self._ENSURE_ENV_MAX_ATTEMPTS + 1):
|
||||
try:
|
||||
env = load_env(task=self.task, robot=self.robot, render_resolution=(h, w))
|
||||
self._env = env
|
||||
break
|
||||
except PhysicsError as exc:
|
||||
last_exc = exc
|
||||
logger.warning(
|
||||
"PhysicsError on attempt %d/%d while building task '%s': %s. Retrying with fresh layout…",
|
||||
attempt,
|
||||
self._ENSURE_ENV_MAX_ATTEMPTS,
|
||||
self.task,
|
||||
exc,
|
||||
)
|
||||
np.random.seed(None)
|
||||
if self._env is None:
|
||||
assert last_exc is not None
|
||||
raise RuntimeError(
|
||||
f"VLABench task '{self.task}' failed to produce a stable "
|
||||
f"initial layout after {self._ENSURE_ENV_MAX_ATTEMPTS} "
|
||||
f"attempts. This task's upstream sampler diverges too "
|
||||
f"often for the configured robot; consider removing it "
|
||||
f"from the eval set. Last physics error: {last_exc}"
|
||||
) from last_exc
|
||||
|
||||
# Extract task description from the dm_control task
|
||||
task_obj = self._env.task
|
||||
if hasattr(task_obj, "task_description"):
|
||||
self.task_description = task_obj.task_description
|
||||
elif hasattr(task_obj, "language_instruction"):
|
||||
self.task_description = task_obj.language_instruction
|
||||
else:
|
||||
self.task_description = self.task
|
||||
|
||||
# Cache robot base world position so `_build_ctrl_from_action` and
|
||||
# `_get_obs` can translate between robot-frame (dataset) and
|
||||
# world-frame (dm_control) without hitting physics every call.
|
||||
try:
|
||||
self._robot_base_xyz = np.asarray(self._env.get_robot_frame_position(), dtype=np.float64).reshape(
|
||||
3
|
||||
)
|
||||
except Exception:
|
||||
# Fallback to VLABench's default Franka base position.
|
||||
self._robot_base_xyz = np.array([0.0, -0.4, 0.78], dtype=np.float64)
|
||||
|
||||
def _get_obs(self) -> dict:
|
||||
"""Get current observation from the environment."""
|
||||
assert self._env is not None
|
||||
|
||||
obs = self._env.get_observation()
|
||||
h, w = self.render_resolution
|
||||
|
||||
def _to_hwc3(arr: np.ndarray) -> np.ndarray:
|
||||
"""Coerce any camera array to the declared (h, w, 3) uint8 shape."""
|
||||
a = np.asarray(arr)
|
||||
# Drop a leading singleton batch dim if present.
|
||||
while a.ndim > 3 and a.shape[0] == 1:
|
||||
a = a[0]
|
||||
if a.ndim == 3 and a.shape[0] in (1, 3, 4) and a.shape[-1] not in (1, 3, 4):
|
||||
# CHW → HWC
|
||||
a = np.transpose(a, (1, 2, 0))
|
||||
if a.ndim == 2:
|
||||
a = np.stack([a] * 3, axis=-1)
|
||||
if a.ndim != 3:
|
||||
return np.zeros((h, w, 3), dtype=np.uint8)
|
||||
# Force 3 channels.
|
||||
if a.shape[-1] == 1:
|
||||
a = np.repeat(a, 3, axis=-1)
|
||||
elif a.shape[-1] == 4:
|
||||
a = a[..., :3]
|
||||
elif a.shape[-1] != 3:
|
||||
return np.zeros((h, w, 3), dtype=np.uint8)
|
||||
if a.shape[:2] != (h, w):
|
||||
a = cv2.resize(a, (w, h), interpolation=cv2.INTER_AREA)
|
||||
return a.astype(np.uint8)
|
||||
|
||||
# Extract camera images — VLABench returns (n_cameras, C, H, W) or individual arrays
|
||||
raw_frames: list[np.ndarray] = []
|
||||
if "rgb" in obs:
|
||||
rgb = obs["rgb"]
|
||||
if isinstance(rgb, np.ndarray):
|
||||
if rgb.ndim == 4:
|
||||
raw_frames = [rgb[i] for i in range(rgb.shape[0])]
|
||||
elif rgb.ndim == 3:
|
||||
raw_frames = [rgb]
|
||||
|
||||
image_keys = ["image", "second_image", "wrist_image"]
|
||||
images: dict[str, np.ndarray] = {}
|
||||
for i, key in enumerate(image_keys):
|
||||
if i < len(raw_frames):
|
||||
images[key] = _to_hwc3(raw_frames[i])
|
||||
else:
|
||||
images[key] = np.zeros((h, w, 3), dtype=np.uint8)
|
||||
|
||||
# Convert VLABench's raw ee_state `[pos_world(3), quat_wxyz(4), open(1)]`
|
||||
# to the dataset's observation.state layout `[pos_robot(3), euler_xyz(3),
|
||||
# gripper(1)]`. See VLABench/scripts/convert_to_lerobot.py — positions
|
||||
# are stored in robot-base frame and orientations as scipy extrinsic
|
||||
# 'xyz' euler angles.
|
||||
raw = np.asarray(obs.get("ee_state", np.zeros(8)), dtype=np.float64).ravel()
|
||||
pos_world = raw[:3] if raw.size >= 3 else np.zeros(3, dtype=np.float64)
|
||||
quat_wxyz = raw[3:7] if raw.size >= 7 else np.array([1.0, 0.0, 0.0, 0.0], dtype=np.float64)
|
||||
gripper = float(raw[7]) if raw.size >= 8 else 0.0
|
||||
|
||||
base = self._robot_base_xyz if self._robot_base_xyz is not None else np.zeros(3, dtype=np.float64)
|
||||
pos_robot = pos_world - base
|
||||
euler_xyz = Rotation.from_quat([quat_wxyz[1], quat_wxyz[2], quat_wxyz[3], quat_wxyz[0]]).as_euler(
|
||||
"xyz", degrees=False
|
||||
)
|
||||
|
||||
ee_state = np.concatenate([pos_robot, euler_xyz, [gripper]]).astype(np.float64)
|
||||
|
||||
if self.obs_type == "pixels":
|
||||
return {"pixels": images}
|
||||
elif self.obs_type == "pixels_agent_pos":
|
||||
return {
|
||||
"pixels": images,
|
||||
"agent_pos": ee_state.astype(np.float64),
|
||||
}
|
||||
else:
|
||||
raise ValueError(f"Unknown obs_type: {self.obs_type}")
|
||||
|
||||
# ---- Action adaptation (EEF → joint ctrl) --------------------------------
|
||||
#
|
||||
# The HF vlabench datasets log 7D actions
|
||||
# `[x, y, z (robot frame), rx, ry, rz (scipy extrinsic xyz), gripper]`,
|
||||
# exactly matching VLABench's own eval pipeline (evaluator.base):
|
||||
# pos, euler, g = policy(...)
|
||||
# quat = euler_to_quaternion(*euler) # extrinsic xyz -> wxyz
|
||||
# _, qpos = robot.get_qpos_from_ee_pos(physics, pos=pos + base, quat=quat)
|
||||
# env.step(np.concatenate([qpos, [g, g]]))
|
||||
#
|
||||
# VLABench's dm_control task writes `data.ctrl[:] = action` directly — for
|
||||
# Franka that's 9 entries (7 arm joints + 2 gripper fingers). We mirror the
|
||||
# above conversion so the policy's EEF commands actually drive the robot.
|
||||
|
||||
_FRANKA_FINGER_OPEN = 0.04 # qpos when gripper fully open
|
||||
|
||||
def _build_ctrl_from_action(self, action: np.ndarray, ctrl_dim: int) -> np.ndarray:
|
||||
"""Convert a 7D EEF action into the `ctrl_dim`-sized joint command vector.
|
||||
|
||||
For the Franka default (ctrl_dim=9): 7 arm joint qposes (via IK) +
|
||||
2 gripper finger qposes (open/closed based on the gripper scalar).
|
||||
If the action is already joint-space (shape matches ctrl_dim), pass
|
||||
through.
|
||||
"""
|
||||
if action.shape[0] == ctrl_dim:
|
||||
return action.astype(np.float64, copy=False)
|
||||
|
||||
if action.shape[0] != 7:
|
||||
# Unknown layout — fall back to zero-pad so the sim doesn't crash.
|
||||
padded = np.zeros(ctrl_dim, dtype=np.float64)
|
||||
padded[: min(action.shape[0], ctrl_dim)] = action[:ctrl_dim]
|
||||
return padded
|
||||
|
||||
from dm_control.utils.inverse_kinematics import qpos_from_site_pose
|
||||
|
||||
# Action position is in robot-base frame (see convert_to_lerobot.py);
|
||||
# dm_control's IK expects a world-frame target.
|
||||
base = self._robot_base_xyz if self._robot_base_xyz is not None else np.zeros(3, dtype=np.float64)
|
||||
pos_world = np.asarray(action[:3], dtype=np.float64) + base
|
||||
rx, ry, rz = float(action[3]), float(action[4]), float(action[5])
|
||||
gripper = float(np.clip(action[6], 0.0, 1.0))
|
||||
|
||||
# Dataset euler is scipy extrinsic 'xyz' (same as VLABench's
|
||||
# `euler_to_quaternion`). scipy emits `[x, y, z, w]`; dm_control's IK
|
||||
# and MuJoCo use `[w, x, y, z]`, so reorder.
|
||||
qxyzw = Rotation.from_euler("xyz", [rx, ry, rz], degrees=False).as_quat()
|
||||
quat = np.array([qxyzw[3], qxyzw[0], qxyzw[1], qxyzw[2]], dtype=np.float64)
|
||||
|
||||
assert self._env is not None
|
||||
robot = self._env.task.robot
|
||||
site_name = robot.end_effector_site.full_identifier
|
||||
|
||||
# inplace=False so IK doesn't mutate physics state mid-step — we only
|
||||
# want the solved qpos. Fetch a fresh physics handle — caching it can
|
||||
# yield a stale weakref after a reset.
|
||||
ik_result = qpos_from_site_pose(
|
||||
self._env.physics,
|
||||
site_name=site_name,
|
||||
target_pos=pos_world,
|
||||
target_quat=quat,
|
||||
inplace=False,
|
||||
max_steps=100,
|
||||
)
|
||||
n_dof = robot.n_dof # 7 for Franka
|
||||
arm_qpos = ik_result.qpos[:n_dof]
|
||||
|
||||
# Dataset gripper convention: 1 = open (finger qpos = 0.04),
|
||||
# 0 = closed (finger qpos = 0.0). See VLABench/scripts/convert_to_lerobot.py
|
||||
# where `trajectory[i][-1] > 0.03` is encoded as `1`.
|
||||
finger_qpos = gripper * self._FRANKA_FINGER_OPEN
|
||||
|
||||
ctrl = np.zeros(ctrl_dim, dtype=np.float64)
|
||||
ctrl[:n_dof] = arm_qpos
|
||||
# Remaining entries are gripper fingers (usually 2 for Franka).
|
||||
ctrl[n_dof:] = finger_qpos
|
||||
return ctrl
|
||||
|
||||
def reset(self, seed=None, **kwargs) -> tuple[RobotObservation, dict[str, Any]]:
|
||||
self._ensure_env()
|
||||
assert self._env is not None
|
||||
super().reset(seed=seed)
|
||||
|
||||
if seed is not None:
|
||||
self._seed_inner_env(int(self.np_random.integers(0, 2**31 - 1)))
|
||||
|
||||
self._env.reset()
|
||||
|
||||
observation = self._get_obs()
|
||||
info = {"is_success": False}
|
||||
return observation, info
|
||||
|
||||
def _seed_inner_env(self, seed: int) -> None:
|
||||
"""Propagate `seed` to the inner dm_control env. `Environment.reset()`
|
||||
doesn't accept a seed, so we re-seed the task and environment
|
||||
`RandomState`s directly. Best-effort: silently skipped when the
|
||||
expected attributes are absent on a given VLABench version.
|
||||
"""
|
||||
for owner_attr, rng_attr in (("task", "random"), (None, "_random_state")):
|
||||
owner = getattr(self._env, owner_attr) if owner_attr else self._env
|
||||
rng = getattr(owner, rng_attr, None)
|
||||
rng_seed = getattr(rng, "seed", None)
|
||||
if callable(rng_seed):
|
||||
rng_seed(seed)
|
||||
|
||||
def step(self, action: np.ndarray) -> tuple[RobotObservation, float, bool, bool, dict[str, Any]]:
|
||||
from dm_control.rl.control import PhysicsError # type: ignore[import-untyped]
|
||||
|
||||
self._ensure_env()
|
||||
assert self._env is not None
|
||||
|
||||
if action.ndim != 1:
|
||||
raise ValueError(
|
||||
f"Expected action to be 1-D (shape (action_dim,)), "
|
||||
f"but got shape {action.shape} with ndim={action.ndim}"
|
||||
)
|
||||
|
||||
if self.action_mode not in ("eef", "joint", "delta_eef"):
|
||||
raise ValueError(f"Unknown action_mode: {self.action_mode}")
|
||||
|
||||
# Always refetch physics — dm_control returns a weakref proxy that can
|
||||
# go stale across resets.
|
||||
physics = self._env.physics
|
||||
ctrl_dim = int(physics.data.ctrl.shape[0])
|
||||
ctrl = self._build_ctrl_from_action(action, ctrl_dim)
|
||||
try:
|
||||
timestep = self._env.step(ctrl)
|
||||
except PhysicsError as exc:
|
||||
# Physics integrator diverged (e.g. mjWARN_BADQACC). Treat it as
|
||||
# a graceful failed termination rather than a hard crash — the
|
||||
# rest of the multi-task eval should still run.
|
||||
logger.warning(
|
||||
"PhysicsError during step on task '%s': %s. Terminating episode.",
|
||||
self.task,
|
||||
exc,
|
||||
)
|
||||
observation = self._get_obs()
|
||||
info = {"task": self.task, "is_success": False, "physics_error": True}
|
||||
# Drop the stale env so the next reset() rebuilds it cleanly.
|
||||
with contextlib.suppress(Exception):
|
||||
self._env.close()
|
||||
self._env = None
|
||||
return observation, 0.0, True, False, info
|
||||
|
||||
# Extract reward from dm_control timestep
|
||||
reward = float(timestep.reward) if timestep.reward is not None else 0.0
|
||||
|
||||
# Check success via the task's termination condition
|
||||
is_success = False
|
||||
if hasattr(self._env, "task") and hasattr(self._env.task, "should_terminate_episode"):
|
||||
is_success = bool(self._env.task.should_terminate_episode(self._env.physics))
|
||||
|
||||
terminated = is_success
|
||||
truncated = False
|
||||
info = {
|
||||
"task": self.task,
|
||||
"is_success": is_success,
|
||||
}
|
||||
|
||||
observation = self._get_obs()
|
||||
|
||||
if terminated:
|
||||
self.reset()
|
||||
|
||||
return observation, reward, terminated, truncated, info
|
||||
|
||||
def render(self) -> np.ndarray:
|
||||
self._ensure_env()
|
||||
obs = self._get_obs()
|
||||
return obs["pixels"]["image"]
|
||||
|
||||
def close(self):
|
||||
if self._env is not None:
|
||||
self._env.close()
|
||||
self._env = None
|
||||
|
||||
|
||||
# ---- Main API ----------------------------------------------------------------
|
||||
|
||||
|
||||
def create_vlabench_envs(
|
||||
task: str,
|
||||
n_envs: int,
|
||||
gym_kwargs: dict[str, Any] | None = None,
|
||||
env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None,
|
||||
) -> dict[str, dict[int, Any]]:
|
||||
"""
|
||||
Create vectorized VLABench environments with a consistent return shape.
|
||||
|
||||
Returns:
|
||||
dict[suite_name][task_id] -> vec_env (env_cls([...]) with exactly n_envs factories)
|
||||
|
||||
Notes:
|
||||
- n_envs is the number of rollouts *per task*.
|
||||
- `task` can be a suite name ("primitive", "composite"), a comma-separated list of
|
||||
suite names, or individual task names (e.g. "select_fruit,heat_food").
|
||||
"""
|
||||
if env_cls is None or not callable(env_cls):
|
||||
raise ValueError("env_cls must be a callable that wraps a list of environment factory callables.")
|
||||
if not isinstance(n_envs, int) or n_envs <= 0:
|
||||
raise ValueError(f"n_envs must be a positive int; got {n_envs}.")
|
||||
|
||||
gym_kwargs = dict(gym_kwargs or {})
|
||||
task_groups = [t.strip() for t in task.split(",") if t.strip()]
|
||||
if not task_groups:
|
||||
raise ValueError("`task` must contain at least one VLABench task or suite name.")
|
||||
|
||||
logger.info(
|
||||
"Creating VLABench envs | task_groups=%s | n_envs(per task)=%d",
|
||||
task_groups,
|
||||
n_envs,
|
||||
)
|
||||
|
||||
is_async = env_cls is gym.vector.AsyncVectorEnv
|
||||
cached_obs_space = None
|
||||
cached_act_space = None
|
||||
cached_metadata = None
|
||||
out: dict[str, dict[int, Any]] = defaultdict(dict)
|
||||
|
||||
for group in task_groups:
|
||||
# Check if it's a suite name, otherwise treat as individual task
|
||||
tasks = SUITE_TASKS.get(group, [group])
|
||||
|
||||
for tid, task_name in enumerate(tasks):
|
||||
logger.info(
|
||||
"Building vec env | group=%s | task_id=%d | task=%s",
|
||||
group,
|
||||
tid,
|
||||
task_name,
|
||||
)
|
||||
|
||||
fns = [(lambda tn=task_name: VLABenchEnv(task=tn, **gym_kwargs)) for _ in range(n_envs)]
|
||||
|
||||
if is_async:
|
||||
lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata)
|
||||
if cached_obs_space is None:
|
||||
cached_obs_space = lazy.observation_space
|
||||
cached_act_space = lazy.action_space
|
||||
cached_metadata = lazy.metadata
|
||||
out[group][tid] = lazy
|
||||
else:
|
||||
out[group][tid] = env_cls(fns)
|
||||
|
||||
return {group: dict(task_map) for group, task_map in out.items()}
|
||||
@@ -16,7 +16,6 @@ from lerobot.utils.action_interpolator import ActionInterpolator as ActionInterp
|
||||
|
||||
from .act.configuration_act import ACTConfig as ACTConfig
|
||||
from .diffusion.configuration_diffusion import DiffusionConfig as DiffusionConfig
|
||||
from .eo1.configuration_eo1 import EO1Config as EO1Config
|
||||
from .factory import get_policy_class, make_policy, make_policy_config, make_pre_post_processors
|
||||
from .groot.configuration_groot import GrootConfig as GrootConfig
|
||||
from .multi_task_dit.configuration_multi_task_dit import MultiTaskDiTConfig as MultiTaskDiTConfig
|
||||
@@ -25,6 +24,8 @@ from .pi0_fast.configuration_pi0_fast import PI0FastConfig as PI0FastConfig
|
||||
from .pi05.configuration_pi05 import PI05Config as PI05Config
|
||||
from .pretrained import PreTrainedPolicy as PreTrainedPolicy
|
||||
from .sac.configuration_sac import SACConfig as SACConfig
|
||||
from .sac.reward_model.configuration_classifier import RewardClassifierConfig as RewardClassifierConfig
|
||||
from .sarm.configuration_sarm import SARMConfig as SARMConfig
|
||||
from .smolvla.configuration_smolvla import SmolVLAConfig as SmolVLAConfig
|
||||
from .tdmpc.configuration_tdmpc import TDMPCConfig as TDMPCConfig
|
||||
from .utils import make_robot_action, prepare_observation_for_inference
|
||||
@@ -42,11 +43,12 @@ __all__ = [
|
||||
"DiffusionConfig",
|
||||
"GrootConfig",
|
||||
"MultiTaskDiTConfig",
|
||||
"EO1Config",
|
||||
"PI0Config",
|
||||
"PI0FastConfig",
|
||||
"PI05Config",
|
||||
"RewardClassifierConfig",
|
||||
"SACConfig",
|
||||
"SARMConfig",
|
||||
"SmolVLAConfig",
|
||||
"TDMPCConfig",
|
||||
"VQBeTConfig",
|
||||
|
||||
@@ -142,10 +142,9 @@ class ACTPolicy(PreTrainedPolicy):
|
||||
|
||||
actions_hat, (mu_hat, log_sigma_x2_hat) = self.model(batch)
|
||||
|
||||
abs_err = F.l1_loss(batch[ACTION], actions_hat, reduction="none")
|
||||
valid_mask = ~batch["action_is_pad"].unsqueeze(-1)
|
||||
num_valid = valid_mask.sum() * abs_err.shape[-1]
|
||||
l1_loss = (abs_err * valid_mask).sum() / num_valid.clamp_min(1)
|
||||
l1_loss = (
|
||||
F.l1_loss(batch[ACTION], actions_hat, reduction="none") * ~batch["action_is_pad"].unsqueeze(-1)
|
||||
).mean()
|
||||
|
||||
loss_dict = {"l1_loss": l1_loss.item()}
|
||||
if self.config.use_vae:
|
||||
|
||||
@@ -100,8 +100,8 @@ class DiffusionConfig(PreTrainedConfig):
|
||||
|
||||
# Inputs / output structure.
|
||||
n_obs_steps: int = 2
|
||||
horizon: int = 64
|
||||
n_action_steps: int = 32
|
||||
horizon: int = 16
|
||||
n_action_steps: int = 8
|
||||
|
||||
normalization_mapping: dict[str, NormalizationMode] = field(
|
||||
default_factory=lambda: {
|
||||
@@ -122,10 +122,10 @@ class DiffusionConfig(PreTrainedConfig):
|
||||
crop_ratio: float = 1.0
|
||||
crop_shape: tuple[int, int] | None = None
|
||||
crop_is_random: bool = True
|
||||
pretrained_backbone_weights: str | None = "ResNet18_Weights.IMAGENET1K_V1"
|
||||
use_group_norm: bool = False
|
||||
pretrained_backbone_weights: str | None = None
|
||||
use_group_norm: bool = True
|
||||
spatial_softmax_num_keypoints: int = 32
|
||||
use_separate_rgb_encoder_per_camera: bool = True
|
||||
use_separate_rgb_encoder_per_camera: bool = False
|
||||
# Unet.
|
||||
down_dims: tuple[int, ...] = (512, 1024, 2048)
|
||||
kernel_size: int = 5
|
||||
|
||||
@@ -380,9 +380,7 @@ class DiffusionModel(nn.Module):
|
||||
f"{self.config.do_mask_loss_for_padding=}."
|
||||
)
|
||||
in_episode_bound = ~batch["action_is_pad"]
|
||||
mask = in_episode_bound.unsqueeze(-1)
|
||||
num_valid = mask.sum() * loss.shape[-1]
|
||||
return (loss * mask).sum() / num_valid.clamp_min(1)
|
||||
loss = loss * in_episode_bound.unsqueeze(-1)
|
||||
|
||||
return loss.mean()
|
||||
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
../../../../docs/source/eo1.mdx
|
||||
@@ -1,7 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
from .configuration_eo1 import EO1Config
|
||||
from .modeling_eo1 import EO1Policy
|
||||
from .processor_eo1 import make_eo1_pre_post_processors
|
||||
|
||||
__all__ = ["EO1Config", "EO1Policy", "make_eo1_pre_post_processors"]
|
||||
@@ -1,193 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from copy import deepcopy
|
||||
from dataclasses import dataclass, field
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from lerobot.configs.policies import PreTrainedConfig
|
||||
from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature
|
||||
from lerobot.optim.optimizers import AdamWConfig
|
||||
from lerobot.optim.schedulers import CosineDecayWithWarmupSchedulerConfig
|
||||
from lerobot.utils.constants import ACTION, OBS_STATE
|
||||
from lerobot.utils.import_utils import _transformers_available, require_package
|
||||
|
||||
if TYPE_CHECKING or _transformers_available:
|
||||
from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import (
|
||||
Qwen2_5_VLConfig,
|
||||
Qwen2_5_VLTextConfig,
|
||||
Qwen2_5_VLVisionConfig,
|
||||
)
|
||||
else:
|
||||
Qwen2_5_VLConfig = None
|
||||
Qwen2_5_VLTextConfig = None
|
||||
Qwen2_5_VLVisionConfig = None
|
||||
|
||||
|
||||
@PreTrainedConfig.register_subclass("eo1")
|
||||
@dataclass
|
||||
class EO1Config(PreTrainedConfig):
|
||||
"""Configuration for native EO1 policy integration in LeRobot."""
|
||||
|
||||
vlm_base: str = "Qwen/Qwen2.5-VL-3B-Instruct"
|
||||
vlm_config: dict | None = None
|
||||
|
||||
# Vision processor settings.
|
||||
image_min_pixels: int | None = 64 * 28 * 28
|
||||
image_max_pixels: int | None = 128 * 28 * 28
|
||||
use_fast_processor: bool = False
|
||||
|
||||
# Execution and action horizon.
|
||||
n_obs_steps: int = 1
|
||||
chunk_size: int = 8
|
||||
n_action_steps: int = 8
|
||||
|
||||
# State/action padding to match EO1 flow head dimensionality.
|
||||
max_state_dim: int = 32
|
||||
max_action_dim: int = 32
|
||||
|
||||
# Flow matching sampling.
|
||||
num_denoise_steps: int = 10
|
||||
num_action_layers: int = 2
|
||||
action_act: str = "linear"
|
||||
time_sampling_beta_alpha: float = 1.5
|
||||
time_sampling_beta_beta: float = 1.0
|
||||
time_sampling_scale: float = 0.999
|
||||
time_sampling_offset: float = 0.001
|
||||
min_period: float = 4e-3
|
||||
max_period: float = 4.0
|
||||
supervise_padding_action_dims: bool = True
|
||||
supervise_padding_actions: bool = True
|
||||
|
||||
# Policy-level dtype request for the Qwen backbone.
|
||||
# - "auto": follow the backbone config/checkpoint default dtype. For Qwen2.5-VL this resolves to bf16.
|
||||
# The EO1 flow-matching head still keeps its own parameters in fp32.
|
||||
# - "bfloat16": force the backbone to initialize/load in bf16 regardless of the saved config default.
|
||||
# - "float32": force the backbone to initialize/load in fp32 for maximum numerical conservatism.
|
||||
dtype: str = "auto" # Options: "auto", "bfloat16", "float32"
|
||||
force_fp32_autocast: bool = True
|
||||
|
||||
# Optional attention backend request passed through to the Qwen backbone.
|
||||
# Common values: None, "eager", "sdpa", "flash_attention_2".
|
||||
attn_implementation: str | None = None
|
||||
|
||||
# Training settings.
|
||||
gradient_checkpointing: bool = False # Enable gradient checkpointing for memory optimization
|
||||
|
||||
normalization_mapping: dict[str, NormalizationMode] = field(
|
||||
default_factory=lambda: {
|
||||
"VISUAL": NormalizationMode.IDENTITY,
|
||||
"STATE": NormalizationMode.MEAN_STD,
|
||||
"ACTION": NormalizationMode.MEAN_STD,
|
||||
}
|
||||
)
|
||||
|
||||
# Optimizer settings aligned with EO1/experiments/2_libero/train.sh and EO1 TrainPipelineConfig defaults.
|
||||
optimizer_lr: float = 1e-4
|
||||
optimizer_betas: tuple[float, float] = (0.9, 0.999)
|
||||
optimizer_eps: float = 1e-8
|
||||
optimizer_weight_decay: float = 0.1
|
||||
optimizer_grad_clip_norm: float = 1.0
|
||||
|
||||
# Scheduler settings aligned with EO1 train.sh: cosine schedule with warmup_ratio=0.03.
|
||||
# Note: These will auto-scale if --steps < scheduler_decay_steps
|
||||
# For example, --steps=3000 will scale warmup to 100 and decay to 3000
|
||||
scheduler_warmup_steps: int = 900 # 0.03 * 30_000 long-run steps
|
||||
scheduler_decay_steps: int = 30_000
|
||||
scheduler_decay_lr: float = 0.0
|
||||
|
||||
def __post_init__(self):
|
||||
super().__post_init__()
|
||||
|
||||
if self.n_action_steps > self.chunk_size:
|
||||
raise ValueError(
|
||||
f"n_action_steps ({self.n_action_steps}) cannot be greater than chunk_size ({self.chunk_size})"
|
||||
)
|
||||
|
||||
# Populate the serialized backbone config only when the caller did not provide one.
|
||||
if self.vlm_config is None:
|
||||
require_package("transformers", extra="eo1")
|
||||
self.vlm_config = Qwen2_5_VLConfig.from_pretrained(self.vlm_base).to_dict()
|
||||
|
||||
@property
|
||||
def vlm_backbone_config(self) -> Qwen2_5_VLConfig:
|
||||
require_package("transformers", extra="eo1")
|
||||
config_dict = deepcopy(self.vlm_config)
|
||||
if self.attn_implementation is not None:
|
||||
config_dict["attn_implementation"] = self.attn_implementation
|
||||
return Qwen2_5_VLConfig(**config_dict)
|
||||
|
||||
@property
|
||||
def text_config(self) -> Qwen2_5_VLTextConfig:
|
||||
return self.vlm_backbone_config.text_config
|
||||
|
||||
@property
|
||||
def vision_config(self) -> Qwen2_5_VLVisionConfig:
|
||||
return self.vlm_backbone_config.vision_config
|
||||
|
||||
def validate_features(self) -> None:
|
||||
"""Validate and set up EO1 input and output features."""
|
||||
image_features = [key for key, feat in self.input_features.items() if feat.type == FeatureType.VISUAL]
|
||||
if not image_features:
|
||||
raise ValueError(
|
||||
"EO1 policy requires at least one visual input feature. "
|
||||
"No features of type FeatureType.VISUAL found in input_features."
|
||||
)
|
||||
|
||||
if OBS_STATE not in self.input_features:
|
||||
state_feature = PolicyFeature(
|
||||
type=FeatureType.STATE,
|
||||
shape=(self.max_state_dim,),
|
||||
)
|
||||
self.input_features[OBS_STATE] = state_feature
|
||||
|
||||
if ACTION not in self.output_features:
|
||||
action_feature = PolicyFeature(
|
||||
type=FeatureType.ACTION,
|
||||
shape=(self.max_action_dim,),
|
||||
)
|
||||
self.output_features[ACTION] = action_feature
|
||||
|
||||
def get_optimizer_preset(self) -> AdamWConfig:
|
||||
return AdamWConfig(
|
||||
lr=self.optimizer_lr,
|
||||
betas=self.optimizer_betas,
|
||||
eps=self.optimizer_eps,
|
||||
weight_decay=self.optimizer_weight_decay,
|
||||
grad_clip_norm=self.optimizer_grad_clip_norm,
|
||||
)
|
||||
|
||||
def get_scheduler_preset(self):
|
||||
return CosineDecayWithWarmupSchedulerConfig(
|
||||
peak_lr=self.optimizer_lr,
|
||||
decay_lr=self.scheduler_decay_lr,
|
||||
num_warmup_steps=self.scheduler_warmup_steps,
|
||||
num_decay_steps=self.scheduler_decay_steps,
|
||||
)
|
||||
|
||||
@property
|
||||
def observation_delta_indices(self) -> None:
|
||||
return None
|
||||
|
||||
@property
|
||||
def action_delta_indices(self) -> list[int]:
|
||||
return list(range(self.chunk_size))
|
||||
|
||||
@property
|
||||
def reward_delta_indices(self) -> None:
|
||||
return None
|
||||
@@ -1,620 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import logging
|
||||
import math
|
||||
from collections import deque
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F # noqa: N812
|
||||
import torch.utils.checkpoint
|
||||
from torch import Tensor
|
||||
|
||||
from lerobot.policies.eo1.configuration_eo1 import EO1Config
|
||||
from lerobot.policies.pretrained import PreTrainedPolicy
|
||||
from lerobot.utils.constants import ACTION, OBS_STATE
|
||||
from lerobot.utils.import_utils import _transformers_available, require_package
|
||||
|
||||
if TYPE_CHECKING or _transformers_available:
|
||||
from transformers.activations import ACT2FN
|
||||
from transformers.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
|
||||
from transformers.utils import torch_compilable_check
|
||||
else:
|
||||
ACT2FN = None
|
||||
Qwen2_5_VLForConditionalGeneration = None
|
||||
torch_compilable_check = None
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def pad_vector(vector, new_dim):
|
||||
"""Pad the last dimension of a vector to new_dim with zeros.
|
||||
|
||||
Can be (batch_size x sequence_length x features_dimension)
|
||||
or (batch_size x features_dimension)
|
||||
"""
|
||||
if vector.shape[-1] >= new_dim:
|
||||
return vector
|
||||
return F.pad(vector, (0, new_dim - vector.shape[-1]))
|
||||
|
||||
|
||||
class EO1Policy(PreTrainedPolicy):
|
||||
"""EO1 policy wrapper for LeRobot robot-only training/evaluation."""
|
||||
|
||||
config_class = EO1Config
|
||||
name = "eo1"
|
||||
|
||||
def __init__(self, config: EO1Config, **kwargs):
|
||||
require_package("transformers", extra="eo1")
|
||||
super().__init__(config)
|
||||
config.validate_features()
|
||||
self.config = config
|
||||
|
||||
if config.pretrained_path is None:
|
||||
# Initialize from pretrained VLM
|
||||
vlm_backbone = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
||||
config.vlm_base,
|
||||
dtype=config.dtype,
|
||||
attn_implementation=config.attn_implementation,
|
||||
)
|
||||
else:
|
||||
vlm_backbone = Qwen2_5_VLForConditionalGeneration._from_config(
|
||||
config.vlm_backbone_config,
|
||||
dtype=config.vlm_backbone_config.dtype if config.dtype == "auto" else config.dtype,
|
||||
)
|
||||
|
||||
self.model = EO1VisionFlowMatchingModel(config, vlm_backbone)
|
||||
if config.gradient_checkpointing:
|
||||
self.model.gradient_checkpointing_enable()
|
||||
|
||||
self.model.to(config.device)
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
self._action_queue = deque(maxlen=self.config.n_action_steps)
|
||||
|
||||
@staticmethod
|
||||
def _get_model_inputs(batch: dict[str, Tensor], excluded_keys: set[str]) -> dict[str, Tensor]:
|
||||
return {key: value for key, value in batch.items() if key not in excluded_keys}
|
||||
|
||||
def forward(self, batch: dict[str, Tensor]) -> tuple[Tensor, dict]:
|
||||
state = self.prepare_state(batch[OBS_STATE])
|
||||
actions = self.prepare_action(batch[ACTION])
|
||||
model_inputs = self._get_model_inputs(batch, {OBS_STATE, ACTION})
|
||||
loss = self.model(states=state, action=actions, **model_inputs)
|
||||
|
||||
loss_dict = {"loss": loss.item()}
|
||||
return loss, loss_dict
|
||||
|
||||
@torch.no_grad()
|
||||
def predict_action_chunk(self, batch: dict[str, Tensor], **kwargs) -> Tensor:
|
||||
self.eval()
|
||||
|
||||
states = self.prepare_state(batch[OBS_STATE])
|
||||
model_inputs = self._get_model_inputs(batch, {OBS_STATE})
|
||||
actions = self.model.sample_actions(states=states, **model_inputs).to(torch.float32)
|
||||
|
||||
original_action_dim = self.config.output_features[ACTION].shape[0]
|
||||
return actions[:, :, :original_action_dim]
|
||||
|
||||
def prepare_state(self, state: Tensor) -> Tensor:
|
||||
return pad_vector(state, self.config.max_state_dim)
|
||||
|
||||
def prepare_action(self, action: Tensor) -> Tensor:
|
||||
return pad_vector(action, self.config.max_action_dim)
|
||||
|
||||
@torch.no_grad()
|
||||
def select_action(self, batch: dict[str, Tensor]) -> Tensor:
|
||||
self.eval()
|
||||
|
||||
if len(self._action_queue) == 0:
|
||||
actions = self.predict_action_chunk(batch)[:, : self.config.n_action_steps]
|
||||
self._action_queue.extend(actions.transpose(0, 1))
|
||||
|
||||
return self._action_queue.popleft()
|
||||
|
||||
def get_optim_params(self) -> dict:
|
||||
return self.parameters()
|
||||
|
||||
|
||||
def get_safe_dtype(target_dtype, device_type):
|
||||
"""Get a safe dtype for the given device type."""
|
||||
if device_type == "mps" and target_dtype == torch.float64:
|
||||
return torch.float32
|
||||
if device_type == "cpu":
|
||||
# CPU doesn't support bfloat16, use float32 instead
|
||||
if target_dtype == torch.bfloat16:
|
||||
return torch.float32
|
||||
if target_dtype == torch.float64:
|
||||
return torch.float64
|
||||
return target_dtype
|
||||
|
||||
|
||||
def create_sinusoidal_pos_embedding( # see openpi `create_sinusoidal_pos_embedding` (exact copy)
|
||||
time: torch.Tensor, dimension: int, min_period: float, max_period: float, device="cpu"
|
||||
) -> Tensor:
|
||||
"""Computes sine-cosine positional embedding vectors for scalar positions."""
|
||||
if dimension % 2 != 0:
|
||||
raise ValueError(f"dimension ({dimension}) must be divisible by 2")
|
||||
|
||||
if time.ndim != 1:
|
||||
raise ValueError("The time tensor is expected to be of shape `(batch_size, )`.")
|
||||
|
||||
dtype = get_safe_dtype(torch.float64, device.type)
|
||||
fraction = torch.linspace(0.0, 1.0, dimension // 2, dtype=dtype, device=device)
|
||||
period = min_period * (max_period / min_period) ** fraction
|
||||
|
||||
# Compute the outer product
|
||||
scaling_factor = 1.0 / period * 2 * math.pi
|
||||
sin_input = scaling_factor[None, :] * time[:, None]
|
||||
return torch.cat([torch.sin(sin_input), torch.cos(sin_input)], dim=1)
|
||||
|
||||
|
||||
def sample_beta(alpha, beta, bsize, device): # see openpi `sample_beta` (exact copy)
|
||||
# Beta sampling uses _sample_dirichlet which isn't implemented for MPS, so sample on CPU
|
||||
alpha_t = torch.tensor(alpha, dtype=torch.float32)
|
||||
beta_t = torch.tensor(beta, dtype=torch.float32)
|
||||
dist = torch.distributions.Beta(alpha_t, beta_t)
|
||||
return dist.sample((bsize,)).to(device)
|
||||
|
||||
|
||||
class EO1VisionActionProjector(torch.nn.Sequential):
|
||||
"""This block implements the multi-layer perceptron (MLP) module."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
in_channels: int,
|
||||
out_channels: int,
|
||||
num_layers: int = 2,
|
||||
activation_layer: str = "linear",
|
||||
bias: bool = True,
|
||||
device: Any = None,
|
||||
dtype: torch.dtype = torch.float32,
|
||||
):
|
||||
layers = []
|
||||
in_dim = in_channels
|
||||
hidden_channels = [in_dim] * (num_layers - 1) + [out_channels]
|
||||
for hidden_dim in hidden_channels[:-1]:
|
||||
layers.append(torch.nn.Linear(in_dim, hidden_dim, bias=bias, dtype=dtype, device=device))
|
||||
layers.append(ACT2FN[activation_layer])
|
||||
in_dim = hidden_dim
|
||||
layers.append(torch.nn.Linear(in_dim, hidden_channels[-1], bias=bias, dtype=dtype, device=device))
|
||||
super().__init__(*layers)
|
||||
|
||||
@property
|
||||
def dtype(self):
|
||||
return self[0].weight.dtype
|
||||
|
||||
|
||||
class EO1VisionFlowMatchingModel(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
config: EO1Config,
|
||||
vlm_backbone: Qwen2_5_VLForConditionalGeneration | None = None,
|
||||
):
|
||||
require_package("transformers", extra="eo1")
|
||||
super().__init__()
|
||||
|
||||
self.config = config
|
||||
# Preserve the backbone dtype selected at construction time so Qwen's fp32 rotary buffers stay intact.
|
||||
self.vlm_backbone = vlm_backbone
|
||||
self.hidden_size = self.vlm_backbone.config.text_config.hidden_size
|
||||
max_state_dim = config.max_state_dim
|
||||
max_action_dim = config.max_action_dim
|
||||
self.state_proj = nn.Linear(max_state_dim, self.hidden_size, dtype=torch.float32)
|
||||
self.action_in_proj = nn.Linear(max_action_dim, self.hidden_size, dtype=torch.float32)
|
||||
self.action_out_proj = EO1VisionActionProjector(
|
||||
self.hidden_size,
|
||||
max_action_dim,
|
||||
config.num_action_layers,
|
||||
config.action_act,
|
||||
dtype=torch.float32,
|
||||
)
|
||||
self.action_time_mlp_in = nn.Linear(self.hidden_size * 2, self.hidden_size, dtype=torch.float32)
|
||||
self.action_time_mlp_out = nn.Linear(self.hidden_size, self.hidden_size, dtype=torch.float32)
|
||||
self.gradient_checkpointing_enabled = False
|
||||
|
||||
def get_input_embeddings(self):
|
||||
return self.vlm_backbone.get_input_embeddings()
|
||||
|
||||
def flow_head_autocast_context(self):
|
||||
if self.config.force_fp32_autocast:
|
||||
return torch.autocast(
|
||||
device_type=self.state_proj.weight.device.type,
|
||||
enabled=False,
|
||||
)
|
||||
return contextlib.nullcontext()
|
||||
|
||||
def gradient_checkpointing_enable(self):
|
||||
"""Enable gradient checkpointing for the Qwen2.5-VL backbone."""
|
||||
self.gradient_checkpointing_enabled = True
|
||||
self.vlm_backbone.gradient_checkpointing_enable(
|
||||
gradient_checkpointing_kwargs={"use_reentrant": False}
|
||||
)
|
||||
logger.info("Enabled gradient checkpointing for EO1VisionFlowMatchingModel")
|
||||
|
||||
def gradient_checkpointing_disable(self):
|
||||
"""Disable gradient checkpointing for the Qwen2.5-VL backbone."""
|
||||
self.gradient_checkpointing_enabled = False
|
||||
self.vlm_backbone.gradient_checkpointing_disable()
|
||||
logger.info("Disabled gradient checkpointing for EO1VisionFlowMatchingModel")
|
||||
|
||||
def _apply_checkpoint(self, func, *args, **kwargs):
|
||||
"""Apply manual gradient checkpointing to EO1 flow-head computations when training."""
|
||||
if self.gradient_checkpointing_enabled and self.training and torch.is_grad_enabled():
|
||||
return torch.utils.checkpoint.checkpoint(
|
||||
func, *args, use_reentrant=False, preserve_rng_state=False, **kwargs
|
||||
)
|
||||
return func(*args, **kwargs)
|
||||
|
||||
def sample_noise(self, shape, device):
|
||||
noise = torch.normal(
|
||||
mean=0.0,
|
||||
std=1.0,
|
||||
size=shape,
|
||||
dtype=torch.float32,
|
||||
device=device,
|
||||
)
|
||||
return noise
|
||||
|
||||
def sample_time(self, bsize, device):
|
||||
time_beta = sample_beta(
|
||||
self.config.time_sampling_beta_alpha, self.config.time_sampling_beta_beta, bsize, device
|
||||
)
|
||||
time = time_beta * self.config.time_sampling_scale + self.config.time_sampling_offset
|
||||
return time.to(dtype=torch.float32, device=device)
|
||||
|
||||
def get_placeholder_mask(
|
||||
self,
|
||||
input_ids: torch.LongTensor | None,
|
||||
inputs_embeds: torch.FloatTensor | None,
|
||||
state_features: torch.FloatTensor | None = None,
|
||||
action_features: torch.FloatTensor | None = None,
|
||||
*,
|
||||
state_token_id: int,
|
||||
action_token_id: int,
|
||||
) -> tuple[torch.BoolTensor, torch.BoolTensor]:
|
||||
"""Return EO1 state/action placeholder masks, following Qwen's multimodal mask style."""
|
||||
if input_ids is None:
|
||||
special_state_mask = inputs_embeds == self.get_input_embeddings()(
|
||||
torch.tensor(state_token_id, dtype=torch.long, device=inputs_embeds.device)
|
||||
)
|
||||
special_state_mask = special_state_mask.all(-1)
|
||||
special_action_mask = inputs_embeds == self.get_input_embeddings()(
|
||||
torch.tensor(action_token_id, dtype=torch.long, device=inputs_embeds.device)
|
||||
)
|
||||
special_action_mask = special_action_mask.all(-1)
|
||||
else:
|
||||
special_state_mask = input_ids == state_token_id
|
||||
special_action_mask = input_ids == action_token_id
|
||||
|
||||
n_state_tokens = special_state_mask.sum()
|
||||
special_state_mask = (
|
||||
special_state_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
|
||||
)
|
||||
if state_features is not None:
|
||||
torch_compilable_check(
|
||||
inputs_embeds[special_state_mask].numel() == state_features.numel(),
|
||||
f"State features and state tokens do not match, tokens: {n_state_tokens}, features: {state_features.shape[0]}",
|
||||
)
|
||||
|
||||
n_action_tokens = special_action_mask.sum()
|
||||
special_action_mask = (
|
||||
special_action_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
|
||||
)
|
||||
if action_features is not None:
|
||||
torch_compilable_check(
|
||||
inputs_embeds[special_action_mask].numel() == action_features.numel(),
|
||||
f"Action features and action tokens do not match, tokens: {n_action_tokens}, features: {action_features.shape[0]}",
|
||||
)
|
||||
|
||||
return special_state_mask, special_action_mask
|
||||
|
||||
def embed_prefix(
|
||||
self,
|
||||
input_ids: torch.LongTensor,
|
||||
states: torch.Tensor,
|
||||
*,
|
||||
state_token_id: int,
|
||||
action_token_id: int,
|
||||
) -> torch.FloatTensor:
|
||||
"""Embed the EO1 prefix tokens before native Qwen injects multimodal features."""
|
||||
|
||||
# Get the input embeddings for the input IDs
|
||||
def input_embed_func(input_ids: torch.LongTensor) -> torch.FloatTensor:
|
||||
return self.get_input_embeddings()(input_ids)
|
||||
|
||||
inputs_embeds = self._apply_checkpoint(input_embed_func, input_ids)
|
||||
|
||||
# Project the states to the hidden size
|
||||
def state_proj_func(states: torch.Tensor) -> torch.FloatTensor:
|
||||
with self.flow_head_autocast_context():
|
||||
states = states.to(dtype=self.state_proj.weight.dtype)
|
||||
return self.state_proj(states)
|
||||
|
||||
state_embs = self._apply_checkpoint(state_proj_func, states)
|
||||
state_mask, _ = self.get_placeholder_mask(
|
||||
input_ids,
|
||||
inputs_embeds,
|
||||
state_features=state_embs,
|
||||
state_token_id=state_token_id,
|
||||
action_token_id=action_token_id,
|
||||
)
|
||||
state_embs = state_embs.to(inputs_embeds.device, inputs_embeds.dtype)
|
||||
inputs_embeds = inputs_embeds.masked_scatter(state_mask, state_embs)
|
||||
return inputs_embeds
|
||||
|
||||
def embed_suffix(
|
||||
self,
|
||||
timestep: torch.Tensor,
|
||||
noisy_actions: torch.Tensor,
|
||||
) -> torch.FloatTensor:
|
||||
"""Embed the suffix"""
|
||||
|
||||
def action_proj_func(noisy_actions: torch.Tensor) -> torch.FloatTensor:
|
||||
with self.flow_head_autocast_context():
|
||||
noisy_actions = noisy_actions.to(dtype=self.action_in_proj.weight.dtype)
|
||||
return self.action_in_proj(noisy_actions)
|
||||
|
||||
action_embs = self._apply_checkpoint(action_proj_func, noisy_actions)
|
||||
time_embs = create_sinusoidal_pos_embedding(
|
||||
timestep,
|
||||
self.hidden_size,
|
||||
min_period=self.config.min_period,
|
||||
max_period=self.config.max_period,
|
||||
device=action_embs.device,
|
||||
)
|
||||
time_embs = time_embs.to(dtype=action_embs.dtype)
|
||||
time_embs = time_embs[:, None, :].expand_as(action_embs)
|
||||
action_time_embs = torch.cat([action_embs, time_embs], dim=2)
|
||||
|
||||
def mlp_func(action_time_embs: torch.Tensor) -> torch.FloatTensor:
|
||||
with self.flow_head_autocast_context():
|
||||
action_time_embs = action_time_embs.to(dtype=self.action_time_mlp_in.weight.dtype)
|
||||
action_time_embs = self.action_time_mlp_in(action_time_embs)
|
||||
action_time_embs = F.silu(action_time_embs)
|
||||
return self.action_time_mlp_out(action_time_embs)
|
||||
|
||||
action_time_embs = self._apply_checkpoint(mlp_func, action_time_embs)
|
||||
return action_time_embs
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.LongTensor | None = None,
|
||||
attention_mask: torch.LongTensor | None = None,
|
||||
pixel_values: torch.FloatTensor | None = None,
|
||||
image_grid_thw: torch.LongTensor | None = None,
|
||||
mm_token_type_ids: torch.IntTensor | None = None,
|
||||
states: torch.FloatTensor | None = None,
|
||||
action: torch.FloatTensor | None = None,
|
||||
action_is_pad: torch.BoolTensor | None = None,
|
||||
*,
|
||||
state_token_id: int,
|
||||
action_token_id: int,
|
||||
**kwargs,
|
||||
) -> Tensor:
|
||||
"""Run the EO1 training forward pass and compute the flow-matching loss."""
|
||||
|
||||
# 1. Build the EO1 prefix with state placeholders resolved.
|
||||
inputs_embeds = self.embed_prefix(
|
||||
input_ids,
|
||||
states=states,
|
||||
state_token_id=state_token_id,
|
||||
action_token_id=action_token_id,
|
||||
)
|
||||
|
||||
# 2. Sample the diffusion target and replace the action placeholders.
|
||||
time = self.sample_time(action.shape[0], inputs_embeds.device)
|
||||
noise = self.sample_noise(action.shape, inputs_embeds.device)
|
||||
|
||||
time_expanded = time[:, None, None]
|
||||
x_t = time_expanded * noise + (1 - time_expanded) * action
|
||||
u_t = noise - action
|
||||
action_time_embs = self.embed_suffix(time, x_t)
|
||||
_, action_mask = self.get_placeholder_mask(
|
||||
input_ids,
|
||||
inputs_embeds,
|
||||
action_features=action_time_embs,
|
||||
state_token_id=state_token_id,
|
||||
action_token_id=action_token_id,
|
||||
)
|
||||
action_time_embs = action_time_embs.to(inputs_embeds.device, inputs_embeds.dtype)
|
||||
inputs_embeds = inputs_embeds.masked_scatter(action_mask, action_time_embs)
|
||||
|
||||
# 3. Optionally drop padded action tokens from backbone attention.
|
||||
if attention_mask is not None:
|
||||
attention_mask = attention_mask.to(inputs_embeds.device)
|
||||
|
||||
if not self.config.supervise_padding_actions:
|
||||
action_is_pad = action_is_pad.to(device=inputs_embeds.device, dtype=torch.bool)
|
||||
action_token_mask = action_mask[..., 0]
|
||||
action_padding_mask = torch.zeros_like(action_token_mask)
|
||||
action_padding_mask = action_padding_mask.masked_scatter(
|
||||
action_token_mask,
|
||||
action_is_pad.reshape(-1),
|
||||
)
|
||||
attention_mask = attention_mask.masked_fill(action_padding_mask, 0)
|
||||
|
||||
# 4. Run the Qwen backbone on the fused EO1 sequence.
|
||||
def vlm_forward_func(
|
||||
input_ids: torch.LongTensor,
|
||||
attention_mask: torch.Tensor | None,
|
||||
inputs_embeds: torch.FloatTensor,
|
||||
pixel_values: torch.Tensor | None,
|
||||
image_grid_thw: torch.LongTensor | None,
|
||||
mm_token_type_ids: torch.IntTensor | None,
|
||||
) -> torch.FloatTensor:
|
||||
outputs = self.vlm_backbone.model(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
inputs_embeds=inputs_embeds,
|
||||
pixel_values=pixel_values,
|
||||
image_grid_thw=image_grid_thw,
|
||||
mm_token_type_ids=mm_token_type_ids,
|
||||
use_cache=False,
|
||||
output_hidden_states=False,
|
||||
return_dict=True,
|
||||
)
|
||||
return outputs.last_hidden_state
|
||||
|
||||
hidden_states = self._apply_checkpoint(
|
||||
vlm_forward_func,
|
||||
input_ids,
|
||||
attention_mask,
|
||||
inputs_embeds,
|
||||
pixel_values,
|
||||
image_grid_thw,
|
||||
mm_token_type_ids,
|
||||
)
|
||||
action_hidden_states = hidden_states[action_mask[..., 0]]
|
||||
|
||||
# 5. Project the action-token hidden states back to the flow target space.
|
||||
def action_out_proj_func(action_hidden_states: torch.FloatTensor) -> torch.FloatTensor:
|
||||
with self.flow_head_autocast_context():
|
||||
action_hidden_states = action_hidden_states.to(dtype=self.action_out_proj.dtype)
|
||||
return self.action_out_proj(action_hidden_states)
|
||||
|
||||
v_t = self._apply_checkpoint(action_out_proj_func, action_hidden_states)
|
||||
v_t = v_t.reshape(u_t.shape).to(dtype=u_t.dtype)
|
||||
losses = F.mse_loss(u_t, v_t, reduction="none")
|
||||
|
||||
# 6. Apply the configured supervision mask and reduce the loss.
|
||||
if not self.config.supervise_padding_action_dims:
|
||||
original_action_dim = self.config.output_features[ACTION].shape[0]
|
||||
losses = losses[..., :original_action_dim]
|
||||
|
||||
if not self.config.supervise_padding_actions:
|
||||
losses = losses[~action_is_pad]
|
||||
|
||||
return losses.mean()
|
||||
|
||||
@torch.no_grad()
|
||||
def sample_actions(
|
||||
self,
|
||||
input_ids: torch.LongTensor | None = None,
|
||||
attention_mask: torch.Tensor | None = None,
|
||||
pixel_values: torch.Tensor | None = None,
|
||||
image_grid_thw: torch.LongTensor | None = None,
|
||||
mm_token_type_ids: torch.IntTensor | None = None,
|
||||
states: torch.Tensor | None = None,
|
||||
*,
|
||||
state_token_id: int,
|
||||
action_token_id: int,
|
||||
**kwargs,
|
||||
) -> Tensor:
|
||||
"""Sample actions from the model."""
|
||||
if states is None:
|
||||
raise ValueError("states are required for EO1 action sampling.")
|
||||
if mm_token_type_ids is None:
|
||||
raise ValueError("mm_token_type_ids are required for EO1 action sampling.")
|
||||
|
||||
# 1. Resolve the left-padded rollout prompt and locate the action span.
|
||||
chunk_size = self.config.chunk_size
|
||||
|
||||
inputs_embeds = self.embed_prefix(
|
||||
input_ids,
|
||||
states=states,
|
||||
state_token_id=state_token_id,
|
||||
action_token_id=action_token_id,
|
||||
).clone()
|
||||
_, action_placeholder_mask = self.get_placeholder_mask(
|
||||
input_ids,
|
||||
inputs_embeds,
|
||||
state_token_id=state_token_id,
|
||||
action_token_id=action_token_id,
|
||||
)
|
||||
action_mask = action_placeholder_mask[..., 0]
|
||||
token_counts = action_mask.sum(dim=1)
|
||||
if not torch.all(token_counts == chunk_size):
|
||||
raise ValueError(
|
||||
f"Each sample must contain exactly {chunk_size} action tokens, got {token_counts.tolist()}."
|
||||
)
|
||||
if action_mask.ne(action_mask[:1]).any():
|
||||
raise ValueError(
|
||||
"Batch inference expects all samples to share the same action token mask after left padding."
|
||||
)
|
||||
act_start = int(action_mask[0].to(torch.int64).argmax().item())
|
||||
act_end = act_start + self.config.chunk_size
|
||||
if not torch.all(action_mask[:, act_start:act_end]):
|
||||
raise ValueError("Action tokens must form a contiguous chunk of length chunk_size.")
|
||||
act_slice = slice(act_start, act_end)
|
||||
|
||||
# 2. Encode the fixed prefix once and cache its KV state.
|
||||
batch_size = input_ids.shape[0]
|
||||
device = inputs_embeds.device
|
||||
attention_mask = attention_mask.to(device)
|
||||
mm_token_type_ids = mm_token_type_ids.to(device)
|
||||
position_ids, _ = self.vlm_backbone.model.get_rope_index(
|
||||
input_ids,
|
||||
image_grid_thw=image_grid_thw,
|
||||
attention_mask=attention_mask,
|
||||
mm_token_type_ids=mm_token_type_ids,
|
||||
)
|
||||
position_ids = position_ids.to(device)
|
||||
|
||||
outputs = self.vlm_backbone.model(
|
||||
input_ids=input_ids[:, :act_start],
|
||||
attention_mask=attention_mask[:, :act_start],
|
||||
position_ids=position_ids[..., :act_start],
|
||||
inputs_embeds=inputs_embeds[:, :act_start],
|
||||
pixel_values=pixel_values,
|
||||
image_grid_thw=image_grid_thw,
|
||||
mm_token_type_ids=mm_token_type_ids[:, :act_start],
|
||||
use_cache=True,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
x_t = self.sample_noise(
|
||||
(batch_size, chunk_size, self.config.max_action_dim),
|
||||
device,
|
||||
).to(dtype=self.action_in_proj.weight.dtype)
|
||||
dt = -1.0 / self.config.num_denoise_steps
|
||||
past_key_values = outputs.past_key_values
|
||||
|
||||
# 3. Denoise only the action chunk while keeping the prefix cache invariant.
|
||||
for step in range(self.config.num_denoise_steps):
|
||||
time = torch.full(
|
||||
(batch_size,),
|
||||
1.0 + step * dt,
|
||||
device=device,
|
||||
dtype=torch.float32,
|
||||
)
|
||||
action_time_embs = self.embed_suffix(time, x_t)
|
||||
inputs_embeds[:, act_slice] = action_time_embs.to(inputs_embeds.dtype)
|
||||
|
||||
# Keep the prefix KV cache invariant across denoising steps.
|
||||
past_key_values.crop(act_start)
|
||||
outputs = self.vlm_backbone.model(
|
||||
attention_mask=attention_mask[:, :act_end],
|
||||
past_key_values=past_key_values,
|
||||
inputs_embeds=inputs_embeds[:, act_slice],
|
||||
position_ids=position_ids[..., act_slice],
|
||||
use_cache=True,
|
||||
return_dict=True,
|
||||
)
|
||||
with self.flow_head_autocast_context():
|
||||
hidden_states = outputs.last_hidden_state[:, :chunk_size]
|
||||
hidden_states = hidden_states.to(dtype=self.action_out_proj.dtype)
|
||||
v_t = self.action_out_proj(hidden_states)
|
||||
|
||||
x_t += dt * v_t.reshape(x_t.shape)
|
||||
|
||||
return x_t
|
||||
@@ -1,282 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
import torch
|
||||
|
||||
from lerobot.configs.types import FeatureType, PipelineFeatureType, PolicyFeature
|
||||
from lerobot.policies.eo1.configuration_eo1 import EO1Config
|
||||
from lerobot.processor import (
|
||||
AddBatchDimensionProcessorStep,
|
||||
ComplementaryDataProcessorStep,
|
||||
DeviceProcessorStep,
|
||||
NormalizerProcessorStep,
|
||||
PolicyAction,
|
||||
PolicyProcessorPipeline,
|
||||
ProcessorStep,
|
||||
ProcessorStepRegistry,
|
||||
RenameObservationsProcessorStep,
|
||||
UnnormalizerProcessorStep,
|
||||
)
|
||||
from lerobot.processor.converters import policy_action_to_transition, transition_to_policy_action
|
||||
from lerobot.types import TransitionKey
|
||||
from lerobot.utils.constants import (
|
||||
OBS_STATE,
|
||||
POLICY_POSTPROCESSOR_DEFAULT_NAME,
|
||||
POLICY_PREPROCESSOR_DEFAULT_NAME,
|
||||
)
|
||||
from lerobot.utils.import_utils import _transformers_available, require_package
|
||||
|
||||
if TYPE_CHECKING or _transformers_available:
|
||||
from transformers.models.qwen2_5_vl import Qwen2_5_VLProcessor
|
||||
else:
|
||||
Qwen2_5_VLProcessor = None
|
||||
|
||||
SYSTEM_MESSAGE = "You are a helpful physical assistant."
|
||||
|
||||
# EO-1 special tokens
|
||||
ACTION_START_TOKEN = "<|action_start|>" # nosec B105
|
||||
DEFAULT_ACTION_TOKEN = "<|action_pad|>" # nosec B105
|
||||
ACTION_END_TOKEN = "<|action_end|>" # nosec B105
|
||||
STATE_START_TOKEN = "<|state_start|>" # nosec B105
|
||||
DEFAULT_STATE_TOKEN = "<|state_pad|>" # nosec B105
|
||||
STATE_END_TOKEN = "<|state_end|>" # nosec B105
|
||||
TASK_VLA_TOKEN = "<|vla|>" # nosec B105
|
||||
|
||||
EO1_SPECIAL_TOKENS = [
|
||||
ACTION_START_TOKEN,
|
||||
DEFAULT_ACTION_TOKEN,
|
||||
ACTION_END_TOKEN,
|
||||
STATE_START_TOKEN,
|
||||
DEFAULT_STATE_TOKEN,
|
||||
STATE_END_TOKEN,
|
||||
TASK_VLA_TOKEN,
|
||||
]
|
||||
|
||||
|
||||
@dataclass
|
||||
@ProcessorStepRegistry.register(name="eo1_conversation_template_processor")
|
||||
class EO1ConversationTemplateStep(ComplementaryDataProcessorStep):
|
||||
input_features: dict[str, PolicyFeature] | dict[str, dict[str, Any]]
|
||||
chunk_size: int
|
||||
|
||||
_image_keys: list[str] = field(default_factory=list, init=False, repr=False)
|
||||
|
||||
def __post_init__(self):
|
||||
# Robust JSON deserialization handling (guard empty maps).
|
||||
if self.input_features:
|
||||
first_val = next(iter(self.input_features.values()))
|
||||
if isinstance(first_val, dict):
|
||||
reconstructed = {}
|
||||
for key, ft_dict in self.input_features.items():
|
||||
reconstructed[key] = PolicyFeature(
|
||||
type=FeatureType(ft_dict["type"]), shape=tuple(ft_dict["shape"])
|
||||
)
|
||||
self.input_features = reconstructed
|
||||
|
||||
self._image_keys = [
|
||||
key for key, value in self.input_features.items() if value.type == FeatureType.VISUAL
|
||||
]
|
||||
|
||||
def complementary_data(self, complementary_data):
|
||||
tasks = complementary_data.get("task")
|
||||
if tasks is None:
|
||||
raise ValueError("Task is required for EO1ConversationTemplateStep.")
|
||||
|
||||
observation = self.transition.get(TransitionKey.OBSERVATION)
|
||||
if observation is None:
|
||||
raise ValueError("Observation is required for EO1ConversationTemplateStep.")
|
||||
|
||||
if OBS_STATE in observation and observation[OBS_STATE].shape[0] != len(tasks):
|
||||
raise ValueError("Batch size mismatch between observation.state and task list.")
|
||||
|
||||
# LeRobot visual observations reach in processor as float32 tensors in [0, 1].
|
||||
# Convert to uint8 in [0, 255] to meet the input requirement of Qwen2.5-VL-3B-Instruct.
|
||||
images = {
|
||||
key: observation[key].clamp(0, 1).mul(255.0).round().to(torch.uint8) for key in self._image_keys
|
||||
}
|
||||
messages = []
|
||||
for i in range(len(tasks)):
|
||||
content = [
|
||||
*[{"type": "image", "image": images[key][i]} for key in self._image_keys],
|
||||
{
|
||||
"type": "text",
|
||||
"text": (
|
||||
f"{STATE_START_TOKEN}{DEFAULT_STATE_TOKEN}{STATE_END_TOKEN}{tasks[i]}{TASK_VLA_TOKEN}"
|
||||
),
|
||||
},
|
||||
]
|
||||
messages.append(
|
||||
[
|
||||
{"role": "system", "content": [{"type": "text", "text": SYSTEM_MESSAGE}]},
|
||||
{"role": "user", "content": content},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": f"{ACTION_START_TOKEN}{DEFAULT_ACTION_TOKEN * self.chunk_size}{ACTION_END_TOKEN}",
|
||||
}
|
||||
],
|
||||
},
|
||||
]
|
||||
)
|
||||
|
||||
complementary_data["messages"] = messages
|
||||
|
||||
return complementary_data
|
||||
|
||||
def transform_features(
|
||||
self, features: dict[PipelineFeatureType, dict[str, PolicyFeature]]
|
||||
) -> dict[PipelineFeatureType, dict[str, PolicyFeature]]:
|
||||
"""
|
||||
This step only materializes EO1-specific message objects in complementary_data.
|
||||
PipelineFeatureType tracks only ACTION and OBSERVATION, so there is no static
|
||||
feature contract change to record here.
|
||||
"""
|
||||
return features
|
||||
|
||||
def get_config(self) -> dict[str, Any]:
|
||||
return {
|
||||
"input_features": {
|
||||
key: {"type": ft.type.value, "shape": ft.shape} for key, ft in self.input_features.items()
|
||||
},
|
||||
"chunk_size": self.chunk_size,
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
@ProcessorStepRegistry.register(name="eo1_qwen_processor")
|
||||
class EO1QwenProcessorStep(ComplementaryDataProcessorStep):
|
||||
processor_name: str = "Qwen/Qwen2.5-VL-3B-Instruct"
|
||||
image_min_pixels: int | None = 64 * 28 * 28
|
||||
image_max_pixels: int | None = 128 * 28 * 28
|
||||
use_fast_processor: bool = False
|
||||
|
||||
_processor: Qwen2_5_VLProcessor | None = field(default=None, init=False, repr=False)
|
||||
_state_token_id: int | None = field(default=None, init=False, repr=False)
|
||||
_action_token_id: int | None = field(default=None, init=False, repr=False)
|
||||
|
||||
def __post_init__(self):
|
||||
require_package("transformers", extra="eo1")
|
||||
self._processor = Qwen2_5_VLProcessor.from_pretrained(
|
||||
self.processor_name,
|
||||
use_fast=self.use_fast_processor,
|
||||
)
|
||||
self._processor.tokenizer.add_tokens(EO1_SPECIAL_TOKENS, special_tokens=True)
|
||||
self._state_token_id = self._processor.tokenizer.convert_tokens_to_ids(DEFAULT_STATE_TOKEN)
|
||||
self._action_token_id = self._processor.tokenizer.convert_tokens_to_ids(DEFAULT_ACTION_TOKEN)
|
||||
|
||||
def complementary_data(self, complementary_data):
|
||||
messages = complementary_data.pop("messages", None)
|
||||
if messages is None:
|
||||
raise ValueError("Messages are required for EO1QwenProcessorStep.")
|
||||
|
||||
# Rollout batches use left padding so action spans stay aligned across samples.
|
||||
# Supervised batches use right padding to match standard training collation.
|
||||
padding_side = "right" if self.transition.get(TransitionKey.ACTION) is not None else "left"
|
||||
|
||||
inputs = self._processor.apply_chat_template(
|
||||
messages,
|
||||
tokenize=True,
|
||||
padding=True,
|
||||
padding_side=padding_side,
|
||||
min_pixels=self.image_min_pixels,
|
||||
max_pixels=self.image_max_pixels,
|
||||
add_generation_prompt=False,
|
||||
return_dict=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
complementary_data["input_ids"] = inputs["input_ids"]
|
||||
complementary_data["pixel_values"] = inputs["pixel_values"]
|
||||
complementary_data["image_grid_thw"] = inputs["image_grid_thw"]
|
||||
complementary_data["attention_mask"] = inputs["attention_mask"]
|
||||
complementary_data["mm_token_type_ids"] = inputs["mm_token_type_ids"]
|
||||
complementary_data["state_token_id"] = self._state_token_id
|
||||
complementary_data["action_token_id"] = self._action_token_id
|
||||
|
||||
return complementary_data
|
||||
|
||||
def get_config(self) -> dict[str, Any]:
|
||||
return {
|
||||
"processor_name": self.processor_name,
|
||||
"image_min_pixels": self.image_min_pixels,
|
||||
"image_max_pixels": self.image_max_pixels,
|
||||
"use_fast_processor": self.use_fast_processor,
|
||||
}
|
||||
|
||||
def transform_features(
|
||||
self, features: dict[PipelineFeatureType, dict[str, PolicyFeature]]
|
||||
) -> dict[PipelineFeatureType, dict[str, PolicyFeature]]:
|
||||
"""
|
||||
This step only converts the messages to the model input format.
|
||||
"""
|
||||
return features
|
||||
|
||||
|
||||
def make_eo1_pre_post_processors(
|
||||
config: EO1Config,
|
||||
dataset_stats: dict[str, dict[str, torch.Tensor]] | None = None,
|
||||
) -> tuple[
|
||||
PolicyProcessorPipeline[dict[str, Any], dict[str, Any]],
|
||||
PolicyProcessorPipeline[PolicyAction, PolicyAction],
|
||||
]:
|
||||
"""Build pre/post processor pipelines for EO1."""
|
||||
|
||||
input_steps: list[ProcessorStep] = [
|
||||
RenameObservationsProcessorStep(rename_map={}),
|
||||
AddBatchDimensionProcessorStep(),
|
||||
NormalizerProcessorStep(
|
||||
features={**config.input_features, **config.output_features},
|
||||
norm_map=config.normalization_mapping,
|
||||
stats=dataset_stats,
|
||||
),
|
||||
EO1ConversationTemplateStep(input_features=config.input_features, chunk_size=config.chunk_size),
|
||||
EO1QwenProcessorStep(
|
||||
processor_name=config.vlm_base,
|
||||
image_min_pixels=config.image_min_pixels,
|
||||
image_max_pixels=config.image_max_pixels,
|
||||
use_fast_processor=config.use_fast_processor,
|
||||
),
|
||||
DeviceProcessorStep(device=config.device),
|
||||
]
|
||||
|
||||
output_steps: list[ProcessorStep] = [
|
||||
UnnormalizerProcessorStep(
|
||||
features=config.output_features,
|
||||
norm_map=config.normalization_mapping,
|
||||
stats=dataset_stats,
|
||||
),
|
||||
DeviceProcessorStep(device="cpu"),
|
||||
]
|
||||
|
||||
return (
|
||||
PolicyProcessorPipeline[dict[str, Any], dict[str, Any]](
|
||||
steps=input_steps,
|
||||
name=POLICY_PREPROCESSOR_DEFAULT_NAME,
|
||||
),
|
||||
PolicyProcessorPipeline[PolicyAction, PolicyAction](
|
||||
steps=output_steps,
|
||||
name=POLICY_POSTPROCESSOR_DEFAULT_NAME,
|
||||
to_transition=policy_action_to_transition,
|
||||
to_output=transition_to_policy_action,
|
||||
),
|
||||
)
|
||||
@@ -46,13 +46,14 @@ from lerobot.utils.feature_utils import dataset_to_policy_features
|
||||
|
||||
from .act.configuration_act import ACTConfig
|
||||
from .diffusion.configuration_diffusion import DiffusionConfig
|
||||
from .eo1.configuration_eo1 import EO1Config
|
||||
from .groot.configuration_groot import GrootConfig
|
||||
from .multi_task_dit.configuration_multi_task_dit import MultiTaskDiTConfig
|
||||
from .pi0.configuration_pi0 import PI0Config
|
||||
from .pi05.configuration_pi05 import PI05Config
|
||||
from .pretrained import PreTrainedPolicy
|
||||
from .sac.configuration_sac import SACConfig
|
||||
from .sac.reward_model.configuration_classifier import RewardClassifierConfig
|
||||
from .sarm.configuration_sarm import SARMConfig
|
||||
from .smolvla.configuration_smolvla import SmolVLAConfig
|
||||
from .tdmpc.configuration_tdmpc import TDMPCConfig
|
||||
from .utils import validate_visual_features_consistency
|
||||
@@ -88,7 +89,7 @@ def get_policy_class(name: str) -> type[PreTrainedPolicy]:
|
||||
|
||||
Args:
|
||||
name: The name of the policy. Supported names are "tdmpc", "diffusion", "act",
|
||||
"multi_task_dit", "vqbet", "pi0", "pi05", "sac", "smolvla", "wall_x".
|
||||
"multi_task_dit", "vqbet", "pi0", "pi05", "sac", "reward_classifier", "smolvla", "wall_x".
|
||||
Returns:
|
||||
The policy class corresponding to the given name.
|
||||
|
||||
@@ -131,10 +132,18 @@ def get_policy_class(name: str) -> type[PreTrainedPolicy]:
|
||||
from .sac.modeling_sac import SACPolicy
|
||||
|
||||
return SACPolicy
|
||||
elif name == "reward_classifier":
|
||||
from .sac.reward_model.modeling_classifier import Classifier
|
||||
|
||||
return Classifier
|
||||
elif name == "smolvla":
|
||||
from .smolvla.modeling_smolvla import SmolVLAPolicy
|
||||
|
||||
return SmolVLAPolicy
|
||||
elif name == "sarm":
|
||||
from .sarm.modeling_sarm import SARMRewardModel
|
||||
|
||||
return SARMRewardModel
|
||||
elif name == "groot":
|
||||
from .groot.modeling_groot import GrootPolicy
|
||||
|
||||
@@ -147,10 +156,6 @@ def get_policy_class(name: str) -> type[PreTrainedPolicy]:
|
||||
from .wall_x.modeling_wall_x import WallXPolicy
|
||||
|
||||
return WallXPolicy
|
||||
elif name == "eo1":
|
||||
from .eo1.modeling_eo1 import EO1Policy
|
||||
|
||||
return EO1Policy
|
||||
else:
|
||||
try:
|
||||
return _get_policy_cls_from_policy_name(name=name)
|
||||
@@ -168,7 +173,7 @@ def make_policy_config(policy_type: str, **kwargs) -> PreTrainedConfig:
|
||||
Args:
|
||||
policy_type: The type of the policy. Supported types include "tdmpc",
|
||||
"multi_task_dit", "diffusion", "act", "vqbet", "pi0", "pi05", "sac",
|
||||
"smolvla", "wall_x".
|
||||
"smolvla", "reward_classifier", "wall_x".
|
||||
**kwargs: Keyword arguments to be passed to the configuration class constructor.
|
||||
|
||||
Returns:
|
||||
@@ -195,14 +200,14 @@ def make_policy_config(policy_type: str, **kwargs) -> PreTrainedConfig:
|
||||
return SACConfig(**kwargs)
|
||||
elif policy_type == "smolvla":
|
||||
return SmolVLAConfig(**kwargs)
|
||||
elif policy_type == "reward_classifier":
|
||||
return RewardClassifierConfig(**kwargs)
|
||||
elif policy_type == "groot":
|
||||
return GrootConfig(**kwargs)
|
||||
elif policy_type == "xvla":
|
||||
return XVLAConfig(**kwargs)
|
||||
elif policy_type == "wall_x":
|
||||
return WallXConfig(**kwargs)
|
||||
elif policy_type == "eo1":
|
||||
return EO1Config(**kwargs)
|
||||
else:
|
||||
try:
|
||||
config_cls = PreTrainedConfig.get_choice_class(policy_type)
|
||||
@@ -373,6 +378,14 @@ def make_pre_post_processors(
|
||||
dataset_stats=kwargs.get("dataset_stats"),
|
||||
)
|
||||
|
||||
elif isinstance(policy_cfg, RewardClassifierConfig):
|
||||
from .sac.reward_model.processor_classifier import make_classifier_processor
|
||||
|
||||
processors = make_classifier_processor(
|
||||
config=policy_cfg,
|
||||
dataset_stats=kwargs.get("dataset_stats"),
|
||||
)
|
||||
|
||||
elif isinstance(policy_cfg, SmolVLAConfig):
|
||||
from .smolvla.processor_smolvla import make_smolvla_pre_post_processors
|
||||
|
||||
@@ -381,6 +394,14 @@ def make_pre_post_processors(
|
||||
dataset_stats=kwargs.get("dataset_stats"),
|
||||
)
|
||||
|
||||
elif isinstance(policy_cfg, SARMConfig):
|
||||
from .sarm.processor_sarm import make_sarm_pre_post_processors
|
||||
|
||||
processors = make_sarm_pre_post_processors(
|
||||
config=policy_cfg,
|
||||
dataset_stats=kwargs.get("dataset_stats"),
|
||||
dataset_meta=kwargs.get("dataset_meta"),
|
||||
)
|
||||
elif isinstance(policy_cfg, GrootConfig):
|
||||
from .groot.processor_groot import make_groot_pre_post_processors
|
||||
|
||||
@@ -406,13 +427,6 @@ def make_pre_post_processors(
|
||||
config=policy_cfg,
|
||||
dataset_stats=kwargs.get("dataset_stats"),
|
||||
)
|
||||
elif isinstance(policy_cfg, EO1Config):
|
||||
from .eo1.processor_eo1 import make_eo1_pre_post_processors
|
||||
|
||||
processors = make_eo1_pre_post_processors(
|
||||
config=policy_cfg,
|
||||
dataset_stats=kwargs.get("dataset_stats"),
|
||||
)
|
||||
|
||||
else:
|
||||
try:
|
||||
@@ -528,7 +542,7 @@ def make_policy(
|
||||
|
||||
logging.info("Loading policy's PEFT adapter.")
|
||||
|
||||
peft_pretrained_path = str(cfg.pretrained_path)
|
||||
peft_pretrained_path = cfg.pretrained_path
|
||||
peft_config = PeftConfig.from_pretrained(peft_pretrained_path)
|
||||
|
||||
kwargs["pretrained_name_or_path"] = peft_config.base_model_name_or_path
|
||||
@@ -541,9 +555,7 @@ def make_policy(
|
||||
)
|
||||
|
||||
policy = policy_cls.from_pretrained(**kwargs)
|
||||
policy = PeftModel.from_pretrained(
|
||||
policy, peft_pretrained_path, config=peft_config, is_trainable=True
|
||||
)
|
||||
policy = PeftModel.from_pretrained(policy, peft_pretrained_path, config=peft_config)
|
||||
|
||||
else:
|
||||
# Make a fresh policy.
|
||||
|
||||
@@ -13,7 +13,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from dataclasses import field
|
||||
from dataclasses import dataclass, field
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import torch
|
||||
@@ -109,6 +109,7 @@ class MultiEmbodimentActionEncoder(nn.Module):
|
||||
return x
|
||||
|
||||
|
||||
@dataclass
|
||||
class FlowmatchingActionHeadConfig(PretrainedConfig):
|
||||
"""NOTE: N1.5 uses XEmbFlowmatchingPolicyHeadConfig as action head"""
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
@@ -173,14 +174,17 @@ N_COLOR_CHANNELS = 3
|
||||
|
||||
|
||||
# config
|
||||
@dataclass
|
||||
class GR00TN15Config(PretrainedConfig):
|
||||
model_type = "gr00t_n1_5"
|
||||
backbone_cfg: dict = field(init=False, metadata={"help": "Backbone configuration."})
|
||||
|
||||
backbone_cfg: dict
|
||||
action_head_cfg: dict
|
||||
action_horizon: int
|
||||
action_dim: int
|
||||
compute_dtype: str = "float32"
|
||||
action_head_cfg: dict = field(init=False, metadata={"help": "Action head configuration."})
|
||||
|
||||
action_horizon: int = field(init=False, metadata={"help": "Action horizon."})
|
||||
|
||||
action_dim: int = field(init=False, metadata={"help": "Action dimension."})
|
||||
compute_dtype: str = field(default="float32", metadata={"help": "Compute dtype."})
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@@ -688,9 +688,8 @@ class DiffusionObjective(nn.Module):
|
||||
loss = F.mse_loss(predicted, target, reduction="none")
|
||||
|
||||
if self.do_mask_loss_for_padding and "action_is_pad" in batch:
|
||||
mask = ~batch["action_is_pad"].unsqueeze(-1)
|
||||
num_valid = mask.sum() * loss.shape[-1]
|
||||
return (loss * mask).sum() / num_valid.clamp_min(1)
|
||||
valid_actions = ~batch["action_is_pad"]
|
||||
loss = loss * valid_actions.unsqueeze(-1)
|
||||
|
||||
return loss.mean()
|
||||
|
||||
@@ -753,9 +752,8 @@ class FlowMatchingObjective(nn.Module):
|
||||
loss = F.mse_loss(predicted_velocity, target_velocity, reduction="none")
|
||||
|
||||
if self.do_mask_loss_for_padding and "action_is_pad" in batch:
|
||||
mask = ~batch["action_is_pad"].unsqueeze(-1)
|
||||
num_valid = mask.sum() * loss.shape[-1]
|
||||
return (loss * mask).sum() / num_valid.clamp_min(1)
|
||||
valid_mask = ~batch["action_is_pad"]
|
||||
loss = loss * valid_mask.unsqueeze(-1)
|
||||
|
||||
return loss.mean()
|
||||
|
||||
|
||||
@@ -444,13 +444,13 @@ class PaliGemmaWithExpertModel(
|
||||
if image.dtype != torch.float32:
|
||||
image = image.to(torch.float32)
|
||||
image_outputs = self.paligemma.model.get_image_features(image)
|
||||
features = image_outputs.pooler_output
|
||||
features = image_outputs.pooler_output * self.paligemma.config.text_config.hidden_size**0.5
|
||||
if features.dtype != out_dtype:
|
||||
features = features.to(out_dtype)
|
||||
return features
|
||||
|
||||
def embed_language_tokens(self, tokens: torch.Tensor):
|
||||
return self.paligemma.model.language_model.get_input_embeddings()(tokens)
|
||||
return self.paligemma.model.language_model.embed_tokens(tokens)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
@@ -666,7 +666,8 @@ class PI0Pytorch(nn.Module): # see openpi `PI0Pytorch`
|
||||
# Process language tokens
|
||||
def lang_embed_func(lang_tokens):
|
||||
lang_emb = self.paligemma_with_expert.embed_language_tokens(lang_tokens)
|
||||
return lang_emb
|
||||
lang_emb_dim = lang_emb.shape[-1]
|
||||
return lang_emb * math.sqrt(lang_emb_dim)
|
||||
|
||||
lang_emb = self._apply_checkpoint(lang_embed_func, lang_tokens)
|
||||
embs.append(lang_emb)
|
||||
@@ -747,8 +748,16 @@ class PI0Pytorch(nn.Module): # see openpi `PI0Pytorch`
|
||||
|
||||
return embs, pad_masks, att_masks, adarms_cond
|
||||
|
||||
def forward(self, images, img_masks, lang_tokens, lang_masks, state, actions, noise, time) -> Tensor:
|
||||
def forward(
|
||||
self, images, img_masks, lang_tokens, lang_masks, state, actions, noise=None, time=None
|
||||
) -> Tensor:
|
||||
"""Do a full training forward pass and compute the loss."""
|
||||
if noise is None:
|
||||
noise = self.sample_noise(actions.shape, actions.device)
|
||||
|
||||
if time is None:
|
||||
time = self.sample_time(actions.shape[0], actions.device)
|
||||
|
||||
time_expanded = time[:, None, None]
|
||||
x_t = time_expanded * noise + (1 - time_expanded) * actions
|
||||
u_t = noise - actions
|
||||
@@ -1283,11 +1292,8 @@ class PI0Policy(PreTrainedPolicy):
|
||||
state = self.prepare_state(batch)
|
||||
actions = self.prepare_action(batch)
|
||||
|
||||
noise = self.model.sample_noise(actions.shape, actions.device)
|
||||
time = self.model.sample_time(actions.shape[0], actions.device)
|
||||
|
||||
# Compute loss
|
||||
losses = self.model.forward(images, img_masks, lang_tokens, lang_masks, state, actions, noise, time)
|
||||
losses = self.model.forward(images, img_masks, lang_tokens, lang_masks, state, actions)
|
||||
|
||||
# Truncate losses to actual action dimensions
|
||||
original_action_dim = self.config.output_features[ACTION].shape[0]
|
||||
|
||||
@@ -728,8 +728,14 @@ class PI05Pytorch(nn.Module): # see openpi `PI0Pytorch`
|
||||
|
||||
return embs, pad_masks, att_masks, adarms_cond
|
||||
|
||||
def forward(self, images, img_masks, tokens, masks, actions, noise, time) -> Tensor:
|
||||
def forward(self, images, img_masks, tokens, masks, actions, noise=None, time=None) -> Tensor:
|
||||
"""Do a full training forward pass and compute the loss."""
|
||||
if noise is None:
|
||||
noise = self.sample_noise(actions.shape, actions.device)
|
||||
|
||||
if time is None:
|
||||
time = self.sample_time(actions.shape[0], actions.device)
|
||||
|
||||
time_expanded = time[:, None, None]
|
||||
x_t = time_expanded * noise + (1 - time_expanded) * actions
|
||||
u_t = noise - actions
|
||||
@@ -1256,11 +1262,8 @@ class PI05Policy(PreTrainedPolicy):
|
||||
|
||||
actions = self.prepare_action(batch)
|
||||
|
||||
noise = self.model.sample_noise(actions.shape, actions.device)
|
||||
time = self.model.sample_time(actions.shape[0], actions.device)
|
||||
|
||||
# Compute loss (no separate state needed for PI05)
|
||||
losses = self.model.forward(images, img_masks, tokens, masks, actions, noise, time)
|
||||
losses = self.model.forward(images, img_masks, tokens, masks, actions)
|
||||
|
||||
# Truncate losses to actual action dimensions
|
||||
original_action_dim = self.config.output_features[ACTION].shape[0]
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
|
||||
import builtins
|
||||
import logging
|
||||
import math
|
||||
from collections import deque
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Literal, TypedDict, Unpack
|
||||
@@ -226,7 +227,6 @@ class PI0FastPaliGemma(nn.Module):
|
||||
# forward(..., adarms_cond=...) is supported (same as pi0/pi05).
|
||||
if use_adarms[0]:
|
||||
text_config = self.paligemma.config.text_config
|
||||
del self.paligemma.model.language_model
|
||||
self.paligemma.model.language_model = PiGemmaModel(text_config)
|
||||
|
||||
self.to_bfloat16_for_selected_params(precision)
|
||||
@@ -260,15 +260,13 @@ class PI0FastPaliGemma(nn.Module):
|
||||
if image.dtype != torch.float32:
|
||||
image = image.to(torch.float32)
|
||||
image_outputs = self.paligemma.model.get_image_features(image)
|
||||
features = image_outputs.pooler_output
|
||||
norm = 2048**0.5
|
||||
features = features / norm * norm
|
||||
features = image_outputs.pooler_output * self.paligemma.config.text_config.hidden_size**0.5
|
||||
if features.dtype != out_dtype:
|
||||
features = features.to(out_dtype)
|
||||
return features
|
||||
|
||||
def embed_language_tokens(self, tokens: torch.Tensor):
|
||||
return self.paligemma.model.language_model.get_input_embeddings()(tokens)
|
||||
return self.paligemma.model.language_model.embed_tokens(tokens)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
@@ -418,7 +416,8 @@ class PI0FastPytorch(nn.Module): # see openpi `PI0Pytorch`
|
||||
# Process language instruction tokens
|
||||
def lang_embed_func(tokens):
|
||||
lang_emb = self.paligemma_with_expert.embed_language_tokens(tokens)
|
||||
return lang_emb
|
||||
lang_emb_dim = lang_emb.shape[-1]
|
||||
return lang_emb * math.sqrt(lang_emb_dim)
|
||||
|
||||
lang_emb = self._apply_checkpoint(lang_embed_func, tokens)
|
||||
embs.append(lang_emb)
|
||||
@@ -432,7 +431,8 @@ class PI0FastPytorch(nn.Module): # see openpi `PI0Pytorch`
|
||||
|
||||
def fast_action_embed_func(fast_action_tokens):
|
||||
fast_emb = self.paligemma_with_expert.embed_language_tokens(fast_action_tokens)
|
||||
return fast_emb
|
||||
fast_emb_dim = fast_emb.shape[-1]
|
||||
return fast_emb * math.sqrt(fast_emb_dim)
|
||||
|
||||
fast_action_emb = self._apply_checkpoint(fast_action_embed_func, fast_action_tokens)
|
||||
embs.append(fast_action_emb)
|
||||
@@ -665,6 +665,7 @@ class PI0FastPytorch(nn.Module): # see openpi `PI0Pytorch`
|
||||
if t < max_decoding_steps - 1:
|
||||
# embed the newly generated token
|
||||
next_token_emb = self.paligemma_with_expert.embed_language_tokens(next_token)
|
||||
next_token_emb = next_token_emb * math.sqrt(next_token_emb.shape[-1])
|
||||
if prefix_embs.dtype == torch.bfloat16:
|
||||
next_token_emb = next_token_emb.to(dtype=torch.bfloat16)
|
||||
|
||||
@@ -769,6 +770,7 @@ class PI0FastPytorch(nn.Module): # see openpi `PI0Pytorch`
|
||||
# Embed the single previous token
|
||||
# We use embed_language_tokens directly to avoid overhead of full prefix embedding
|
||||
next_token_emb = self.paligemma_with_expert.embed_language_tokens(next_token)
|
||||
next_token_emb = next_token_emb * math.sqrt(next_token_emb.shape[-1])
|
||||
if prefix_embs.dtype == torch.bfloat16:
|
||||
next_token_emb = next_token_emb.to(dtype=torch.bfloat16)
|
||||
|
||||
|
||||
@@ -197,9 +197,6 @@ class PiGemmaModel(GemmaModel): # type: ignore[misc]
|
||||
|
||||
def __init__(self, config: GemmaConfig, **kwargs):
|
||||
super().__init__(config, **kwargs)
|
||||
# Free parent-allocated layers/norm before replacing to avoid ~2x peak memory.
|
||||
del self.layers
|
||||
del self.norm
|
||||
# if not getattr(config, "use_adarms", False):
|
||||
# return
|
||||
cond_dim = getattr(config, "adarms_cond_dim", None)
|
||||
@@ -331,7 +328,6 @@ class PiGemmaForCausalLM(GemmaForCausalLM): # type: ignore[misc]
|
||||
|
||||
def __init__(self, config: GemmaConfig, **kwargs):
|
||||
super().__init__(config, **kwargs)
|
||||
del self.model
|
||||
self.model = PiGemmaModel(config)
|
||||
|
||||
|
||||
@@ -340,7 +336,6 @@ class PaliGemmaModelWithPiGemma(PaliGemmaModel):
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
del self.language_model
|
||||
self.language_model = PiGemmaModel(config.text_config)
|
||||
|
||||
|
||||
@@ -349,7 +344,6 @@ class PaliGemmaForConditionalGenerationWithPiGemma(PaliGemmaForConditionalGenera
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
del self.model
|
||||
self.model = PaliGemmaModelWithPiGemma(config)
|
||||
|
||||
# Make modules available through conditional class for BC
|
||||
|
||||
@@ -19,7 +19,6 @@ from .action_queue import ActionQueue
|
||||
from .configuration_rtc import RTCConfig
|
||||
from .latency_tracker import LatencyTracker
|
||||
from .modeling_rtc import RTCProcessor
|
||||
from .relative import reanchor_relative_rtc_prefix
|
||||
|
||||
__all__ = [
|
||||
"ActionInterpolator",
|
||||
@@ -27,5 +26,4 @@ __all__ = [
|
||||
"LatencyTracker",
|
||||
"RTCConfig",
|
||||
"RTCProcessor",
|
||||
"reanchor_relative_rtc_prefix",
|
||||
]
|
||||
|
||||
@@ -35,7 +35,7 @@ class RTCConfig:
|
||||
"""
|
||||
|
||||
# Infrastructure
|
||||
enabled: bool = True
|
||||
enabled: bool = False
|
||||
|
||||
# Core RTC settings
|
||||
# Todo change to exp
|
||||
|
||||
@@ -1,58 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Relative-action helpers for Real-Time Chunking (RTC)."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import torch
|
||||
|
||||
from lerobot.processor import (
|
||||
NormalizerProcessorStep,
|
||||
RelativeActionsProcessorStep,
|
||||
TransitionKey,
|
||||
create_transition,
|
||||
to_relative_actions,
|
||||
)
|
||||
|
||||
|
||||
def reanchor_relative_rtc_prefix(
|
||||
prev_actions_absolute: torch.Tensor,
|
||||
current_state: torch.Tensor,
|
||||
relative_step: RelativeActionsProcessorStep,
|
||||
normalizer_step: NormalizerProcessorStep | None,
|
||||
policy_device: torch.device | str,
|
||||
) -> torch.Tensor:
|
||||
"""Convert absolute leftover actions into model-space for relative-action RTC policies.
|
||||
|
||||
When using relative actions, the RTC prefix (previous chunk's unexecuted tail)
|
||||
is stored in absolute coordinates. Before feeding it back to the policy, this
|
||||
helper re-expresses those actions relative to the robot's current joint state
|
||||
and optionally normalizes them so the policy receives correctly scaled inputs.
|
||||
"""
|
||||
state = current_state.detach().cpu()
|
||||
if state.dim() == 1:
|
||||
state = state.unsqueeze(0)
|
||||
|
||||
action_cpu = prev_actions_absolute.detach().cpu()
|
||||
mask = relative_step._build_mask(action_cpu.shape[-1])
|
||||
relative_actions = to_relative_actions(action_cpu, state, mask)
|
||||
|
||||
transition = create_transition(action=relative_actions)
|
||||
if normalizer_step is not None:
|
||||
transition = normalizer_step(transition)
|
||||
|
||||
return transition[TransitionKey.ACTION].to(policy_device)
|
||||
+5
-4
@@ -1,3 +1,5 @@
|
||||
# !/usr/bin/env python
|
||||
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
@@ -13,15 +15,14 @@
|
||||
# limitations under the License.
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from lerobot.configs import NormalizationMode
|
||||
from lerobot.configs.rewards import RewardModelConfig
|
||||
from lerobot.configs import NormalizationMode, PreTrainedConfig
|
||||
from lerobot.optim import AdamWConfig, LRSchedulerConfig, OptimizerConfig
|
||||
from lerobot.utils.constants import OBS_IMAGE
|
||||
|
||||
|
||||
@RewardModelConfig.register_subclass(name="reward_classifier")
|
||||
@PreTrainedConfig.register_subclass(name="reward_classifier")
|
||||
@dataclass
|
||||
class RewardClassifierConfig(RewardModelConfig):
|
||||
class RewardClassifierConfig(PreTrainedConfig):
|
||||
"""Configuration for the Reward Classifier model."""
|
||||
|
||||
name: str = "reward_classifier"
|
||||
+35
-13
@@ -1,3 +1,5 @@
|
||||
# !/usr/bin/env python
|
||||
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
@@ -17,10 +19,11 @@ import logging
|
||||
import torch
|
||||
from torch import Tensor, nn
|
||||
|
||||
from lerobot.rewards.classifier.configuration_classifier import RewardClassifierConfig
|
||||
from lerobot.rewards.pretrained import PreTrainedRewardModel
|
||||
from lerobot.utils.constants import OBS_IMAGE, REWARD
|
||||
|
||||
from ...pretrained import PreTrainedPolicy
|
||||
from .configuration_classifier import RewardClassifierConfig
|
||||
|
||||
|
||||
class ClassifierOutput:
|
||||
"""Wrapper for classifier outputs with additional metadata."""
|
||||
@@ -96,7 +99,7 @@ class SpatialLearnedEmbeddings(nn.Module):
|
||||
return output
|
||||
|
||||
|
||||
class Classifier(PreTrainedRewardModel):
|
||||
class Classifier(PreTrainedPolicy):
|
||||
"""Image classifier built on top of a pre-trained encoder."""
|
||||
|
||||
name = "reward_classifier"
|
||||
@@ -232,16 +235,6 @@ class Classifier(PreTrainedRewardModel):
|
||||
|
||||
return ClassifierOutput(logits=logits, probabilities=probabilities, hidden_states=encoder_outputs)
|
||||
|
||||
def compute_reward(self, batch: dict[str, Tensor]) -> Tensor:
|
||||
"""Returns 1.0 for success, 0.0 for failure based on image observations."""
|
||||
images = [batch[key] for key in self.config.input_features if key.startswith(OBS_IMAGE)]
|
||||
output = self.predict(images)
|
||||
|
||||
if self.config.num_classes == 2:
|
||||
return (output.probabilities > 0.5).float()
|
||||
else:
|
||||
return torch.argmax(output.probabilities, dim=1).float()
|
||||
|
||||
def forward(self, batch: dict[str, Tensor]) -> tuple[Tensor, dict[str, Tensor]]:
|
||||
"""Standard forward pass for training compatible with train.py."""
|
||||
# Extract images and labels
|
||||
@@ -276,6 +269,10 @@ class Classifier(PreTrainedRewardModel):
|
||||
|
||||
def predict_reward(self, batch, threshold=0.5):
|
||||
"""Eval method. Returns predicted reward with the decision threshold as argument."""
|
||||
# Check for both OBS_IMAGE and OBS_IMAGES prefixes
|
||||
batch = self.normalize_inputs(batch)
|
||||
batch = self.normalize_targets(batch)
|
||||
|
||||
# Extract images from batch dict
|
||||
images = [batch[key] for key in self.config.input_features if key.startswith(OBS_IMAGE)]
|
||||
|
||||
@@ -285,3 +282,28 @@ class Classifier(PreTrainedRewardModel):
|
||||
return (probs > threshold).float()
|
||||
else:
|
||||
return torch.argmax(self.predict(images).probabilities, dim=1)
|
||||
|
||||
def get_optim_params(self):
|
||||
"""Return optimizer parameters for the policy."""
|
||||
return self.parameters()
|
||||
|
||||
def select_action(self, batch: dict[str, Tensor]) -> Tensor:
|
||||
"""
|
||||
This method is required by PreTrainedPolicy but not used for reward classifiers.
|
||||
The reward classifier is not an actor and does not select actions.
|
||||
"""
|
||||
raise NotImplementedError("Reward classifiers do not select actions")
|
||||
|
||||
def predict_action_chunk(self, batch: dict[str, Tensor]) -> Tensor:
|
||||
"""
|
||||
This method is required by PreTrainedPolicy but not used for reward classifiers.
|
||||
The reward classifier is not an actor and does not produce action chunks.
|
||||
"""
|
||||
raise NotImplementedError("Reward classifiers do not predict action chunks")
|
||||
|
||||
def reset(self):
|
||||
"""
|
||||
This method is required by PreTrainedPolicy but not used for reward classifiers.
|
||||
The reward classifier is not an actor and does not select actions.
|
||||
"""
|
||||
pass
|
||||
+6
-1
@@ -1,3 +1,5 @@
|
||||
# !/usr/bin/env python
|
||||
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
@@ -25,7 +27,8 @@ from lerobot.processor import (
|
||||
policy_action_to_transition,
|
||||
transition_to_policy_action,
|
||||
)
|
||||
from lerobot.rewards.classifier.configuration_classifier import RewardClassifierConfig
|
||||
|
||||
from .configuration_classifier import RewardClassifierConfig
|
||||
|
||||
|
||||
def make_classifier_processor(
|
||||
@@ -49,6 +52,8 @@ def make_classifier_processor(
|
||||
Args:
|
||||
config: The configuration object for the RewardClassifier.
|
||||
dataset_stats: A dictionary of statistics for normalization.
|
||||
preprocessor_kwargs: Additional arguments for the pre-processor pipeline.
|
||||
postprocessor_kwargs: Additional arguments for the post-processor pipeline.
|
||||
|
||||
Returns:
|
||||
A tuple containing the configured pre-processor and post-processor pipelines.
|
||||
+1
@@ -0,0 +1 @@
|
||||
../../../../docs/source/policy_sarm_README.md
|
||||
@@ -1,4 +1,4 @@
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
@@ -14,6 +14,5 @@
|
||||
|
||||
from .configuration_sarm import SARMConfig
|
||||
from .modeling_sarm import SARMRewardModel
|
||||
from .processor_sarm import make_sarm_pre_post_processors
|
||||
|
||||
__all__ = ["SARMConfig", "SARMRewardModel", "make_sarm_pre_post_processors"]
|
||||
__all__ = ["SARMConfig", "SARMRewardModel"]
|
||||
+9
-8
@@ -25,18 +25,18 @@ need ~num_frames/30 queries instead of one per frame (~30x speedup).
|
||||
|
||||
Usage:
|
||||
# Full RA-BC computation with visualizations
|
||||
python src/lerobot/rewards/sarm/compute_rabc_weights.py \\
|
||||
python src/lerobot/policies/sarm/compute_rabc_weights.py \\
|
||||
--dataset-repo-id lerobot/aloha_sim_insertion_human \\
|
||||
--reward-model-path <USER>/sarm_single_uni4
|
||||
|
||||
# Faster computation with stride (compute every 5 frames, interpolate the rest)
|
||||
python src/lerobot/rewards/sarm/compute_rabc_weights.py \\
|
||||
python src/lerobot/policies/sarm/compute_rabc_weights.py \\
|
||||
--dataset-repo-id lerobot/aloha_sim_insertion_human \\
|
||||
--reward-model-path <USER>/sarm_single_uni4 \\
|
||||
--stride 5
|
||||
|
||||
# Visualize predictions only (no RA-BC computation)
|
||||
python src/lerobot/rewards/sarm/compute_rabc_weights.py \\
|
||||
python src/lerobot/policies/sarm/compute_rabc_weights.py \\
|
||||
--dataset-repo-id lerobot/aloha_sim_insertion_human \\
|
||||
--reward-model-path <USER>/sarm_single_uni4 \\
|
||||
--visualize-only \\
|
||||
@@ -58,9 +58,10 @@ import torch
|
||||
from tqdm import tqdm
|
||||
|
||||
from lerobot.datasets import LeRobotDataset
|
||||
from lerobot.rewards.sarm.modeling_sarm import SARMRewardModel
|
||||
from lerobot.rewards.sarm.processor_sarm import make_sarm_pre_post_processors
|
||||
from lerobot.rewards.sarm.sarm_utils import normalize_stage_tau
|
||||
|
||||
from .modeling_sarm import SARMRewardModel
|
||||
from .processor_sarm import make_sarm_pre_post_processors
|
||||
from .sarm_utils import normalize_stage_tau
|
||||
|
||||
|
||||
def get_reward_model_path_from_parquet(parquet_path: Path) -> str | None:
|
||||
@@ -712,12 +713,12 @@ def main():
|
||||
epilog="""
|
||||
Examples:
|
||||
# Full RA-BC computation with visualizations
|
||||
python src/lerobot/rewards/sarm/compute_rabc_weights.py \\
|
||||
python src/lerobot/policies/sarm/compute_rabc_weights.py \\
|
||||
--dataset-repo-id lerobot/aloha_sim_insertion_human \\
|
||||
--reward-model-path <USER>/sarm_single_uni4
|
||||
|
||||
# Visualize predictions only (no RA-BC computation)
|
||||
python src/lerobot/rewards/sarm/compute_rabc_weights.py \\
|
||||
python src/lerobot/policies/sarm/compute_rabc_weights.py \\
|
||||
--dataset-repo-id lerobot/aloha_sim_insertion_human \\
|
||||
--reward-model-path <USER>/sarm_single_uni4 \\
|
||||
--visualize-only \\
|
||||
+6
-4
@@ -1,3 +1,5 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2025 Qianzhong Chen, Justin Yu, Mac Schwager, Pieter Abbeel, Yide Shentu, Philipp Wu
|
||||
# and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
@@ -20,15 +22,14 @@ Paper: https://arxiv.org/abs/2509.25358
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from lerobot.configs import FeatureType, NormalizationMode, PolicyFeature
|
||||
from lerobot.configs.rewards import RewardModelConfig
|
||||
from lerobot.configs import FeatureType, NormalizationMode, PolicyFeature, PreTrainedConfig
|
||||
from lerobot.optim import AdamWConfig, CosineDecayWithWarmupSchedulerConfig
|
||||
from lerobot.utils.constants import OBS_IMAGES, OBS_STATE
|
||||
|
||||
|
||||
@RewardModelConfig.register_subclass("sarm")
|
||||
@PreTrainedConfig.register_subclass("sarm")
|
||||
@dataclass
|
||||
class SARMConfig(RewardModelConfig):
|
||||
class SARMConfig(PreTrainedConfig):
|
||||
"""Configuration class for SARM (Stage-Aware Reward Modeling).
|
||||
|
||||
Supports three annotation modes:
|
||||
@@ -109,6 +110,7 @@ class SARMConfig(RewardModelConfig):
|
||||
|
||||
def __post_init__(self):
|
||||
super().__post_init__()
|
||||
|
||||
if self.annotation_mode not in ["single_stage", "dense_only", "dual"]:
|
||||
raise ValueError(
|
||||
f"annotation_mode must be 'single_stage', 'dense_only', or 'dual', got {self.annotation_mode}"
|
||||
+17
-23
@@ -1,3 +1,5 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2025 Qianzhong Chen, Justin Yu, Mac Schwager, Pieter Abbeel, Yide Shentu, Philipp Wu
|
||||
# and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
@@ -32,13 +34,14 @@ import torch.nn as nn
|
||||
import torch.nn.functional as F # noqa: N812
|
||||
from torch import Tensor
|
||||
|
||||
from lerobot.rewards.pretrained import PreTrainedRewardModel
|
||||
from lerobot.rewards.sarm.configuration_sarm import SARMConfig
|
||||
from lerobot.rewards.sarm.sarm_utils import (
|
||||
from lerobot.utils.constants import OBS_STR
|
||||
|
||||
from ..pretrained import PreTrainedPolicy
|
||||
from .configuration_sarm import SARMConfig
|
||||
from .sarm_utils import (
|
||||
normalize_stage_tau,
|
||||
pad_state_to_max_dim,
|
||||
)
|
||||
from lerobot.utils.constants import OBS_STR
|
||||
|
||||
|
||||
class StageTransformer(nn.Module):
|
||||
@@ -350,7 +353,7 @@ def gen_stage_emb(num_classes: int, targets: torch.Tensor) -> torch.Tensor:
|
||||
return stage_onehot
|
||||
|
||||
|
||||
class SARMRewardModel(PreTrainedRewardModel):
|
||||
class SARMRewardModel(PreTrainedPolicy):
|
||||
"""
|
||||
SARM Reward Model for stage-aware task completion rewards.
|
||||
|
||||
@@ -468,23 +471,6 @@ class SARMRewardModel(PreTrainedRewardModel):
|
||||
self.subtask_model.to(device)
|
||||
return self
|
||||
|
||||
def compute_reward(self, batch: dict[str, Tensor]) -> Tensor:
|
||||
"""Compute dense progress reward in [0, 1] from batch.
|
||||
|
||||
Expects batch to contain:
|
||||
- "observation_features" or video embeddings: (B, T, 512)
|
||||
- "language_embedding" or text embeddings: (B, 512)
|
||||
- optionally "observation.state": (B, T, state_dim)
|
||||
"""
|
||||
text_emb = batch.get("language_embedding", batch.get("text_features"))
|
||||
video_emb = batch.get("observation_features", batch.get("video_features"))
|
||||
state = batch.get("observation.state", batch.get("state_features"))
|
||||
|
||||
rewards = self.calculate_rewards(text_emb, video_emb, state)
|
||||
if isinstance(rewards, np.ndarray):
|
||||
rewards = torch.from_numpy(rewards).float()
|
||||
return rewards
|
||||
|
||||
@torch.no_grad()
|
||||
def calculate_rewards(
|
||||
self,
|
||||
@@ -645,9 +631,17 @@ class SARMRewardModel(PreTrainedRewardModel):
|
||||
return self.parameters()
|
||||
|
||||
def reset(self):
|
||||
"""SARM has no episode-level state to reset."""
|
||||
"""Required by PreTrainedPolicy but not used for reward models."""
|
||||
pass
|
||||
|
||||
def predict_action_chunk(self, batch: dict[str, Tensor]) -> Tensor:
|
||||
"""Required by PreTrainedPolicy but not used for reward models."""
|
||||
raise NotImplementedError("SARM model does not predict action chunks")
|
||||
|
||||
def select_action(self, batch: dict[str, Tensor]) -> Tensor:
|
||||
"""Required by PreTrainedPolicy but not used for SARM."""
|
||||
raise NotImplementedError("SARM model does not select actions")
|
||||
|
||||
def _train_step(
|
||||
self,
|
||||
img_emb: torch.Tensor, # (B, N, T, D)
|
||||
+9
-18
@@ -1,3 +1,5 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
@@ -58,15 +60,16 @@ from lerobot.processor import (
|
||||
policy_action_to_transition,
|
||||
transition_to_policy_action,
|
||||
)
|
||||
from lerobot.rewards.sarm.configuration_sarm import SARMConfig
|
||||
from lerobot.rewards.sarm.sarm_utils import (
|
||||
from lerobot.types import EnvTransition, PolicyAction, TransitionKey
|
||||
from lerobot.utils.constants import POLICY_POSTPROCESSOR_DEFAULT_NAME, POLICY_PREPROCESSOR_DEFAULT_NAME
|
||||
|
||||
from .configuration_sarm import SARMConfig
|
||||
from .sarm_utils import (
|
||||
apply_rewind_augmentation,
|
||||
compute_absolute_indices,
|
||||
find_stage_and_tau,
|
||||
pad_state_to_max_dim,
|
||||
)
|
||||
from lerobot.types import EnvTransition, PolicyAction, TransitionKey
|
||||
from lerobot.utils.constants import POLICY_POSTPROCESSOR_DEFAULT_NAME, POLICY_PREPROCESSOR_DEFAULT_NAME
|
||||
|
||||
|
||||
class SARMEncodingProcessorStep(ProcessorStep):
|
||||
@@ -452,13 +455,7 @@ class SARMEncodingProcessorStep(ProcessorStep):
|
||||
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
||||
|
||||
# Get image embeddings
|
||||
# transformers 5.x returns BaseModelOutputWithPooling instead of a plain tensor
|
||||
output = self.clip_model.get_image_features(**inputs)
|
||||
if not isinstance(output, torch.Tensor):
|
||||
output = output.pooler_output
|
||||
if output is None:
|
||||
raise ValueError("pooler_output should not be None for CLIP models.")
|
||||
embeddings = output.detach().cpu()
|
||||
embeddings = self.clip_model.get_image_features(**inputs).detach().cpu()
|
||||
|
||||
# Handle single frame case
|
||||
if embeddings.dim() == 1:
|
||||
@@ -485,13 +482,7 @@ class SARMEncodingProcessorStep(ProcessorStep):
|
||||
inputs = self.clip_processor.tokenizer([text], return_tensors="pt", padding=True, truncation=True)
|
||||
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
||||
|
||||
# transformers 5.x returns BaseModelOutputWithPooling instead of a plain tensor
|
||||
output = self.clip_model.get_text_features(**inputs)
|
||||
if not isinstance(output, torch.Tensor):
|
||||
output = output.pooler_output
|
||||
if output is None:
|
||||
raise ValueError("pooler_output should not be None for CLIP models.")
|
||||
text_embedding = output.detach().cpu()
|
||||
text_embedding = self.clip_model.get_text_features(**inputs).detach().cpu()
|
||||
text_embedding = text_embedding.expand(batch_size, -1)
|
||||
|
||||
return text_embedding
|
||||
@@ -1,3 +1,5 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
@@ -394,21 +394,13 @@ class SmolVLAPolicy(PreTrainedPolicy):
|
||||
loss_dict["losses_after_rm_padding"] = losses.clone().mean().item()
|
||||
|
||||
if reduction == "none":
|
||||
# Return per-sample losses (B,) by averaging over valid (time, action) entries
|
||||
if actions_is_pad is None:
|
||||
per_sample_loss = losses.mean(dim=(1, 2))
|
||||
else:
|
||||
num_valid = ((~actions_is_pad).sum(dim=1) * losses.shape[-1]).clamp_min(1)
|
||||
per_sample_loss = losses.sum(dim=(1, 2)) / num_valid
|
||||
# Return per-sample losses (B,) by averaging over time and action dims
|
||||
per_sample_loss = losses.mean(dim=(1, 2))
|
||||
loss_dict["loss"] = per_sample_loss.mean().item()
|
||||
return per_sample_loss, loss_dict
|
||||
else:
|
||||
# Default: return scalar mean loss over valid (time, action) entries
|
||||
if actions_is_pad is None:
|
||||
loss = losses.mean()
|
||||
else:
|
||||
num_valid = ((~actions_is_pad).sum() * losses.shape[-1]).clamp_min(1)
|
||||
loss = losses.sum() / num_valid
|
||||
# Default: return scalar mean loss
|
||||
loss = losses.mean()
|
||||
loss_dict["loss"] = loss.item()
|
||||
return loss, loss_dict
|
||||
|
||||
|
||||
@@ -97,8 +97,8 @@ class VQBeTConfig(PreTrainedConfig):
|
||||
vision_backbone: str = "resnet18"
|
||||
crop_shape: tuple[int, int] | None = (84, 84)
|
||||
crop_is_random: bool = True
|
||||
pretrained_backbone_weights: str | None = "ResNet18_Weights.IMAGENET1K_V1"
|
||||
use_group_norm: bool = False
|
||||
pretrained_backbone_weights: str | None = None
|
||||
use_group_norm: bool = True
|
||||
spatial_softmax_num_keypoints: int = 32
|
||||
# VQ-VAE
|
||||
n_vqvae_training_steps: int = 20000
|
||||
|
||||
@@ -22,7 +22,7 @@ from transformers.utils import (
|
||||
add_start_docstrings,
|
||||
add_start_docstrings_to_model_forward,
|
||||
is_flash_attn_2_available,
|
||||
is_flash_attn_greater_or_equal,
|
||||
is_flash_attn_greater_or_equal_2_10,
|
||||
is_torchdynamo_compiling,
|
||||
logging,
|
||||
replace_return_docstrings,
|
||||
@@ -890,7 +890,7 @@ class Qwen2_5_VLFlashAttention2(Qwen2_5_VLAttention):
|
||||
# TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
|
||||
# flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
|
||||
# Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
|
||||
self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal("2.1.0")
|
||||
self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
|
||||
|
||||
def forward(
|
||||
self,
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user