diff --git a/benchmarks/video/benchmark.py b/benchmarks/video/benchmark.py deleted file mode 100644 index d9e5e62bb..000000000 --- a/benchmarks/video/benchmark.py +++ /dev/null @@ -1,94 +0,0 @@ -#!/usr/bin/env python - -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import threading -import time -from contextlib import ContextDecorator - - -class TimeBenchmark(ContextDecorator): - """ - Measures execution time using a context manager or decorator. - - This class supports both context manager and decorator usage, and is thread-safe for multithreaded - environments. - - Args: - print: If True, prints the elapsed time upon exiting the context or completing the function. Defaults - to False. - - Examples: - - Using as a context manager: - - >>> benchmark = TimeBenchmark() - >>> with benchmark: - ... time.sleep(1) - >>> print(f"Block took {benchmark.result:.4f} seconds") - Block took approximately 1.0000 seconds - - Using with multithreading: - - ```python - import threading - - benchmark = TimeBenchmark() - - - def context_manager_example(): - with benchmark: - time.sleep(0.01) - print(f"Block took {benchmark.result_ms:.2f} milliseconds") - - - threads = [] - for _ in range(3): - t1 = threading.Thread(target=context_manager_example) - threads.append(t1) - - for t in threads: - t.start() - - for t in threads: - t.join() - ``` - Expected output: - Block took approximately 10.00 milliseconds - Block took approximately 10.00 milliseconds - Block took approximately 10.00 milliseconds - """ - - def __init__(self, print=False): - self.local = threading.local() - self.print_time = print - - def __enter__(self): - self.local.start_time = time.perf_counter() - return self - - def __exit__(self, *exc): - self.local.end_time = time.perf_counter() - self.local.elapsed_time = self.local.end_time - self.local.start_time - if self.print_time: - print(f"Elapsed time: {self.local.elapsed_time:.4f} seconds") - return False - - @property - def result(self): - return getattr(self.local, "elapsed_time", None) - - @property - def result_ms(self): - return self.result * 1e3 diff --git a/benchmarks/video/capture_camera_feed.py b/benchmarks/video/capture_camera_feed.py deleted file mode 100755 index 8f8530532..000000000 --- a/benchmarks/video/capture_camera_feed.py +++ /dev/null @@ -1,102 +0,0 @@ -#!/usr/bin/env python - -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Capture video feed from a camera as raw images.""" - -import argparse -import datetime as dt -import os -import time -from pathlib import Path - -import cv2 -import rerun as rr - -# see https://rerun.io/docs/howto/visualization/limit-ram -RERUN_MEMORY_LIMIT = os.getenv("LEROBOT_RERUN_MEMORY_LIMIT", "5%") - - -def display_and_save_video_stream(output_dir: Path, fps: int, width: int, height: int, duration: int): - rr.init("lerobot_capture_camera_feed") - rr.spawn(memory_limit=RERUN_MEMORY_LIMIT) - - now = dt.datetime.now() - capture_dir = output_dir / f"{now:%Y-%m-%d}" / f"{now:%H-%M-%S}" - if not capture_dir.exists(): - capture_dir.mkdir(parents=True, exist_ok=True) - - # Opens the default webcam - cap = cv2.VideoCapture(0) - if not cap.isOpened(): - print("Error: Could not open video stream.") - return - - cap.set(cv2.CAP_PROP_FPS, fps) - cap.set(cv2.CAP_PROP_FRAME_WIDTH, width) - cap.set(cv2.CAP_PROP_FRAME_HEIGHT, height) - - frame_index = 0 - start_time = time.time() - while time.time() - start_time < duration: - ret, frame = cap.read() - - if not ret: - print("Error: Could not read frame.") - break - rr.log("video/stream", rr.Image(frame), static=True) - cv2.imwrite(str(capture_dir / f"frame_{frame_index:06d}.png"), frame) - frame_index += 1 - - # Release the capture - cap.release() - - # TODO(Steven): Add a graceful shutdown via a close() method for the Viewer context, though not currently supported in the Rerun API. - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - parser.add_argument( - "--output-dir", - type=Path, - default=Path("outputs/cam_capture/"), - help="Directory where the capture images are written. A subfolder named with the current date & time will be created inside it for each capture.", - ) - parser.add_argument( - "--fps", - type=int, - default=30, - help="Frames Per Second of the capture.", - ) - parser.add_argument( - "--width", - type=int, - default=1280, - help="Width of the captured images.", - ) - parser.add_argument( - "--height", - type=int, - default=720, - help="Height of the captured images.", - ) - parser.add_argument( - "--duration", - type=int, - default=20, - help="Duration in seconds for which the video stream should be captured.", - ) - args = parser.parse_args() - display_and_save_video_stream(**vars(args)) diff --git a/benchmarks/video/run_video_benchmark.py b/benchmarks/video/run_video_benchmark.py index 9f34b2273..064a84b48 100644 --- a/benchmarks/video/run_video_benchmark.py +++ b/benchmarks/video/run_video_benchmark.py @@ -21,11 +21,13 @@ See the provided README.md or run `python benchmark/video/run_video_benchmark.py import argparse import datetime as dt +import itertools import random import shutil from collections import OrderedDict from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path +from threading import Lock import einops import numpy as np @@ -35,13 +37,13 @@ import torch from skimage.metrics import mean_squared_error, peak_signal_noise_ratio, structural_similarity from tqdm import tqdm -from benchmarks.video.benchmark import TimeBenchmark from lerobot.datasets.lerobot_dataset import LeRobotDataset from lerobot.datasets.video_utils import ( - decode_video_frames_torchvision, + decode_video_frames, encode_video_frames, ) from lerobot.utils.constants import OBS_IMAGE +from lerobot.utils.utils import TimerManager BASE_ENCODING = OrderedDict( [ @@ -86,7 +88,7 @@ def load_original_frames(imgs_dir: Path, timestamps: list[float], fps: int) -> t frames = [] for ts in timestamps: idx = int(ts * fps) - frame = PIL.Image.open(imgs_dir / f"frame_{idx:06d}.png") + frame = PIL.Image.open(imgs_dir / f"frame-{idx:06d}.png") frame = torch.from_numpy(np.array(frame)) frame = frame.type(torch.float32) / 255 frame = einops.rearrange(frame, "h w c -> c h w") @@ -97,21 +99,21 @@ def load_original_frames(imgs_dir: Path, timestamps: list[float], fps: int) -> t def save_decoded_frames( imgs_dir: Path, save_dir: Path, frames: torch.Tensor, timestamps: list[float], fps: int ) -> None: - if save_dir.exists() and len(list(save_dir.glob("frame_*.png"))) == len(timestamps): + if save_dir.exists() and len(list(save_dir.glob("frame-*.png"))) == len(timestamps): return save_dir.mkdir(parents=True, exist_ok=True) for i, ts in enumerate(timestamps): idx = int(ts * fps) frame_hwc = (frames[i].permute((1, 2, 0)) * 255).type(torch.uint8).cpu().numpy() - PIL.Image.fromarray(frame_hwc).save(save_dir / f"frame_{idx:06d}_decoded.png") - shutil.copyfile(imgs_dir / f"frame_{idx:06d}.png", save_dir / f"frame_{idx:06d}_original.png") + PIL.Image.fromarray(frame_hwc).save(save_dir / f"frame-{idx:06d}_decoded.png") + shutil.copyfile(imgs_dir / f"frame-{idx:06d}.png", save_dir / f"frame-{idx:06d}_original.png") def save_first_episode(imgs_dir: Path, dataset: LeRobotDataset) -> None: episode_index = 0 ep_num_images = dataset.meta.episodes["length"][episode_index] - if imgs_dir.exists() and len(list(imgs_dir.glob("frame_*.png"))) == ep_num_images: + if imgs_dir.exists() and len(list(imgs_dir.glob("frame-*.png"))) == ep_num_images: return imgs_dir.mkdir(parents=True, exist_ok=True) @@ -125,7 +127,7 @@ def save_first_episode(imgs_dir: Path, dataset: LeRobotDataset) -> None: tqdm(imgs_dataset, desc=f"saving {dataset.repo_id} first episode images", leave=False) ): img = item[img_keys[0]] - img.save(str(imgs_dir / f"frame_{i:06d}.png"), quality=100) + img.save(str(imgs_dir / f"frame-{i:06d}.png"), quality=100) if i >= ep_num_images - 1: break @@ -149,18 +151,6 @@ def sample_timestamps(timestamps_mode: str, ep_num_images: int, fps: int) -> lis return [idx / fps for idx in frame_indexes] -def decode_video_frames( - video_path: str, - timestamps: list[float], - tolerance_s: float, - backend: str, -) -> torch.Tensor: - if backend in ["pyav", "video_reader"]: - return decode_video_frames_torchvision(video_path, timestamps, tolerance_s, backend) - else: - raise NotImplementedError(backend) - - def benchmark_decoding( imgs_dir: Path, video_path: Path, @@ -172,8 +162,8 @@ def benchmark_decoding( num_workers: int = 4, save_frames: bool = False, ) -> dict: - def process_sample(sample: int): - time_benchmark = TimeBenchmark() + def process_sample(sample: int, lock: Lock): + time_benchmark = TimerManager(log=False) timestamps = sample_timestamps(timestamps_mode, ep_num_images, fps) num_frames = len(timestamps) result = { @@ -182,13 +172,13 @@ def benchmark_decoding( "mse_values": [], } - with time_benchmark: + with time_benchmark, lock: frames = decode_video_frames(video_path, timestamps=timestamps, tolerance_s=5e-1, backend=backend) - result["load_time_video_ms"] = time_benchmark.result_ms / num_frames + result["load_time_video_ms"] = (time_benchmark.last * 1000) / num_frames with time_benchmark: original_frames = load_original_frames(imgs_dir, timestamps, fps) - result["load_time_images_ms"] = time_benchmark.result_ms / num_frames + result["load_time_images_ms"] = (time_benchmark.last * 1000) / num_frames frames_np, original_frames_np = frames.numpy(), original_frames.numpy() for i in range(num_frames): @@ -215,8 +205,10 @@ def benchmark_decoding( # A sample is a single set of decoded frames specified by timestamps_mode (e.g. a single frame, 2 frames, etc.). # For each sample, we record metrics (loading time and quality metrics) which are then averaged over all samples. # As these samples are independent, we run them in parallel threads to speed up the benchmark. + # Use a single shared lock for all worker threads + shared_lock = Lock() with ThreadPoolExecutor(max_workers=num_workers) as executor: - futures = [executor.submit(process_sample, i) for i in range(num_samples)] + futures = [executor.submit(process_sample, i, shared_lock) for i in range(num_samples)] for future in tqdm(as_completed(futures), total=num_samples, desc="samples", leave=False): result = future.result() load_times_video_ms.append(result["load_time_video_ms"]) @@ -358,24 +350,27 @@ def main( imgs_dir = output_dir / "images" / dataset.repo_id.replace("/", "_") # We only use the first episode save_first_episode(imgs_dir, dataset) - for key, values in tqdm(encoding_benchmarks.items(), desc="encodings (g, crf)", leave=False): - for value in tqdm(values, desc=f"encodings ({key})", leave=False): - encoding_cfg = BASE_ENCODING.copy() - encoding_cfg["vcodec"] = video_codec - encoding_cfg["pix_fmt"] = pixel_format + for duet in [ + dict(zip(encoding_benchmarks.keys(), unique_combination, strict=False)) + for unique_combination in itertools.product(*encoding_benchmarks.values()) + ]: + encoding_cfg = BASE_ENCODING.copy() + encoding_cfg["vcodec"] = video_codec + encoding_cfg["pix_fmt"] = pixel_format + for key, value in duet.items(): encoding_cfg[key] = value - args_path = Path("_".join(str(value) for value in encoding_cfg.values())) - video_path = output_dir / "videos" / args_path / f"{repo_id.replace('/', '_')}.mp4" - benchmark_table += benchmark_encoding_decoding( - dataset, - video_path, - imgs_dir, - encoding_cfg, - decoding_benchmarks, - num_samples, - num_workers, - save_frames, - ) + args_path = Path("_".join(str(value) for value in encoding_cfg.values())) + video_path = output_dir / "videos" / args_path / f"{repo_id.replace('/', '_')}.mp4" + benchmark_table += benchmark_encoding_decoding( + dataset, + video_path, + imgs_dir, + encoding_cfg, + decoding_benchmarks, + num_samples, + num_workers, + save_frames, + ) # Save intermediate results benchmark_df = pd.DataFrame(benchmark_table, columns=headers) @@ -409,9 +404,9 @@ if __name__ == "__main__": nargs="*", default=[ "lerobot/pusht_image", - "aliberts/aloha_mobile_shrimp_image", - "aliberts/paris_street", - "aliberts/kitchen", + "lerobot/aloha_mobile_shrimp_image", + "lerobot/paris_street", + "lerobot/kitchen", ], help="Datasets repo-ids to test against. First episodes only are used. Must be images.", ) @@ -419,7 +414,7 @@ if __name__ == "__main__": "--vcodec", type=str, nargs="*", - default=["libx264", "hevc", "libsvtav1"], + default=["h264", "hevc", "libsvtav1"], help="Video codecs to be tested", ) parser.add_argument( @@ -468,7 +463,7 @@ if __name__ == "__main__": "--backends", type=str, nargs="*", - default=["pyav", "video_reader"], + default=["torchcodec", "pyav"], help="Torchvision decoding backend to be tested.", ) parser.add_argument( diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index 0cf8aa9a6..2f9715ce1 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -15,8 +15,6 @@ title: Train a Robot with RL - local: hilserl_sim title: Train RL in Simulation - - local: async - title: Use Async Inference - local: multi_gpu_training title: Multi GPU training title: "Tutorials" @@ -40,11 +38,17 @@ - local: groot title: NVIDIA GR00T N1.5 title: "Policies" +- sections: + - local: async + title: Use Async Inference + - local: rtc + title: Real-Time Chunking (RTC) + title: "Inference" - sections: - local: envhub title: Environments from the Hub - - local: il_sim - title: Imitation Learning in Sim + - local: envhub_leisaac + title: Control & Train Robots in Sim (LeIsaac) - local: libero title: Using Libero - local: metaworld @@ -59,6 +63,8 @@ title: Implement your own processor - local: processors_robots_teleop title: Processors for Robots and Teleoperators + - local: env_processor + title: Environment Processors title: "Robot Processors" - sections: - local: so101 diff --git a/docs/source/async.mdx b/docs/source/async.mdx index be10f8baf..e3a11609c 100644 --- a/docs/source/async.mdx +++ b/docs/source/async.mdx @@ -196,7 +196,7 @@ client_cfg = RobotClientConfig( server_address="localhost:8080", policy_device="mps", policy_type="smolvla", - pretrained_name_or_path="fracapuano/smolvla_async", + pretrained_name_or_path="/smolvla_async", chunk_size_threshold=0.5, actions_per_chunk=50, # make sure this is less than the max actions of the policy ) diff --git a/docs/source/env_processor.mdx b/docs/source/env_processor.mdx new file mode 100644 index 000000000..8dbf315c7 --- /dev/null +++ b/docs/source/env_processor.mdx @@ -0,0 +1,418 @@ +# Environment Processors + +Environment processors are a critical layer in LeRobot's data processing architecture that handle **environment-specific** transformations, separate from policy-specific processing. This separation of concerns enables cleaner code, better modularity, and easier experimentation with different environments and policies. + +## Why Environment Processors? + +When working with different robot environments (LIBERO, MetaWorld, Aloha, etc.), each environment often has unique data formats, coordinate systems, and conventions that need standardization **before** policy processing. Without environment processors, these transformations would be: + +1. **Hardcoded in environment code** - Making it difficult to experiment with different state representations +2. **Duplicated across policies** - Each policy would need to handle environment-specific quirks +3. **Mixed with policy logic** - Violating separation of concerns and making debugging harder + +Environment processors solve this by providing a **dedicated processing layer** between raw environment observations and policy inputs. + +## The Processing Pipeline + +Here's how data flows through the complete processing pipeline during evaluation: + +```python +# In lerobot_eval.py rollout() function: + +# 1. Raw environment observation (numpy arrays, various formats) +raw_observation = env.step(action) + +# 2. Convert numpy to torch, normalize images [0,1] +observation = preprocess_observation(raw_observation) + +# 3. Add task metadata (for multi-task environments) +observation = add_envs_task(env, observation) + +# 4. ENVIRONMENT-SPECIFIC preprocessing (NEW!) +# - Flatten robot states +# - Rotate images to match dataset conventions +# - Handle environment-specific coordinate systems +observation = env_preprocessor(observation) + +# 5. POLICY-SPECIFIC preprocessing +# - Normalize with dataset statistics +# - Add batch dimensions +# - Move to GPU +# - Tokenize language instructions +observation = preprocessor(observation) + +# 6. Policy inference +action = policy.select_action(observation) + +# 7. POLICY-SPECIFIC postprocessing +# - Unnormalize actions +# - Remove batch dimensions +action = postprocessor(action) + +# 8. ENVIRONMENT-SPECIFIC postprocessing (NEW!) +# - Convert action formats if needed +# - Apply environment-specific constraints +action_transition = {"action": action} +action_transition = env_postprocessor(action_transition) +action = action_transition["action"] + +# 9. Execute in environment +env.step(action) +``` + +## The Benefits + +### 1. **Separation of Concerns** + +Environment processors handle transformations specific to the **environment's data format**, while policy processors handle transformations specific to the **model's requirements**. + +```python +# ❌ Before: Mixed concerns +class LiberoVLAPolicy: + def preprocess(self, obs): + # Environment-specific: Flatten robot state (shouldn't be in policy!) + state = self._flatten_robot_state(obs["robot_state"]) + # Policy-specific: Normalize with dataset stats + state = self.normalizer(state) + return state + +# ✅ After: Clear separation +# Environment processor: Handles LIBERO's nested robot state +env_preprocessor = LiberoProcessorStep() # Flattens robot_state + +# Policy processor: Handles model requirements +policy_preprocessor = NormalizerProcessorStep(stats=dataset_stats) +``` + +### 2. **Flexibility and Reusability** + +The same policy can work with different environment processors, and the same environment processor can work with different policies: + +```python +# Use SmolVLA policy with LIBERO environment +libero_preprocessor, libero_postprocessor = make_env_pre_post_processors(libero_cfg) +smolvla_preprocessor, smolvla_postprocessor = make_pre_post_processors(smolvla_cfg) + +# Or use ACT policy with the same LIBERO environment +libero_preprocessor, libero_postprocessor = make_env_pre_post_processors(libero_cfg) +act_preprocessor, act_postprocessor = make_pre_post_processors(act_cfg) +``` + +### 3. **Easier Experimentation** + +Want to try different state representations for LIBERO? Just create a new processor: + +```python +# Original: 8D state (pos + quat→axisangle + gripper) +@ProcessorStepRegistry.register("libero_processor") +class LiberoProcessorStep(ObservationProcessorStep): + def _process_observation(self, obs): + eef_pos = robot_state["eef"]["pos"] # 3D + eef_axisangle = quat2axisangle(quat) # 3D + gripper = robot_state["gripper"]["qpos"] # 2D + state = torch.cat([eef_pos, eef_axisangle, gripper], dim=-1) # 8D + return state + +# Experiment: Add velocity for better control +@ProcessorStepRegistry.register("libero_velocity_processor") +class LiberoVelocityProcessorStep(ObservationProcessorStep): + def _process_observation(self, obs): + # Include velocities for 14D state + eef_pos = robot_state["eef"]["pos"] # 3D + eef_axisangle = quat2axisangle(quat) # 3D + eef_vel = robot_state["eef"]["vel"] # 3D (NEW) + gripper_pos = robot_state["gripper"]["qpos"] # 2D + gripper_vel = robot_state["gripper"]["qvel"] # 3D (NEW) + state = torch.cat([eef_pos, eef_axisangle, eef_vel, + gripper_pos, gripper_vel], dim=-1) # 14D + return state +``` + +### 4. **Cleaner Environment Code** + +Environments expose **all available data** without needing to know what downstream models will use: + +```python +# LIBERO environment exposes full robot state +observation = { + "pixels": {"image": img, "image2": img2}, + "robot_state": { + "eef": {"pos": ..., "quat": ..., "vel": ..., "mat": ..., "axisangle": ...}, + "gripper": {"qpos": ..., "qvel": ...}, + "joints": {"pos": ..., "vel": ...} + } +} + +# Environment processor decides what to use +# Policy processor handles model-specific transformations +``` + +## Using Environment Processors + +### Factory Function + +The `make_env_pre_post_processors` function follows the same pattern as `make_pre_post_processors` for policies: + +```python +from lerobot.envs.factory import make_env_pre_post_processors +from lerobot.envs.configs import LiberoEnv, PushtEnv + +# For LIBERO: Returns LiberoProcessorStep in preprocessor +libero_cfg = LiberoEnv(task="libero_spatial", camera_name=["agentview"]) +env_preprocessor, env_postprocessor = make_env_pre_post_processors(libero_cfg) + +# For other environments: Returns identity processors (no-op) +pusht_cfg = PushtEnv() +env_preprocessor, env_postprocessor = make_env_pre_post_processors(pusht_cfg) +``` + +### Implementation in `envs/factory.py` + +```python +def make_env_pre_post_processors( + env_cfg: EnvConfig, +) -> tuple[ + PolicyProcessorPipeline[dict[str, Any], dict[str, Any]], + PolicyProcessorPipeline[dict[str, Any], dict[str, Any]], +]: + """ + Create preprocessor and postprocessor pipelines for environment observations. + + Args: + env_cfg: The configuration of the environment. + + Returns: + A tuple containing: + - preprocessor: Pipeline that processes environment observations + - postprocessor: Pipeline that processes environment outputs + """ + # For LIBERO environments, add the LiberoProcessorStep to preprocessor + if isinstance(env_cfg, LiberoEnv) or "libero" in env_cfg.type: + preprocessor = PolicyProcessorPipeline(steps=[LiberoProcessorStep()]) + else: + # For all other environments, return an identity preprocessor + preprocessor = PolicyProcessorPipeline(steps=[]) + + # Postprocessor is currently identity for all environments + # Future: Could add environment-specific action transformations + postprocessor = PolicyProcessorPipeline(steps=[]) + + return preprocessor, postprocessor +``` + +### Integration in Evaluation + +In `lerobot_eval.py`, the environment processors are created once and used throughout: + +```python +def eval_main(cfg: EvalPipelineConfig): + # Create environment + envs = make_env(cfg.env, n_envs=cfg.eval.batch_size) + + # Create policy + policy = make_policy(cfg=cfg.policy, env_cfg=cfg.env) + + # Create policy processors + preprocessor, postprocessor = make_pre_post_processors( + policy_cfg=cfg.policy, + pretrained_path=cfg.policy.pretrained_path, + ) + + # Create environment processors (NEW!) + env_preprocessor, env_postprocessor = make_env_pre_post_processors(env_cfg=cfg.env) + + # Run evaluation with both processor types + eval_policy_all( + envs=envs, + policy=policy, + env_preprocessor=env_preprocessor, # Environment-specific + env_postprocessor=env_postprocessor, # Environment-specific + preprocessor=preprocessor, # Policy-specific + postprocessor=postprocessor, # Policy-specific + n_episodes=cfg.eval.n_episodes, + ) +``` + +## Example: LIBERO Environment Processor + +The `LiberoProcessorStep` demonstrates a real-world environment processor: + +```python +from lerobot.processor.pipeline import ObservationProcessorStep + +@dataclass +@ProcessorStepRegistry.register(name="libero_processor") +class LiberoProcessorStep(ObservationProcessorStep): + """ + Processes LIBERO observations into the LeRobot format. + + **State Processing:** + - Extracts end-effector position (3D) + - Converts quaternion to axis-angle representation (3D) + - Extracts gripper joint positions (2D) + - Concatenates into 8D state vector + + **Image Processing:** + - Rotates images 180° to match HuggingFaceVLA/libero convention + """ + + def _process_observation(self, observation): + processed_obs = observation.copy() + + # Process images: Flip 180° for camera convention + for key in list(processed_obs.keys()): + if key.startswith("observation.images."): + img = processed_obs[key] + img = torch.flip(img, dims=[2, 3]) # Flip H and W + processed_obs[key] = img + + # Process robot_state: Flatten to 8D vector + if "observation.robot_state" in processed_obs: + robot_state = processed_obs.pop("observation.robot_state") + + eef_pos = robot_state["eef"]["pos"] # (B, 3) + eef_quat = robot_state["eef"]["quat"] # (B, 4) + gripper_qpos = robot_state["gripper"]["qpos"] # (B, 2) + + # Convert quaternion to axis-angle + eef_axisangle = self._quat2axisangle(eef_quat) # (B, 3) + + # Concatenate into single state vector + state = torch.cat((eef_pos, eef_axisangle, gripper_qpos), dim=-1) + state = state.float() + + processed_obs["observation.state"] = state + + return processed_obs +``` + +### Why These Transformations? + +1. **Image Rotation**: The HuggingFaceVLA/libero dataset has images rotated 180° from the raw LIBERO simulator. The processor handles this convention mismatch so policies trained on the dataset work seamlessly. + +2. **State Flattening**: The raw LIBERO environment exposes nested dictionaries with all available state information (position, quaternion, velocity, matrix representation, etc.). The processor: + - Selects the relevant components (pos, quat, gripper) + - Converts quaternion to axis-angle (more suitable for learning) + - Flattens to a single 8D vector that policies expect + +3. **Flexibility**: The environment still exposes **all** raw data. If you want to try different state representations (e.g., including velocities, using matrix representation instead of axis-angle), you can create a new processor without modifying the environment code. + +## Adding Environment Processors for New Environments + +To add environment processors for a new environment: + +### 1. Create the Processor Step + +```python +# In src/lerobot/processor/env_processor.py + +@dataclass +@ProcessorStepRegistry.register(name="myenv_processor") +class MyEnvProcessorStep(ObservationProcessorStep): + """Process observations from MyEnv.""" + + def _process_observation(self, observation): + processed = observation.copy() + + # Your environment-specific transformations + if "myenv.specific.state" in processed: + state = processed.pop("myenv.specific.state") + # Transform to standard format + processed["observation.state"] = self._transform_state(state) + + return processed +``` + +### 2. Update the Factory + +```python +# In src/lerobot/envs/factory.py + +def make_env_pre_post_processors(env_cfg: EnvConfig): + if isinstance(env_cfg, LiberoEnv) or "libero" in env_cfg.type: + preprocessor = PolicyProcessorPipeline(steps=[LiberoProcessorStep()]) + elif isinstance(env_cfg, MyEnvConfig) or "myenv" in env_cfg.type: + preprocessor = PolicyProcessorPipeline(steps=[MyEnvProcessorStep()]) + else: + preprocessor = PolicyProcessorPipeline(steps=[]) + + postprocessor = PolicyProcessorPipeline(steps=[]) + return preprocessor, postprocessor +``` + +### 3. Use in Evaluation + +No changes needed! The evaluation script automatically uses the appropriate processor: + +```bash +lerobot-eval \ + --policy.path=lerobot/my_policy \ + --env.type=myenv \ # Automatically uses MyEnvProcessorStep + --eval.n_episodes=10 +``` + +## Future: Environment Postprocessors + +Currently, postprocessors are identity (no-op) for all environments. Future use cases include: + +### Action Space Transformations + +```python +@dataclass +class MyEnvActionPostprocessor(ProcessorStep): + """Convert policy actions to environment-specific format.""" + + def __call__(self, transition: EnvTransition) -> EnvTransition: + action = transition["action"] + + # Example: Convert from Cartesian to joint space + if self.action_space == "joint": + action = self.ik_solver(action) + + # Example: Apply environment-specific safety limits + action = torch.clamp(action, self.min_action, self.max_action) + + transition["action"] = action + return transition +``` + +### Coordinate System Conversions + +```python +@dataclass +class CoordinateTransformPostprocessor(ProcessorStep): + """Transform actions between coordinate systems.""" + + def __call__(self, transition: EnvTransition) -> EnvTransition: + action = transition["action"] + + # Example: Policy outputs in world frame, env expects base frame + action = self.world_to_base_transform(action) + + transition["action"] = action + return transition +``` + +## Best Practices + +1. **Keep environment processors simple**: They should only handle environment-specific data format issues, not complex learning-related transformations. + +2. **Use policy processors for model requirements**: Normalization, batching, device placement, and tokenization belong in policy processors. + +3. **Expose all data from environments**: Let processors decide what to use rather than hardcoding choices in the environment. + +4. **Document conventions**: Clearly document any coordinate system conventions, camera orientations, or data formats that your processor handles. + +5. **Test independently**: Environment processors should be testable without loading full policies or environments. + +## Summary + +Environment processors provide a **clean separation** between environment-specific data transformations and policy-specific model requirements. This architecture: + +- ✅ Enables easy experimentation with different state representations +- ✅ Allows policies to work seamlessly across different environments +- ✅ Keeps environment code focused on simulation/hardware interface +- ✅ Makes processor pipelines more maintainable and debuggable +- ✅ Follows the single responsibility principle + +The key insight: **Environments define data formats, processors standardize them, policies consume standardized data.** Each layer has a clear, focused responsibility. diff --git a/docs/source/envhub_leisaac.mdx b/docs/source/envhub_leisaac.mdx new file mode 100644 index 000000000..ff848d415 --- /dev/null +++ b/docs/source/envhub_leisaac.mdx @@ -0,0 +1,301 @@ +# LeIsaac × LeRobot EnvHub + +LeRobot EnvHub now supports **imitation learning in simulation** with LeIsaac. +Spin up everyday manipulation tasks, teleoperate the robot, collect demos, push them to the Hub, and train policies in LeRobot — all in one loop. + +[LeIsaac](https://github.com/LightwheelAI/leisaac) integrates with IsaacLab and the SO101 Leader/Follower setup to provide: + +- 🕹️ **Teleoperation-first workflows** for data collection +- 📦 **Built-in data conversion** ready for LeRobot training +- 🤖 **Everyday skills** like picking oranges, lifting cubes, cleaning tables, and folding cloth +- ☁️ **Ongoing upgrades** from [LightWheel](https://lightwheel.ai/): cloud simulation, EnvHub support, Sim2Real tooling, and more + +Below you’ll find the currently supported LeIsaac tasks exposed through LeRobot EnvHub. + +# Available Environments + +The following table lists all available tasks and environments in LeIsaac x LeRobot Envhub. You can also get the latest list of environments by running the following command: + +```bash +python scripts/environments/list_envs.py +``` + +| Task | Environment ID | Task Description | Related Robot | +| :-------------------------------------------------------------------------------------------------------------------------------------------------------------- | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :------------------------------------------------------------------------------------------------------------------------- | :--------------------------------------------------------- | +| | [LeIsaac-SO101-PickOrange-v0](https://github.com/LightwheelAI/leisaac/blob/main/source/leisaac/leisaac/tasks/pick_orange/pick_orange_env_cfg.py)

[LeIsaac-SO101-PickOrange-Direct-v0](https://github.com/LightwheelAI/leisaac/blob/main/source/leisaac/leisaac/tasks/pick_orange/direct/pick_orange_env.py) | Pick three oranges and put them into the plate, then reset the arm to rest state. | Single-Arm SO101 Follower | +| | [LeIsaac-SO101-LiftCube-v0](https://github.com/LightwheelAI/leisaac/blob/main/source/leisaac/leisaac/tasks/lift_cube/lift_cube_env_cfg.py)

[LeIsaac-SO101-LiftCube-Direct-v0](https://github.com/LightwheelAI/leisaac/blob/main/source/leisaac/leisaac/tasks/lift_cube/direct/lift_cube_env.py) | Lift the red cube up. | Single-Arm SO101 Follower | +| | [LeIsaac-SO101-CleanToyTable-v0](https://github.com/LightwheelAI/leisaac/blob/main/source/leisaac/leisaac/tasks/clean_toy_table/clean_toy_table_env_cfg.py)

[LeIsaac-SO101-CleanToyTable-BiArm-v0](https://github.com/LightwheelAI/leisaac/blob/main/source/leisaac/leisaac/tasks/clean_toy_table/clean_toy_table_bi_arm_env_cfg.py)

[LeIsaac-SO101-CleanToyTable-BiArm-Direct-v0](https://github.com/LightwheelAI/leisaac/blob/main/source/leisaac/leisaac/tasks/clean_toy_table/direct/clean_toy_table_bi_arm_env.py) | Pick two letter e objects into the box, and reset the arm to rest state. | Single-Arm SO101 Follower

Bi-Arm SO101 Follower | +| | [LeIsaac-SO101-FoldCloth-BiArm-v0](https://github.com/LightwheelAI/leisaac/blob/main/source/leisaac/leisaac/tasks/fold_cloth/fold_cloth_bi_arm_env_cfg.py)

[LeIsaac-SO101-FoldCloth-BiArm-Direct-v0](https://github.com/LightwheelAI/leisaac/blob/main/source/leisaac/leisaac/tasks/fold_cloth/direct/fold_cloth_bi_arm_env.py) | Fold the cloth, and reset the arm to rest state.

_Note: Only the DirectEnv support check_success in this task._ | Bi-Arm SO101 Follower | + +# Load LeIsaac directly in LeRobot with one line of code + +> EnvHub: Share LeIsaac environments through HuggingFace + +[EnvHub](https://huggingface.co/docs/lerobot/envhub) is our reproducible environment hub, spin up a packaged simulation with one line, experiment immediately, and publish your own tasks for the community. + +LeIsaac offers EnvHub support so you can consume or share tasks with only a few commands. + +