diff --git a/docs/source/hilserl.mdx b/docs/source/hilserl.mdx index 2f73d0964..b90e613db 100644 --- a/docs/source/hilserl.mdx +++ b/docs/source/hilserl.mdx @@ -56,27 +56,38 @@ pip install -e ".[hilserl]" ### Understanding Configuration -The training process begins with proper configuration for the HILSerl environment. The configuration class of interest is `HILSerlRobotEnvConfig` in `lerobot/envs/configs.py`. Which is defined as: +The training process begins with proper configuration for the HILSerl environment. The configuration class of interest is `HILSerlRobotEnvConfig` in `lerobot/envs/configs.py`. The configuration is now organized into focused, nested sub-configs: ```python class HILSerlRobotEnvConfig(EnvConfig): robot: RobotConfig | None = None # Main robot agent (defined in `lerobot/robots`) - teleop: TeleoperatorConfig | None = None # Teleoperator agent, e.g., gamepad or leader arm, (defined in `lerobot/teleoperators`) - wrapper: EnvTransformConfig | None = None # Environment wrapper settings; check `lerobot/scripts/server/gym_manipulator.py` - fps: int = 10 # Control frequency + teleop: TeleoperatorConfig | None = None # Teleoperator agent, e.g., gamepad or leader arm + processor: HILSerlProcessorConfig # Processing pipeline configuration (nested) + dataset: DatasetConfig # Dataset recording/replay configuration (nested) name: str = "real_robot" # Environment name mode: str = None # "record", "replay", or None (for training) + device: str = "cuda" # Compute device + +# Nested processor configuration +class HILSerlProcessorConfig: + control_mode: str = "gamepad" # Control mode + observation: ObservationConfig # Observation processing settings + image_preprocessing: ImagePreprocessingConfig # Image crop/resize settings + gripper: GripperConfig # Gripper control and penalty settings + reset: ResetConfig # Environment reset and timing settings + inverse_kinematics: InverseKinematicsConfig # IK processing settings + reward_classifier: RewardClassifierConfig # Reward classifier settings + +# Dataset configuration +class DatasetConfig: repo_id: str | None = None # LeRobot dataset repository ID dataset_root: str | None = None # Local dataset root (optional) task: str = "" # Task identifier num_episodes: int = 10 # Number of episodes for recording - episode: int = 0 # episode index for replay - device: str = "cuda" # Compute device - push_to_hub: bool = True # Whether to push the recorded datasets to Hub - pretrained_policy_name_or_path: str | None = None # For policy loading - reward_classifier_pretrained_path: str | None = None # For reward model - number_of_steps_after_success: int = 0 # For reward classifier, collect more positive examples after a success to train a classifier + episode: int = 0 # Episode index for replay + push_to_hub: bool = True # Whether to push datasets to Hub + fps: int = 10 # Control frequency ``` @@ -133,19 +144,22 @@ Create a configuration file for recording demonstrations (or edit an existing on 1. Set `mode` to `"record"` 2. Specify a unique `repo_id` for your dataset (e.g., "username/task_name") 3. Set `num_episodes` to the number of demonstrations you want to collect -4. Set `crop_params_dict` to `null` initially (we'll determine crops later) +4. Set `processor.image_preprocessing.crop_params_dict` to `{}` initially (we'll determine crops later) 5. Configure `robot`, `cameras`, and other hardware settings Example configuration section: ```json "mode": "record", -"repo_id": "username/pick_lift_cube", -"dataset_root": null, -"task": "pick_and_lift", -"num_episodes": 15, -"episode": 0, -"push_to_hub": true +"dataset": { + "repo_id": "username/pick_lift_cube", + "dataset_root": null, + "task": "pick_and_lift", + "num_episodes": 15, + "episode": 0, + "push_to_hub": true, + "fps": 10 +} ``` ### Using a Teleoperation Device @@ -191,10 +205,13 @@ The gamepad provides a very convenient way to control the robot and the episode To setup the gamepad, you need to set the `control_mode` to `"gamepad"` and define the `teleop` section in the configuration file. ```json - "teleop": { - "type": "gamepad", - "use_gripper": true - }, +"teleop": { + "type": "gamepad", + "use_gripper": true +}, +"processor": { + "control_mode": "gamepad" +} ```
@@ -216,11 +233,14 @@ The SO101 leader arm has reduced gears that allows it to move and track the foll To setup the SO101 leader, you need to set the `control_mode` to `"leader"` and define the `teleop` section in the configuration file. ```json - "teleop": { - "type": "so101_leader", - "port": "/dev/tty.usbmodem585A0077921", # check your port number - "use_degrees": true - }, +"teleop": { + "type": "so101_leader", + "port": "/dev/tty.usbmodem585A0077921", + "use_degrees": true +}, +"processor": { + "control_mode": "leader" +} ``` In order to annotate the success/failure of the episode, **you will need** to use a keyboard to press `s` for success, `esc` for failure. @@ -251,7 +271,7 @@ python -m lerobot.scripts.rl.gym_manipulator --config_path src/lerobot/configs/e During recording: -1. The robot will reset to the initial position defined in the configuration file `fixed_reset_joint_positions` +1. The robot will reset to the initial position defined in the configuration file `processor.reset.fixed_reset_joint_positions` 2. Complete the task successfully 3. The episode ends with a reward of 1 when you press the "success" button 4. If the time limit is reached, or the fail button is pressed, the episode ends with a reward of 0 @@ -310,11 +330,15 @@ observation.images.front: [180, 250, 120, 150] Add these crop parameters to your training configuration: ```json -"crop_params_dict": { - "observation.images.side": [180, 207, 180, 200], - "observation.images.front": [180, 250, 120, 150] -}, -"resize_size": [128, 128] +"processor": { + "image_preprocessing": { + "crop_params_dict": { + "observation.images.side": [180, 207, 180, 200], + "observation.images.front": [180, 250, 120, 150] + }, + "resize_size": [128, 128] + } +} ``` **Recommended image resolution** @@ -346,23 +370,29 @@ python -m lerobot.scripts.rl.gym_manipulator --config_path src/lerobot/configs/r - **mode**: set it to `"record"` to collect a dataset - **repo_id**: `"hf_username/dataset_name"`, name of the dataset and repo on the hub - **num_episodes**: Number of episodes to record -- **number_of_steps_after_success**: Number of additional frames to record after a success (reward=1) is detected +- **processor.reset.number_of_steps_after_success**: Number of additional frames to record after a success (reward=1) is detected - **fps**: Number of frames per second to record - **push_to_hub**: Whether to push the dataset to the hub -The `number_of_steps_after_success` parameter is crucial as it allows you to collect more positive examples. When a success is detected, the system will continue recording for the specified number of steps while maintaining the reward=1 label. Otherwise, there won't be enough states in the dataset labeled to 1 to train a good classifier. +The `processor.reset.number_of_steps_after_success` parameter is crucial as it allows you to collect more positive examples. When a success is detected, the system will continue recording for the specified number of steps while maintaining the reward=1 label. Otherwise, there won't be enough states in the dataset labeled to 1 to train a good classifier. Example configuration section for data collection: ```json { "mode": "record", - "repo_id": "hf_username/dataset_name", - "dataset_root": "data/your_dataset", - "num_episodes": 20, - "push_to_hub": true, - "fps": 10, - "number_of_steps_after_success": 15 + "dataset": { + "repo_id": "hf_username/dataset_name", + "dataset_root": "data/your_dataset", + "num_episodes": 20, + "push_to_hub": true, + "fps": 10 + }, + "processor": { + "reset": { + "number_of_steps_after_success": 15 + } + } } ``` @@ -422,7 +452,11 @@ To use your trained reward classifier, configure the `HILSerlRobotEnvConfig` to ```python env_config = HILSerlRobotEnvConfig( - reward_classifier_pretrained_path="path_to_your_pretrained_trained_model", + processor=HILSerlProcessorConfig( + reward_classifier=RewardClassifierConfig( + pretrained_path="path_to_your_pretrained_trained_model" + ) + ), # Other environment parameters ) ``` @@ -432,7 +466,13 @@ or set the argument in the json config file. ```json { - "reward_classifier_pretrained_path": "path_to_your_pretrained_model" + "processor": { + "reward_classifier": { + "pretrained_path": "path_to_your_pretrained_model", + "success_threshold": 0.7, + "success_reward": 1.0 + } + } } ``` diff --git a/src/lerobot/envs/configs.py b/src/lerobot/envs/configs.py index 8d780f1cd..78a52616d 100644 --- a/src/lerobot/envs/configs.py +++ b/src/lerobot/envs/configs.py @@ -161,33 +161,36 @@ class XarmEnv(EnvConfig): @dataclass -class VideoRecordConfig: - """Configuration for video recording in ManiSkill environments.""" - - enabled: bool = False - record_dir: str = "videos" - trajectory_name: str = "trajectory" +class ImagePreprocessingConfig: + crop_params_dict: dict[str, tuple[int, int, int, int]] | None = None + resize_size: tuple[int, int] | None = None @dataclass -class HILSerlProcessorConfig: - """Configuration for environment wrappers.""" +class DatasetConfig: + """Configuration for dataset recording and replay.""" - # ee_action_space_params: EEActionSpaceConfig = field(default_factory=EEActionSpaceConfig) - control_mode: str = "gamepad" - display_cameras: bool = False - add_joint_velocity_to_observation: bool = False - add_current_to_observation: bool = False - add_ee_pose_to_observation: bool = False - crop_params_dict: dict[str, tuple[int, int, int, int]] | None = None - resize_size: tuple[int, int] | None = None - control_time_s: float = 20.0 - fixed_reset_joint_positions: Any | None = None - reset_time_s: float = 5.0 - use_gripper: bool = True - gripper_quantization_threshold: float | None = 0.8 - gripper_penalty: float = 0.0 - gripper_penalty_in_reward: bool = False + repo_id: str | None = None + dataset_root: str | None = None + task: str | None = "" + num_episodes: int = 10 + episode: int = 0 # for replay mode + push_to_hub: bool = True + fps: int = 10 + + +@dataclass +class RewardClassifierConfig: + """Configuration for reward classification.""" + + pretrained_path: str | None = None + success_threshold: float = 0.5 + success_reward: float = 1.0 + + +@dataclass +class InverseKinematicsConfig: + """Configuration for inverse kinematics processing.""" urdf_path: str | None = None target_frame_name: str | None = None @@ -196,6 +199,50 @@ class HILSerlProcessorConfig: max_gripper_pos: float | None = None +@dataclass +class ObservationConfig: + """Configuration for observation processing.""" + + add_joint_velocity_to_observation: bool = False + add_current_to_observation: bool = False + add_ee_pose_to_observation: bool = False + display_cameras: bool = False + + +@dataclass +class GripperConfig: + """Configuration for gripper control and penalties.""" + + use_gripper: bool = True + gripper_quantization_threshold: float | None = 0.8 + gripper_penalty: float = 0.0 + gripper_penalty_in_reward: bool = False + + +@dataclass +class ResetConfig: + """Configuration for environment reset behavior.""" + + fixed_reset_joint_positions: Any | None = None + reset_time_s: float = 5.0 + control_time_s: float = 20.0 + terminate_on_success: bool = True + number_of_steps_after_success: int = 0 + + +@dataclass +class HILSerlProcessorConfig: + """Configuration for environment processing pipeline.""" + + control_mode: str = "gamepad" + observation: ObservationConfig = field(default_factory=ObservationConfig) + image_preprocessing: ImagePreprocessingConfig = field(default_factory=ImagePreprocessingConfig) + gripper: GripperConfig = field(default_factory=GripperConfig) + reset: ResetConfig = field(default_factory=ResetConfig) + inverse_kinematics: InverseKinematicsConfig = field(default_factory=InverseKinematicsConfig) + reward_classifier: RewardClassifierConfig = field(default_factory=RewardClassifierConfig) + + @EnvConfig.register_subclass(name="gym_manipulator") @dataclass class HILSerlRobotEnvConfig(EnvConfig): @@ -203,21 +250,12 @@ class HILSerlRobotEnvConfig(EnvConfig): robot: RobotConfig | None = None teleop: TeleoperatorConfig | None = None - processor: HILSerlProcessorConfig | None = None - fps: int = 10 + processor: HILSerlProcessorConfig = field(default_factory=HILSerlProcessorConfig) + dataset: DatasetConfig = field(default_factory=DatasetConfig) + name: str = "real_robot" mode: str | None = None # Either "record", "replay", None - repo_id: str | None = None - dataset_root: str | None = None - task: str | None = "" - num_episodes: int = 10 # only for record mode - episode: int = 0 device: str = "cuda" - push_to_hub: bool = True - pretrained_policy_name_or_path: str | None = None - reward_classifier_pretrained_path: str | None = None - # For the reward classifier, to record more positive examples after a success - number_of_steps_after_success: int = 0 @property def gym_kwargs(self) -> dict: diff --git a/src/lerobot/processor/hil_processor.py b/src/lerobot/processor/hil_processor.py index 7790bc0bb..fd5239d74 100644 --- a/src/lerobot/processor/hil_processor.py +++ b/src/lerobot/processor/hil_processor.py @@ -1,3 +1,4 @@ +import time from dataclasses import dataclass from typing import Any @@ -241,3 +242,90 @@ class InterventionActionProcessor: def feature_contract(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]: return features + + +@dataclass +@ProcessorStepRegistry.register("reward_classifier_processor") +class RewardClassifierProcessor: + """Apply reward classification to image observations. + + This processor runs a trained reward classifier on image observations + to predict rewards and success states, potentially terminating episodes + when success is achieved. + """ + + pretrained_path: str = None + device: str = "cpu" + success_threshold: float = 0.5 + success_reward: float = 1.0 + terminate_on_success: bool = True + + reward_classifier: Any = None + + def __post_init__(self): + """Initialize the reward classifier after dataclass initialization.""" + if self.pretrained_path is not None: + from lerobot.policies.sac.reward_model.modeling_classifier import Classifier + + self.reward_classifier = Classifier.from_pretrained(self.pretrained_path) + self.reward_classifier.to(self.device) + self.reward_classifier.eval() + + def __call__(self, transition: EnvTransition) -> EnvTransition: + observation = transition.get(TransitionKey.OBSERVATION) + if observation is None or self.reward_classifier is None: + return transition + + # Extract images from observation + images = {key: value for key, value in observation.items() if "image" in key} + + if not images: + return transition + + # Run reward classifier + start_time = time.perf_counter() + with torch.inference_mode(): + success = self.reward_classifier.predict_reward(images, threshold=self.success_threshold) + + classifier_frequency = 1 / (time.perf_counter() - start_time) + + # Calculate reward and termination + reward = transition.get(TransitionKey.REWARD, 0.0) + terminated = transition.get(TransitionKey.DONE, False) + + if success == 1.0: + reward = self.success_reward + if self.terminate_on_success: + terminated = True + + # Update transition + new_transition = transition.copy() + new_transition[TransitionKey.REWARD] = reward + new_transition[TransitionKey.DONE] = terminated + + # Update info with classifier frequency + info = new_transition.get(TransitionKey.INFO, {}) + info["reward_classifier_frequency"] = classifier_frequency + new_transition[TransitionKey.INFO] = info + + return new_transition + + def get_config(self) -> dict[str, Any]: + return { + "device": self.device, + "success_threshold": self.success_threshold, + "success_reward": self.success_reward, + "terminate_on_success": self.terminate_on_success, + } + + def state_dict(self) -> dict[str, torch.Tensor]: + return {} + + def load_state_dict(self, state: dict[str, torch.Tensor]) -> None: + pass + + def reset(self) -> None: + pass + + def feature_contract(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]: + return features diff --git a/src/lerobot/scripts/rl/gym_manipulator.py b/src/lerobot/scripts/rl/gym_manipulator.py index 9673bb03e..b83718e94 100644 --- a/src/lerobot/scripts/rl/gym_manipulator.py +++ b/src/lerobot/scripts/rl/gym_manipulator.py @@ -36,6 +36,7 @@ from lerobot.processor.hil_processor import ( GripperPenaltyProcessor, ImageCropResizeProcessor, InterventionActionProcessor, + RewardClassifierProcessor, TimeLimitProcessor, ) from lerobot.processor.pipeline import TransitionKey @@ -308,9 +309,9 @@ def make_robot_env(cfg: EnvConfig) -> tuple[gym.Env, Any]: # Create base environment env = RobotEnv( robot=robot, - use_gripper=cfg.processor.use_gripper, - display_cameras=cfg.processor.display_cameras, - reset_pose=cfg.processor.fixed_reset_joint_positions, + use_gripper=cfg.processor.gripper.use_gripper, + display_cameras=cfg.processor.observation.display_cameras, + reset_pose=cfg.processor.reset.fixed_reset_joint_positions, ) return env, teleop_device @@ -330,33 +331,48 @@ def make_processors(env, cfg): env_pipeline_steps = [ ImageProcessor(), StateProcessor(), - JointVelocityProcessor(dt=1.0 / cfg.fps), + JointVelocityProcessor(dt=1.0 / cfg.dataset.fps), MotorCurrentProcessor(env=env), ImageCropResizeProcessor( - crop_params_dict=cfg.processor.crop_params_dict, resize_size=cfg.processor.resize_size + crop_params_dict=cfg.processor.image_preprocessing.crop_params_dict, + resize_size=cfg.processor.image_preprocessing.resize_size, ), - TimeLimitProcessor(max_episode_steps=int(cfg.processor.control_time_s * cfg.fps)), + TimeLimitProcessor(max_episode_steps=int(cfg.processor.reset.control_time_s * cfg.dataset.fps)), ] - if cfg.processor.use_gripper: + if cfg.processor.gripper.use_gripper: env_pipeline_steps.append( GripperPenaltyProcessor( - penalty=cfg.processor.gripper_penalty, max_gripper_pos=cfg.processor.max_gripper_pos + penalty=cfg.processor.gripper.gripper_penalty, + max_gripper_pos=cfg.processor.inverse_kinematics.max_gripper_pos, ) ) + + # Add reward classifier processor if configured + if cfg.processor.reward_classifier.pretrained_path is not None: + env_pipeline_steps.append( + RewardClassifierProcessor( + pretrained_path=cfg.processor.reward_classifier.pretrained_path, + device=cfg.device, + success_threshold=cfg.processor.reward_classifier.success_threshold, + success_reward=cfg.processor.reward_classifier.success_reward, + terminate_on_success=cfg.processor.reward_classifier.terminate_on_success, + ) + ) + env_pipeline_steps.append(DeviceProcessor(device=cfg.device)) env_processor = RobotProcessor(steps=env_pipeline_steps) action_pipeline_steps = [ InterventionActionProcessor( - use_gripper=cfg.processor.use_gripper, + use_gripper=cfg.processor.gripper.use_gripper, ), InverseKinematicsProcessor( - urdf_path=cfg.processor.urdf_path, - target_frame_name=cfg.processor.target_frame_name, - end_effector_step_sizes=cfg.processor.end_effector_step_sizes, - end_effector_bounds=cfg.processor.end_effector_bounds, - max_gripper_pos=cfg.processor.max_gripper_pos, + urdf_path=cfg.processor.inverse_kinematics.urdf_path, + target_frame_name=cfg.processor.inverse_kinematics.target_frame_name, + end_effector_step_sizes=cfg.processor.inverse_kinematics.end_effector_step_sizes, + end_effector_bounds=cfg.processor.inverse_kinematics.end_effector_bounds, + max_gripper_pos=cfg.processor.inverse_kinematics.max_gripper_pos, ), ] action_processor = RobotProcessor(steps=action_pipeline_steps) @@ -435,9 +451,9 @@ def step_env_and_process_transition( def control_loop(env, env_processor, action_processor, teleop_device, cfg: EnvConfig): - dt = 1.0 / cfg.fps + dt = 1.0 / cfg.dataset.fps - print(f"Starting control loop at {cfg.fps} FPS") + print(f"Starting control loop at {cfg.dataset.fps} FPS") print("Controls:") print("- Use gamepad/teleop device for intervention") print("- When not intervening, robot will stay still") @@ -460,7 +476,7 @@ def control_loop(env, env_processor, action_processor, teleop_device, cfg: EnvCo "next.reward": {"dtype": "float32", "shape": (1,), "names": None}, "next.done": {"dtype": "bool", "shape": (1,), "names": None}, } - if cfg.processor.use_gripper: + if cfg.processor.gripper.use_gripper: features["complementary_info.discrete_penalty"] = { "dtype": "float32", "shape": (1,), @@ -483,9 +499,9 @@ def control_loop(env, env_processor, action_processor, teleop_device, cfg: EnvCo # Create dataset dataset = LeRobotDataset.create( - cfg.repo_id, - cfg.fps, - root=cfg.dataset_root, + cfg.dataset.repo_id, + cfg.dataset.fps, + root=cfg.dataset.dataset_root, use_videos=True, image_writer_threads=4, image_writer_processes=0, @@ -496,12 +512,12 @@ def control_loop(env, env_processor, action_processor, teleop_device, cfg: EnvCo episode_step = 0 episode_start_time = time.perf_counter() - while episode_idx < cfg.num_episodes: + while episode_idx < cfg.dataset.num_episodes: step_start_time = time.perf_counter() # Create a neutral action (no movement) neutral_action = torch.tensor([0.0, 0.0, 0.0], dtype=torch.float32) - if hasattr(env, "use_gripper") and env.use_gripper: + if cfg.processor.gripper.use_gripper: neutral_action = torch.cat([neutral_action, torch.tensor([1.0])]) # Gripper stay # Use the new step function @@ -524,11 +540,11 @@ def control_loop(env, env_processor, action_processor, teleop_device, cfg: EnvCo "next.reward": np.array([transition[TransitionKey.REWARD]], dtype=np.float32), "next.done": np.array([terminated or truncated], dtype=bool), } - if cfg.processor.use_gripper: + if cfg.processor.gripper.use_gripper: frame["complementary_info.discrete_penalty"] = np.array( [transition[TransitionKey.COMPLEMENTARY_DATA]["discrete_penalty"]], dtype=np.float32 ) - dataset.add_frame(frame, task=cfg.task) + dataset.add_frame(frame, task=cfg.dataset.task) episode_step += 1 @@ -562,14 +578,17 @@ def control_loop(env, env_processor, action_processor, teleop_device, cfg: EnvCo # Maintain fps timing busy_wait(dt - (time.perf_counter() - step_start_time)) - if cfg.mode == "record" and cfg.push_to_hub: + if cfg.mode == "record" and cfg.dataset.push_to_hub: logging.info("Pushing dataset to hub") dataset.push_to_hub() def replay_trajectory(env, action_processor, cfg): dataset = LeRobotDataset( - cfg.repo_id, root=cfg.dataset_root, episodes=[cfg.episode], download_videos=False + cfg.dataset.repo_id, + root=cfg.dataset.dataset_root, + episodes=[cfg.dataset.episode], + download_videos=False, ) dataset_actions = dataset.hf_dataset.select_columns(["action"]) _, info = env.reset() @@ -581,7 +600,7 @@ def replay_trajectory(env, action_processor, cfg): ) transition = action_processor(transition) env.step(transition[TransitionKey.ACTION]) - busy_wait(1 / cfg.fps - (time.perf_counter() - start_time)) + busy_wait(1 / cfg.dataset.fps - (time.perf_counter() - start_time)) @parser.wrap()