diff --git a/src/lerobot/configs/policies.py b/src/lerobot/configs/policies.py index 7f326b70b..44b013c29 100644 --- a/src/lerobot/configs/policies.py +++ b/src/lerobot/configs/policies.py @@ -45,12 +45,12 @@ class PreTrainedConfig(draccus.ChoiceRegistry, HubMixin, abc.ABC): # type: igno Args: n_obs_steps: Number of environment steps worth of observations to pass to the policy (takes the current step and additional steps going back). - input_shapes: A dictionary defining the shapes of the input data for the policy. - output_shapes: A dictionary defining the shapes of the output data for the policy. - input_normalization_modes: A dictionary with key representing the modality and the value specifies the - normalization mode to apply. - output_normalization_modes: Similar dictionary as `input_normalization_modes`, but to unnormalize to - the original scale. + input_features: A dictionary defining the PolicyFeature of the input data for the policy. The key represents + the input data name, and the value is PolicyFeature, which consists of FeatureType and shape attributes. + output_features: A dictionary defining the PolicyFeature of the output data for the policy. The key represents + the output data name, and the value is PolicyFeature, which consists of FeatureType and shape attributes. + normalization_mapping: A dictionary that maps from a str value of FeatureType (e.g., "STATE", "VISUAL") to + a corresponding NormalizationMode (e.g., NormalizationMode.MIN_MAX) """ n_obs_steps: int = 1 diff --git a/src/lerobot/datasets/transforms.py b/src/lerobot/datasets/transforms.py index beacc48d9..5240619cb 100644 --- a/src/lerobot/datasets/transforms.py +++ b/src/lerobot/datasets/transforms.py @@ -216,16 +216,17 @@ class ImageTransformsConfig: def make_transform_from_config(cfg: ImageTransformConfig): - if cfg.type == "Identity": - return v2.Identity(**cfg.kwargs) - elif cfg.type == "ColorJitter": - return v2.ColorJitter(**cfg.kwargs) - elif cfg.type == "SharpnessJitter": + if cfg.type == "SharpnessJitter": return SharpnessJitter(**cfg.kwargs) - elif cfg.type == "RandomAffine": - return v2.RandomAffine(**cfg.kwargs) - else: - raise ValueError(f"Transform '{cfg.type}' is not valid.") + + transform_cls = getattr(v2, cfg.type, None) + if isinstance(transform_cls, type) and issubclass(transform_cls, Transform): + return transform_cls(**cfg.kwargs) + + raise ValueError( + f"Transform '{cfg.type}' is not valid. It must be a class in " + f"torchvision.transforms.v2 or 'SharpnessJitter'." + ) class ImageTransforms(Transform): diff --git a/src/lerobot/envs/configs.py b/src/lerobot/envs/configs.py index cd88b37bc..9c1c083a4 100644 --- a/src/lerobot/envs/configs.py +++ b/src/lerobot/envs/configs.py @@ -205,6 +205,7 @@ class ObservationConfig: add_joint_velocity_to_observation: bool = False add_current_to_observation: bool = False + add_ee_pose_to_observation: bool = False display_cameras: bool = False diff --git a/src/lerobot/policies/act/configuration_act.py b/src/lerobot/policies/act/configuration_act.py index 6f6c1c4be..bd89185fd 100644 --- a/src/lerobot/policies/act/configuration_act.py +++ b/src/lerobot/policies/act/configuration_act.py @@ -28,7 +28,7 @@ class ACTConfig(PreTrainedConfig): Defaults are configured for training on bimanual Aloha tasks like "insertion" or "transfer". The parameters you will most likely need to change are the ones which depend on the environment / sensors. - Those are: `input_shapes` and 'output_shapes`. + Those are: `input_features` and `output_features`. Notes on the inputs and outputs: - Either: @@ -48,21 +48,12 @@ class ACTConfig(PreTrainedConfig): This should be no greater than the chunk size. For example, if the chunk size size 100, you may set this to 50. This would mean that the model predicts 100 steps worth of actions, runs 50 in the environment, and throws the other 50 out. - input_shapes: A dictionary defining the shapes of the input data for the policy. The key represents - the input data name, and the value is a list indicating the dimensions of the corresponding data. - For example, "observation.image" refers to an input from a camera with dimensions [3, 96, 96], - indicating it has three color channels and 96x96 resolution. Importantly, `input_shapes` doesn't - include batch dimension or temporal dimension. - output_shapes: A dictionary defining the shapes of the output data for the policy. The key represents - the output data name, and the value is a list indicating the dimensions of the corresponding data. - For example, "action" refers to an output shape of [14], indicating 14-dimensional actions. - Importantly, `output_shapes` doesn't include batch dimension or temporal dimension. - input_normalization_modes: A dictionary with key representing the modality (e.g. "observation.state"), - and the value specifies the normalization mode to apply. The two available modes are "mean_std" - which subtracts the mean and divides by the standard deviation and "min_max" which rescale in a - [-1, 1] range. - output_normalization_modes: Similar dictionary as `normalize_input_modes`, but to unnormalize to the - original scale. Note that this is also used for normalizing the training targets. + input_features: A dictionary defining the PolicyFeature of the input data for the policy. The key represents + the input data name, and the value is PolicyFeature, which consists of FeatureType and shape attributes. + output_features: A dictionary defining the PolicyFeature of the output data for the policy. The key represents + the output data name, and the value is PolicyFeature, which consists of FeatureType and shape attributes. + normalization_mapping: A dictionary that maps from a str value of FeatureType (e.g., "STATE", "VISUAL") to + a corresponding NormalizationMode (e.g., NormalizationMode.MIN_MAX) vision_backbone: Name of the torchvision resnet backbone to use for encoding images. pretrained_backbone_weights: Pretrained weights from torchvision to initialize the backbone. `None` means no pretrained weights. diff --git a/src/lerobot/policies/diffusion/configuration_diffusion.py b/src/lerobot/policies/diffusion/configuration_diffusion.py index 54569434a..8ac0920dd 100644 --- a/src/lerobot/policies/diffusion/configuration_diffusion.py +++ b/src/lerobot/policies/diffusion/configuration_diffusion.py @@ -30,7 +30,7 @@ class DiffusionConfig(PreTrainedConfig): Defaults are configured for training with PushT providing proprioceptive and single camera observations. The parameters you will most likely need to change are the ones which depend on the environment / sensors. - Those are: `input_shapes` and `output_shapes`. + Those are: `input_features` and `output_features`. Notes on the inputs and outputs: - "observation.state" is required as an input key. @@ -48,21 +48,12 @@ class DiffusionConfig(PreTrainedConfig): horizon: Diffusion model action prediction size as detailed in `DiffusionPolicy.select_action`. n_action_steps: The number of action steps to run in the environment for one invocation of the policy. See `DiffusionPolicy.select_action` for more details. - input_shapes: A dictionary defining the shapes of the input data for the policy. The key represents - the input data name, and the value is a list indicating the dimensions of the corresponding data. - For example, "observation.image" refers to an input from a camera with dimensions [3, 96, 96], - indicating it has three color channels and 96x96 resolution. Importantly, `input_shapes` doesn't - include batch dimension or temporal dimension. - output_shapes: A dictionary defining the shapes of the output data for the policy. The key represents - the output data name, and the value is a list indicating the dimensions of the corresponding data. - For example, "action" refers to an output shape of [14], indicating 14-dimensional actions. - Importantly, `output_shapes` doesn't include batch dimension or temporal dimension. - input_normalization_modes: A dictionary with key representing the modality (e.g. "observation.state"), - and the value specifies the normalization mode to apply. The two available modes are "mean_std" - which subtracts the mean and divides by the standard deviation and "min_max" which rescale in a - [-1, 1] range. - output_normalization_modes: Similar dictionary as `normalize_input_modes`, but to unnormalize to the - original scale. Note that this is also used for normalizing the training targets. + input_features: A dictionary defining the PolicyFeature of the input data for the policy. The key represents + the input data name, and the value is PolicyFeature, which consists of FeatureType and shape attributes. + output_features: A dictionary defining the PolicyFeature of the output data for the policy. The key represents + the output data name, and the value is PolicyFeature, which consists of FeatureType and shape attributes. + normalization_mapping: A dictionary that maps from a str value of FeatureType (e.g., "STATE", "VISUAL") to + a corresponding NormalizationMode (e.g., NormalizationMode.MIN_MAX) vision_backbone: Name of the torchvision resnet backbone to use for encoding images. crop_shape: (H, W) shape to crop images to as a preprocessing step for the vision backbone. Must fit within the image size. If None, no cropping is done. @@ -73,7 +64,7 @@ class DiffusionConfig(PreTrainedConfig): use_group_norm: Whether to replace batch normalization with group normalization in the backbone. The group sizes are set to be about 16 (to be precise, feature_dim // 16). spatial_softmax_num_keypoints: Number of keypoints for SpatialSoftmax. - use_separate_rgb_encoders_per_camera: Whether to use a separate RGB encoder for each camera view. + use_separate_rgb_encoder_per_camera: Whether to use a separate RGB encoder for each camera view. down_dims: Feature dimension for each stage of temporal downsampling in the diffusion modeling Unet. You may provide a variable number of dimensions, therefore also controlling the degree of downsampling. diff --git a/src/lerobot/policies/tdmpc/configuration_tdmpc.py b/src/lerobot/policies/tdmpc/configuration_tdmpc.py index 3c1a29932..3ec493472 100644 --- a/src/lerobot/policies/tdmpc/configuration_tdmpc.py +++ b/src/lerobot/policies/tdmpc/configuration_tdmpc.py @@ -30,7 +30,7 @@ class TDMPCConfig(PreTrainedConfig): camera observations. The parameters you will most likely need to change are the ones which depend on the environment / sensors. - Those are: `input_shapes`, `output_shapes`, and perhaps `max_random_shift_ratio`. + Those are: `input_features`, `output_features`, and perhaps `max_random_shift_ratio`. Args: n_action_repeats: The number of times to repeat the action returned by the planning. (hint: Google @@ -40,24 +40,12 @@ class TDMPCConfig(PreTrainedConfig): is an alternative to using action repeats. If this is set to more than 1, then we require `n_action_repeats == 1`, `use_mpc == True` and `n_action_steps <= horizon`. Note that this approach of using multiple steps from the plan is not in the original implementation. - input_shapes: A dictionary defining the shapes of the input data for the policy. The key represents - the input data name, and the value is a list indicating the dimensions of the corresponding data. - For example, "observation.image" refers to an input from a camera with dimensions [3, 96, 96], - indicating it has three color channels and 96x96 resolution. Importantly, `input_shapes` doesn't - include batch dimension or temporal dimension. - output_shapes: A dictionary defining the shapes of the output data for the policy. The key represents - the output data name, and the value is a list indicating the dimensions of the corresponding data. - For example, "action" refers to an output shape of [14], indicating 14-dimensional actions. - Importantly, `output_shapes` doesn't include batch dimension or temporal dimension. - input_normalization_modes: A dictionary with key representing the modality (e.g. "observation.state"), - and the value specifies the normalization mode to apply. The two available modes are "mean_std" - which subtracts the mean and divides by the standard deviation and "min_max" which rescale in a - [-1, 1] range. Note that here this defaults to None meaning inputs are not normalized. This is to - match the original implementation. - output_normalization_modes: Similar dictionary as `normalize_input_modes`, but to unnormalize to the - original scale. Note that this is also used for normalizing the training targets. NOTE: Clipping - to [-1, +1] is used during MPPI/CEM. Therefore, it is recommended that you stick with "min_max" - normalization mode here. + input_features: A dictionary defining the PolicyFeature of the input data for the policy. The key represents + the input data name, and the value is PolicyFeature, which consists of FeatureType and shape attributes. + output_features: A dictionary defining the PolicyFeature of the output data for the policy. The key represents + the output data name, and the value is PolicyFeature, which consists of FeatureType and shape attributes. + normalization_mapping: A dictionary that maps from a str value of FeatureType (e.g., "STATE", "VISUAL") to + a corresponding NormalizationMode (e.g., NormalizationMode.MIN_MAX) image_encoder_hidden_dim: Number of channels for the convolutional layers used for image encoding. state_encoder_hidden_dim: Hidden dimension for MLP used for state vector encoding. latent_dim: Observation's latent embedding dimension. diff --git a/src/lerobot/policies/vqbet/configuration_vqbet.py b/src/lerobot/policies/vqbet/configuration_vqbet.py index 44ada9f17..32906e528 100644 --- a/src/lerobot/policies/vqbet/configuration_vqbet.py +++ b/src/lerobot/policies/vqbet/configuration_vqbet.py @@ -32,7 +32,7 @@ class VQBeTConfig(PreTrainedConfig): Defaults are configured for training with PushT providing proprioceptive and single camera observations. The parameters you will most likely need to change are the ones which depend on the environment / sensors. - Those are: `input_shapes` and `output_shapes`. + Those are: `input_features` and `output_features`. Notes on the inputs and outputs: - "observation.state" is required as an input key. @@ -46,21 +46,12 @@ class VQBeTConfig(PreTrainedConfig): current step and additional steps going back). n_action_pred_token: Total number of current token and future tokens that VQ-BeT predicts. action_chunk_size: Action chunk size of each action prediction token. - input_shapes: A dictionary defining the shapes of the input data for the policy. - The key represents the input data name, and the value is a list indicating the dimensions - of the corresponding data. For example, "observation.image" refers to an input from - a camera with dimensions [3, 96, 96], indicating it has three color channels and 96x96 resolution. - Importantly, shapes doesnt include batch dimension or temporal dimension. - output_shapes: A dictionary defining the shapes of the output data for the policy. - The key represents the output data name, and the value is a list indicating the dimensions - of the corresponding data. For example, "action" refers to an output shape of [14], indicating - 14-dimensional actions. Importantly, shapes doesnt include batch dimension or temporal dimension. - input_normalization_modes: A dictionary with key representing the modality (e.g. "observation.state"), - and the value specifies the normalization mode to apply. The two available modes are "mean_std" - which subtracts the mean and divides by the standard deviation and "min_max" which rescale in a - [-1, 1] range. - output_normalization_modes: Similar dictionary as `normalize_input_modes`, but to unnormalize to the - original scale. Note that this is also used for normalizing the training targets. + input_features: A dictionary defining the PolicyFeature of the input data for the policy. The key represents + the input data name, and the value is PolicyFeature, which consists of FeatureType and shape attributes. + output_features: A dictionary defining the PolicyFeature of the output data for the policy. The key represents + the output data name, and the value is PolicyFeature, which consists of FeatureType and shape attributes. + normalization_mapping: A dictionary that maps from a str value of FeatureType (e.g., "STATE", "VISUAL") to + a corresponding NormalizationMode (e.g., NormalizationMode.MIN_MAX) vision_backbone: Name of the torchvision resnet backbone to use for encoding images. crop_shape: (H, W) shape to crop images to as a preprocessing step for the vision backbone. Must fit within the image size. If None, no cropping is done. diff --git a/src/lerobot/processor/hil_processor.py b/src/lerobot/processor/hil_processor.py index 6d44ed8cb..24b5628fa 100644 --- a/src/lerobot/processor/hil_processor.py +++ b/src/lerobot/processor/hil_processor.py @@ -314,7 +314,7 @@ class TimeLimitProcessorStep(TruncatedProcessorStep): @dataclass @ProcessorStepRegistry.register("gripper_penalty_processor") -class GripperPenaltyProcessorStep(ComplementaryDataProcessorStep): +class GripperPenaltyProcessorStep(ProcessorStep): """ Applies a penalty for inefficient gripper usage. @@ -329,26 +329,27 @@ class GripperPenaltyProcessorStep(ComplementaryDataProcessorStep): penalty: float = -0.01 max_gripper_pos: float = 30.0 - def complementary_data(self, complementary_data: dict) -> dict: + def __call__(self, transition: EnvTransition) -> EnvTransition: """ Calculates the gripper penalty and adds it to the complementary data. Args: - complementary_data: The incoming complementary data, which should contain - raw joint positions. + transition: The incoming environment transition. Returns: - A new complementary data dictionary with the `discrete_penalty` key added. + The modified transition with the penalty added to complementary data. """ - action = self.transition.get(TransitionKey.ACTION) + new_transition = transition.copy() + action = new_transition.get(TransitionKey.ACTION) + complementary_data = new_transition.get(TransitionKey.COMPLEMENTARY_DATA, {}) raw_joint_positions = complementary_data.get("raw_joint_positions") if raw_joint_positions is None: - return complementary_data + return new_transition current_gripper_pos = raw_joint_positions.get(GRIPPER_KEY, None) if current_gripper_pos is None: - return complementary_data + return new_transition # Gripper action is a PolicyAction at this stage gripper_action = action[-1].item() @@ -364,11 +365,12 @@ class GripperPenaltyProcessorStep(ComplementaryDataProcessorStep): gripper_penalty = self.penalty * int(gripper_penalty_bool) - # Create new complementary data with penalty info + # Update complementary data with penalty info new_complementary_data = dict(complementary_data) new_complementary_data[DISCRETE_PENALTY_KEY] = gripper_penalty + new_transition[TransitionKey.COMPLEMENTARY_DATA] = new_complementary_data - return new_complementary_data + return new_transition def get_config(self) -> dict[str, Any]: """ diff --git a/src/lerobot/rl/gym_manipulator.py b/src/lerobot/rl/gym_manipulator.py index 3d58ae18f..1c1cb752f 100644 --- a/src/lerobot/rl/gym_manipulator.py +++ b/src/lerobot/rl/gym_manipulator.py @@ -412,7 +412,10 @@ def make_processors( if cfg.processor.observation.add_current_to_observation: env_pipeline_steps.append(MotorCurrentProcessorStep(robot=env.robot)) - if kinematics_solver is not None: + add_ee_pose = ( + cfg.processor.observation is not None and cfg.processor.observation.add_ee_pose_to_observation + ) + if kinematics_solver is not None and add_ee_pose: env_pipeline_steps.append( ForwardKinematicsJointsToEEObservation( kinematics=kinematics_solver, @@ -435,7 +438,12 @@ def make_processors( ) # Add gripper penalty processor if gripper config exists and enabled - if cfg.processor.gripper is not None and cfg.processor.gripper.use_gripper: + # Only add if max_gripper_pos is explicitly configured (required for normalization) + if ( + cfg.processor.gripper is not None + and cfg.processor.gripper.use_gripper + and cfg.processor.max_gripper_pos is not None + ): env_pipeline_steps.append( GripperPenaltyProcessorStep( penalty=cfg.processor.gripper.gripper_penalty, diff --git a/src/lerobot/rl/wandb_utils.py b/src/lerobot/rl/wandb_utils.py index 7b7f8a57b..ee30b75df 100644 --- a/src/lerobot/rl/wandb_utils.py +++ b/src/lerobot/rl/wandb_utils.py @@ -26,8 +26,21 @@ from lerobot.configs.train import TrainPipelineConfig from lerobot.utils.constants import PRETRAINED_MODEL_DIR -def cfg_to_group(cfg: TrainPipelineConfig, return_list: bool = False) -> list[str] | str: +def cfg_to_group( + cfg: TrainPipelineConfig, return_list: bool = False, truncate_tags: bool = False, max_tag_length: int = 64 +) -> list[str] | str: """Return a group name for logging. Optionally returns group name as list.""" + + def _maybe_truncate(tag: str) -> str: + """Truncate tag to max_tag_length characters if required. + + wandb rejects tags longer than 64 characters. + See: https://github.com/wandb/wandb/blob/main/wandb/sdk/wandb_settings.py + """ + if len(tag) <= max_tag_length: + return tag + return tag[:max_tag_length] + lst = [ f"policy:{cfg.policy.type}", f"seed:{cfg.seed}", @@ -36,6 +49,8 @@ def cfg_to_group(cfg: TrainPipelineConfig, return_list: bool = False) -> list[st lst.append(f"dataset:{cfg.dataset.repo_id}") if cfg.env is not None: lst.append(f"env:{cfg.env.type}") + if truncate_tags: + lst = [_maybe_truncate(tag) for tag in lst] return lst if return_list else "-".join(lst) @@ -83,7 +98,7 @@ class WandBLogger: entity=self.cfg.entity, name=self.job_name, notes=self.cfg.notes, - tags=cfg_to_group(cfg, return_list=True), + tags=cfg_to_group(cfg, return_list=True, truncate_tags=True), dir=self.log_dir, config=cfg.to_dict(), # TODO(rcadene): try set to True diff --git a/tests/datasets/test_image_transforms.py b/tests/datasets/test_image_transforms.py index 8a66ceb24..ef7e8c395 100644 --- a/tests/datasets/test_image_transforms.py +++ b/tests/datasets/test_image_transforms.py @@ -390,6 +390,30 @@ def test_sharpness_jitter_invalid_range_max_smaller(): SharpnessJitter((2.0, 0.1)) +def test_make_transform_from_config_with_v2_resize(img_tensor_factory): + img_tensor = img_tensor_factory() + tf_cfg = ImageTransformConfig(type="Resize", kwargs={"size": (32, 32)}) + tf = make_transform_from_config(tf_cfg) + assert isinstance(tf, v2.Resize) + output = tf(img_tensor) + assert output.shape[-2:] == (32, 32) + + +def test_make_transform_from_config_with_v2_identity(img_tensor_factory): + img_tensor = img_tensor_factory() + tf_cfg = ImageTransformConfig(type="Identity", kwargs={}) + tf = make_transform_from_config(tf_cfg) + assert isinstance(tf, v2.Identity) + output = tf(img_tensor) + assert output.shape == img_tensor.shape + + +def test_make_transform_from_config_invalid_type(): + tf_cfg = ImageTransformConfig(type="NotARealTransform", kwargs={}) + with pytest.raises(ValueError, match="not valid"): + make_transform_from_config(tf_cfg) + + def test_save_all_transforms(img_tensor_factory, tmp_path): img_tensor = img_tensor_factory() tf_cfg = ImageTransformsConfig(enable=True)