mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-16 17:20:05 +00:00
Compare commits
5 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| e07bc8fd97 | |||
| 0f39248445 | |||
| a6370dd783 | |||
| 14a15f90e7 | |||
| 9c24a09665 |
@@ -45,12 +45,12 @@ class PreTrainedConfig(draccus.ChoiceRegistry, HubMixin, abc.ABC): # type: igno
|
|||||||
Args:
|
Args:
|
||||||
n_obs_steps: Number of environment steps worth of observations to pass to the policy (takes the
|
n_obs_steps: Number of environment steps worth of observations to pass to the policy (takes the
|
||||||
current step and additional steps going back).
|
current step and additional steps going back).
|
||||||
input_shapes: A dictionary defining the shapes of the input data for the policy.
|
input_features: A dictionary defining the PolicyFeature of the input data for the policy. The key represents
|
||||||
output_shapes: A dictionary defining the shapes of the output data for the policy.
|
the input data name, and the value is PolicyFeature, which consists of FeatureType and shape attributes.
|
||||||
input_normalization_modes: A dictionary with key representing the modality and the value specifies the
|
output_features: A dictionary defining the PolicyFeature of the output data for the policy. The key represents
|
||||||
normalization mode to apply.
|
the output data name, and the value is PolicyFeature, which consists of FeatureType and shape attributes.
|
||||||
output_normalization_modes: Similar dictionary as `input_normalization_modes`, but to unnormalize to
|
normalization_mapping: A dictionary that maps from a str value of FeatureType (e.g., "STATE", "VISUAL") to
|
||||||
the original scale.
|
a corresponding NormalizationMode (e.g., NormalizationMode.MIN_MAX)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
n_obs_steps: int = 1
|
n_obs_steps: int = 1
|
||||||
|
|||||||
@@ -205,6 +205,7 @@ class ObservationConfig:
|
|||||||
|
|
||||||
add_joint_velocity_to_observation: bool = False
|
add_joint_velocity_to_observation: bool = False
|
||||||
add_current_to_observation: bool = False
|
add_current_to_observation: bool = False
|
||||||
|
add_ee_pose_to_observation: bool = False
|
||||||
display_cameras: bool = False
|
display_cameras: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ class ACTConfig(PreTrainedConfig):
|
|||||||
Defaults are configured for training on bimanual Aloha tasks like "insertion" or "transfer".
|
Defaults are configured for training on bimanual Aloha tasks like "insertion" or "transfer".
|
||||||
|
|
||||||
The parameters you will most likely need to change are the ones which depend on the environment / sensors.
|
The parameters you will most likely need to change are the ones which depend on the environment / sensors.
|
||||||
Those are: `input_shapes` and 'output_shapes`.
|
Those are: `input_features` and `output_features`.
|
||||||
|
|
||||||
Notes on the inputs and outputs:
|
Notes on the inputs and outputs:
|
||||||
- Either:
|
- Either:
|
||||||
@@ -48,21 +48,12 @@ class ACTConfig(PreTrainedConfig):
|
|||||||
This should be no greater than the chunk size. For example, if the chunk size size 100, you may
|
This should be no greater than the chunk size. For example, if the chunk size size 100, you may
|
||||||
set this to 50. This would mean that the model predicts 100 steps worth of actions, runs 50 in the
|
set this to 50. This would mean that the model predicts 100 steps worth of actions, runs 50 in the
|
||||||
environment, and throws the other 50 out.
|
environment, and throws the other 50 out.
|
||||||
input_shapes: A dictionary defining the shapes of the input data for the policy. The key represents
|
input_features: A dictionary defining the PolicyFeature of the input data for the policy. The key represents
|
||||||
the input data name, and the value is a list indicating the dimensions of the corresponding data.
|
the input data name, and the value is PolicyFeature, which consists of FeatureType and shape attributes.
|
||||||
For example, "observation.image" refers to an input from a camera with dimensions [3, 96, 96],
|
output_features: A dictionary defining the PolicyFeature of the output data for the policy. The key represents
|
||||||
indicating it has three color channels and 96x96 resolution. Importantly, `input_shapes` doesn't
|
the output data name, and the value is PolicyFeature, which consists of FeatureType and shape attributes.
|
||||||
include batch dimension or temporal dimension.
|
normalization_mapping: A dictionary that maps from a str value of FeatureType (e.g., "STATE", "VISUAL") to
|
||||||
output_shapes: A dictionary defining the shapes of the output data for the policy. The key represents
|
a corresponding NormalizationMode (e.g., NormalizationMode.MIN_MAX)
|
||||||
the output data name, and the value is a list indicating the dimensions of the corresponding data.
|
|
||||||
For example, "action" refers to an output shape of [14], indicating 14-dimensional actions.
|
|
||||||
Importantly, `output_shapes` doesn't include batch dimension or temporal dimension.
|
|
||||||
input_normalization_modes: A dictionary with key representing the modality (e.g. "observation.state"),
|
|
||||||
and the value specifies the normalization mode to apply. The two available modes are "mean_std"
|
|
||||||
which subtracts the mean and divides by the standard deviation and "min_max" which rescale in a
|
|
||||||
[-1, 1] range.
|
|
||||||
output_normalization_modes: Similar dictionary as `normalize_input_modes`, but to unnormalize to the
|
|
||||||
original scale. Note that this is also used for normalizing the training targets.
|
|
||||||
vision_backbone: Name of the torchvision resnet backbone to use for encoding images.
|
vision_backbone: Name of the torchvision resnet backbone to use for encoding images.
|
||||||
pretrained_backbone_weights: Pretrained weights from torchvision to initialize the backbone.
|
pretrained_backbone_weights: Pretrained weights from torchvision to initialize the backbone.
|
||||||
`None` means no pretrained weights.
|
`None` means no pretrained weights.
|
||||||
|
|||||||
@@ -30,7 +30,7 @@ class DiffusionConfig(PreTrainedConfig):
|
|||||||
Defaults are configured for training with PushT providing proprioceptive and single camera observations.
|
Defaults are configured for training with PushT providing proprioceptive and single camera observations.
|
||||||
|
|
||||||
The parameters you will most likely need to change are the ones which depend on the environment / sensors.
|
The parameters you will most likely need to change are the ones which depend on the environment / sensors.
|
||||||
Those are: `input_shapes` and `output_shapes`.
|
Those are: `input_features` and `output_features`.
|
||||||
|
|
||||||
Notes on the inputs and outputs:
|
Notes on the inputs and outputs:
|
||||||
- "observation.state" is required as an input key.
|
- "observation.state" is required as an input key.
|
||||||
@@ -48,21 +48,12 @@ class DiffusionConfig(PreTrainedConfig):
|
|||||||
horizon: Diffusion model action prediction size as detailed in `DiffusionPolicy.select_action`.
|
horizon: Diffusion model action prediction size as detailed in `DiffusionPolicy.select_action`.
|
||||||
n_action_steps: The number of action steps to run in the environment for one invocation of the policy.
|
n_action_steps: The number of action steps to run in the environment for one invocation of the policy.
|
||||||
See `DiffusionPolicy.select_action` for more details.
|
See `DiffusionPolicy.select_action` for more details.
|
||||||
input_shapes: A dictionary defining the shapes of the input data for the policy. The key represents
|
input_features: A dictionary defining the PolicyFeature of the input data for the policy. The key represents
|
||||||
the input data name, and the value is a list indicating the dimensions of the corresponding data.
|
the input data name, and the value is PolicyFeature, which consists of FeatureType and shape attributes.
|
||||||
For example, "observation.image" refers to an input from a camera with dimensions [3, 96, 96],
|
output_features: A dictionary defining the PolicyFeature of the output data for the policy. The key represents
|
||||||
indicating it has three color channels and 96x96 resolution. Importantly, `input_shapes` doesn't
|
the output data name, and the value is PolicyFeature, which consists of FeatureType and shape attributes.
|
||||||
include batch dimension or temporal dimension.
|
normalization_mapping: A dictionary that maps from a str value of FeatureType (e.g., "STATE", "VISUAL") to
|
||||||
output_shapes: A dictionary defining the shapes of the output data for the policy. The key represents
|
a corresponding NormalizationMode (e.g., NormalizationMode.MIN_MAX)
|
||||||
the output data name, and the value is a list indicating the dimensions of the corresponding data.
|
|
||||||
For example, "action" refers to an output shape of [14], indicating 14-dimensional actions.
|
|
||||||
Importantly, `output_shapes` doesn't include batch dimension or temporal dimension.
|
|
||||||
input_normalization_modes: A dictionary with key representing the modality (e.g. "observation.state"),
|
|
||||||
and the value specifies the normalization mode to apply. The two available modes are "mean_std"
|
|
||||||
which subtracts the mean and divides by the standard deviation and "min_max" which rescale in a
|
|
||||||
[-1, 1] range.
|
|
||||||
output_normalization_modes: Similar dictionary as `normalize_input_modes`, but to unnormalize to the
|
|
||||||
original scale. Note that this is also used for normalizing the training targets.
|
|
||||||
vision_backbone: Name of the torchvision resnet backbone to use for encoding images.
|
vision_backbone: Name of the torchvision resnet backbone to use for encoding images.
|
||||||
crop_shape: (H, W) shape to crop images to as a preprocessing step for the vision backbone. Must fit
|
crop_shape: (H, W) shape to crop images to as a preprocessing step for the vision backbone. Must fit
|
||||||
within the image size. If None, no cropping is done.
|
within the image size. If None, no cropping is done.
|
||||||
@@ -73,7 +64,7 @@ class DiffusionConfig(PreTrainedConfig):
|
|||||||
use_group_norm: Whether to replace batch normalization with group normalization in the backbone.
|
use_group_norm: Whether to replace batch normalization with group normalization in the backbone.
|
||||||
The group sizes are set to be about 16 (to be precise, feature_dim // 16).
|
The group sizes are set to be about 16 (to be precise, feature_dim // 16).
|
||||||
spatial_softmax_num_keypoints: Number of keypoints for SpatialSoftmax.
|
spatial_softmax_num_keypoints: Number of keypoints for SpatialSoftmax.
|
||||||
use_separate_rgb_encoders_per_camera: Whether to use a separate RGB encoder for each camera view.
|
use_separate_rgb_encoder_per_camera: Whether to use a separate RGB encoder for each camera view.
|
||||||
down_dims: Feature dimension for each stage of temporal downsampling in the diffusion modeling Unet.
|
down_dims: Feature dimension for each stage of temporal downsampling in the diffusion modeling Unet.
|
||||||
You may provide a variable number of dimensions, therefore also controlling the degree of
|
You may provide a variable number of dimensions, therefore also controlling the degree of
|
||||||
downsampling.
|
downsampling.
|
||||||
|
|||||||
@@ -30,7 +30,7 @@ class TDMPCConfig(PreTrainedConfig):
|
|||||||
camera observations.
|
camera observations.
|
||||||
|
|
||||||
The parameters you will most likely need to change are the ones which depend on the environment / sensors.
|
The parameters you will most likely need to change are the ones which depend on the environment / sensors.
|
||||||
Those are: `input_shapes`, `output_shapes`, and perhaps `max_random_shift_ratio`.
|
Those are: `input_features`, `output_features`, and perhaps `max_random_shift_ratio`.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
n_action_repeats: The number of times to repeat the action returned by the planning. (hint: Google
|
n_action_repeats: The number of times to repeat the action returned by the planning. (hint: Google
|
||||||
@@ -40,24 +40,12 @@ class TDMPCConfig(PreTrainedConfig):
|
|||||||
is an alternative to using action repeats. If this is set to more than 1, then we require
|
is an alternative to using action repeats. If this is set to more than 1, then we require
|
||||||
`n_action_repeats == 1`, `use_mpc == True` and `n_action_steps <= horizon`. Note that this
|
`n_action_repeats == 1`, `use_mpc == True` and `n_action_steps <= horizon`. Note that this
|
||||||
approach of using multiple steps from the plan is not in the original implementation.
|
approach of using multiple steps from the plan is not in the original implementation.
|
||||||
input_shapes: A dictionary defining the shapes of the input data for the policy. The key represents
|
input_features: A dictionary defining the PolicyFeature of the input data for the policy. The key represents
|
||||||
the input data name, and the value is a list indicating the dimensions of the corresponding data.
|
the input data name, and the value is PolicyFeature, which consists of FeatureType and shape attributes.
|
||||||
For example, "observation.image" refers to an input from a camera with dimensions [3, 96, 96],
|
output_features: A dictionary defining the PolicyFeature of the output data for the policy. The key represents
|
||||||
indicating it has three color channels and 96x96 resolution. Importantly, `input_shapes` doesn't
|
the output data name, and the value is PolicyFeature, which consists of FeatureType and shape attributes.
|
||||||
include batch dimension or temporal dimension.
|
normalization_mapping: A dictionary that maps from a str value of FeatureType (e.g., "STATE", "VISUAL") to
|
||||||
output_shapes: A dictionary defining the shapes of the output data for the policy. The key represents
|
a corresponding NormalizationMode (e.g., NormalizationMode.MIN_MAX)
|
||||||
the output data name, and the value is a list indicating the dimensions of the corresponding data.
|
|
||||||
For example, "action" refers to an output shape of [14], indicating 14-dimensional actions.
|
|
||||||
Importantly, `output_shapes` doesn't include batch dimension or temporal dimension.
|
|
||||||
input_normalization_modes: A dictionary with key representing the modality (e.g. "observation.state"),
|
|
||||||
and the value specifies the normalization mode to apply. The two available modes are "mean_std"
|
|
||||||
which subtracts the mean and divides by the standard deviation and "min_max" which rescale in a
|
|
||||||
[-1, 1] range. Note that here this defaults to None meaning inputs are not normalized. This is to
|
|
||||||
match the original implementation.
|
|
||||||
output_normalization_modes: Similar dictionary as `normalize_input_modes`, but to unnormalize to the
|
|
||||||
original scale. Note that this is also used for normalizing the training targets. NOTE: Clipping
|
|
||||||
to [-1, +1] is used during MPPI/CEM. Therefore, it is recommended that you stick with "min_max"
|
|
||||||
normalization mode here.
|
|
||||||
image_encoder_hidden_dim: Number of channels for the convolutional layers used for image encoding.
|
image_encoder_hidden_dim: Number of channels for the convolutional layers used for image encoding.
|
||||||
state_encoder_hidden_dim: Hidden dimension for MLP used for state vector encoding.
|
state_encoder_hidden_dim: Hidden dimension for MLP used for state vector encoding.
|
||||||
latent_dim: Observation's latent embedding dimension.
|
latent_dim: Observation's latent embedding dimension.
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ class VQBeTConfig(PreTrainedConfig):
|
|||||||
Defaults are configured for training with PushT providing proprioceptive and single camera observations.
|
Defaults are configured for training with PushT providing proprioceptive and single camera observations.
|
||||||
|
|
||||||
The parameters you will most likely need to change are the ones which depend on the environment / sensors.
|
The parameters you will most likely need to change are the ones which depend on the environment / sensors.
|
||||||
Those are: `input_shapes` and `output_shapes`.
|
Those are: `input_features` and `output_features`.
|
||||||
|
|
||||||
Notes on the inputs and outputs:
|
Notes on the inputs and outputs:
|
||||||
- "observation.state" is required as an input key.
|
- "observation.state" is required as an input key.
|
||||||
@@ -46,21 +46,12 @@ class VQBeTConfig(PreTrainedConfig):
|
|||||||
current step and additional steps going back).
|
current step and additional steps going back).
|
||||||
n_action_pred_token: Total number of current token and future tokens that VQ-BeT predicts.
|
n_action_pred_token: Total number of current token and future tokens that VQ-BeT predicts.
|
||||||
action_chunk_size: Action chunk size of each action prediction token.
|
action_chunk_size: Action chunk size of each action prediction token.
|
||||||
input_shapes: A dictionary defining the shapes of the input data for the policy.
|
input_features: A dictionary defining the PolicyFeature of the input data for the policy. The key represents
|
||||||
The key represents the input data name, and the value is a list indicating the dimensions
|
the input data name, and the value is PolicyFeature, which consists of FeatureType and shape attributes.
|
||||||
of the corresponding data. For example, "observation.image" refers to an input from
|
output_features: A dictionary defining the PolicyFeature of the output data for the policy. The key represents
|
||||||
a camera with dimensions [3, 96, 96], indicating it has three color channels and 96x96 resolution.
|
the output data name, and the value is PolicyFeature, which consists of FeatureType and shape attributes.
|
||||||
Importantly, shapes doesnt include batch dimension or temporal dimension.
|
normalization_mapping: A dictionary that maps from a str value of FeatureType (e.g., "STATE", "VISUAL") to
|
||||||
output_shapes: A dictionary defining the shapes of the output data for the policy.
|
a corresponding NormalizationMode (e.g., NormalizationMode.MIN_MAX)
|
||||||
The key represents the output data name, and the value is a list indicating the dimensions
|
|
||||||
of the corresponding data. For example, "action" refers to an output shape of [14], indicating
|
|
||||||
14-dimensional actions. Importantly, shapes doesnt include batch dimension or temporal dimension.
|
|
||||||
input_normalization_modes: A dictionary with key representing the modality (e.g. "observation.state"),
|
|
||||||
and the value specifies the normalization mode to apply. The two available modes are "mean_std"
|
|
||||||
which subtracts the mean and divides by the standard deviation and "min_max" which rescale in a
|
|
||||||
[-1, 1] range.
|
|
||||||
output_normalization_modes: Similar dictionary as `normalize_input_modes`, but to unnormalize to the
|
|
||||||
original scale. Note that this is also used for normalizing the training targets.
|
|
||||||
vision_backbone: Name of the torchvision resnet backbone to use for encoding images.
|
vision_backbone: Name of the torchvision resnet backbone to use for encoding images.
|
||||||
crop_shape: (H, W) shape to crop images to as a preprocessing step for the vision backbone. Must fit
|
crop_shape: (H, W) shape to crop images to as a preprocessing step for the vision backbone. Must fit
|
||||||
within the image size. If None, no cropping is done.
|
within the image size. If None, no cropping is done.
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ from dataclasses import dataclass
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from lerobot.configs.types import FeatureType, PipelineFeatureType, PolicyFeature
|
from lerobot.configs.types import PipelineFeatureType, PolicyFeature
|
||||||
from lerobot.utils.constants import OBS_IMAGES, OBS_PREFIX, OBS_STATE, OBS_STR
|
from lerobot.utils.constants import OBS_IMAGES, OBS_PREFIX, OBS_STATE, OBS_STR
|
||||||
|
|
||||||
from .pipeline import ObservationProcessorStep, ProcessorStepRegistry
|
from .pipeline import ObservationProcessorStep, ProcessorStepRegistry
|
||||||
@@ -92,7 +92,7 @@ class LiberoProcessorStep(ObservationProcessorStep):
|
|||||||
|
|
||||||
# copy over non-STATE features
|
# copy over non-STATE features
|
||||||
for ft, feats in features.items():
|
for ft, feats in features.items():
|
||||||
if ft != FeatureType.STATE:
|
if ft != PipelineFeatureType.STATE:
|
||||||
new_features[ft] = feats.copy()
|
new_features[ft] = feats.copy()
|
||||||
|
|
||||||
# rebuild STATE features
|
# rebuild STATE features
|
||||||
@@ -100,11 +100,13 @@ class LiberoProcessorStep(ObservationProcessorStep):
|
|||||||
|
|
||||||
# add our new flattened state
|
# add our new flattened state
|
||||||
state_feats[OBS_STATE] = PolicyFeature(
|
state_feats[OBS_STATE] = PolicyFeature(
|
||||||
type=FeatureType.STATE,
|
key=OBS_STATE,
|
||||||
shape=(8,), # [eef_pos(3), axis_angle(3), gripper(2)]
|
shape=(8,), # [eef_pos(3), axis_angle(3), gripper(2)]
|
||||||
|
dtype="float32",
|
||||||
|
description=("Concatenated end-effector position (3), axis-angle (3), and gripper qpos (2)."),
|
||||||
)
|
)
|
||||||
|
|
||||||
new_features[FeatureType.STATE] = state_feats
|
new_features[PipelineFeatureType.STATE] = state_feats
|
||||||
|
|
||||||
return new_features
|
return new_features
|
||||||
|
|
||||||
|
|||||||
@@ -314,7 +314,7 @@ class TimeLimitProcessorStep(TruncatedProcessorStep):
|
|||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ProcessorStepRegistry.register("gripper_penalty_processor")
|
@ProcessorStepRegistry.register("gripper_penalty_processor")
|
||||||
class GripperPenaltyProcessorStep(ComplementaryDataProcessorStep):
|
class GripperPenaltyProcessorStep(ProcessorStep):
|
||||||
"""
|
"""
|
||||||
Applies a penalty for inefficient gripper usage.
|
Applies a penalty for inefficient gripper usage.
|
||||||
|
|
||||||
@@ -329,26 +329,27 @@ class GripperPenaltyProcessorStep(ComplementaryDataProcessorStep):
|
|||||||
penalty: float = -0.01
|
penalty: float = -0.01
|
||||||
max_gripper_pos: float = 30.0
|
max_gripper_pos: float = 30.0
|
||||||
|
|
||||||
def complementary_data(self, complementary_data: dict) -> dict:
|
def __call__(self, transition: EnvTransition) -> EnvTransition:
|
||||||
"""
|
"""
|
||||||
Calculates the gripper penalty and adds it to the complementary data.
|
Calculates the gripper penalty and adds it to the complementary data.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
complementary_data: The incoming complementary data, which should contain
|
transition: The incoming environment transition.
|
||||||
raw joint positions.
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A new complementary data dictionary with the `discrete_penalty` key added.
|
The modified transition with the penalty added to complementary data.
|
||||||
"""
|
"""
|
||||||
action = self.transition.get(TransitionKey.ACTION)
|
new_transition = transition.copy()
|
||||||
|
action = new_transition.get(TransitionKey.ACTION)
|
||||||
|
complementary_data = new_transition.get(TransitionKey.COMPLEMENTARY_DATA, {})
|
||||||
|
|
||||||
raw_joint_positions = complementary_data.get("raw_joint_positions")
|
raw_joint_positions = complementary_data.get("raw_joint_positions")
|
||||||
if raw_joint_positions is None:
|
if raw_joint_positions is None:
|
||||||
return complementary_data
|
return new_transition
|
||||||
|
|
||||||
current_gripper_pos = raw_joint_positions.get(GRIPPER_KEY, None)
|
current_gripper_pos = raw_joint_positions.get(GRIPPER_KEY, None)
|
||||||
if current_gripper_pos is None:
|
if current_gripper_pos is None:
|
||||||
return complementary_data
|
return new_transition
|
||||||
|
|
||||||
# Gripper action is a PolicyAction at this stage
|
# Gripper action is a PolicyAction at this stage
|
||||||
gripper_action = action[-1].item()
|
gripper_action = action[-1].item()
|
||||||
@@ -364,11 +365,12 @@ class GripperPenaltyProcessorStep(ComplementaryDataProcessorStep):
|
|||||||
|
|
||||||
gripper_penalty = self.penalty * int(gripper_penalty_bool)
|
gripper_penalty = self.penalty * int(gripper_penalty_bool)
|
||||||
|
|
||||||
# Create new complementary data with penalty info
|
# Update complementary data with penalty info
|
||||||
new_complementary_data = dict(complementary_data)
|
new_complementary_data = dict(complementary_data)
|
||||||
new_complementary_data[DISCRETE_PENALTY_KEY] = gripper_penalty
|
new_complementary_data[DISCRETE_PENALTY_KEY] = gripper_penalty
|
||||||
|
new_transition[TransitionKey.COMPLEMENTARY_DATA] = new_complementary_data
|
||||||
|
|
||||||
return new_complementary_data
|
return new_transition
|
||||||
|
|
||||||
def get_config(self) -> dict[str, Any]:
|
def get_config(self) -> dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -412,7 +412,10 @@ def make_processors(
|
|||||||
if cfg.processor.observation.add_current_to_observation:
|
if cfg.processor.observation.add_current_to_observation:
|
||||||
env_pipeline_steps.append(MotorCurrentProcessorStep(robot=env.robot))
|
env_pipeline_steps.append(MotorCurrentProcessorStep(robot=env.robot))
|
||||||
|
|
||||||
if kinematics_solver is not None:
|
add_ee_pose = (
|
||||||
|
cfg.processor.observation is not None and cfg.processor.observation.add_ee_pose_to_observation
|
||||||
|
)
|
||||||
|
if kinematics_solver is not None and add_ee_pose:
|
||||||
env_pipeline_steps.append(
|
env_pipeline_steps.append(
|
||||||
ForwardKinematicsJointsToEEObservation(
|
ForwardKinematicsJointsToEEObservation(
|
||||||
kinematics=kinematics_solver,
|
kinematics=kinematics_solver,
|
||||||
@@ -435,7 +438,12 @@ def make_processors(
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Add gripper penalty processor if gripper config exists and enabled
|
# Add gripper penalty processor if gripper config exists and enabled
|
||||||
if cfg.processor.gripper is not None and cfg.processor.gripper.use_gripper:
|
# Only add if max_gripper_pos is explicitly configured (required for normalization)
|
||||||
|
if (
|
||||||
|
cfg.processor.gripper is not None
|
||||||
|
and cfg.processor.gripper.use_gripper
|
||||||
|
and cfg.processor.max_gripper_pos is not None
|
||||||
|
):
|
||||||
env_pipeline_steps.append(
|
env_pipeline_steps.append(
|
||||||
GripperPenaltyProcessorStep(
|
GripperPenaltyProcessorStep(
|
||||||
penalty=cfg.processor.gripper.gripper_penalty,
|
penalty=cfg.processor.gripper.gripper_penalty,
|
||||||
|
|||||||
@@ -26,8 +26,21 @@ from lerobot.configs.train import TrainPipelineConfig
|
|||||||
from lerobot.utils.constants import PRETRAINED_MODEL_DIR
|
from lerobot.utils.constants import PRETRAINED_MODEL_DIR
|
||||||
|
|
||||||
|
|
||||||
def cfg_to_group(cfg: TrainPipelineConfig, return_list: bool = False) -> list[str] | str:
|
def cfg_to_group(
|
||||||
|
cfg: TrainPipelineConfig, return_list: bool = False, truncate_tags: bool = False, max_tag_length: int = 64
|
||||||
|
) -> list[str] | str:
|
||||||
"""Return a group name for logging. Optionally returns group name as list."""
|
"""Return a group name for logging. Optionally returns group name as list."""
|
||||||
|
|
||||||
|
def _maybe_truncate(tag: str) -> str:
|
||||||
|
"""Truncate tag to max_tag_length characters if required.
|
||||||
|
|
||||||
|
wandb rejects tags longer than 64 characters.
|
||||||
|
See: https://github.com/wandb/wandb/blob/main/wandb/sdk/wandb_settings.py
|
||||||
|
"""
|
||||||
|
if len(tag) <= max_tag_length:
|
||||||
|
return tag
|
||||||
|
return tag[:max_tag_length]
|
||||||
|
|
||||||
lst = [
|
lst = [
|
||||||
f"policy:{cfg.policy.type}",
|
f"policy:{cfg.policy.type}",
|
||||||
f"seed:{cfg.seed}",
|
f"seed:{cfg.seed}",
|
||||||
@@ -36,6 +49,8 @@ def cfg_to_group(cfg: TrainPipelineConfig, return_list: bool = False) -> list[st
|
|||||||
lst.append(f"dataset:{cfg.dataset.repo_id}")
|
lst.append(f"dataset:{cfg.dataset.repo_id}")
|
||||||
if cfg.env is not None:
|
if cfg.env is not None:
|
||||||
lst.append(f"env:{cfg.env.type}")
|
lst.append(f"env:{cfg.env.type}")
|
||||||
|
if truncate_tags:
|
||||||
|
lst = [_maybe_truncate(tag) for tag in lst]
|
||||||
return lst if return_list else "-".join(lst)
|
return lst if return_list else "-".join(lst)
|
||||||
|
|
||||||
|
|
||||||
@@ -83,7 +98,7 @@ class WandBLogger:
|
|||||||
entity=self.cfg.entity,
|
entity=self.cfg.entity,
|
||||||
name=self.job_name,
|
name=self.job_name,
|
||||||
notes=self.cfg.notes,
|
notes=self.cfg.notes,
|
||||||
tags=cfg_to_group(cfg, return_list=True),
|
tags=cfg_to_group(cfg, return_list=True, truncate_tags=True),
|
||||||
dir=self.log_dir,
|
dir=self.log_dir,
|
||||||
config=cfg.to_dict(),
|
config=cfg.to_dict(),
|
||||||
# TODO(rcadene): try set to True
|
# TODO(rcadene): try set to True
|
||||||
|
|||||||
@@ -184,6 +184,9 @@ class DatasetRecordConfig:
|
|||||||
vcodec: str = "libsvtav1"
|
vcodec: str = "libsvtav1"
|
||||||
# Rename map for the observation to override the image and state keys
|
# Rename map for the observation to override the image and state keys
|
||||||
rename_map: dict[str, str] = field(default_factory=dict)
|
rename_map: dict[str, str] = field(default_factory=dict)
|
||||||
|
# Expert noise injection scale. Noise is added to robot actions but not recorded in dataset.
|
||||||
|
# This forces recovery behavior for more robust learned policies. 0.0 means no noise. #https://arxiv.org/pdf/1703.09327, https://arxiv.org/abs/2507.09061
|
||||||
|
noise_scale: float = 0.0
|
||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
if self.single_task is None:
|
if self.single_task is None:
|
||||||
@@ -283,6 +286,7 @@ def record_loop(
|
|||||||
single_task: str | None = None,
|
single_task: str | None = None,
|
||||||
display_data: bool = False,
|
display_data: bool = False,
|
||||||
display_compressed_images: bool = False,
|
display_compressed_images: bool = False,
|
||||||
|
noise_scale: float = 0.0,
|
||||||
):
|
):
|
||||||
if dataset is not None and dataset.fps != fps:
|
if dataset is not None and dataset.fps != fps:
|
||||||
raise ValueError(f"The dataset fps should be equal to requested fps ({dataset.fps} != {fps}).")
|
raise ValueError(f"The dataset fps should be equal to requested fps ({dataset.fps} != {fps}).")
|
||||||
@@ -380,18 +384,27 @@ def record_loop(
|
|||||||
action_values = act_processed_teleop
|
action_values = act_processed_teleop
|
||||||
robot_action_to_send = robot_action_processor((act_processed_teleop, obs))
|
robot_action_to_send = robot_action_processor((act_processed_teleop, obs))
|
||||||
|
|
||||||
|
# Write clean action to dataset (before noise injection)
|
||||||
|
if dataset is not None:
|
||||||
|
action_frame = build_dataset_frame(dataset.features, action_values, prefix=ACTION)
|
||||||
|
frame = {**observation_frame, **action_frame, "task": single_task}
|
||||||
|
dataset.add_frame(frame)
|
||||||
|
|
||||||
|
# Expert noise injection: add noise to motor commands but not to recorded labels
|
||||||
|
if noise_scale > 0:
|
||||||
|
import torch
|
||||||
|
|
||||||
|
for key in robot_action_to_send:
|
||||||
|
if isinstance(robot_action_to_send[key], torch.Tensor):
|
||||||
|
noise = torch.randn_like(robot_action_to_send[key]) * noise_scale
|
||||||
|
robot_action_to_send[key] = robot_action_to_send[key] + noise
|
||||||
|
|
||||||
# Send action to robot
|
# Send action to robot
|
||||||
# Action can eventually be clipped using `max_relative_target`,
|
# Action can eventually be clipped using `max_relative_target`,
|
||||||
# so action actually sent is saved in the dataset. action = postprocessor.process(action)
|
# so action actually sent is saved in the dataset. action = postprocessor.process(action)
|
||||||
# TODO(steven, pepijn, adil): we should use a pipeline step to clip the action, so the sent action is the action that we input to the robot.
|
# TODO(steven, pepijn, adil): we should use a pipeline step to clip the action, so the sent action is the action that we input to the robot.
|
||||||
_sent_action = robot.send_action(robot_action_to_send)
|
_sent_action = robot.send_action(robot_action_to_send)
|
||||||
|
|
||||||
# Write to dataset
|
|
||||||
if dataset is not None:
|
|
||||||
action_frame = build_dataset_frame(dataset.features, action_values, prefix=ACTION)
|
|
||||||
frame = {**observation_frame, **action_frame, "task": single_task}
|
|
||||||
dataset.add_frame(frame)
|
|
||||||
|
|
||||||
if display_data:
|
if display_data:
|
||||||
log_rerun_data(
|
log_rerun_data(
|
||||||
observation=obs_processed, action=action_values, compress_images=display_compressed_images
|
observation=obs_processed, action=action_values, compress_images=display_compressed_images
|
||||||
@@ -510,6 +523,7 @@ def record(cfg: RecordConfig) -> LeRobotDataset:
|
|||||||
single_task=cfg.dataset.single_task,
|
single_task=cfg.dataset.single_task,
|
||||||
display_data=cfg.display_data,
|
display_data=cfg.display_data,
|
||||||
display_compressed_images=display_compressed_images,
|
display_compressed_images=display_compressed_images,
|
||||||
|
noise_scale=cfg.dataset.noise_scale,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Execute a few seconds without recording to give time to manually reset the environment
|
# Execute a few seconds without recording to give time to manually reset the environment
|
||||||
|
|||||||
Reference in New Issue
Block a user