diff --git a/pyproject.toml b/pyproject.toml index 6339d5607..98ccc7b9b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -119,8 +119,7 @@ phone = ["hebi-py>=2.8.0", "teleop>=0.1.0"] # ] # TODO: Currently not supported # Policies -pi0 = ["lerobot[transformers-dep]"] -pi05 = ["lerobot[transformers-dep]"] +pi = ["lerobot[transformers-dep]"] smolvla = ["lerobot[transformers-dep]", "num2words>=0.5.14", "accelerate>=1.7.0", "safetensors>=0.4.3"] hilserl = ["lerobot[transformers-dep]", "gym-hil>=0.1.9", "lerobot[grpcio-dep]", "lerobot[placo-dep]"] diff --git a/src/lerobot/policies/pi05_openpi/README.md b/src/lerobot/policies/pi05_openpi/README.md index e4b2f5460..2ae69d978 100644 --- a/src/lerobot/policies/pi05_openpi/README.md +++ b/src/lerobot/policies/pi05_openpi/README.md @@ -5,44 +5,6 @@ It is designed as a **Vision-Language-Action model with open-world generalizatio --- -### ⚠️ WARNING ⚠️ - -This project requires **patching the Hugging Face `transformers` library**. - -1. Make sure you have the exact version installed: - -```bash - pip show transformers -``` - -It must be version **4.53.2**. - -2. Apply the custom patches by copying the modified files into your environment: - - ```bash - cp -r ./src/lerobot/policies/pi0_openpi/transformers_replace/* \ - $(python -c "import transformers, os; print(os.path.dirname(transformers.__file__))") - ``` - - These patches overwrite parts of `transformers` to: - - Support the **AdaRMS optimizer**, - - Correctly control the precision of activations, - - Allow the KV cache to be used without updates. - -**Important:** - -- This permanently modifies your `transformers` installation. -- The changes survive reinstalls unless you explicitly remove the patched files or recreate the environment. - -To undo and restore a clean state: - -```bash -pip uninstall transformers -pip install transformers==4.53.2 -``` - ---- - ## Model Overview | Feature | π₀ | π₀.₅ | diff --git a/src/lerobot/policies/pi05_openpi/configuration_pi05openpi.py b/src/lerobot/policies/pi05_openpi/configuration_pi05openpi.py index 8f0ca3fd7..3b8b4779e 100644 --- a/src/lerobot/policies/pi05_openpi/configuration_pi05openpi.py +++ b/src/lerobot/policies/pi05_openpi/configuration_pi05openpi.py @@ -17,7 +17,7 @@ from dataclasses import dataclass, field from lerobot.configs.policies import PreTrainedConfig -from lerobot.configs.types import NormalizationMode +from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature from lerobot.optim.optimizers import AdamWConfig from lerobot.optim.schedulers import CosineDecayWithWarmupSchedulerConfig @@ -43,12 +43,19 @@ class PI05OpenPIConfig(PreTrainedConfig): num_inference_steps: int = 10 # Number of denoising steps during inference time_sampling_beta_alpha: float = 1.5 # Beta distribution alpha parameter for time sampling time_sampling_beta_beta: float = 1.0 # Beta distribution beta parameter for time sampling + time_sampling_scale: float = 0.999 # Scale factor for time sampling + time_sampling_offset: float = 0.001 # Offset for time sampling min_period: float = 4e-3 # Min period for sinusoidal positional encoding max_period: float = 4.0 # Max period for sinusoidal positional encoding + attention_mask_value: float = -2.3819763e38 + # Image preprocessing image_resolution: tuple[int, int] = (224, 224) # see openpi `preprocessing_pytorch.py` + # Add empty images. Used to add empty cameras when no image features are present. + empty_cameras: int = 0 + # Normalization normalization_mapping: dict[str, NormalizationMode] = field( default_factory=lambda: { @@ -64,7 +71,7 @@ class PI05OpenPIConfig(PreTrainedConfig): compile_mode: str = "max-autotune" # Torch compile mode device: str | None = None # Device to use for the model (None = auto-detect) - # Optimizer settings: see openpi `AdamW` and + # Optimizer settings: see openpi `AdamW` optimizer_lr: float = 2.5e-5 # see openpi `CosineDecaySchedule: peak_lr` optimizer_betas: tuple[float, float] = (0.9, 0.95) optimizer_eps: float = 1e-8 @@ -98,12 +105,27 @@ class PI05OpenPIConfig(PreTrainedConfig): def validate_features(self) -> None: """Validate and set up input/output features.""" - # Image features are now handled dynamically through dataset configuration - # No need to auto-add hardcoded image keys + for i in range(self.empty_cameras): + key = f"observation.images.empty_camera_{i}" + empty_camera = PolicyFeature( + type=FeatureType.VISUAL, + shape=(3, *self.image_resolution), # Use configured image resolution + ) + self.input_features[key] = empty_camera - # State and action features are also handled dynamically through dataset configuration - # The actual dimensions come from the feature shapes, max dimensions are used for padding only - pass + if "observation.state" not in self.input_features: + state_feature = PolicyFeature( + type=FeatureType.STATE, + shape=(self.max_state_dim,), # Will be padded to max_state_dim + ) + self.input_features["observation.state"] = state_feature + + if "action" not in self.output_features: + action_feature = PolicyFeature( + type=FeatureType.ACTION, + shape=(self.max_action_dim,), # Will be padded to max_action_dim + ) + self.output_features["action"] = action_feature def get_optimizer_preset(self) -> AdamWConfig: return AdamWConfig( diff --git a/src/lerobot/policies/pi05_openpi/modeling_pi05openpi.py b/src/lerobot/policies/pi05_openpi/modeling_pi05openpi.py index 6780fdfc3..6b6d328e1 100644 --- a/src/lerobot/policies/pi05_openpi/modeling_pi05openpi.py +++ b/src/lerobot/policies/pi05_openpi/modeling_pi05openpi.py @@ -563,7 +563,7 @@ $(python -c "import transformers, os; print(os.path.dirname(transformers.__file_ def _prepare_attention_masks_4d(self, att_2d_masks): """Helper method to prepare 4D attention masks for transformer.""" att_2d_masks_4d = att_2d_masks[:, None, :, :] - return torch.where(att_2d_masks_4d, 0.0, -2.3819763e38) + return torch.where(att_2d_masks_4d, 0.0, self.config.attention_mask_value) def sample_noise(self, shape, device): return torch.normal( @@ -578,7 +578,7 @@ $(python -c "import transformers, os; print(os.path.dirname(transformers.__file_ time_beta = sample_beta( self.config.time_sampling_beta_alpha, self.config.time_sampling_beta_beta, bsize, device ) - time = time_beta * 0.999 + 0.001 + time = time_beta * self.config.time_sampling_scale + self.config.time_sampling_offset return time.to(dtype=torch.float32, device=device) def embed_prefix( @@ -661,6 +661,7 @@ $(python -c "import transformers, os; print(os.path.dirname(transformers.__file_ action_time_mask = torch.ones(bsize, action_time_dim, dtype=torch.bool, device=timestep.device) pad_masks.append(action_time_mask) + # Set attention masks so that image, language and state inputs do not attend to action tokens att_masks += [1] + ([0] * (self.config.chunk_size - 1)) embs = torch.cat(embs, dim=1) diff --git a/src/lerobot/policies/pi0_openpi/README.md b/src/lerobot/policies/pi0_openpi/README.md index 791a7e5bd..65b331e51 100644 --- a/src/lerobot/policies/pi0_openpi/README.md +++ b/src/lerobot/policies/pi0_openpi/README.md @@ -5,44 +5,6 @@ It is designed as a **Vision-Language-Action model for general robot control**. --- -### ⚠️ WARNING ⚠️ - -This project requires **patching the Hugging Face `transformers` library**. - -1. Make sure you have the exact version installed: - -```bash - pip show transformers -``` - -It must be version **4.53.2**. - -2. Apply the custom patches by copying the modified files into your environment: - - ```bash - cp -r ./src/lerobot/policies/pi0_openpi/transformers_replace/* \ - $(python -c "import transformers, os; print(os.path.dirname(transformers.__file__))") - ``` - - These patches overwrite parts of `transformers` to: - - Support the **AdaRMS optimizer**, - - Correctly control the precision of activations, - - Allow the KV cache to be used without updates. - -**Important:** - -- This permanently modifies your `transformers` installation. -- The changes survive reinstalls unless you explicitly remove the patched files or recreate the environment. - -To undo and restore a clean state: - -```bash -pip uninstall transformers -pip install transformers==4.53.2 -``` - ---- - ## Model Overview | Feature | π₀ | π₀.₅ | diff --git a/src/lerobot/policies/pi0_openpi/configuration_pi0openpi.py b/src/lerobot/policies/pi0_openpi/configuration_pi0openpi.py index 4c6a19909..0bef9c5a1 100644 --- a/src/lerobot/policies/pi0_openpi/configuration_pi0openpi.py +++ b/src/lerobot/policies/pi0_openpi/configuration_pi0openpi.py @@ -17,7 +17,7 @@ from dataclasses import dataclass, field from lerobot.configs.policies import PreTrainedConfig -from lerobot.configs.types import NormalizationMode +from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature from lerobot.optim.optimizers import AdamWConfig from lerobot.optim.schedulers import CosineDecayWithWarmupSchedulerConfig @@ -43,12 +43,19 @@ class PI0OpenPIConfig(PreTrainedConfig): num_inference_steps: int = 10 # Number of denoising steps during inference time_sampling_beta_alpha: float = 1.5 # Beta distribution alpha parameter for time sampling time_sampling_beta_beta: float = 1.0 # Beta distribution beta parameter for time sampling + time_sampling_scale: float = 0.999 # Scale factor for time sampling + time_sampling_offset: float = 0.001 # Offset for time sampling min_period: float = 4e-3 # Min period for sinusoidal positional encoding max_period: float = 4.0 # Max period for sinusoidal positional encoding + attention_mask_value: float = -2.3819763e38 + # Image preprocessing image_resolution: tuple[int, int] = (224, 224) # see openpi `preprocessing_pytorch.py` + # Add empty images. Used to add empty cameras when no image features are present. + empty_cameras: int = 0 + # Normalization normalization_mapping: dict[str, NormalizationMode] = field( default_factory=lambda: { @@ -64,7 +71,7 @@ class PI0OpenPIConfig(PreTrainedConfig): compile_mode: str = "max-autotune" # Torch compile mode device: str | None = None # Device to use for the model (None = auto-detect) - # Optimizer settings: see openpi `AdamW` and + # Optimizer settings: see openpi `AdamW`` optimizer_lr: float = 2.5e-5 # see openpi `CosineDecaySchedule: peak_lr` optimizer_betas: tuple[float, float] = (0.9, 0.95) optimizer_eps: float = 1e-8 @@ -98,12 +105,27 @@ class PI0OpenPIConfig(PreTrainedConfig): def validate_features(self) -> None: """Validate and set up input/output features.""" - # Image features are now handled dynamically through dataset configuration - # No need to auto-add hardcoded image keys + for i in range(self.empty_cameras): + key = f"observation.images.empty_camera_{i}" + empty_camera = PolicyFeature( + type=FeatureType.VISUAL, + shape=(3, *self.image_resolution), # Use configured image resolution + ) + self.input_features[key] = empty_camera - # State and action features are also handled dynamically through dataset configuration - # The actual dimensions come from the feature shapes, max dimensions are used for padding only - pass + if "observation.state" not in self.input_features: + state_feature = PolicyFeature( + type=FeatureType.STATE, + shape=(self.max_state_dim,), # Will be padded to max_state_dim + ) + self.input_features["observation.state"] = state_feature + + if "action" not in self.output_features: + action_feature = PolicyFeature( + type=FeatureType.ACTION, + shape=(self.max_action_dim,), # Will be padded to max_action_dim + ) + self.output_features["action"] = action_feature def get_optimizer_preset(self) -> AdamWConfig: return AdamWConfig( diff --git a/src/lerobot/policies/pi0_openpi/modeling_pi0openpi.py b/src/lerobot/policies/pi0_openpi/modeling_pi0openpi.py index e493a9a2a..4db15a3d3 100644 --- a/src/lerobot/policies/pi0_openpi/modeling_pi0openpi.py +++ b/src/lerobot/policies/pi0_openpi/modeling_pi0openpi.py @@ -563,7 +563,7 @@ $(python -c "import transformers, os; print(os.path.dirname(transformers.__file_ def _prepare_attention_masks_4d(self, att_2d_masks): """Helper method to prepare 4D attention masks for transformer.""" att_2d_masks_4d = att_2d_masks[:, None, :, :] - return torch.where(att_2d_masks_4d, 0.0, -2.3819763e38) + return torch.where(att_2d_masks_4d, 0.0, self.config.attention_mask_value) def sample_noise(self, shape, device): return torch.normal( @@ -578,7 +578,7 @@ $(python -c "import transformers, os; print(os.path.dirname(transformers.__file_ time_beta = sample_beta( self.config.time_sampling_beta_alpha, self.config.time_sampling_beta_beta, bsize, device ) - time = time_beta * 0.999 + 0.001 + time = time_beta * self.config.time_sampling_scale + self.config.time_sampling_offset return time.to(dtype=torch.float32, device=device) def embed_prefix( @@ -677,6 +677,7 @@ $(python -c "import transformers, os; print(os.path.dirname(transformers.__file_ action_time_mask = torch.ones(bsize, action_time_dim, dtype=torch.bool, device=timestep.device) pad_masks.append(action_time_mask) + # Set attention masks so that image, language and state inputs do not attend to action tokens att_masks += [1] + ([0] * (self.config.chunk_size - 1)) embs = torch.cat(embs, dim=1)