feedback pr

This commit is contained in:
Pepijn
2025-09-22 10:26:49 +02:00
parent 8951e6034f
commit 8ac060124d
7 changed files with 65 additions and 96 deletions
+1 -2
View File
@@ -119,8 +119,7 @@ phone = ["hebi-py>=2.8.0", "teleop>=0.1.0"]
# ] # TODO: Currently not supported # ] # TODO: Currently not supported
# Policies # Policies
pi0 = ["lerobot[transformers-dep]"] pi = ["lerobot[transformers-dep]"]
pi05 = ["lerobot[transformers-dep]"]
smolvla = ["lerobot[transformers-dep]", "num2words>=0.5.14", "accelerate>=1.7.0", "safetensors>=0.4.3"] smolvla = ["lerobot[transformers-dep]", "num2words>=0.5.14", "accelerate>=1.7.0", "safetensors>=0.4.3"]
hilserl = ["lerobot[transformers-dep]", "gym-hil>=0.1.9", "lerobot[grpcio-dep]", "lerobot[placo-dep]"] hilserl = ["lerobot[transformers-dep]", "gym-hil>=0.1.9", "lerobot[grpcio-dep]", "lerobot[placo-dep]"]
@@ -5,44 +5,6 @@ It is designed as a **Vision-Language-Action model with open-world generalizatio
--- ---
### ⚠️ WARNING ⚠️
This project requires **patching the Hugging Face `transformers` library**.
1. Make sure you have the exact version installed:
```bash
pip show transformers
```
It must be version **4.53.2**.
2. Apply the custom patches by copying the modified files into your environment:
```bash
cp -r ./src/lerobot/policies/pi0_openpi/transformers_replace/* \
$(python -c "import transformers, os; print(os.path.dirname(transformers.__file__))")
```
These patches overwrite parts of `transformers` to:
- Support the **AdaRMS optimizer**,
- Correctly control the precision of activations,
- Allow the KV cache to be used without updates.
**Important:**
- This permanently modifies your `transformers` installation.
- The changes survive reinstalls unless you explicitly remove the patched files or recreate the environment.
To undo and restore a clean state:
```bash
pip uninstall transformers
pip install transformers==4.53.2
```
---
## Model Overview ## Model Overview
| Feature | π₀ | π₀.₅ | | Feature | π₀ | π₀.₅ |
@@ -17,7 +17,7 @@
from dataclasses import dataclass, field from dataclasses import dataclass, field
from lerobot.configs.policies import PreTrainedConfig from lerobot.configs.policies import PreTrainedConfig
from lerobot.configs.types import NormalizationMode from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature
from lerobot.optim.optimizers import AdamWConfig from lerobot.optim.optimizers import AdamWConfig
from lerobot.optim.schedulers import CosineDecayWithWarmupSchedulerConfig from lerobot.optim.schedulers import CosineDecayWithWarmupSchedulerConfig
@@ -43,12 +43,19 @@ class PI05OpenPIConfig(PreTrainedConfig):
num_inference_steps: int = 10 # Number of denoising steps during inference num_inference_steps: int = 10 # Number of denoising steps during inference
time_sampling_beta_alpha: float = 1.5 # Beta distribution alpha parameter for time sampling time_sampling_beta_alpha: float = 1.5 # Beta distribution alpha parameter for time sampling
time_sampling_beta_beta: float = 1.0 # Beta distribution beta parameter for time sampling time_sampling_beta_beta: float = 1.0 # Beta distribution beta parameter for time sampling
time_sampling_scale: float = 0.999 # Scale factor for time sampling
time_sampling_offset: float = 0.001 # Offset for time sampling
min_period: float = 4e-3 # Min period for sinusoidal positional encoding min_period: float = 4e-3 # Min period for sinusoidal positional encoding
max_period: float = 4.0 # Max period for sinusoidal positional encoding max_period: float = 4.0 # Max period for sinusoidal positional encoding
attention_mask_value: float = -2.3819763e38
# Image preprocessing # Image preprocessing
image_resolution: tuple[int, int] = (224, 224) # see openpi `preprocessing_pytorch.py` image_resolution: tuple[int, int] = (224, 224) # see openpi `preprocessing_pytorch.py`
# Add empty images. Used to add empty cameras when no image features are present.
empty_cameras: int = 0
# Normalization # Normalization
normalization_mapping: dict[str, NormalizationMode] = field( normalization_mapping: dict[str, NormalizationMode] = field(
default_factory=lambda: { default_factory=lambda: {
@@ -64,7 +71,7 @@ class PI05OpenPIConfig(PreTrainedConfig):
compile_mode: str = "max-autotune" # Torch compile mode compile_mode: str = "max-autotune" # Torch compile mode
device: str | None = None # Device to use for the model (None = auto-detect) device: str | None = None # Device to use for the model (None = auto-detect)
# Optimizer settings: see openpi `AdamW` and # Optimizer settings: see openpi `AdamW`
optimizer_lr: float = 2.5e-5 # see openpi `CosineDecaySchedule: peak_lr` optimizer_lr: float = 2.5e-5 # see openpi `CosineDecaySchedule: peak_lr`
optimizer_betas: tuple[float, float] = (0.9, 0.95) optimizer_betas: tuple[float, float] = (0.9, 0.95)
optimizer_eps: float = 1e-8 optimizer_eps: float = 1e-8
@@ -98,12 +105,27 @@ class PI05OpenPIConfig(PreTrainedConfig):
def validate_features(self) -> None: def validate_features(self) -> None:
"""Validate and set up input/output features.""" """Validate and set up input/output features."""
# Image features are now handled dynamically through dataset configuration for i in range(self.empty_cameras):
# No need to auto-add hardcoded image keys key = f"observation.images.empty_camera_{i}"
empty_camera = PolicyFeature(
type=FeatureType.VISUAL,
shape=(3, *self.image_resolution), # Use configured image resolution
)
self.input_features[key] = empty_camera
# State and action features are also handled dynamically through dataset configuration if "observation.state" not in self.input_features:
# The actual dimensions come from the feature shapes, max dimensions are used for padding only state_feature = PolicyFeature(
pass type=FeatureType.STATE,
shape=(self.max_state_dim,), # Will be padded to max_state_dim
)
self.input_features["observation.state"] = state_feature
if "action" not in self.output_features:
action_feature = PolicyFeature(
type=FeatureType.ACTION,
shape=(self.max_action_dim,), # Will be padded to max_action_dim
)
self.output_features["action"] = action_feature
def get_optimizer_preset(self) -> AdamWConfig: def get_optimizer_preset(self) -> AdamWConfig:
return AdamWConfig( return AdamWConfig(
@@ -563,7 +563,7 @@ $(python -c "import transformers, os; print(os.path.dirname(transformers.__file_
def _prepare_attention_masks_4d(self, att_2d_masks): def _prepare_attention_masks_4d(self, att_2d_masks):
"""Helper method to prepare 4D attention masks for transformer.""" """Helper method to prepare 4D attention masks for transformer."""
att_2d_masks_4d = att_2d_masks[:, None, :, :] att_2d_masks_4d = att_2d_masks[:, None, :, :]
return torch.where(att_2d_masks_4d, 0.0, -2.3819763e38) return torch.where(att_2d_masks_4d, 0.0, self.config.attention_mask_value)
def sample_noise(self, shape, device): def sample_noise(self, shape, device):
return torch.normal( return torch.normal(
@@ -578,7 +578,7 @@ $(python -c "import transformers, os; print(os.path.dirname(transformers.__file_
time_beta = sample_beta( time_beta = sample_beta(
self.config.time_sampling_beta_alpha, self.config.time_sampling_beta_beta, bsize, device self.config.time_sampling_beta_alpha, self.config.time_sampling_beta_beta, bsize, device
) )
time = time_beta * 0.999 + 0.001 time = time_beta * self.config.time_sampling_scale + self.config.time_sampling_offset
return time.to(dtype=torch.float32, device=device) return time.to(dtype=torch.float32, device=device)
def embed_prefix( def embed_prefix(
@@ -661,6 +661,7 @@ $(python -c "import transformers, os; print(os.path.dirname(transformers.__file_
action_time_mask = torch.ones(bsize, action_time_dim, dtype=torch.bool, device=timestep.device) action_time_mask = torch.ones(bsize, action_time_dim, dtype=torch.bool, device=timestep.device)
pad_masks.append(action_time_mask) pad_masks.append(action_time_mask)
# Set attention masks so that image, language and state inputs do not attend to action tokens
att_masks += [1] + ([0] * (self.config.chunk_size - 1)) att_masks += [1] + ([0] * (self.config.chunk_size - 1))
embs = torch.cat(embs, dim=1) embs = torch.cat(embs, dim=1)
-38
View File
@@ -5,44 +5,6 @@ It is designed as a **Vision-Language-Action model for general robot control**.
--- ---
### ⚠️ WARNING ⚠️
This project requires **patching the Hugging Face `transformers` library**.
1. Make sure you have the exact version installed:
```bash
pip show transformers
```
It must be version **4.53.2**.
2. Apply the custom patches by copying the modified files into your environment:
```bash
cp -r ./src/lerobot/policies/pi0_openpi/transformers_replace/* \
$(python -c "import transformers, os; print(os.path.dirname(transformers.__file__))")
```
These patches overwrite parts of `transformers` to:
- Support the **AdaRMS optimizer**,
- Correctly control the precision of activations,
- Allow the KV cache to be used without updates.
**Important:**
- This permanently modifies your `transformers` installation.
- The changes survive reinstalls unless you explicitly remove the patched files or recreate the environment.
To undo and restore a clean state:
```bash
pip uninstall transformers
pip install transformers==4.53.2
```
---
## Model Overview ## Model Overview
| Feature | π₀ | π₀.₅ | | Feature | π₀ | π₀.₅ |
@@ -17,7 +17,7 @@
from dataclasses import dataclass, field from dataclasses import dataclass, field
from lerobot.configs.policies import PreTrainedConfig from lerobot.configs.policies import PreTrainedConfig
from lerobot.configs.types import NormalizationMode from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature
from lerobot.optim.optimizers import AdamWConfig from lerobot.optim.optimizers import AdamWConfig
from lerobot.optim.schedulers import CosineDecayWithWarmupSchedulerConfig from lerobot.optim.schedulers import CosineDecayWithWarmupSchedulerConfig
@@ -43,12 +43,19 @@ class PI0OpenPIConfig(PreTrainedConfig):
num_inference_steps: int = 10 # Number of denoising steps during inference num_inference_steps: int = 10 # Number of denoising steps during inference
time_sampling_beta_alpha: float = 1.5 # Beta distribution alpha parameter for time sampling time_sampling_beta_alpha: float = 1.5 # Beta distribution alpha parameter for time sampling
time_sampling_beta_beta: float = 1.0 # Beta distribution beta parameter for time sampling time_sampling_beta_beta: float = 1.0 # Beta distribution beta parameter for time sampling
time_sampling_scale: float = 0.999 # Scale factor for time sampling
time_sampling_offset: float = 0.001 # Offset for time sampling
min_period: float = 4e-3 # Min period for sinusoidal positional encoding min_period: float = 4e-3 # Min period for sinusoidal positional encoding
max_period: float = 4.0 # Max period for sinusoidal positional encoding max_period: float = 4.0 # Max period for sinusoidal positional encoding
attention_mask_value: float = -2.3819763e38
# Image preprocessing # Image preprocessing
image_resolution: tuple[int, int] = (224, 224) # see openpi `preprocessing_pytorch.py` image_resolution: tuple[int, int] = (224, 224) # see openpi `preprocessing_pytorch.py`
# Add empty images. Used to add empty cameras when no image features are present.
empty_cameras: int = 0
# Normalization # Normalization
normalization_mapping: dict[str, NormalizationMode] = field( normalization_mapping: dict[str, NormalizationMode] = field(
default_factory=lambda: { default_factory=lambda: {
@@ -64,7 +71,7 @@ class PI0OpenPIConfig(PreTrainedConfig):
compile_mode: str = "max-autotune" # Torch compile mode compile_mode: str = "max-autotune" # Torch compile mode
device: str | None = None # Device to use for the model (None = auto-detect) device: str | None = None # Device to use for the model (None = auto-detect)
# Optimizer settings: see openpi `AdamW` and # Optimizer settings: see openpi `AdamW``
optimizer_lr: float = 2.5e-5 # see openpi `CosineDecaySchedule: peak_lr` optimizer_lr: float = 2.5e-5 # see openpi `CosineDecaySchedule: peak_lr`
optimizer_betas: tuple[float, float] = (0.9, 0.95) optimizer_betas: tuple[float, float] = (0.9, 0.95)
optimizer_eps: float = 1e-8 optimizer_eps: float = 1e-8
@@ -98,12 +105,27 @@ class PI0OpenPIConfig(PreTrainedConfig):
def validate_features(self) -> None: def validate_features(self) -> None:
"""Validate and set up input/output features.""" """Validate and set up input/output features."""
# Image features are now handled dynamically through dataset configuration for i in range(self.empty_cameras):
# No need to auto-add hardcoded image keys key = f"observation.images.empty_camera_{i}"
empty_camera = PolicyFeature(
type=FeatureType.VISUAL,
shape=(3, *self.image_resolution), # Use configured image resolution
)
self.input_features[key] = empty_camera
# State and action features are also handled dynamically through dataset configuration if "observation.state" not in self.input_features:
# The actual dimensions come from the feature shapes, max dimensions are used for padding only state_feature = PolicyFeature(
pass type=FeatureType.STATE,
shape=(self.max_state_dim,), # Will be padded to max_state_dim
)
self.input_features["observation.state"] = state_feature
if "action" not in self.output_features:
action_feature = PolicyFeature(
type=FeatureType.ACTION,
shape=(self.max_action_dim,), # Will be padded to max_action_dim
)
self.output_features["action"] = action_feature
def get_optimizer_preset(self) -> AdamWConfig: def get_optimizer_preset(self) -> AdamWConfig:
return AdamWConfig( return AdamWConfig(
@@ -563,7 +563,7 @@ $(python -c "import transformers, os; print(os.path.dirname(transformers.__file_
def _prepare_attention_masks_4d(self, att_2d_masks): def _prepare_attention_masks_4d(self, att_2d_masks):
"""Helper method to prepare 4D attention masks for transformer.""" """Helper method to prepare 4D attention masks for transformer."""
att_2d_masks_4d = att_2d_masks[:, None, :, :] att_2d_masks_4d = att_2d_masks[:, None, :, :]
return torch.where(att_2d_masks_4d, 0.0, -2.3819763e38) return torch.where(att_2d_masks_4d, 0.0, self.config.attention_mask_value)
def sample_noise(self, shape, device): def sample_noise(self, shape, device):
return torch.normal( return torch.normal(
@@ -578,7 +578,7 @@ $(python -c "import transformers, os; print(os.path.dirname(transformers.__file_
time_beta = sample_beta( time_beta = sample_beta(
self.config.time_sampling_beta_alpha, self.config.time_sampling_beta_beta, bsize, device self.config.time_sampling_beta_alpha, self.config.time_sampling_beta_beta, bsize, device
) )
time = time_beta * 0.999 + 0.001 time = time_beta * self.config.time_sampling_scale + self.config.time_sampling_offset
return time.to(dtype=torch.float32, device=device) return time.to(dtype=torch.float32, device=device)
def embed_prefix( def embed_prefix(
@@ -677,6 +677,7 @@ $(python -c "import transformers, os; print(os.path.dirname(transformers.__file_
action_time_mask = torch.ones(bsize, action_time_dim, dtype=torch.bool, device=timestep.device) action_time_mask = torch.ones(bsize, action_time_dim, dtype=torch.bool, device=timestep.device)
pad_masks.append(action_time_mask) pad_masks.append(action_time_mask)
# Set attention masks so that image, language and state inputs do not attend to action tokens
att_masks += [1] + ([0] * (self.config.chunk_size - 1)) att_masks += [1] + ([0] * (self.config.chunk_size - 1))
embs = torch.cat(embs, dim=1) embs = torch.cat(embs, dim=1)