feedback pr

This commit is contained in:
Pepijn
2025-09-22 10:26:49 +02:00
parent 8951e6034f
commit 8ac060124d
7 changed files with 65 additions and 96 deletions
+1 -2
View File
@@ -119,8 +119,7 @@ phone = ["hebi-py>=2.8.0", "teleop>=0.1.0"]
# ] # TODO: Currently not supported
# Policies
pi0 = ["lerobot[transformers-dep]"]
pi05 = ["lerobot[transformers-dep]"]
pi = ["lerobot[transformers-dep]"]
smolvla = ["lerobot[transformers-dep]", "num2words>=0.5.14", "accelerate>=1.7.0", "safetensors>=0.4.3"]
hilserl = ["lerobot[transformers-dep]", "gym-hil>=0.1.9", "lerobot[grpcio-dep]", "lerobot[placo-dep]"]
@@ -5,44 +5,6 @@ It is designed as a **Vision-Language-Action model with open-world generalizatio
---
### ⚠️ WARNING ⚠️
This project requires **patching the Hugging Face `transformers` library**.
1. Make sure you have the exact version installed:
```bash
pip show transformers
```
It must be version **4.53.2**.
2. Apply the custom patches by copying the modified files into your environment:
```bash
cp -r ./src/lerobot/policies/pi0_openpi/transformers_replace/* \
$(python -c "import transformers, os; print(os.path.dirname(transformers.__file__))")
```
These patches overwrite parts of `transformers` to:
- Support the **AdaRMS optimizer**,
- Correctly control the precision of activations,
- Allow the KV cache to be used without updates.
**Important:**
- This permanently modifies your `transformers` installation.
- The changes survive reinstalls unless you explicitly remove the patched files or recreate the environment.
To undo and restore a clean state:
```bash
pip uninstall transformers
pip install transformers==4.53.2
```
---
## Model Overview
| Feature | π₀ | π₀.₅ |
@@ -17,7 +17,7 @@
from dataclasses import dataclass, field
from lerobot.configs.policies import PreTrainedConfig
from lerobot.configs.types import NormalizationMode
from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature
from lerobot.optim.optimizers import AdamWConfig
from lerobot.optim.schedulers import CosineDecayWithWarmupSchedulerConfig
@@ -43,12 +43,19 @@ class PI05OpenPIConfig(PreTrainedConfig):
num_inference_steps: int = 10 # Number of denoising steps during inference
time_sampling_beta_alpha: float = 1.5 # Beta distribution alpha parameter for time sampling
time_sampling_beta_beta: float = 1.0 # Beta distribution beta parameter for time sampling
time_sampling_scale: float = 0.999 # Scale factor for time sampling
time_sampling_offset: float = 0.001 # Offset for time sampling
min_period: float = 4e-3 # Min period for sinusoidal positional encoding
max_period: float = 4.0 # Max period for sinusoidal positional encoding
attention_mask_value: float = -2.3819763e38
# Image preprocessing
image_resolution: tuple[int, int] = (224, 224) # see openpi `preprocessing_pytorch.py`
# Add empty images. Used to add empty cameras when no image features are present.
empty_cameras: int = 0
# Normalization
normalization_mapping: dict[str, NormalizationMode] = field(
default_factory=lambda: {
@@ -64,7 +71,7 @@ class PI05OpenPIConfig(PreTrainedConfig):
compile_mode: str = "max-autotune" # Torch compile mode
device: str | None = None # Device to use for the model (None = auto-detect)
# Optimizer settings: see openpi `AdamW` and
# Optimizer settings: see openpi `AdamW`
optimizer_lr: float = 2.5e-5 # see openpi `CosineDecaySchedule: peak_lr`
optimizer_betas: tuple[float, float] = (0.9, 0.95)
optimizer_eps: float = 1e-8
@@ -98,12 +105,27 @@ class PI05OpenPIConfig(PreTrainedConfig):
def validate_features(self) -> None:
"""Validate and set up input/output features."""
# Image features are now handled dynamically through dataset configuration
# No need to auto-add hardcoded image keys
for i in range(self.empty_cameras):
key = f"observation.images.empty_camera_{i}"
empty_camera = PolicyFeature(
type=FeatureType.VISUAL,
shape=(3, *self.image_resolution), # Use configured image resolution
)
self.input_features[key] = empty_camera
# State and action features are also handled dynamically through dataset configuration
# The actual dimensions come from the feature shapes, max dimensions are used for padding only
pass
if "observation.state" not in self.input_features:
state_feature = PolicyFeature(
type=FeatureType.STATE,
shape=(self.max_state_dim,), # Will be padded to max_state_dim
)
self.input_features["observation.state"] = state_feature
if "action" not in self.output_features:
action_feature = PolicyFeature(
type=FeatureType.ACTION,
shape=(self.max_action_dim,), # Will be padded to max_action_dim
)
self.output_features["action"] = action_feature
def get_optimizer_preset(self) -> AdamWConfig:
return AdamWConfig(
@@ -563,7 +563,7 @@ $(python -c "import transformers, os; print(os.path.dirname(transformers.__file_
def _prepare_attention_masks_4d(self, att_2d_masks):
"""Helper method to prepare 4D attention masks for transformer."""
att_2d_masks_4d = att_2d_masks[:, None, :, :]
return torch.where(att_2d_masks_4d, 0.0, -2.3819763e38)
return torch.where(att_2d_masks_4d, 0.0, self.config.attention_mask_value)
def sample_noise(self, shape, device):
return torch.normal(
@@ -578,7 +578,7 @@ $(python -c "import transformers, os; print(os.path.dirname(transformers.__file_
time_beta = sample_beta(
self.config.time_sampling_beta_alpha, self.config.time_sampling_beta_beta, bsize, device
)
time = time_beta * 0.999 + 0.001
time = time_beta * self.config.time_sampling_scale + self.config.time_sampling_offset
return time.to(dtype=torch.float32, device=device)
def embed_prefix(
@@ -661,6 +661,7 @@ $(python -c "import transformers, os; print(os.path.dirname(transformers.__file_
action_time_mask = torch.ones(bsize, action_time_dim, dtype=torch.bool, device=timestep.device)
pad_masks.append(action_time_mask)
# Set attention masks so that image, language and state inputs do not attend to action tokens
att_masks += [1] + ([0] * (self.config.chunk_size - 1))
embs = torch.cat(embs, dim=1)
-38
View File
@@ -5,44 +5,6 @@ It is designed as a **Vision-Language-Action model for general robot control**.
---
### ⚠️ WARNING ⚠️
This project requires **patching the Hugging Face `transformers` library**.
1. Make sure you have the exact version installed:
```bash
pip show transformers
```
It must be version **4.53.2**.
2. Apply the custom patches by copying the modified files into your environment:
```bash
cp -r ./src/lerobot/policies/pi0_openpi/transformers_replace/* \
$(python -c "import transformers, os; print(os.path.dirname(transformers.__file__))")
```
These patches overwrite parts of `transformers` to:
- Support the **AdaRMS optimizer**,
- Correctly control the precision of activations,
- Allow the KV cache to be used without updates.
**Important:**
- This permanently modifies your `transformers` installation.
- The changes survive reinstalls unless you explicitly remove the patched files or recreate the environment.
To undo and restore a clean state:
```bash
pip uninstall transformers
pip install transformers==4.53.2
```
---
## Model Overview
| Feature | π₀ | π₀.₅ |
@@ -17,7 +17,7 @@
from dataclasses import dataclass, field
from lerobot.configs.policies import PreTrainedConfig
from lerobot.configs.types import NormalizationMode
from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature
from lerobot.optim.optimizers import AdamWConfig
from lerobot.optim.schedulers import CosineDecayWithWarmupSchedulerConfig
@@ -43,12 +43,19 @@ class PI0OpenPIConfig(PreTrainedConfig):
num_inference_steps: int = 10 # Number of denoising steps during inference
time_sampling_beta_alpha: float = 1.5 # Beta distribution alpha parameter for time sampling
time_sampling_beta_beta: float = 1.0 # Beta distribution beta parameter for time sampling
time_sampling_scale: float = 0.999 # Scale factor for time sampling
time_sampling_offset: float = 0.001 # Offset for time sampling
min_period: float = 4e-3 # Min period for sinusoidal positional encoding
max_period: float = 4.0 # Max period for sinusoidal positional encoding
attention_mask_value: float = -2.3819763e38
# Image preprocessing
image_resolution: tuple[int, int] = (224, 224) # see openpi `preprocessing_pytorch.py`
# Add empty images. Used to add empty cameras when no image features are present.
empty_cameras: int = 0
# Normalization
normalization_mapping: dict[str, NormalizationMode] = field(
default_factory=lambda: {
@@ -64,7 +71,7 @@ class PI0OpenPIConfig(PreTrainedConfig):
compile_mode: str = "max-autotune" # Torch compile mode
device: str | None = None # Device to use for the model (None = auto-detect)
# Optimizer settings: see openpi `AdamW` and
# Optimizer settings: see openpi `AdamW``
optimizer_lr: float = 2.5e-5 # see openpi `CosineDecaySchedule: peak_lr`
optimizer_betas: tuple[float, float] = (0.9, 0.95)
optimizer_eps: float = 1e-8
@@ -98,12 +105,27 @@ class PI0OpenPIConfig(PreTrainedConfig):
def validate_features(self) -> None:
"""Validate and set up input/output features."""
# Image features are now handled dynamically through dataset configuration
# No need to auto-add hardcoded image keys
for i in range(self.empty_cameras):
key = f"observation.images.empty_camera_{i}"
empty_camera = PolicyFeature(
type=FeatureType.VISUAL,
shape=(3, *self.image_resolution), # Use configured image resolution
)
self.input_features[key] = empty_camera
# State and action features are also handled dynamically through dataset configuration
# The actual dimensions come from the feature shapes, max dimensions are used for padding only
pass
if "observation.state" not in self.input_features:
state_feature = PolicyFeature(
type=FeatureType.STATE,
shape=(self.max_state_dim,), # Will be padded to max_state_dim
)
self.input_features["observation.state"] = state_feature
if "action" not in self.output_features:
action_feature = PolicyFeature(
type=FeatureType.ACTION,
shape=(self.max_action_dim,), # Will be padded to max_action_dim
)
self.output_features["action"] = action_feature
def get_optimizer_preset(self) -> AdamWConfig:
return AdamWConfig(
@@ -563,7 +563,7 @@ $(python -c "import transformers, os; print(os.path.dirname(transformers.__file_
def _prepare_attention_masks_4d(self, att_2d_masks):
"""Helper method to prepare 4D attention masks for transformer."""
att_2d_masks_4d = att_2d_masks[:, None, :, :]
return torch.where(att_2d_masks_4d, 0.0, -2.3819763e38)
return torch.where(att_2d_masks_4d, 0.0, self.config.attention_mask_value)
def sample_noise(self, shape, device):
return torch.normal(
@@ -578,7 +578,7 @@ $(python -c "import transformers, os; print(os.path.dirname(transformers.__file_
time_beta = sample_beta(
self.config.time_sampling_beta_alpha, self.config.time_sampling_beta_beta, bsize, device
)
time = time_beta * 0.999 + 0.001
time = time_beta * self.config.time_sampling_scale + self.config.time_sampling_offset
return time.to(dtype=torch.float32, device=device)
def embed_prefix(
@@ -677,6 +677,7 @@ $(python -c "import transformers, os; print(os.path.dirname(transformers.__file_
action_time_mask = torch.ones(bsize, action_time_dim, dtype=torch.bool, device=timestep.device)
pad_masks.append(action_time_mask)
# Set attention masks so that image, language and state inputs do not attend to action tokens
att_masks += [1] + ([0] * (self.config.chunk_size - 1))
embs = torch.cat(embs, dim=1)