mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-27 06:29:47 +00:00
feedback pr
This commit is contained in:
+1
-2
@@ -119,8 +119,7 @@ phone = ["hebi-py>=2.8.0", "teleop>=0.1.0"]
|
||||
# ] # TODO: Currently not supported
|
||||
|
||||
# Policies
|
||||
pi0 = ["lerobot[transformers-dep]"]
|
||||
pi05 = ["lerobot[transformers-dep]"]
|
||||
pi = ["lerobot[transformers-dep]"]
|
||||
smolvla = ["lerobot[transformers-dep]", "num2words>=0.5.14", "accelerate>=1.7.0", "safetensors>=0.4.3"]
|
||||
hilserl = ["lerobot[transformers-dep]", "gym-hil>=0.1.9", "lerobot[grpcio-dep]", "lerobot[placo-dep]"]
|
||||
|
||||
|
||||
@@ -5,44 +5,6 @@ It is designed as a **Vision-Language-Action model with open-world generalizatio
|
||||
|
||||
---
|
||||
|
||||
### ⚠️ WARNING ⚠️
|
||||
|
||||
This project requires **patching the Hugging Face `transformers` library**.
|
||||
|
||||
1. Make sure you have the exact version installed:
|
||||
|
||||
```bash
|
||||
pip show transformers
|
||||
```
|
||||
|
||||
It must be version **4.53.2**.
|
||||
|
||||
2. Apply the custom patches by copying the modified files into your environment:
|
||||
|
||||
```bash
|
||||
cp -r ./src/lerobot/policies/pi0_openpi/transformers_replace/* \
|
||||
$(python -c "import transformers, os; print(os.path.dirname(transformers.__file__))")
|
||||
```
|
||||
|
||||
These patches overwrite parts of `transformers` to:
|
||||
- Support the **AdaRMS optimizer**,
|
||||
- Correctly control the precision of activations,
|
||||
- Allow the KV cache to be used without updates.
|
||||
|
||||
**Important:**
|
||||
|
||||
- This permanently modifies your `transformers` installation.
|
||||
- The changes survive reinstalls unless you explicitly remove the patched files or recreate the environment.
|
||||
|
||||
To undo and restore a clean state:
|
||||
|
||||
```bash
|
||||
pip uninstall transformers
|
||||
pip install transformers==4.53.2
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Model Overview
|
||||
|
||||
| Feature | π₀ | π₀.₅ |
|
||||
|
||||
@@ -17,7 +17,7 @@
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from lerobot.configs.policies import PreTrainedConfig
|
||||
from lerobot.configs.types import NormalizationMode
|
||||
from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature
|
||||
from lerobot.optim.optimizers import AdamWConfig
|
||||
from lerobot.optim.schedulers import CosineDecayWithWarmupSchedulerConfig
|
||||
|
||||
@@ -43,12 +43,19 @@ class PI05OpenPIConfig(PreTrainedConfig):
|
||||
num_inference_steps: int = 10 # Number of denoising steps during inference
|
||||
time_sampling_beta_alpha: float = 1.5 # Beta distribution alpha parameter for time sampling
|
||||
time_sampling_beta_beta: float = 1.0 # Beta distribution beta parameter for time sampling
|
||||
time_sampling_scale: float = 0.999 # Scale factor for time sampling
|
||||
time_sampling_offset: float = 0.001 # Offset for time sampling
|
||||
min_period: float = 4e-3 # Min period for sinusoidal positional encoding
|
||||
max_period: float = 4.0 # Max period for sinusoidal positional encoding
|
||||
|
||||
attention_mask_value: float = -2.3819763e38
|
||||
|
||||
# Image preprocessing
|
||||
image_resolution: tuple[int, int] = (224, 224) # see openpi `preprocessing_pytorch.py`
|
||||
|
||||
# Add empty images. Used to add empty cameras when no image features are present.
|
||||
empty_cameras: int = 0
|
||||
|
||||
# Normalization
|
||||
normalization_mapping: dict[str, NormalizationMode] = field(
|
||||
default_factory=lambda: {
|
||||
@@ -64,7 +71,7 @@ class PI05OpenPIConfig(PreTrainedConfig):
|
||||
compile_mode: str = "max-autotune" # Torch compile mode
|
||||
device: str | None = None # Device to use for the model (None = auto-detect)
|
||||
|
||||
# Optimizer settings: see openpi `AdamW` and
|
||||
# Optimizer settings: see openpi `AdamW`
|
||||
optimizer_lr: float = 2.5e-5 # see openpi `CosineDecaySchedule: peak_lr`
|
||||
optimizer_betas: tuple[float, float] = (0.9, 0.95)
|
||||
optimizer_eps: float = 1e-8
|
||||
@@ -98,12 +105,27 @@ class PI05OpenPIConfig(PreTrainedConfig):
|
||||
|
||||
def validate_features(self) -> None:
|
||||
"""Validate and set up input/output features."""
|
||||
# Image features are now handled dynamically through dataset configuration
|
||||
# No need to auto-add hardcoded image keys
|
||||
for i in range(self.empty_cameras):
|
||||
key = f"observation.images.empty_camera_{i}"
|
||||
empty_camera = PolicyFeature(
|
||||
type=FeatureType.VISUAL,
|
||||
shape=(3, *self.image_resolution), # Use configured image resolution
|
||||
)
|
||||
self.input_features[key] = empty_camera
|
||||
|
||||
# State and action features are also handled dynamically through dataset configuration
|
||||
# The actual dimensions come from the feature shapes, max dimensions are used for padding only
|
||||
pass
|
||||
if "observation.state" not in self.input_features:
|
||||
state_feature = PolicyFeature(
|
||||
type=FeatureType.STATE,
|
||||
shape=(self.max_state_dim,), # Will be padded to max_state_dim
|
||||
)
|
||||
self.input_features["observation.state"] = state_feature
|
||||
|
||||
if "action" not in self.output_features:
|
||||
action_feature = PolicyFeature(
|
||||
type=FeatureType.ACTION,
|
||||
shape=(self.max_action_dim,), # Will be padded to max_action_dim
|
||||
)
|
||||
self.output_features["action"] = action_feature
|
||||
|
||||
def get_optimizer_preset(self) -> AdamWConfig:
|
||||
return AdamWConfig(
|
||||
|
||||
@@ -563,7 +563,7 @@ $(python -c "import transformers, os; print(os.path.dirname(transformers.__file_
|
||||
def _prepare_attention_masks_4d(self, att_2d_masks):
|
||||
"""Helper method to prepare 4D attention masks for transformer."""
|
||||
att_2d_masks_4d = att_2d_masks[:, None, :, :]
|
||||
return torch.where(att_2d_masks_4d, 0.0, -2.3819763e38)
|
||||
return torch.where(att_2d_masks_4d, 0.0, self.config.attention_mask_value)
|
||||
|
||||
def sample_noise(self, shape, device):
|
||||
return torch.normal(
|
||||
@@ -578,7 +578,7 @@ $(python -c "import transformers, os; print(os.path.dirname(transformers.__file_
|
||||
time_beta = sample_beta(
|
||||
self.config.time_sampling_beta_alpha, self.config.time_sampling_beta_beta, bsize, device
|
||||
)
|
||||
time = time_beta * 0.999 + 0.001
|
||||
time = time_beta * self.config.time_sampling_scale + self.config.time_sampling_offset
|
||||
return time.to(dtype=torch.float32, device=device)
|
||||
|
||||
def embed_prefix(
|
||||
@@ -661,6 +661,7 @@ $(python -c "import transformers, os; print(os.path.dirname(transformers.__file_
|
||||
action_time_mask = torch.ones(bsize, action_time_dim, dtype=torch.bool, device=timestep.device)
|
||||
pad_masks.append(action_time_mask)
|
||||
|
||||
# Set attention masks so that image, language and state inputs do not attend to action tokens
|
||||
att_masks += [1] + ([0] * (self.config.chunk_size - 1))
|
||||
|
||||
embs = torch.cat(embs, dim=1)
|
||||
|
||||
@@ -5,44 +5,6 @@ It is designed as a **Vision-Language-Action model for general robot control**.
|
||||
|
||||
---
|
||||
|
||||
### ⚠️ WARNING ⚠️
|
||||
|
||||
This project requires **patching the Hugging Face `transformers` library**.
|
||||
|
||||
1. Make sure you have the exact version installed:
|
||||
|
||||
```bash
|
||||
pip show transformers
|
||||
```
|
||||
|
||||
It must be version **4.53.2**.
|
||||
|
||||
2. Apply the custom patches by copying the modified files into your environment:
|
||||
|
||||
```bash
|
||||
cp -r ./src/lerobot/policies/pi0_openpi/transformers_replace/* \
|
||||
$(python -c "import transformers, os; print(os.path.dirname(transformers.__file__))")
|
||||
```
|
||||
|
||||
These patches overwrite parts of `transformers` to:
|
||||
- Support the **AdaRMS optimizer**,
|
||||
- Correctly control the precision of activations,
|
||||
- Allow the KV cache to be used without updates.
|
||||
|
||||
**Important:**
|
||||
|
||||
- This permanently modifies your `transformers` installation.
|
||||
- The changes survive reinstalls unless you explicitly remove the patched files or recreate the environment.
|
||||
|
||||
To undo and restore a clean state:
|
||||
|
||||
```bash
|
||||
pip uninstall transformers
|
||||
pip install transformers==4.53.2
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Model Overview
|
||||
|
||||
| Feature | π₀ | π₀.₅ |
|
||||
|
||||
@@ -17,7 +17,7 @@
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from lerobot.configs.policies import PreTrainedConfig
|
||||
from lerobot.configs.types import NormalizationMode
|
||||
from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature
|
||||
from lerobot.optim.optimizers import AdamWConfig
|
||||
from lerobot.optim.schedulers import CosineDecayWithWarmupSchedulerConfig
|
||||
|
||||
@@ -43,12 +43,19 @@ class PI0OpenPIConfig(PreTrainedConfig):
|
||||
num_inference_steps: int = 10 # Number of denoising steps during inference
|
||||
time_sampling_beta_alpha: float = 1.5 # Beta distribution alpha parameter for time sampling
|
||||
time_sampling_beta_beta: float = 1.0 # Beta distribution beta parameter for time sampling
|
||||
time_sampling_scale: float = 0.999 # Scale factor for time sampling
|
||||
time_sampling_offset: float = 0.001 # Offset for time sampling
|
||||
min_period: float = 4e-3 # Min period for sinusoidal positional encoding
|
||||
max_period: float = 4.0 # Max period for sinusoidal positional encoding
|
||||
|
||||
attention_mask_value: float = -2.3819763e38
|
||||
|
||||
# Image preprocessing
|
||||
image_resolution: tuple[int, int] = (224, 224) # see openpi `preprocessing_pytorch.py`
|
||||
|
||||
# Add empty images. Used to add empty cameras when no image features are present.
|
||||
empty_cameras: int = 0
|
||||
|
||||
# Normalization
|
||||
normalization_mapping: dict[str, NormalizationMode] = field(
|
||||
default_factory=lambda: {
|
||||
@@ -64,7 +71,7 @@ class PI0OpenPIConfig(PreTrainedConfig):
|
||||
compile_mode: str = "max-autotune" # Torch compile mode
|
||||
device: str | None = None # Device to use for the model (None = auto-detect)
|
||||
|
||||
# Optimizer settings: see openpi `AdamW` and
|
||||
# Optimizer settings: see openpi `AdamW``
|
||||
optimizer_lr: float = 2.5e-5 # see openpi `CosineDecaySchedule: peak_lr`
|
||||
optimizer_betas: tuple[float, float] = (0.9, 0.95)
|
||||
optimizer_eps: float = 1e-8
|
||||
@@ -98,12 +105,27 @@ class PI0OpenPIConfig(PreTrainedConfig):
|
||||
|
||||
def validate_features(self) -> None:
|
||||
"""Validate and set up input/output features."""
|
||||
# Image features are now handled dynamically through dataset configuration
|
||||
# No need to auto-add hardcoded image keys
|
||||
for i in range(self.empty_cameras):
|
||||
key = f"observation.images.empty_camera_{i}"
|
||||
empty_camera = PolicyFeature(
|
||||
type=FeatureType.VISUAL,
|
||||
shape=(3, *self.image_resolution), # Use configured image resolution
|
||||
)
|
||||
self.input_features[key] = empty_camera
|
||||
|
||||
# State and action features are also handled dynamically through dataset configuration
|
||||
# The actual dimensions come from the feature shapes, max dimensions are used for padding only
|
||||
pass
|
||||
if "observation.state" not in self.input_features:
|
||||
state_feature = PolicyFeature(
|
||||
type=FeatureType.STATE,
|
||||
shape=(self.max_state_dim,), # Will be padded to max_state_dim
|
||||
)
|
||||
self.input_features["observation.state"] = state_feature
|
||||
|
||||
if "action" not in self.output_features:
|
||||
action_feature = PolicyFeature(
|
||||
type=FeatureType.ACTION,
|
||||
shape=(self.max_action_dim,), # Will be padded to max_action_dim
|
||||
)
|
||||
self.output_features["action"] = action_feature
|
||||
|
||||
def get_optimizer_preset(self) -> AdamWConfig:
|
||||
return AdamWConfig(
|
||||
|
||||
@@ -563,7 +563,7 @@ $(python -c "import transformers, os; print(os.path.dirname(transformers.__file_
|
||||
def _prepare_attention_masks_4d(self, att_2d_masks):
|
||||
"""Helper method to prepare 4D attention masks for transformer."""
|
||||
att_2d_masks_4d = att_2d_masks[:, None, :, :]
|
||||
return torch.where(att_2d_masks_4d, 0.0, -2.3819763e38)
|
||||
return torch.where(att_2d_masks_4d, 0.0, self.config.attention_mask_value)
|
||||
|
||||
def sample_noise(self, shape, device):
|
||||
return torch.normal(
|
||||
@@ -578,7 +578,7 @@ $(python -c "import transformers, os; print(os.path.dirname(transformers.__file_
|
||||
time_beta = sample_beta(
|
||||
self.config.time_sampling_beta_alpha, self.config.time_sampling_beta_beta, bsize, device
|
||||
)
|
||||
time = time_beta * 0.999 + 0.001
|
||||
time = time_beta * self.config.time_sampling_scale + self.config.time_sampling_offset
|
||||
return time.to(dtype=torch.float32, device=device)
|
||||
|
||||
def embed_prefix(
|
||||
@@ -677,6 +677,7 @@ $(python -c "import transformers, os; print(os.path.dirname(transformers.__file_
|
||||
action_time_mask = torch.ones(bsize, action_time_dim, dtype=torch.bool, device=timestep.device)
|
||||
pad_masks.append(action_time_mask)
|
||||
|
||||
# Set attention masks so that image, language and state inputs do not attend to action tokens
|
||||
att_masks += [1] + ([0] * (self.config.chunk_size - 1))
|
||||
|
||||
embs = torch.cat(embs, dim=1)
|
||||
|
||||
Reference in New Issue
Block a user