mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-24 13:09:43 +00:00
feedback pr
This commit is contained in:
+1
-2
@@ -119,8 +119,7 @@ phone = ["hebi-py>=2.8.0", "teleop>=0.1.0"]
|
|||||||
# ] # TODO: Currently not supported
|
# ] # TODO: Currently not supported
|
||||||
|
|
||||||
# Policies
|
# Policies
|
||||||
pi0 = ["lerobot[transformers-dep]"]
|
pi = ["lerobot[transformers-dep]"]
|
||||||
pi05 = ["lerobot[transformers-dep]"]
|
|
||||||
smolvla = ["lerobot[transformers-dep]", "num2words>=0.5.14", "accelerate>=1.7.0", "safetensors>=0.4.3"]
|
smolvla = ["lerobot[transformers-dep]", "num2words>=0.5.14", "accelerate>=1.7.0", "safetensors>=0.4.3"]
|
||||||
hilserl = ["lerobot[transformers-dep]", "gym-hil>=0.1.9", "lerobot[grpcio-dep]", "lerobot[placo-dep]"]
|
hilserl = ["lerobot[transformers-dep]", "gym-hil>=0.1.9", "lerobot[grpcio-dep]", "lerobot[placo-dep]"]
|
||||||
|
|
||||||
|
|||||||
@@ -5,44 +5,6 @@ It is designed as a **Vision-Language-Action model with open-world generalizatio
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
### ⚠️ WARNING ⚠️
|
|
||||||
|
|
||||||
This project requires **patching the Hugging Face `transformers` library**.
|
|
||||||
|
|
||||||
1. Make sure you have the exact version installed:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
pip show transformers
|
|
||||||
```
|
|
||||||
|
|
||||||
It must be version **4.53.2**.
|
|
||||||
|
|
||||||
2. Apply the custom patches by copying the modified files into your environment:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cp -r ./src/lerobot/policies/pi0_openpi/transformers_replace/* \
|
|
||||||
$(python -c "import transformers, os; print(os.path.dirname(transformers.__file__))")
|
|
||||||
```
|
|
||||||
|
|
||||||
These patches overwrite parts of `transformers` to:
|
|
||||||
- Support the **AdaRMS optimizer**,
|
|
||||||
- Correctly control the precision of activations,
|
|
||||||
- Allow the KV cache to be used without updates.
|
|
||||||
|
|
||||||
**Important:**
|
|
||||||
|
|
||||||
- This permanently modifies your `transformers` installation.
|
|
||||||
- The changes survive reinstalls unless you explicitly remove the patched files or recreate the environment.
|
|
||||||
|
|
||||||
To undo and restore a clean state:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
pip uninstall transformers
|
|
||||||
pip install transformers==4.53.2
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Model Overview
|
## Model Overview
|
||||||
|
|
||||||
| Feature | π₀ | π₀.₅ |
|
| Feature | π₀ | π₀.₅ |
|
||||||
|
|||||||
@@ -17,7 +17,7 @@
|
|||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
from lerobot.configs.policies import PreTrainedConfig
|
from lerobot.configs.policies import PreTrainedConfig
|
||||||
from lerobot.configs.types import NormalizationMode
|
from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature
|
||||||
from lerobot.optim.optimizers import AdamWConfig
|
from lerobot.optim.optimizers import AdamWConfig
|
||||||
from lerobot.optim.schedulers import CosineDecayWithWarmupSchedulerConfig
|
from lerobot.optim.schedulers import CosineDecayWithWarmupSchedulerConfig
|
||||||
|
|
||||||
@@ -43,12 +43,19 @@ class PI05OpenPIConfig(PreTrainedConfig):
|
|||||||
num_inference_steps: int = 10 # Number of denoising steps during inference
|
num_inference_steps: int = 10 # Number of denoising steps during inference
|
||||||
time_sampling_beta_alpha: float = 1.5 # Beta distribution alpha parameter for time sampling
|
time_sampling_beta_alpha: float = 1.5 # Beta distribution alpha parameter for time sampling
|
||||||
time_sampling_beta_beta: float = 1.0 # Beta distribution beta parameter for time sampling
|
time_sampling_beta_beta: float = 1.0 # Beta distribution beta parameter for time sampling
|
||||||
|
time_sampling_scale: float = 0.999 # Scale factor for time sampling
|
||||||
|
time_sampling_offset: float = 0.001 # Offset for time sampling
|
||||||
min_period: float = 4e-3 # Min period for sinusoidal positional encoding
|
min_period: float = 4e-3 # Min period for sinusoidal positional encoding
|
||||||
max_period: float = 4.0 # Max period for sinusoidal positional encoding
|
max_period: float = 4.0 # Max period for sinusoidal positional encoding
|
||||||
|
|
||||||
|
attention_mask_value: float = -2.3819763e38
|
||||||
|
|
||||||
# Image preprocessing
|
# Image preprocessing
|
||||||
image_resolution: tuple[int, int] = (224, 224) # see openpi `preprocessing_pytorch.py`
|
image_resolution: tuple[int, int] = (224, 224) # see openpi `preprocessing_pytorch.py`
|
||||||
|
|
||||||
|
# Add empty images. Used to add empty cameras when no image features are present.
|
||||||
|
empty_cameras: int = 0
|
||||||
|
|
||||||
# Normalization
|
# Normalization
|
||||||
normalization_mapping: dict[str, NormalizationMode] = field(
|
normalization_mapping: dict[str, NormalizationMode] = field(
|
||||||
default_factory=lambda: {
|
default_factory=lambda: {
|
||||||
@@ -64,7 +71,7 @@ class PI05OpenPIConfig(PreTrainedConfig):
|
|||||||
compile_mode: str = "max-autotune" # Torch compile mode
|
compile_mode: str = "max-autotune" # Torch compile mode
|
||||||
device: str | None = None # Device to use for the model (None = auto-detect)
|
device: str | None = None # Device to use for the model (None = auto-detect)
|
||||||
|
|
||||||
# Optimizer settings: see openpi `AdamW` and
|
# Optimizer settings: see openpi `AdamW`
|
||||||
optimizer_lr: float = 2.5e-5 # see openpi `CosineDecaySchedule: peak_lr`
|
optimizer_lr: float = 2.5e-5 # see openpi `CosineDecaySchedule: peak_lr`
|
||||||
optimizer_betas: tuple[float, float] = (0.9, 0.95)
|
optimizer_betas: tuple[float, float] = (0.9, 0.95)
|
||||||
optimizer_eps: float = 1e-8
|
optimizer_eps: float = 1e-8
|
||||||
@@ -98,12 +105,27 @@ class PI05OpenPIConfig(PreTrainedConfig):
|
|||||||
|
|
||||||
def validate_features(self) -> None:
|
def validate_features(self) -> None:
|
||||||
"""Validate and set up input/output features."""
|
"""Validate and set up input/output features."""
|
||||||
# Image features are now handled dynamically through dataset configuration
|
for i in range(self.empty_cameras):
|
||||||
# No need to auto-add hardcoded image keys
|
key = f"observation.images.empty_camera_{i}"
|
||||||
|
empty_camera = PolicyFeature(
|
||||||
|
type=FeatureType.VISUAL,
|
||||||
|
shape=(3, *self.image_resolution), # Use configured image resolution
|
||||||
|
)
|
||||||
|
self.input_features[key] = empty_camera
|
||||||
|
|
||||||
# State and action features are also handled dynamically through dataset configuration
|
if "observation.state" not in self.input_features:
|
||||||
# The actual dimensions come from the feature shapes, max dimensions are used for padding only
|
state_feature = PolicyFeature(
|
||||||
pass
|
type=FeatureType.STATE,
|
||||||
|
shape=(self.max_state_dim,), # Will be padded to max_state_dim
|
||||||
|
)
|
||||||
|
self.input_features["observation.state"] = state_feature
|
||||||
|
|
||||||
|
if "action" not in self.output_features:
|
||||||
|
action_feature = PolicyFeature(
|
||||||
|
type=FeatureType.ACTION,
|
||||||
|
shape=(self.max_action_dim,), # Will be padded to max_action_dim
|
||||||
|
)
|
||||||
|
self.output_features["action"] = action_feature
|
||||||
|
|
||||||
def get_optimizer_preset(self) -> AdamWConfig:
|
def get_optimizer_preset(self) -> AdamWConfig:
|
||||||
return AdamWConfig(
|
return AdamWConfig(
|
||||||
|
|||||||
@@ -563,7 +563,7 @@ $(python -c "import transformers, os; print(os.path.dirname(transformers.__file_
|
|||||||
def _prepare_attention_masks_4d(self, att_2d_masks):
|
def _prepare_attention_masks_4d(self, att_2d_masks):
|
||||||
"""Helper method to prepare 4D attention masks for transformer."""
|
"""Helper method to prepare 4D attention masks for transformer."""
|
||||||
att_2d_masks_4d = att_2d_masks[:, None, :, :]
|
att_2d_masks_4d = att_2d_masks[:, None, :, :]
|
||||||
return torch.where(att_2d_masks_4d, 0.0, -2.3819763e38)
|
return torch.where(att_2d_masks_4d, 0.0, self.config.attention_mask_value)
|
||||||
|
|
||||||
def sample_noise(self, shape, device):
|
def sample_noise(self, shape, device):
|
||||||
return torch.normal(
|
return torch.normal(
|
||||||
@@ -578,7 +578,7 @@ $(python -c "import transformers, os; print(os.path.dirname(transformers.__file_
|
|||||||
time_beta = sample_beta(
|
time_beta = sample_beta(
|
||||||
self.config.time_sampling_beta_alpha, self.config.time_sampling_beta_beta, bsize, device
|
self.config.time_sampling_beta_alpha, self.config.time_sampling_beta_beta, bsize, device
|
||||||
)
|
)
|
||||||
time = time_beta * 0.999 + 0.001
|
time = time_beta * self.config.time_sampling_scale + self.config.time_sampling_offset
|
||||||
return time.to(dtype=torch.float32, device=device)
|
return time.to(dtype=torch.float32, device=device)
|
||||||
|
|
||||||
def embed_prefix(
|
def embed_prefix(
|
||||||
@@ -661,6 +661,7 @@ $(python -c "import transformers, os; print(os.path.dirname(transformers.__file_
|
|||||||
action_time_mask = torch.ones(bsize, action_time_dim, dtype=torch.bool, device=timestep.device)
|
action_time_mask = torch.ones(bsize, action_time_dim, dtype=torch.bool, device=timestep.device)
|
||||||
pad_masks.append(action_time_mask)
|
pad_masks.append(action_time_mask)
|
||||||
|
|
||||||
|
# Set attention masks so that image, language and state inputs do not attend to action tokens
|
||||||
att_masks += [1] + ([0] * (self.config.chunk_size - 1))
|
att_masks += [1] + ([0] * (self.config.chunk_size - 1))
|
||||||
|
|
||||||
embs = torch.cat(embs, dim=1)
|
embs = torch.cat(embs, dim=1)
|
||||||
|
|||||||
@@ -5,44 +5,6 @@ It is designed as a **Vision-Language-Action model for general robot control**.
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
### ⚠️ WARNING ⚠️
|
|
||||||
|
|
||||||
This project requires **patching the Hugging Face `transformers` library**.
|
|
||||||
|
|
||||||
1. Make sure you have the exact version installed:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
pip show transformers
|
|
||||||
```
|
|
||||||
|
|
||||||
It must be version **4.53.2**.
|
|
||||||
|
|
||||||
2. Apply the custom patches by copying the modified files into your environment:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cp -r ./src/lerobot/policies/pi0_openpi/transformers_replace/* \
|
|
||||||
$(python -c "import transformers, os; print(os.path.dirname(transformers.__file__))")
|
|
||||||
```
|
|
||||||
|
|
||||||
These patches overwrite parts of `transformers` to:
|
|
||||||
- Support the **AdaRMS optimizer**,
|
|
||||||
- Correctly control the precision of activations,
|
|
||||||
- Allow the KV cache to be used without updates.
|
|
||||||
|
|
||||||
**Important:**
|
|
||||||
|
|
||||||
- This permanently modifies your `transformers` installation.
|
|
||||||
- The changes survive reinstalls unless you explicitly remove the patched files or recreate the environment.
|
|
||||||
|
|
||||||
To undo and restore a clean state:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
pip uninstall transformers
|
|
||||||
pip install transformers==4.53.2
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Model Overview
|
## Model Overview
|
||||||
|
|
||||||
| Feature | π₀ | π₀.₅ |
|
| Feature | π₀ | π₀.₅ |
|
||||||
|
|||||||
@@ -17,7 +17,7 @@
|
|||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
from lerobot.configs.policies import PreTrainedConfig
|
from lerobot.configs.policies import PreTrainedConfig
|
||||||
from lerobot.configs.types import NormalizationMode
|
from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature
|
||||||
from lerobot.optim.optimizers import AdamWConfig
|
from lerobot.optim.optimizers import AdamWConfig
|
||||||
from lerobot.optim.schedulers import CosineDecayWithWarmupSchedulerConfig
|
from lerobot.optim.schedulers import CosineDecayWithWarmupSchedulerConfig
|
||||||
|
|
||||||
@@ -43,12 +43,19 @@ class PI0OpenPIConfig(PreTrainedConfig):
|
|||||||
num_inference_steps: int = 10 # Number of denoising steps during inference
|
num_inference_steps: int = 10 # Number of denoising steps during inference
|
||||||
time_sampling_beta_alpha: float = 1.5 # Beta distribution alpha parameter for time sampling
|
time_sampling_beta_alpha: float = 1.5 # Beta distribution alpha parameter for time sampling
|
||||||
time_sampling_beta_beta: float = 1.0 # Beta distribution beta parameter for time sampling
|
time_sampling_beta_beta: float = 1.0 # Beta distribution beta parameter for time sampling
|
||||||
|
time_sampling_scale: float = 0.999 # Scale factor for time sampling
|
||||||
|
time_sampling_offset: float = 0.001 # Offset for time sampling
|
||||||
min_period: float = 4e-3 # Min period for sinusoidal positional encoding
|
min_period: float = 4e-3 # Min period for sinusoidal positional encoding
|
||||||
max_period: float = 4.0 # Max period for sinusoidal positional encoding
|
max_period: float = 4.0 # Max period for sinusoidal positional encoding
|
||||||
|
|
||||||
|
attention_mask_value: float = -2.3819763e38
|
||||||
|
|
||||||
# Image preprocessing
|
# Image preprocessing
|
||||||
image_resolution: tuple[int, int] = (224, 224) # see openpi `preprocessing_pytorch.py`
|
image_resolution: tuple[int, int] = (224, 224) # see openpi `preprocessing_pytorch.py`
|
||||||
|
|
||||||
|
# Add empty images. Used to add empty cameras when no image features are present.
|
||||||
|
empty_cameras: int = 0
|
||||||
|
|
||||||
# Normalization
|
# Normalization
|
||||||
normalization_mapping: dict[str, NormalizationMode] = field(
|
normalization_mapping: dict[str, NormalizationMode] = field(
|
||||||
default_factory=lambda: {
|
default_factory=lambda: {
|
||||||
@@ -64,7 +71,7 @@ class PI0OpenPIConfig(PreTrainedConfig):
|
|||||||
compile_mode: str = "max-autotune" # Torch compile mode
|
compile_mode: str = "max-autotune" # Torch compile mode
|
||||||
device: str | None = None # Device to use for the model (None = auto-detect)
|
device: str | None = None # Device to use for the model (None = auto-detect)
|
||||||
|
|
||||||
# Optimizer settings: see openpi `AdamW` and
|
# Optimizer settings: see openpi `AdamW``
|
||||||
optimizer_lr: float = 2.5e-5 # see openpi `CosineDecaySchedule: peak_lr`
|
optimizer_lr: float = 2.5e-5 # see openpi `CosineDecaySchedule: peak_lr`
|
||||||
optimizer_betas: tuple[float, float] = (0.9, 0.95)
|
optimizer_betas: tuple[float, float] = (0.9, 0.95)
|
||||||
optimizer_eps: float = 1e-8
|
optimizer_eps: float = 1e-8
|
||||||
@@ -98,12 +105,27 @@ class PI0OpenPIConfig(PreTrainedConfig):
|
|||||||
|
|
||||||
def validate_features(self) -> None:
|
def validate_features(self) -> None:
|
||||||
"""Validate and set up input/output features."""
|
"""Validate and set up input/output features."""
|
||||||
# Image features are now handled dynamically through dataset configuration
|
for i in range(self.empty_cameras):
|
||||||
# No need to auto-add hardcoded image keys
|
key = f"observation.images.empty_camera_{i}"
|
||||||
|
empty_camera = PolicyFeature(
|
||||||
|
type=FeatureType.VISUAL,
|
||||||
|
shape=(3, *self.image_resolution), # Use configured image resolution
|
||||||
|
)
|
||||||
|
self.input_features[key] = empty_camera
|
||||||
|
|
||||||
# State and action features are also handled dynamically through dataset configuration
|
if "observation.state" not in self.input_features:
|
||||||
# The actual dimensions come from the feature shapes, max dimensions are used for padding only
|
state_feature = PolicyFeature(
|
||||||
pass
|
type=FeatureType.STATE,
|
||||||
|
shape=(self.max_state_dim,), # Will be padded to max_state_dim
|
||||||
|
)
|
||||||
|
self.input_features["observation.state"] = state_feature
|
||||||
|
|
||||||
|
if "action" not in self.output_features:
|
||||||
|
action_feature = PolicyFeature(
|
||||||
|
type=FeatureType.ACTION,
|
||||||
|
shape=(self.max_action_dim,), # Will be padded to max_action_dim
|
||||||
|
)
|
||||||
|
self.output_features["action"] = action_feature
|
||||||
|
|
||||||
def get_optimizer_preset(self) -> AdamWConfig:
|
def get_optimizer_preset(self) -> AdamWConfig:
|
||||||
return AdamWConfig(
|
return AdamWConfig(
|
||||||
|
|||||||
@@ -563,7 +563,7 @@ $(python -c "import transformers, os; print(os.path.dirname(transformers.__file_
|
|||||||
def _prepare_attention_masks_4d(self, att_2d_masks):
|
def _prepare_attention_masks_4d(self, att_2d_masks):
|
||||||
"""Helper method to prepare 4D attention masks for transformer."""
|
"""Helper method to prepare 4D attention masks for transformer."""
|
||||||
att_2d_masks_4d = att_2d_masks[:, None, :, :]
|
att_2d_masks_4d = att_2d_masks[:, None, :, :]
|
||||||
return torch.where(att_2d_masks_4d, 0.0, -2.3819763e38)
|
return torch.where(att_2d_masks_4d, 0.0, self.config.attention_mask_value)
|
||||||
|
|
||||||
def sample_noise(self, shape, device):
|
def sample_noise(self, shape, device):
|
||||||
return torch.normal(
|
return torch.normal(
|
||||||
@@ -578,7 +578,7 @@ $(python -c "import transformers, os; print(os.path.dirname(transformers.__file_
|
|||||||
time_beta = sample_beta(
|
time_beta = sample_beta(
|
||||||
self.config.time_sampling_beta_alpha, self.config.time_sampling_beta_beta, bsize, device
|
self.config.time_sampling_beta_alpha, self.config.time_sampling_beta_beta, bsize, device
|
||||||
)
|
)
|
||||||
time = time_beta * 0.999 + 0.001
|
time = time_beta * self.config.time_sampling_scale + self.config.time_sampling_offset
|
||||||
return time.to(dtype=torch.float32, device=device)
|
return time.to(dtype=torch.float32, device=device)
|
||||||
|
|
||||||
def embed_prefix(
|
def embed_prefix(
|
||||||
@@ -677,6 +677,7 @@ $(python -c "import transformers, os; print(os.path.dirname(transformers.__file_
|
|||||||
action_time_mask = torch.ones(bsize, action_time_dim, dtype=torch.bool, device=timestep.device)
|
action_time_mask = torch.ones(bsize, action_time_dim, dtype=torch.bool, device=timestep.device)
|
||||||
pad_masks.append(action_time_mask)
|
pad_masks.append(action_time_mask)
|
||||||
|
|
||||||
|
# Set attention masks so that image, language and state inputs do not attend to action tokens
|
||||||
att_masks += [1] + ([0] * (self.config.chunk_size - 1))
|
att_masks += [1] + ([0] * (self.config.chunk_size - 1))
|
||||||
|
|
||||||
embs = torch.cat(embs, dim=1)
|
embs = torch.cat(embs, dim=1)
|
||||||
|
|||||||
Reference in New Issue
Block a user