feedback pr

2026-07-25 02:36:11 +00:00 · 2025-09-22 10:26:49 +02:00
parent 8951e6034f
commit 8ac060124d
7 changed files with 65 additions and 96 deletions
@@ -119,8 +119,7 @@ phone = ["hebi-py>=2.8.0", "teleop>=0.1.0"]
 # ] # TODO: Currently not supported

 # Policies
-pi0 = ["lerobot[transformers-dep]"]
-pi05 = ["lerobot[transformers-dep]"]
+pi = ["lerobot[transformers-dep]"]
 smolvla = ["lerobot[transformers-dep]", "num2words>=0.5.14", "accelerate>=1.7.0", "safetensors>=0.4.3"]
 hilserl = ["lerobot[transformers-dep]", "gym-hil>=0.1.9", "lerobot[grpcio-dep]", "lerobot[placo-dep]"]

@@ -5,44 +5,6 @@ It is designed as a **Vision-Language-Action model with open-world generalizatio

 ---

-### ⚠️ WARNING ⚠️
-
-This project requires **patching the Hugging Face `transformers` library**.
-
-1. Make sure you have the exact version installed:
-
-```bash
-   pip show transformers
-```
-
-It must be version **4.53.2**.
-
-2. Apply the custom patches by copying the modified files into your environment:
-
-   ```bash
-   cp -r ./src/lerobot/policies/pi0_openpi/transformers_replace/* \
-     $(python -c "import transformers, os; print(os.path.dirname(transformers.__file__))")
-   ```
-
-   These patches overwrite parts of `transformers` to:
-   - Support the **AdaRMS optimizer**,
-   - Correctly control the precision of activations,
-   - Allow the KV cache to be used without updates.
-
-**Important:**
-
- This permanently modifies your `transformers` installation.
- The changes survive reinstalls unless you explicitly remove the patched files or recreate the environment.
-
-To undo and restore a clean state:
-
-```bash
-pip uninstall transformers
-pip install transformers==4.53.2
-```
-
---
-
 ## Model Overview

 | Feature              | π₀                                                     | π₀.₅                                      |
@@ -17,7 +17,7 @@
 from dataclasses import dataclass, field

 from lerobot.configs.policies import PreTrainedConfig
-from lerobot.configs.types import NormalizationMode
+from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature
 from lerobot.optim.optimizers import AdamWConfig
 from lerobot.optim.schedulers import CosineDecayWithWarmupSchedulerConfig

@@ -43,12 +43,19 @@ class PI05OpenPIConfig(PreTrainedConfig):
    num_inference_steps: int = 10  # Number of denoising steps during inference
    time_sampling_beta_alpha: float = 1.5  # Beta distribution alpha parameter for time sampling
    time_sampling_beta_beta: float = 1.0  # Beta distribution beta parameter for time sampling
+    time_sampling_scale: float = 0.999  # Scale factor for time sampling
+    time_sampling_offset: float = 0.001  # Offset for time sampling
    min_period: float = 4e-3  # Min period for sinusoidal positional encoding
    max_period: float = 4.0  # Max period for sinusoidal positional encoding

+    attention_mask_value: float = -2.3819763e38
+
    # Image preprocessing
    image_resolution: tuple[int, int] = (224, 224)  # see openpi `preprocessing_pytorch.py`

+    # Add empty images. Used to add empty cameras when no image features are present.
+    empty_cameras: int = 0
+
    # Normalization
    normalization_mapping: dict[str, NormalizationMode] = field(
        default_factory=lambda: {
@@ -64,7 +71,7 @@ class PI05OpenPIConfig(PreTrainedConfig):
    compile_mode: str = "max-autotune"  # Torch compile mode
    device: str | None = None  # Device to use for the model (None = auto-detect)

-    # Optimizer settings: see openpi `AdamW` and
+    # Optimizer settings: see openpi `AdamW`
    optimizer_lr: float = 2.5e-5  # see openpi `CosineDecaySchedule: peak_lr`
    optimizer_betas: tuple[float, float] = (0.9, 0.95)
    optimizer_eps: float = 1e-8
@@ -98,12 +105,27 @@ class PI05OpenPIConfig(PreTrainedConfig):

    def validate_features(self) -> None:
        """Validate and set up input/output features."""
-        # Image features are now handled dynamically through dataset configuration
-        # No need to auto-add hardcoded image keys
+        for i in range(self.empty_cameras):
+            key = f"observation.images.empty_camera_{i}"
+            empty_camera = PolicyFeature(
+                type=FeatureType.VISUAL,
+                shape=(3, *self.image_resolution),  # Use configured image resolution
+            )
+            self.input_features[key] = empty_camera

-        # State and action features are also handled dynamically through dataset configuration
-        # The actual dimensions come from the feature shapes, max dimensions are used for padding only
-        pass
+        if "observation.state" not in self.input_features:
+            state_feature = PolicyFeature(
+                type=FeatureType.STATE,
+                shape=(self.max_state_dim,),  # Will be padded to max_state_dim
+            )
+            self.input_features["observation.state"] = state_feature
+
+        if "action" not in self.output_features:
+            action_feature = PolicyFeature(
+                type=FeatureType.ACTION,
+                shape=(self.max_action_dim,),  # Will be padded to max_action_dim
+            )
+            self.output_features["action"] = action_feature

    def get_optimizer_preset(self) -> AdamWConfig:
        return AdamWConfig(
@@ -563,7 +563,7 @@ $(python -c "import transformers, os; print(os.path.dirname(transformers.__file_
    def _prepare_attention_masks_4d(self, att_2d_masks):
        """Helper method to prepare 4D attention masks for transformer."""
        att_2d_masks_4d = att_2d_masks[:, None, :, :]
-        return torch.where(att_2d_masks_4d, 0.0, -2.3819763e38)
+        return torch.where(att_2d_masks_4d, 0.0, self.config.attention_mask_value)

    def sample_noise(self, shape, device):
        return torch.normal(
@@ -578,7 +578,7 @@ $(python -c "import transformers, os; print(os.path.dirname(transformers.__file_
        time_beta = sample_beta(
            self.config.time_sampling_beta_alpha, self.config.time_sampling_beta_beta, bsize, device
        )
-        time = time_beta * 0.999 + 0.001
+        time = time_beta * self.config.time_sampling_scale + self.config.time_sampling_offset
        return time.to(dtype=torch.float32, device=device)

    def embed_prefix(
@@ -661,6 +661,7 @@ $(python -c "import transformers, os; print(os.path.dirname(transformers.__file_
        action_time_mask = torch.ones(bsize, action_time_dim, dtype=torch.bool, device=timestep.device)
        pad_masks.append(action_time_mask)

+        # Set attention masks so that image, language and state inputs do not attend to action tokens
        att_masks += [1] + ([0] * (self.config.chunk_size - 1))

        embs = torch.cat(embs, dim=1)
@@ -5,44 +5,6 @@ It is designed as a **Vision-Language-Action model for general robot control**.

 ---

-### ⚠️ WARNING ⚠️
-
-This project requires **patching the Hugging Face `transformers` library**.
-
-1. Make sure you have the exact version installed:
-
-```bash
-   pip show transformers
-```
-
-It must be version **4.53.2**.
-
-2. Apply the custom patches by copying the modified files into your environment:
-
-   ```bash
-   cp -r ./src/lerobot/policies/pi0_openpi/transformers_replace/* \
-     $(python -c "import transformers, os; print(os.path.dirname(transformers.__file__))")
-   ```
-
-   These patches overwrite parts of `transformers` to:
-   - Support the **AdaRMS optimizer**,
-   - Correctly control the precision of activations,
-   - Allow the KV cache to be used without updates.
-
-**Important:**
-
- This permanently modifies your `transformers` installation.
- The changes survive reinstalls unless you explicitly remove the patched files or recreate the environment.
-
-To undo and restore a clean state:
-
-```bash
-pip uninstall transformers
-pip install transformers==4.53.2
-```
-
---
-
 ## Model Overview

 | Feature              | π₀                                                     | π₀.₅                                      |
@@ -17,7 +17,7 @@
 from dataclasses import dataclass, field

 from lerobot.configs.policies import PreTrainedConfig
-from lerobot.configs.types import NormalizationMode
+from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature
 from lerobot.optim.optimizers import AdamWConfig
 from lerobot.optim.schedulers import CosineDecayWithWarmupSchedulerConfig

@@ -43,12 +43,19 @@ class PI0OpenPIConfig(PreTrainedConfig):
    num_inference_steps: int = 10  # Number of denoising steps during inference
    time_sampling_beta_alpha: float = 1.5  # Beta distribution alpha parameter for time sampling
    time_sampling_beta_beta: float = 1.0  # Beta distribution beta parameter for time sampling
+    time_sampling_scale: float = 0.999  # Scale factor for time sampling
+    time_sampling_offset: float = 0.001  # Offset for time sampling
    min_period: float = 4e-3  # Min period for sinusoidal positional encoding
    max_period: float = 4.0  # Max period for sinusoidal positional encoding

+    attention_mask_value: float = -2.3819763e38
+
    # Image preprocessing
    image_resolution: tuple[int, int] = (224, 224)  # see openpi `preprocessing_pytorch.py`

+    # Add empty images. Used to add empty cameras when no image features are present.
+    empty_cameras: int = 0
+
    # Normalization
    normalization_mapping: dict[str, NormalizationMode] = field(
        default_factory=lambda: {
@@ -64,7 +71,7 @@ class PI0OpenPIConfig(PreTrainedConfig):
    compile_mode: str = "max-autotune"  # Torch compile mode
    device: str | None = None  # Device to use for the model (None = auto-detect)

-    # Optimizer settings: see openpi `AdamW` and
+    # Optimizer settings: see openpi `AdamW``
    optimizer_lr: float = 2.5e-5  # see openpi `CosineDecaySchedule: peak_lr`
    optimizer_betas: tuple[float, float] = (0.9, 0.95)
    optimizer_eps: float = 1e-8
@@ -98,12 +105,27 @@ class PI0OpenPIConfig(PreTrainedConfig):

    def validate_features(self) -> None:
        """Validate and set up input/output features."""
-        # Image features are now handled dynamically through dataset configuration
-        # No need to auto-add hardcoded image keys
+        for i in range(self.empty_cameras):
+            key = f"observation.images.empty_camera_{i}"
+            empty_camera = PolicyFeature(
+                type=FeatureType.VISUAL,
+                shape=(3, *self.image_resolution),  # Use configured image resolution
+            )
+            self.input_features[key] = empty_camera

-        # State and action features are also handled dynamically through dataset configuration
-        # The actual dimensions come from the feature shapes, max dimensions are used for padding only
-        pass
+        if "observation.state" not in self.input_features:
+            state_feature = PolicyFeature(
+                type=FeatureType.STATE,
+                shape=(self.max_state_dim,),  # Will be padded to max_state_dim
+            )
+            self.input_features["observation.state"] = state_feature
+
+        if "action" not in self.output_features:
+            action_feature = PolicyFeature(
+                type=FeatureType.ACTION,
+                shape=(self.max_action_dim,),  # Will be padded to max_action_dim
+            )
+            self.output_features["action"] = action_feature

    def get_optimizer_preset(self) -> AdamWConfig:
        return AdamWConfig(
@@ -563,7 +563,7 @@ $(python -c "import transformers, os; print(os.path.dirname(transformers.__file_
    def _prepare_attention_masks_4d(self, att_2d_masks):
        """Helper method to prepare 4D attention masks for transformer."""
        att_2d_masks_4d = att_2d_masks[:, None, :, :]
-        return torch.where(att_2d_masks_4d, 0.0, -2.3819763e38)
+        return torch.where(att_2d_masks_4d, 0.0, self.config.attention_mask_value)

    def sample_noise(self, shape, device):
        return torch.normal(
@@ -578,7 +578,7 @@ $(python -c "import transformers, os; print(os.path.dirname(transformers.__file_
        time_beta = sample_beta(
            self.config.time_sampling_beta_alpha, self.config.time_sampling_beta_beta, bsize, device
        )
-        time = time_beta * 0.999 + 0.001
+        time = time_beta * self.config.time_sampling_scale + self.config.time_sampling_offset
        return time.to(dtype=torch.float32, device=device)

    def embed_prefix(
@@ -677,6 +677,7 @@ $(python -c "import transformers, os; print(os.path.dirname(transformers.__file_
        action_time_mask = torch.ones(bsize, action_time_dim, dtype=torch.bool, device=timestep.device)
        pad_masks.append(action_time_mask)

+        # Set attention masks so that image, language and state inputs do not attend to action tokens
        att_masks += [1] + ([0] * (self.config.chunk_size - 1))

        embs = torch.cat(embs, dim=1)