feedback pr

2026-07-25 10:46:01 +00:00 · 2025-09-22 10:26:49 +02:00
parent 8951e6034f
commit 8ac060124d
7 changed files with 65 additions and 96 deletions
@@ -119,8 +119,7 @@ phone = ["hebi-py>=2.8.0", "teleop>=0.1.0"]
 # ] # TODO: Currently not supported
 # Policies
-pi0 = ["lerobot[transformers-dep]"]
+pi = ["lerobot[transformers-dep]"]
 pi05 = ["lerobot[transformers-dep]"]
 smolvla = ["lerobot[transformers-dep]", "num2words>=0.5.14", "accelerate>=1.7.0", "safetensors>=0.4.3"]
 hilserl = ["lerobot[transformers-dep]", "gym-hil>=0.1.9", "lerobot[grpcio-dep]", "lerobot[placo-dep]"]
@@ -5,44 +5,6 @@ It is designed as a **Vision-Language-Action model with open-world generalizatio
 ---
 ### ⚠️ WARNING ⚠️
 This project requires **patching the Hugging Face `transformers` library**.
 1. Make sure you have the exact version installed:
 ```bash
   pip show transformers
 ```
 It must be version **4.53.2**.
 2. Apply the custom patches by copying the modified files into your environment:
   ```bash
   cp -r ./src/lerobot/policies/pi0_openpi/transformers_replace/* \
     $(python -c "import transformers, os; print(os.path.dirname(transformers.__file__))")
   ```
   These patches overwrite parts of `transformers` to:
   - Support the **AdaRMS optimizer**,
   - Correctly control the precision of activations,
   - Allow the KV cache to be used without updates.
 **Important:**
 - This permanently modifies your `transformers` installation.
 - The changes survive reinstalls unless you explicitly remove the patched files or recreate the environment.
 To undo and restore a clean state:
 ```bash
 pip uninstall transformers
 pip install transformers==4.53.2
 ```
 ---
 ## Model Overview
 | Feature              | π₀                                                     | π₀.₅                                      |
@@ -17,7 +17,7 @@
 from dataclasses import dataclass, field
 from lerobot.configs.policies import PreTrainedConfig
-from lerobot.configs.types import NormalizationMode
+from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature
 from lerobot.optim.optimizers import AdamWConfig
 from lerobot.optim.schedulers import CosineDecayWithWarmupSchedulerConfig
@@ -43,12 +43,19 @@ class PI05OpenPIConfig(PreTrainedConfig):
    num_inference_steps: int = 10  # Number of denoising steps during inference
    time_sampling_beta_alpha: float = 1.5  # Beta distribution alpha parameter for time sampling
    time_sampling_beta_beta: float = 1.0  # Beta distribution beta parameter for time sampling
    time_sampling_scale: float = 0.999  # Scale factor for time sampling
    time_sampling_offset: float = 0.001  # Offset for time sampling
    min_period: float = 4e-3  # Min period for sinusoidal positional encoding
    max_period: float = 4.0  # Max period for sinusoidal positional encoding
    attention_mask_value: float = -2.3819763e38
    # Image preprocessing
    image_resolution: tuple[int, int] = (224, 224)  # see openpi `preprocessing_pytorch.py`
    # Add empty images. Used to add empty cameras when no image features are present.
    empty_cameras: int = 0
    # Normalization
    normalization_mapping: dict[str, NormalizationMode] = field(
        default_factory=lambda: {
@@ -64,7 +71,7 @@ class PI05OpenPIConfig(PreTrainedConfig):
    compile_mode: str = "max-autotune"  # Torch compile mode
    device: str | None = None  # Device to use for the model (None = auto-detect)
-    # Optimizer settings: see openpi `AdamW` and
+    # Optimizer settings: see openpi `AdamW`
    optimizer_lr: float = 2.5e-5  # see openpi `CosineDecaySchedule: peak_lr`
    optimizer_betas: tuple[float, float] = (0.9, 0.95)
    optimizer_eps: float = 1e-8
@@ -98,12 +105,27 @@ class PI05OpenPIConfig(PreTrainedConfig):
    def validate_features(self) -> None:
        """Validate and set up input/output features."""
-        # Image features are now handled dynamically through dataset configuration
+        for i in range(self.empty_cameras):
-        # No need to auto-add hardcoded image keys
+            key = f"observation.images.empty_camera_{i}"
            empty_camera = PolicyFeature(
                type=FeatureType.VISUAL,
                shape=(3, *self.image_resolution),  # Use configured image resolution
            )
            self.input_features[key] = empty_camera
-        # State and action features are also handled dynamically through dataset configuration
+        if "observation.state" not in self.input_features:
-        # The actual dimensions come from the feature shapes, max dimensions are used for padding only
+            state_feature = PolicyFeature(
-        pass
+                type=FeatureType.STATE,
                shape=(self.max_state_dim,),  # Will be padded to max_state_dim
            )
            self.input_features["observation.state"] = state_feature
        if "action" not in self.output_features:
            action_feature = PolicyFeature(
                type=FeatureType.ACTION,
                shape=(self.max_action_dim,),  # Will be padded to max_action_dim
            )
            self.output_features["action"] = action_feature
    def get_optimizer_preset(self) -> AdamWConfig:
        return AdamWConfig(
@@ -563,7 +563,7 @@ $(python -c "import transformers, os; print(os.path.dirname(transformers.__file_
    def _prepare_attention_masks_4d(self, att_2d_masks):
        """Helper method to prepare 4D attention masks for transformer."""
        att_2d_masks_4d = att_2d_masks[:, None, :, :]
-        return torch.where(att_2d_masks_4d, 0.0, -2.3819763e38)
+        return torch.where(att_2d_masks_4d, 0.0, self.config.attention_mask_value)
    def sample_noise(self, shape, device):
        return torch.normal(
@@ -578,7 +578,7 @@ $(python -c "import transformers, os; print(os.path.dirname(transformers.__file_
        time_beta = sample_beta(
            self.config.time_sampling_beta_alpha, self.config.time_sampling_beta_beta, bsize, device
        )
-        time = time_beta * 0.999 + 0.001
+        time = time_beta * self.config.time_sampling_scale + self.config.time_sampling_offset
        return time.to(dtype=torch.float32, device=device)
    def embed_prefix(
@@ -661,6 +661,7 @@ $(python -c "import transformers, os; print(os.path.dirname(transformers.__file_
        action_time_mask = torch.ones(bsize, action_time_dim, dtype=torch.bool, device=timestep.device)
        pad_masks.append(action_time_mask)
        # Set attention masks so that image, language and state inputs do not attend to action tokens
        att_masks += [1] + ([0] * (self.config.chunk_size - 1))
        embs = torch.cat(embs, dim=1)
@@ -5,44 +5,6 @@ It is designed as a **Vision-Language-Action model for general robot control**.
 ---
 ### ⚠️ WARNING ⚠️
 This project requires **patching the Hugging Face `transformers` library**.
 1. Make sure you have the exact version installed:
 ```bash
   pip show transformers
 ```
 It must be version **4.53.2**.
 2. Apply the custom patches by copying the modified files into your environment:
   ```bash
   cp -r ./src/lerobot/policies/pi0_openpi/transformers_replace/* \
     $(python -c "import transformers, os; print(os.path.dirname(transformers.__file__))")
   ```
   These patches overwrite parts of `transformers` to:
   - Support the **AdaRMS optimizer**,
   - Correctly control the precision of activations,
   - Allow the KV cache to be used without updates.
 **Important:**
 - This permanently modifies your `transformers` installation.
 - The changes survive reinstalls unless you explicitly remove the patched files or recreate the environment.
 To undo and restore a clean state:
 ```bash
 pip uninstall transformers
 pip install transformers==4.53.2
 ```
 ---
 ## Model Overview
 | Feature              | π₀                                                     | π₀.₅                                      |
@@ -17,7 +17,7 @@
 from dataclasses import dataclass, field
 from lerobot.configs.policies import PreTrainedConfig
-from lerobot.configs.types import NormalizationMode
+from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature
 from lerobot.optim.optimizers import AdamWConfig
 from lerobot.optim.schedulers import CosineDecayWithWarmupSchedulerConfig
@@ -43,12 +43,19 @@ class PI0OpenPIConfig(PreTrainedConfig):
    num_inference_steps: int = 10  # Number of denoising steps during inference
    time_sampling_beta_alpha: float = 1.5  # Beta distribution alpha parameter for time sampling
    time_sampling_beta_beta: float = 1.0  # Beta distribution beta parameter for time sampling
    time_sampling_scale: float = 0.999  # Scale factor for time sampling
    time_sampling_offset: float = 0.001  # Offset for time sampling
    min_period: float = 4e-3  # Min period for sinusoidal positional encoding
    max_period: float = 4.0  # Max period for sinusoidal positional encoding
    attention_mask_value: float = -2.3819763e38
    # Image preprocessing
    image_resolution: tuple[int, int] = (224, 224)  # see openpi `preprocessing_pytorch.py`
    # Add empty images. Used to add empty cameras when no image features are present.
    empty_cameras: int = 0
    # Normalization
    normalization_mapping: dict[str, NormalizationMode] = field(
        default_factory=lambda: {
@@ -64,7 +71,7 @@ class PI0OpenPIConfig(PreTrainedConfig):
    compile_mode: str = "max-autotune"  # Torch compile mode
    device: str | None = None  # Device to use for the model (None = auto-detect)
-    # Optimizer settings: see openpi `AdamW` and
+    # Optimizer settings: see openpi `AdamW``
    optimizer_lr: float = 2.5e-5  # see openpi `CosineDecaySchedule: peak_lr`
    optimizer_betas: tuple[float, float] = (0.9, 0.95)
    optimizer_eps: float = 1e-8
@@ -98,12 +105,27 @@ class PI0OpenPIConfig(PreTrainedConfig):
    def validate_features(self) -> None:
        """Validate and set up input/output features."""
-        # Image features are now handled dynamically through dataset configuration
+        for i in range(self.empty_cameras):
-        # No need to auto-add hardcoded image keys
+            key = f"observation.images.empty_camera_{i}"
            empty_camera = PolicyFeature(
                type=FeatureType.VISUAL,
                shape=(3, *self.image_resolution),  # Use configured image resolution
            )
            self.input_features[key] = empty_camera
-        # State and action features are also handled dynamically through dataset configuration
+        if "observation.state" not in self.input_features:
-        # The actual dimensions come from the feature shapes, max dimensions are used for padding only
+            state_feature = PolicyFeature(
-        pass
+                type=FeatureType.STATE,
                shape=(self.max_state_dim,),  # Will be padded to max_state_dim
            )
            self.input_features["observation.state"] = state_feature
        if "action" not in self.output_features:
            action_feature = PolicyFeature(
                type=FeatureType.ACTION,
                shape=(self.max_action_dim,),  # Will be padded to max_action_dim
            )
            self.output_features["action"] = action_feature
    def get_optimizer_preset(self) -> AdamWConfig:
        return AdamWConfig(
@@ -563,7 +563,7 @@ $(python -c "import transformers, os; print(os.path.dirname(transformers.__file_
    def _prepare_attention_masks_4d(self, att_2d_masks):
        """Helper method to prepare 4D attention masks for transformer."""
        att_2d_masks_4d = att_2d_masks[:, None, :, :]
-        return torch.where(att_2d_masks_4d, 0.0, -2.3819763e38)
+        return torch.where(att_2d_masks_4d, 0.0, self.config.attention_mask_value)
    def sample_noise(self, shape, device):
        return torch.normal(
@@ -578,7 +578,7 @@ $(python -c "import transformers, os; print(os.path.dirname(transformers.__file_
        time_beta = sample_beta(
            self.config.time_sampling_beta_alpha, self.config.time_sampling_beta_beta, bsize, device
        )
-        time = time_beta * 0.999 + 0.001
+        time = time_beta * self.config.time_sampling_scale + self.config.time_sampling_offset
        return time.to(dtype=torch.float32, device=device)
    def embed_prefix(
@@ -677,6 +677,7 @@ $(python -c "import transformers, os; print(os.path.dirname(transformers.__file_
        action_time_mask = torch.ones(bsize, action_time_dim, dtype=torch.bool, device=timestep.device)
        pad_masks.append(action_time_mask)
        # Set attention masks so that image, language and state inputs do not attend to action tokens
        att_masks += [1] + ([0] * (self.config.chunk_size - 1))
        embs = torch.cat(embs, dim=1)