upgrade transformers version

2026-07-24 18:26:11 +00:00 · 2025-11-25 14:18:26 +01:00
parent 936a6728f0
commit 0e21f3fdf7
6 changed files with 116 additions and 318 deletions
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import importlib
-from typing import Any
+from typing import TYPE_CHECKING, Any
 import gymnasium as gym
 from gymnasium.envs.registration import registry as gym_registry
@@ -22,10 +22,16 @@ from gymnasium.envs.registration import registry as gym_registry
 from lerobot.configs.policies import PreTrainedConfig
 from lerobot.envs.configs import AlohaEnv, EnvConfig, LiberoEnv, PushtEnv
 from lerobot.envs.utils import _call_make_env, _download_hub_file, _import_hub_module, _normalize_hub_result
 from lerobot.policies.xvla.configuration_xvla import XVLAConfig
 from lerobot.processor import ProcessorStep
 from lerobot.processor.env_processor import LiberoProcessorStep
 from lerobot.processor.pipeline import PolicyProcessorPipeline
 from lerobot.utils.import_utils import _transformers_available
 # Conditional import for type checking and lazy loading
 if TYPE_CHECKING or _transformers_available:
    from lerobot.policies.xvla.configuration_xvla import XVLAConfig
 else:
    XVLAConfig = None
 def make_env_config(env_type: str, **kwargs) -> EnvConfig:
@@ -49,7 +49,7 @@ class XVLAConfig(PreTrainedConfig):
    normalization_mapping: dict[str, NormalizationMode] = field(
        default_factory=lambda: {
            "VISUAL": NormalizationMode.IDENTITY,
-            "STATE": NormalizationMode.MEAN_STD,
+            "STATE": NormalizationMode.IDENTITY,
            "ACTION": NormalizationMode.MEAN_STD,
        }
    )
@@ -2350,22 +2350,8 @@ class Florence2PreTrainedModel(PreTrainedModel):
    base_model_prefix = "model"
    supports_gradient_checkpointing = True
    _skip_keys_device_placement = "past_key_values"
-
+    _supports_flash_attn_2 = True
-    @property
+    _supports_sdpa = True
    def _supports_flash_attn_2(self):
        """
        Retrieve language_model's attribute to check whether the model supports
        Flash Attention 2 or not.
        """
        return self.language_model._supports_flash_attn_2
    @property
    def _supports_sdpa(self):
        """
        Retrieve language_model's attribute to check whether the model supports
        SDPA or not.
        """
        return self.language_model._supports_sdpa
 FLORENCE2_INPUTS_DOCSTRING = r"""
@@ -18,6 +18,7 @@
 from __future__ import annotations
 import builtins
 import os
 from collections import deque
 from pathlib import Path
@@ -26,13 +27,14 @@ import torch
 import torch.nn.functional as F  # noqa: N812
 from torch import Tensor, nn
-from lerobot.policies.pretrained import PreTrainedPolicy
+from lerobot.configs.policies import PreTrainedConfig
 from lerobot.policies.pretrained import PreTrainedPolicy, T
 from lerobot.policies.utils import populate_queues
 from lerobot.utils.constants import ACTION, OBS_LANGUAGE_TOKENS, OBS_STATE
 from .action_hub import build_action_space
 from .configuration_florence2 import Florence2Config
-from .configuration_xvla import XVLAConfig, XVLAConfig as PreTrainedConfig
+from .configuration_xvla import XVLAConfig
 from .modeling_florence2 import Florence2ForConditionalGeneration
 from .soft_transformer import SoftPromptedTransformer
@@ -400,7 +402,7 @@ class XVLAPolicy(PreTrainedPolicy):
    @classmethod
    def from_pretrained(
-        cls,
+        cls: builtins.type[T],
        pretrained_name_or_path: str | Path,
        *,
        config: PreTrainedConfig | None = None,
@@ -26,6 +26,10 @@ import numpy as np
 import pytest
 import torch
 from lerobot.policies.xvla.configuration_xvla import XVLAConfig
 from lerobot.policies.xvla.modeling_xvla import XVLAPolicy
 from lerobot.policies.xvla.processor_xvla import make_xvla_pre_post_processors
 # Skip if transformers is not available
 pytest.importorskip("transformers")
@@ -37,9 +41,6 @@ pytestmark = pytest.mark.skipif(
 from transformers import AutoModel, AutoProcessor  # noqa: E402
 from lerobot.configs.policies import PreTrainedConfig  # noqa: E402
 from lerobot.envs.factory import make_env_config  # noqa: E402
 from lerobot.policies.factory import make_policy, make_pre_post_processors  # noqa: E402
 from lerobot.processor import PolicyAction, PolicyProcessorPipeline  # noqa: E402
 from lerobot.utils.constants import OBS_IMAGES, OBS_STATE  # noqa: E402
@@ -50,8 +51,8 @@ IMAGE_HEIGHT = 224
 IMAGE_WIDTH = 224
 NUM_VIEWS = 2  # Number of camera views
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-MODEL_PATH_LEROBOT = "lerobot/xvla-base"
+MODEL_PATH_LEROBOT = "lerobot/xvla-widowx"
-MODEL_PATH_ORIGINAL = "2toINF/X-VLA-Pt"
+MODEL_PATH_ORIGINAL = "2toINF/X-VLA-WidowX"
 LIBERO_DOMAIN_ID = 0  # Domain ID for examples purposes
@@ -93,31 +94,25 @@ def instantiate_lerobot_xvla(
 ]:
    """Instantiate LeRobot XVLA policy with preprocessor and postprocessor."""
    if from_pretrained:
-        cfg = PreTrainedConfig.from_pretrained(model_path)
+        policy = XVLAPolicy.from_pretrained(
-        cfg.pretrained_path = model_path
+            pretrained_name_or_path=model_path,
-    else:
+            strict=False,
        # For non-pretrained, we'd need to create a config from scratch
        raise NotImplementedError("Non-pretrained XVLA instantiation not implemented yet")
    cfg.device = DEVICE
    env_cfg = make_env_config("libero", task="libero_spatial")
    policy = make_policy(
        cfg=cfg,
        env_cfg=env_cfg,
        )
    else:
        config = XVLAConfig(
            base_model_path=model_path,
            n_action_steps=DUMMY_ACTION_DIM,
            chunk_size=DUMMY_ACTION_DIM,
            device=DEVICE,
            num_image_views=NUM_VIEWS,
        )  # add resize_imgs_with_padding=IMAGE_SIZE, IMAGE_SIZE?
        policy = XVLAPolicy(config)
    policy.to(DEVICE)
-    policy.eval()
+    policy.config.device = DEVICE
-    
+    preprocessor, postprocessor = make_xvla_pre_post_processors(
-    preprocessor_overrides = {
+        config=policy.config,
-        "device_processor": {"device": str(cfg.device)},
+        dataset_stats=None,  # Pass None for dataset_stats to disable normalization (original XVLA doesn't normalize)
    }
    preprocessor, postprocessor = make_pre_post_processors(
        policy_cfg=cfg,
        pretrained_path=cfg.pretrained_path,
        preprocessor_overrides=preprocessor_overrides,
    )
    return policy, preprocessor, postprocessor
@@ -129,15 +124,8 @@ def instantiate_original_xvla(
 ):
    """Instantiate original XVLA policy from the original implementation."""
    if from_pretrained:
-        processor = AutoProcessor.from_pretrained(
+        processor = AutoProcessor.from_pretrained(model_path, num_views=NUM_VIEWS, trust_remote_code=True)
-            model_path, 
+        model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
            num_views=NUM_VIEWS, 
            trust_remote_code=True
        )
        model = AutoModel.from_pretrained(
            model_path,
            trust_remote_code=True
        )
    else:
        raise NotImplementedError("Non-pretrained XVLA instantiation not implemented yet")
@@ -149,20 +137,23 @@ def instantiate_original_xvla(
 def create_dummy_data(device=DEVICE):
    """Create dummy data for testing both implementations."""
-    batch_size = 2
+    batch_size = 1
    prompt = "Pick up the red block and place it in the bin"
    # Create random RGB images in [0, 255] uint8 range (as PIL images would be)
    # Then convert to [0, 1] float32 range for LeRobot
-    def fake_rgb(H, W):
+    def fake_rgb(h, w):
-        arr = np.random.randint(0, 255, (H, W, 3), dtype=np.uint8)
+        arr = np.random.randint(0, 255, (h, w, 3), dtype=np.uint8)
        t = torch.from_numpy(arr).permute(2, 0, 1)  # CHW
        t = t.float() / 255.0  # Normalize to [0, 1]
        return t
    batch = {
-        f"{OBS_IMAGES}.image": torch.stack([fake_rgb(IMAGE_HEIGHT, IMAGE_WIDTH) for _ in range(batch_size)]).to(device),
+        f"{OBS_IMAGES}.image": torch.stack(
-        f"{OBS_IMAGES}.image2": torch.stack([fake_rgb(IMAGE_HEIGHT, IMAGE_WIDTH) for _ in range(batch_size)]).to(device),
+            [fake_rgb(IMAGE_HEIGHT, IMAGE_WIDTH) for _ in range(batch_size)]
        ).to(device),
        f"{OBS_IMAGES}.image2": torch.stack(
            [fake_rgb(IMAGE_HEIGHT, IMAGE_WIDTH) for _ in range(batch_size)]
        ).to(device),
        OBS_STATE: torch.randn(batch_size, DUMMY_STATE_DIM, dtype=torch.float32, device=device),
        "task": [prompt for _ in range(batch_size)],
    }
@@ -172,11 +163,9 @@ def create_dummy_data(device=DEVICE):
 def prepare_original_inputs(batch, processor, device=DEVICE):
    """Prepare inputs for the original XVLA model."""
    batch_size = batch[OBS_STATE].shape[0]
    # Convert images from [0, 1] to [0, 255] uint8 for processor
-    image1 = (batch[f"{OBS_IMAGES}.image"] * 255).byte()
+    image1 = (batch[f"{OBS_IMAGES}.image"]).byte()
-    image2 = (batch[f"{OBS_IMAGES}.image2"] * 255).byte()
+    image2 = (batch[f"{OBS_IMAGES}.image2"]).byte()
    # Get task instruction (use first one if batch)
    task_instruction = batch["task"][0] if isinstance(batch["task"], list) else batch["task"]
@@ -185,19 +174,23 @@ def prepare_original_inputs(batch, processor, device=DEVICE):
    # The processor expects a list of images per sample
    processed_inputs = processor(
        [image1[0], image2[0]],  # Process first sample only for now
-        task_instruction
+        task_instruction,
    )
    # Move to correct device and dtype
    dtype = torch.float32
-    inputs = {k: v.to(device=device, dtype=dtype) if v.is_floating_point() else v.to(device=device) 
+    inputs = {
-              for k, v in processed_inputs.items()}
+        k: v.to(device=device, dtype=dtype) if v.is_floating_point() else v.to(device=device)
        for k, v in processed_inputs.items()
    }
    # Add proprio and domain_id
-    inputs.update({
+    inputs.update(
        {
            "proprio": batch[OBS_STATE][:1].to(device),  # First sample only
            "domain_id": torch.tensor([LIBERO_DOMAIN_ID], dtype=torch.long, device=device),
-    })
+        }
    )
    return inputs
@@ -319,7 +312,9 @@ def test_xvla_original_vs_lerobot_pretrained():
        print(f"Max absolute difference: {max_diff:.6e}")
        print(f"Mean absolute difference: {mean_diff:.6e}")
-        print(f"Relative difference: {(mean_diff / (torch.abs(original_actions).mean().item() + 1e-8) * 100):.2f}%")
+        print(
            f"Relative difference: {(mean_diff / (torch.abs(original_actions).mean().item() + 1e-8) * 100):.2f}%"
        )
        # Check with different tolerances
        tolerances = [1e-5, 1e-4, 1e-3, 1e-2]
@@ -379,7 +374,7 @@ def test_xvla_inference_reproducibility():
        print("✔️ Inference is perfectly reproducible!")
    else:
        diff = torch.abs(actions_1 - actions_2)
-        print(f"⚠️ Small differences detected:")
+        print("⚠️ Small differences detected:")
        print(f"  Max diff: {diff.max().item():.6e}")
        print(f"  Mean diff: {diff.mean().item():.6e}")
@@ -406,4 +401,3 @@ if __name__ == "__main__":
        print(f"❌ Test failed with error: {e}")
        print("=" * 80)
        raise
@@ -1,190 +0,0 @@
 import random
 import numpy as np
 import torch
 from xvla.models.modeling_xvla import XVLA
 # from lerobot.policies.xvla.configuration_xvla import XVLAConfig
 from lerobot.configs.policies import PreTrainedConfig
 from lerobot.envs.factory import make_env_config
 from lerobot.policies.factory import make_policy, make_pre_post_processors
 from lerobot.utils.constants import OBS_IMAGES, OBS_STATE
 torch.manual_seed(42)
 random.seed(42)
 np.random.seed(42)
 observation_height: int = 224
 observation_width: int = 224  # todo: jadechoghari, image size is different for the two models
 # create an observation dict
 OBS = {
    f"{OBS_IMAGES}.image": torch.randn(1, 3, observation_height, observation_width),
    f"{OBS_IMAGES}.image2": torch.randn(1, 3, observation_height, observation_width),
    OBS_STATE: torch.randn(1, 20),  # ONLY if OBS_STATE is already a string
    "task": "put the object in the box",
 }
 IMAGENET_MEAN = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1)
 IMAGENET_STD = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1)
 def fake_rgb(H, W):
    arr = np.random.randint(0, 255, (H, W, 3), dtype=np.uint8)
    t = torch.from_numpy(arr).permute(2, 0, 1)  # CHW
    t = t.unsqueeze(0).float()
    # normalize pixel to imagenet
    return t
 OBS[f"{OBS_IMAGES}.image"] = fake_rgb(observation_height, observation_width)
 OBS[f"{OBS_IMAGES}.image2"] = fake_rgb(observation_height, observation_width)
 cfg = PreTrainedConfig.from_pretrained("/raid/jade/models/xvla-libero-og_migrated")
 cfg.pretrained_path = "/raid/jade/models/xvla-libero-og_migrated"
 env_cfg = make_env_config("libero", task="libero_spatial")
 policy = make_policy(
    cfg=cfg,
    env_cfg=env_cfg,
 )
 policy.eval()
 preprocessor_overrides = {
    "device_processor": {"device": str(cfg.device)},
 }
 preprocessor, postprocessor = make_pre_post_processors(
    policy_cfg=cfg,
    pretrained_path=cfg.pretrained_path,
    preprocessor_overrides=preprocessor_overrides,
 )
 observation = preprocessor(OBS)
 inputs = policy._build_model_inputs(observation)
 #### now the og model ###########################################################
 from xvla.models.processing_xvla import XVLAProcessor
 processor = XVLAProcessor.from_pretrained("/raid/jade/models/xvla-libero", num_views=2)
 inputs_1 = processor([OBS[f"{OBS_IMAGES}.image"], OBS[f"{OBS_IMAGES}.image2"]], OBS["task"])
 domain_id = torch.tensor([3], dtype=torch.long)
 inputs.update(
    {
        "proprio": OBS[OBS_STATE].to("cuda"),
        "domain_id": domain_id.to("cuda"),
    }
 )
 # check the preprocessor
 for k in inputs.keys() & inputs_1.keys():  # intersection of keys
    a = inputs[k]
    b = inputs_1[k].to("cuda")
    print(f"\n🔎 Key: {k}")
    # Check shape
    print("  shape:", a.shape, b.shape)
    # Check if close
    if torch.allclose(a, b, atol=1e-5, rtol=1e-5):
        print("  ✔️ tensors are equal (allclose)")
    else:
        diff = torch.abs(a - b)
        print("  ❌ tensors differ")
        print("  max diff:", diff.max().item())
        print("  mean diff:", diff.mean().item())
 model = XVLA.from_pretrained("/raid/jade/models/xvla-libero")
 model.eval()
 model.to("cuda")
 action = model.generate_actions(**inputs, steps=10).squeeze(0).float().cpu().numpy()
 action_1 = policy.model.generate_actions(**inputs, steps=10).squeeze(0).float().cpu().numpy()
 # np all close
 print(np.allclose(action, action_1, atol=1e-2, rtol=1e-2))
 print("max diff:", np.max(np.abs(action - action_1)))
 print("mean diff:", np.mean(np.abs(action - action_1)))
 import random
 import numpy as np
 import torch
 from PIL import Image
 from xvla.models.configuration_xvla import XVLAConfig
 from xvla.models.modeling_xvla import XVLA
 from xvla.models.processor_xvla import XVLAProcessor
 from lerobot.configs.policies import PreTrainedConfig
 from lerobot.envs.factory import make_env_config
 from lerobot.policies.factory import make_policy
 cfg = XVLAConfig.from_pretrained("/raid/jade/models/xvla-libero")
 model = XVLA.from_pretrained("/raid/jade/models/xvla-libero")
 model.eval()
 model.to("cuda")
 processor = XVLAProcessor.from_pretrained("/raid/jade/models/xvla-libero")
 # /raid/jade/models/xvla-libero
 # seet seed
 torch.manual_seed(42)
 random.seed(42)
 np.random.seed(42)
 def make_random_pil_images(num_images=3, H=480, W=640):
    images = []
    for _ in range(num_images):
        # Random RGB image
        arr = np.random.randint(0, 256, (H, W, 3), dtype=np.uint8)
        img = Image.fromarray(arr)
        images.append(img)
    return images
 # Example:
 images = make_random_pil_images()
 language_instruction = "This is a random image"
 # Multimodal preprocessing by processor
 inputs = processor(images, language_instruction)
 if not {"input_ids", "image_input", "image_mask"}.issubset(inputs):
    raise ValueError("Processor did not return the expected keys.")
 proprio = torch.randn(1, 20)
 domain_id = torch.tensor([0], dtype=torch.long)
 # Align to model's device/dtype
 device = model.device
 dtype = next(model.parameters()).dtype
 def to_model(t: torch.Tensor) -> torch.Tensor:
    if not isinstance(t, torch.Tensor):
        t = torch.as_tensor(t)
    # cast floats to model dtype, keep integral/bool as-is
    return t.to(device=device, dtype=dtype) if t.is_floating_point() else t.to(device=device)
 inputs = {k: to_model(v) for k, v in inputs.items()}
 inputs.update(
    {
        "proprio": to_model(proprio),
        "domain_id": domain_id.to(device),
    }
 )
 # Inference
 action = model.generate_actions(**inputs, steps=10).squeeze(0).float().cpu().numpy()
 #### now for lerobot model #####################################################
 cfg = PreTrainedConfig.from_pretrained("/raid/jade/models/xvla-libero-og_migrated")
 env_cfg = make_env_config("libero", task="libero_spatial")
 cfg.pretrained_path = "/raid/jade/models/xvla-libero-og_migrated"
 policy = make_policy(cfg=cfg, env_cfg=env_cfg)
 policy.eval()
 policy.to("cuda")
 action_1 = policy.model.generate_actions(**inputs, steps=10).squeeze(0).float().cpu().numpy()