From 0e21f3fdf7d9418470eb7d0f357d2399a4de2161 Mon Sep 17 00:00:00 2001
From: Jade Choghari <chogharijade@gmail.com>
Date: Tue, 25 Nov 2025 14:18:26 +0100
Subject: [PATCH] upgrade transformers version

---
 src/lerobot/envs/factory.py                   |  10 +-
 .../policies/xvla/configuration_xvla.py       |   2 +-
 .../policies/xvla/modeling_florence2.py       |  18 +-
 src/lerobot/policies/xvla/modeling_xvla.py    |   8 +-
 .../xvla/test_xvla_original_vs_lerobot.py     | 206 +++++++++---------
 tests/policies/xvla/tester_xvla.py            | 190 ----------------
 6 files changed, 116 insertions(+), 318 deletions(-)
 delete mode 100644 tests/policies/xvla/tester_xvla.py

diff --git a/src/lerobot/envs/factory.py b/src/lerobot/envs/factory.py
index b39cfee71..3290a0f43 100644
--- a/src/lerobot/envs/factory.py
+++ b/src/lerobot/envs/factory.py
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import importlib
-from typing import Any
+from typing import TYPE_CHECKING, Any
 
 import gymnasium as gym
 from gymnasium.envs.registration import registry as gym_registry
@@ -22,10 +22,16 @@ from gymnasium.envs.registration import registry as gym_registry
 from lerobot.configs.policies import PreTrainedConfig
 from lerobot.envs.configs import AlohaEnv, EnvConfig, LiberoEnv, PushtEnv
 from lerobot.envs.utils import _call_make_env, _download_hub_file, _import_hub_module, _normalize_hub_result
-from lerobot.policies.xvla.configuration_xvla import XVLAConfig
 from lerobot.processor import ProcessorStep
 from lerobot.processor.env_processor import LiberoProcessorStep
 from lerobot.processor.pipeline import PolicyProcessorPipeline
+from lerobot.utils.import_utils import _transformers_available
+
+# Conditional import for type checking and lazy loading
+if TYPE_CHECKING or _transformers_available:
+    from lerobot.policies.xvla.configuration_xvla import XVLAConfig
+else:
+    XVLAConfig = None
 
 
 def make_env_config(env_type: str, **kwargs) -> EnvConfig:
diff --git a/src/lerobot/policies/xvla/configuration_xvla.py b/src/lerobot/policies/xvla/configuration_xvla.py
index 2ecba245f..64e2c20f5 100644
--- a/src/lerobot/policies/xvla/configuration_xvla.py
+++ b/src/lerobot/policies/xvla/configuration_xvla.py
@@ -49,7 +49,7 @@ class XVLAConfig(PreTrainedConfig):
     normalization_mapping: dict[str, NormalizationMode] = field(
         default_factory=lambda: {
             "VISUAL": NormalizationMode.IDENTITY,
-            "STATE": NormalizationMode.MEAN_STD,
+            "STATE": NormalizationMode.IDENTITY,
             "ACTION": NormalizationMode.MEAN_STD,
         }
     )
diff --git a/src/lerobot/policies/xvla/modeling_florence2.py b/src/lerobot/policies/xvla/modeling_florence2.py
index f783d0818..e65e15967 100644
--- a/src/lerobot/policies/xvla/modeling_florence2.py
+++ b/src/lerobot/policies/xvla/modeling_florence2.py
@@ -2350,22 +2350,8 @@ class Florence2PreTrainedModel(PreTrainedModel):
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
     _skip_keys_device_placement = "past_key_values"
-
-    @property
-    def _supports_flash_attn_2(self):
-        """
-        Retrieve language_model's attribute to check whether the model supports
-        Flash Attention 2 or not.
-        """
-        return self.language_model._supports_flash_attn_2
-
-    @property
-    def _supports_sdpa(self):
-        """
-        Retrieve language_model's attribute to check whether the model supports
-        SDPA or not.
-        """
-        return self.language_model._supports_sdpa
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
 
 
 FLORENCE2_INPUTS_DOCSTRING = r"""
diff --git a/src/lerobot/policies/xvla/modeling_xvla.py b/src/lerobot/policies/xvla/modeling_xvla.py
index fd5dc6401..cb1784e8f 100644
--- a/src/lerobot/policies/xvla/modeling_xvla.py
+++ b/src/lerobot/policies/xvla/modeling_xvla.py
@@ -18,6 +18,7 @@
 
 from __future__ import annotations
 
+import builtins
 import os
 from collections import deque
 from pathlib import Path
@@ -26,13 +27,14 @@ import torch
 import torch.nn.functional as F  # noqa: N812
 from torch import Tensor, nn
 
-from lerobot.policies.pretrained import PreTrainedPolicy
+from lerobot.configs.policies import PreTrainedConfig
+from lerobot.policies.pretrained import PreTrainedPolicy, T
 from lerobot.policies.utils import populate_queues
 from lerobot.utils.constants import ACTION, OBS_LANGUAGE_TOKENS, OBS_STATE
 
 from .action_hub import build_action_space
 from .configuration_florence2 import Florence2Config
-from .configuration_xvla import XVLAConfig, XVLAConfig as PreTrainedConfig
+from .configuration_xvla import XVLAConfig
 from .modeling_florence2 import Florence2ForConditionalGeneration
 from .soft_transformer import SoftPromptedTransformer
 
@@ -400,7 +402,7 @@ class XVLAPolicy(PreTrainedPolicy):
 
     @classmethod
     def from_pretrained(
-        cls,
+        cls: builtins.type[T],
         pretrained_name_or_path: str | Path,
         *,
         config: PreTrainedConfig | None = None,
diff --git a/tests/policies/xvla/test_xvla_original_vs_lerobot.py b/tests/policies/xvla/test_xvla_original_vs_lerobot.py
index 51ecb6357..579c0a1cc 100644
--- a/tests/policies/xvla/test_xvla_original_vs_lerobot.py
+++ b/tests/policies/xvla/test_xvla_original_vs_lerobot.py
@@ -26,6 +26,10 @@ import numpy as np
 import pytest
 import torch
 
+from lerobot.policies.xvla.configuration_xvla import XVLAConfig
+from lerobot.policies.xvla.modeling_xvla import XVLAPolicy
+from lerobot.policies.xvla.processor_xvla import make_xvla_pre_post_processors
+
 # Skip if transformers is not available
 pytest.importorskip("transformers")
 
@@ -37,9 +41,6 @@ pytestmark = pytest.mark.skipif(
 
 from transformers import AutoModel, AutoProcessor  # noqa: E402
 
-from lerobot.configs.policies import PreTrainedConfig  # noqa: E402
-from lerobot.envs.factory import make_env_config  # noqa: E402
-from lerobot.policies.factory import make_policy, make_pre_post_processors  # noqa: E402
 from lerobot.processor import PolicyAction, PolicyProcessorPipeline  # noqa: E402
 from lerobot.utils.constants import OBS_IMAGES, OBS_STATE  # noqa: E402
 
@@ -50,8 +51,8 @@ IMAGE_HEIGHT = 224
 IMAGE_WIDTH = 224
 NUM_VIEWS = 2  # Number of camera views
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-MODEL_PATH_LEROBOT = "lerobot/xvla-base"
-MODEL_PATH_ORIGINAL = "2toINF/X-VLA-Pt"
+MODEL_PATH_LEROBOT = "lerobot/xvla-widowx"
+MODEL_PATH_ORIGINAL = "2toINF/X-VLA-WidowX"
 LIBERO_DOMAIN_ID = 0  # Domain ID for examples purposes
 
 
@@ -93,33 +94,27 @@ def instantiate_lerobot_xvla(
 ]:
     """Instantiate LeRobot XVLA policy with preprocessor and postprocessor."""
     if from_pretrained:
-        cfg = PreTrainedConfig.from_pretrained(model_path)
-        cfg.pretrained_path = model_path
+        policy = XVLAPolicy.from_pretrained(
+            pretrained_name_or_path=model_path,
+            strict=False,
+        )
     else:
-        # For non-pretrained, we'd need to create a config from scratch
-        raise NotImplementedError("Non-pretrained XVLA instantiation not implemented yet")
+        config = XVLAConfig(
+            base_model_path=model_path,
+            n_action_steps=DUMMY_ACTION_DIM,
+            chunk_size=DUMMY_ACTION_DIM,
+            device=DEVICE,
+            num_image_views=NUM_VIEWS,
+        )  # add resize_imgs_with_padding=IMAGE_SIZE, IMAGE_SIZE?
+        policy = XVLAPolicy(config)
 
-    cfg.device = DEVICE
-    env_cfg = make_env_config("libero", task="libero_spatial")
-    
-    policy = make_policy(
-        cfg=cfg,
-        env_cfg=env_cfg,
-    )
-    
     policy.to(DEVICE)
-    policy.eval()
-    
-    preprocessor_overrides = {
-        "device_processor": {"device": str(cfg.device)},
-    }
-    
-    preprocessor, postprocessor = make_pre_post_processors(
-        policy_cfg=cfg,
-        pretrained_path=cfg.pretrained_path,
-        preprocessor_overrides=preprocessor_overrides,
+    policy.config.device = DEVICE
+    preprocessor, postprocessor = make_xvla_pre_post_processors(
+        config=policy.config,
+        dataset_stats=None,  # Pass None for dataset_stats to disable normalization (original XVLA doesn't normalize)
     )
-    
+
     return policy, preprocessor, postprocessor
 
 
@@ -129,76 +124,74 @@ def instantiate_original_xvla(
 ):
     """Instantiate original XVLA policy from the original implementation."""
     if from_pretrained:
-        processor = AutoProcessor.from_pretrained(
-            model_path, 
-            num_views=NUM_VIEWS, 
-            trust_remote_code=True
-        )
-        model = AutoModel.from_pretrained(
-            model_path,
-            trust_remote_code=True
-        )
+        processor = AutoProcessor.from_pretrained(model_path, num_views=NUM_VIEWS, trust_remote_code=True)
+        model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
     else:
         raise NotImplementedError("Non-pretrained XVLA instantiation not implemented yet")
-    
+
     model.to(DEVICE)
     model.eval()
-    
+
     return model, processor
 
 
 def create_dummy_data(device=DEVICE):
     """Create dummy data for testing both implementations."""
-    batch_size = 2
+    batch_size = 1
     prompt = "Pick up the red block and place it in the bin"
-    
+
     # Create random RGB images in [0, 255] uint8 range (as PIL images would be)
     # Then convert to [0, 1] float32 range for LeRobot
-    def fake_rgb(H, W):
-        arr = np.random.randint(0, 255, (H, W, 3), dtype=np.uint8)
+    def fake_rgb(h, w):
+        arr = np.random.randint(0, 255, (h, w, 3), dtype=np.uint8)
         t = torch.from_numpy(arr).permute(2, 0, 1)  # CHW
-        t = t.float() / 255.0  # Normalize to [0, 1]
         return t
-    
+
     batch = {
-        f"{OBS_IMAGES}.image": torch.stack([fake_rgb(IMAGE_HEIGHT, IMAGE_WIDTH) for _ in range(batch_size)]).to(device),
-        f"{OBS_IMAGES}.image2": torch.stack([fake_rgb(IMAGE_HEIGHT, IMAGE_WIDTH) for _ in range(batch_size)]).to(device),
+        f"{OBS_IMAGES}.image": torch.stack(
+            [fake_rgb(IMAGE_HEIGHT, IMAGE_WIDTH) for _ in range(batch_size)]
+        ).to(device),
+        f"{OBS_IMAGES}.image2": torch.stack(
+            [fake_rgb(IMAGE_HEIGHT, IMAGE_WIDTH) for _ in range(batch_size)]
+        ).to(device),
         OBS_STATE: torch.randn(batch_size, DUMMY_STATE_DIM, dtype=torch.float32, device=device),
         "task": [prompt for _ in range(batch_size)],
     }
-    
+
     return batch
 
 
 def prepare_original_inputs(batch, processor, device=DEVICE):
     """Prepare inputs for the original XVLA model."""
-    batch_size = batch[OBS_STATE].shape[0]
-    
     # Convert images from [0, 1] to [0, 255] uint8 for processor
-    image1 = (batch[f"{OBS_IMAGES}.image"] * 255).byte()
-    image2 = (batch[f"{OBS_IMAGES}.image2"] * 255).byte()
-    
+    image1 = (batch[f"{OBS_IMAGES}.image"]).byte()
+    image2 = (batch[f"{OBS_IMAGES}.image2"]).byte()
+
     # Get task instruction (use first one if batch)
     task_instruction = batch["task"][0] if isinstance(batch["task"], list) else batch["task"]
-    
+
     # Process images and text through original processor
     # The processor expects a list of images per sample
     processed_inputs = processor(
         [image1[0], image2[0]],  # Process first sample only for now
-        task_instruction
+        task_instruction,
     )
-    
+
     # Move to correct device and dtype
     dtype = torch.float32
-    inputs = {k: v.to(device=device, dtype=dtype) if v.is_floating_point() else v.to(device=device) 
-              for k, v in processed_inputs.items()}
-    
+    inputs = {
+        k: v.to(device=device, dtype=dtype) if v.is_floating_point() else v.to(device=device)
+        for k, v in processed_inputs.items()
+    }
+
     # Add proprio and domain_id
-    inputs.update({
-        "proprio": batch[OBS_STATE][:1].to(device),  # First sample only
-        "domain_id": torch.tensor([LIBERO_DOMAIN_ID], dtype=torch.long, device=device),
-    })
-    
+    inputs.update(
+        {
+            "proprio": batch[OBS_STATE][:1].to(device),  # First sample only
+            "domain_id": torch.tensor([LIBERO_DOMAIN_ID], dtype=torch.long, device=device),
+        }
+    )
+
     return inputs
 
 
@@ -207,46 +200,46 @@ def test_xvla_preprocessor_alignment():
     print("\n" + "=" * 80)
     print("Test: XVLA Preprocessor Alignment")
     print("=" * 80)
-    
+
     set_seed_all(42)
-    
+
     print("\n[LeRobot] Instantiating policy and preprocessor...")
     lerobot_policy, lerobot_preprocessor, lerobot_postprocessor = instantiate_lerobot_xvla(
         from_pretrained=True
     )
-    
+
     print("\n[Original] Instantiating model and processor...")
     original_model, original_processor = instantiate_original_xvla(from_pretrained=True)
-    
+
     print("\nCreating dummy data...")
     batch = create_dummy_data()
-    
+
     print("\n[LeRobot] Preprocessing...")
     lerobot_observation = lerobot_preprocessor(deepcopy(batch))
     lerobot_inputs = lerobot_policy._build_model_inputs(lerobot_observation)
-    
+
     print("\n[Original] Preprocessing...")
     original_inputs = prepare_original_inputs(batch, original_processor)
-    
+
     print("\nComparing preprocessor outputs:")
     print("-" * 80)
-    
+
     # Compare common keys
     common_keys = set(lerobot_inputs.keys()) & set(original_inputs.keys())
     print(f"Common keys: {common_keys}")
-    
+
     for key in common_keys:
         lerobot_tensor = lerobot_inputs[key]
         original_tensor = original_inputs[key]
-        
+
         print(f"\n🔎 Key: {key}")
         print(f"  LeRobot shape: {lerobot_tensor.shape}")
         print(f"  Original shape: {original_tensor.shape}")
-        
+
         # Handle batch size difference (we only process first sample for original)
         if lerobot_tensor.shape[0] > original_tensor.shape[0]:
             lerobot_tensor = lerobot_tensor[:1]
-        
+
         if lerobot_tensor.shape == original_tensor.shape:
             if torch.allclose(lerobot_tensor, original_tensor, atol=1e-5, rtol=1e-5):
                 print("  ✔️ Tensors are equal (allclose with atol=1e-5)")
@@ -258,7 +251,7 @@ def test_xvla_preprocessor_alignment():
                 print(f"  Std diff: {diff.std().item():.6e}")
         else:
             print("  ⚠️ Shapes don't match after alignment")
-    
+
     cleanup_memory()
 
 
@@ -267,67 +260,69 @@ def test_xvla_original_vs_lerobot_pretrained():
     print("\n" + "=" * 80)
     print("Test: XVLA Original vs LeRobot with Pretrained Weights (Inference)")
     print("=" * 80)
-    
+
     set_seed_all(42)
-    
+
     print("\n[LeRobot] Instantiating policy...")
     lerobot_policy, lerobot_preprocessor, lerobot_postprocessor = instantiate_lerobot_xvla(
         from_pretrained=True
     )
-    
+
     print("\n[Original] Instantiating model...")
     original_model, original_processor = instantiate_original_xvla(from_pretrained=True)
-    
+
     print("\nCreating dummy data...")
     batch = create_dummy_data()
-    
+
     print("\n[LeRobot] Running inference...")
     lerobot_observation = lerobot_preprocessor(deepcopy(batch))
     lerobot_inputs = lerobot_policy._build_model_inputs(lerobot_observation)
-    
+
     # Reset seed for inference
     torch.manual_seed(42)
     with torch.no_grad():
         lerobot_actions = lerobot_policy.model.generate_actions(**lerobot_inputs, steps=10)
         lerobot_actions = lerobot_actions.squeeze(0).float().cpu()
-    
+
     print(f"LeRobot actions shape: {lerobot_actions.shape}")
     print(f"LeRobot actions mean: {lerobot_actions.mean().item():.6f}")
     print(f"LeRobot actions std: {lerobot_actions.std().item():.6f}")
-    
+
     print("\n[Original] Running inference...")
     original_inputs = prepare_original_inputs(batch, original_processor)
-    
+
     # Reset seed for inference
     torch.manual_seed(42)
     with torch.no_grad():
         original_actions = original_model.generate_actions(**original_inputs, steps=10)
         original_actions = original_actions.squeeze(0).float().cpu()
-    
+
     print(f"Original actions shape: {original_actions.shape}")
     print(f"Original actions mean: {original_actions.mean().item():.6f}")
     print(f"Original actions std: {original_actions.std().item():.6f}")
-    
+
     print("\nAction Comparison:")
     print("-" * 80)
-    
+
     # Compare actions
     if lerobot_actions.shape == original_actions.shape:
         diff = torch.abs(lerobot_actions - original_actions)
         max_diff = diff.max().item()
         mean_diff = diff.mean().item()
-        
+
         print(f"Max absolute difference: {max_diff:.6e}")
         print(f"Mean absolute difference: {mean_diff:.6e}")
-        print(f"Relative difference: {(mean_diff / (torch.abs(original_actions).mean().item() + 1e-8) * 100):.2f}%")
-        
+        print(
+            f"Relative difference: {(mean_diff / (torch.abs(original_actions).mean().item() + 1e-8) * 100):.2f}%"
+        )
+
         # Check with different tolerances
         tolerances = [1e-5, 1e-4, 1e-3, 1e-2]
         for tol in tolerances:
             is_close = torch.allclose(lerobot_actions, original_actions, atol=tol)
             status = "✔️" if is_close else "❌"
             print(f"{status} Actions close (atol={tol}): {is_close}")
-        
+
         # Assert with reasonable tolerance
         tolerance = 1e-3
         assert torch.allclose(lerobot_actions, original_actions, atol=tolerance), (
@@ -336,7 +331,7 @@ def test_xvla_original_vs_lerobot_pretrained():
         print(f"\n✅ Success: Actions match within tolerance ({tolerance})!")
     else:
         print(f"⚠️ Shape mismatch: LeRobot {lerobot_actions.shape} vs Original {original_actions.shape}")
-    
+
     cleanup_memory()
 
 
@@ -345,15 +340,15 @@ def test_xvla_inference_reproducibility():
     print("\n" + "=" * 80)
     print("Test: XVLA Inference Reproducibility")
     print("=" * 80)
-    
+
     print("\n[LeRobot] Instantiating policy...")
     lerobot_policy, lerobot_preprocessor, lerobot_postprocessor = instantiate_lerobot_xvla(
         from_pretrained=True
     )
-    
+
     print("\nCreating dummy data...")
     batch = create_dummy_data()
-    
+
     # First inference
     print("\n[Run 1] Running inference...")
     set_seed_all(42)
@@ -362,7 +357,7 @@ def test_xvla_inference_reproducibility():
     with torch.no_grad():
         actions_1 = lerobot_policy.model.generate_actions(**lerobot_inputs, steps=10)
         actions_1 = actions_1.squeeze(0).float().cpu()
-    
+
     # Second inference with same seed
     print("\n[Run 2] Running inference with same seed...")
     set_seed_all(42)
@@ -371,20 +366,20 @@ def test_xvla_inference_reproducibility():
     with torch.no_grad():
         actions_2 = lerobot_policy.model.generate_actions(**lerobot_inputs, steps=10)
         actions_2 = actions_2.squeeze(0).float().cpu()
-    
+
     print("\nComparing two runs:")
     print("-" * 80)
-    
+
     if torch.allclose(actions_1, actions_2, atol=1e-8):
         print("✔️ Inference is perfectly reproducible!")
     else:
         diff = torch.abs(actions_1 - actions_2)
-        print(f"⚠️ Small differences detected:")
+        print("⚠️ Small differences detected:")
         print(f"  Max diff: {diff.max().item():.6e}")
         print(f"  Mean diff: {diff.mean().item():.6e}")
-    
+
     assert torch.allclose(actions_1, actions_2, atol=1e-6), "Inference should be reproducible!"
-    
+
     cleanup_memory()
 
 
@@ -392,12 +387,12 @@ if __name__ == "__main__":
     print("\n" + "=" * 80)
     print("XVLA Original vs LeRobot Comparison Test Suite")
     print("=" * 80)
-    
+
     try:
         test_xvla_preprocessor_alignment()
         test_xvla_original_vs_lerobot_pretrained()
         test_xvla_inference_reproducibility()
-        
+
         print("\n" + "=" * 80)
         print("✅ All tests passed!")
         print("=" * 80)
@@ -406,4 +401,3 @@ if __name__ == "__main__":
         print(f"❌ Test failed with error: {e}")
         print("=" * 80)
         raise
-
diff --git a/tests/policies/xvla/tester_xvla.py b/tests/policies/xvla/tester_xvla.py
deleted file mode 100644
index 53e65cb40..000000000
--- a/tests/policies/xvla/tester_xvla.py
+++ /dev/null
@@ -1,190 +0,0 @@
-import random
-
-import numpy as np
-import torch
-from xvla.models.modeling_xvla import XVLA
-
-# from lerobot.policies.xvla.configuration_xvla import XVLAConfig
-from lerobot.configs.policies import PreTrainedConfig
-from lerobot.envs.factory import make_env_config
-from lerobot.policies.factory import make_policy, make_pre_post_processors
-from lerobot.utils.constants import OBS_IMAGES, OBS_STATE
-
-torch.manual_seed(42)
-random.seed(42)
-np.random.seed(42)
-observation_height: int = 224
-observation_width: int = 224  # todo: jadechoghari, image size is different for the two models
-# create an observation dict
-OBS = {
-    f"{OBS_IMAGES}.image": torch.randn(1, 3, observation_height, observation_width),
-    f"{OBS_IMAGES}.image2": torch.randn(1, 3, observation_height, observation_width),
-    OBS_STATE: torch.randn(1, 20),  # ONLY if OBS_STATE is already a string
-    "task": "put the object in the box",
-}
-
-IMAGENET_MEAN = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1)
-IMAGENET_STD = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1)
-
-
-def fake_rgb(H, W):
-    arr = np.random.randint(0, 255, (H, W, 3), dtype=np.uint8)
-    t = torch.from_numpy(arr).permute(2, 0, 1)  # CHW
-    t = t.unsqueeze(0).float()
-    # normalize pixel to imagenet
-    return t
-
-
-OBS[f"{OBS_IMAGES}.image"] = fake_rgb(observation_height, observation_width)
-OBS[f"{OBS_IMAGES}.image2"] = fake_rgb(observation_height, observation_width)
-
-cfg = PreTrainedConfig.from_pretrained("/raid/jade/models/xvla-libero-og_migrated")
-cfg.pretrained_path = "/raid/jade/models/xvla-libero-og_migrated"
-env_cfg = make_env_config("libero", task="libero_spatial")
-policy = make_policy(
-    cfg=cfg,
-    env_cfg=env_cfg,
-)
-
-policy.eval()
-
-preprocessor_overrides = {
-    "device_processor": {"device": str(cfg.device)},
-}
-
-preprocessor, postprocessor = make_pre_post_processors(
-    policy_cfg=cfg,
-    pretrained_path=cfg.pretrained_path,
-    preprocessor_overrides=preprocessor_overrides,
-)
-
-observation = preprocessor(OBS)
-inputs = policy._build_model_inputs(observation)
-
-
-#### now the og model ###########################################################
-from xvla.models.processing_xvla import XVLAProcessor
-
-processor = XVLAProcessor.from_pretrained("/raid/jade/models/xvla-libero", num_views=2)
-inputs_1 = processor([OBS[f"{OBS_IMAGES}.image"], OBS[f"{OBS_IMAGES}.image2"]], OBS["task"])
-domain_id = torch.tensor([3], dtype=torch.long)
-inputs.update(
-    {
-        "proprio": OBS[OBS_STATE].to("cuda"),
-        "domain_id": domain_id.to("cuda"),
-    }
-)
-
-# check the preprocessor
-for k in inputs.keys() & inputs_1.keys():  # intersection of keys
-    a = inputs[k]
-    b = inputs_1[k].to("cuda")
-
-    print(f"\n🔎 Key: {k}")
-
-    # Check shape
-    print("  shape:", a.shape, b.shape)
-
-    # Check if close
-    if torch.allclose(a, b, atol=1e-5, rtol=1e-5):
-        print("  ✔️ tensors are equal (allclose)")
-    else:
-        diff = torch.abs(a - b)
-        print("  ❌ tensors differ")
-        print("  max diff:", diff.max().item())
-        print("  mean diff:", diff.mean().item())
-
-
-model = XVLA.from_pretrained("/raid/jade/models/xvla-libero")
-model.eval()
-model.to("cuda")
-
-action = model.generate_actions(**inputs, steps=10).squeeze(0).float().cpu().numpy()
-action_1 = policy.model.generate_actions(**inputs, steps=10).squeeze(0).float().cpu().numpy()
-
-# np all close
-print(np.allclose(action, action_1, atol=1e-2, rtol=1e-2))
-print("max diff:", np.max(np.abs(action - action_1)))
-print("mean diff:", np.mean(np.abs(action - action_1)))
-
-
-import random
-
-import numpy as np
-import torch
-from PIL import Image
-from xvla.models.configuration_xvla import XVLAConfig
-from xvla.models.modeling_xvla import XVLA
-from xvla.models.processor_xvla import XVLAProcessor
-
-from lerobot.configs.policies import PreTrainedConfig
-from lerobot.envs.factory import make_env_config
-from lerobot.policies.factory import make_policy
-
-cfg = XVLAConfig.from_pretrained("/raid/jade/models/xvla-libero")
-model = XVLA.from_pretrained("/raid/jade/models/xvla-libero")
-model.eval()
-model.to("cuda")
-processor = XVLAProcessor.from_pretrained("/raid/jade/models/xvla-libero")
-# /raid/jade/models/xvla-libero
-# seet seed
-torch.manual_seed(42)
-random.seed(42)
-np.random.seed(42)
-
-
-def make_random_pil_images(num_images=3, H=480, W=640):
-    images = []
-    for _ in range(num_images):
-        # Random RGB image
-        arr = np.random.randint(0, 256, (H, W, 3), dtype=np.uint8)
-        img = Image.fromarray(arr)
-        images.append(img)
-    return images
-
-
-# Example:
-images = make_random_pil_images()
-language_instruction = "This is a random image"
-# Multimodal preprocessing by processor
-inputs = processor(images, language_instruction)
-if not {"input_ids", "image_input", "image_mask"}.issubset(inputs):
-    raise ValueError("Processor did not return the expected keys.")
-
-proprio = torch.randn(1, 20)
-domain_id = torch.tensor([0], dtype=torch.long)
-
-# Align to model's device/dtype
-device = model.device
-dtype = next(model.parameters()).dtype
-
-
-def to_model(t: torch.Tensor) -> torch.Tensor:
-    if not isinstance(t, torch.Tensor):
-        t = torch.as_tensor(t)
-    # cast floats to model dtype, keep integral/bool as-is
-    return t.to(device=device, dtype=dtype) if t.is_floating_point() else t.to(device=device)
-
-
-inputs = {k: to_model(v) for k, v in inputs.items()}
-inputs.update(
-    {
-        "proprio": to_model(proprio),
-        "domain_id": domain_id.to(device),
-    }
-)
-
-# Inference
-action = model.generate_actions(**inputs, steps=10).squeeze(0).float().cpu().numpy()
-
-
-#### now for lerobot model #####################################################
-
-cfg = PreTrainedConfig.from_pretrained("/raid/jade/models/xvla-libero-og_migrated")
-env_cfg = make_env_config("libero", task="libero_spatial")
-cfg.pretrained_path = "/raid/jade/models/xvla-libero-og_migrated"
-policy = make_policy(cfg=cfg, env_cfg=env_cfg)
-policy.eval()
-policy.to("cuda")
-
-action_1 = policy.model.generate_actions(**inputs, steps=10).squeeze(0).float().cpu().numpy()
\ No newline at end of file