From 39260a581a66e57c3a2061f82f76d7857a3d80a0 Mon Sep 17 00:00:00 2001
From: Jade Choghari <chogharijade@gmail.com>
Date: Sat, 15 Nov 2025 16:41:23 +0100
Subject: [PATCH] update files

---
 convert.sh                                    |   4 +
 eval.sh                                       |   7 ++
 output.txt                                    |   0
 prepend.py                                    |  28 +++++
 src/lerobot/envs/libero.py                    |  11 +-
 src/lerobot/policies/factory.py               |   2 -
 .../policies/xvla/configuration_xvla.py       | 107 +++++++++---------
 src/lerobot/policies/xvla/inference.py        |  54 +++++++++
 src/lerobot/policies/xvla/inference_pipe.py   |  59 ++++++++++
 src/lerobot/policies/xvla/modeling_xvla.py    |  15 +--
 src/lerobot/policies/xvla/utils.py            |  48 ++++++++
 .../processor/migrate_policy_normalization.py |   1 +
 src/lerobot/processor/normalize_processor.py  |   1 -
 src/lerobot/scripts/lerobot_eval.py           |  61 +++++++++-
 14 files changed, 326 insertions(+), 72 deletions(-)
 create mode 100644 convert.sh
 create mode 100644 eval.sh
 create mode 100644 output.txt
 create mode 100644 prepend.py
 create mode 100644 src/lerobot/policies/xvla/inference.py
 create mode 100644 src/lerobot/policies/xvla/inference_pipe.py
 create mode 100644 src/lerobot/policies/xvla/utils.py

diff --git a/convert.sh b/convert.sh
new file mode 100644
index 000000000..c7d4998dc
--- /dev/null
+++ b/convert.sh
@@ -0,0 +1,4 @@
+python src/lerobot/processor/migrate_policy_normalization.py \
+        --pretrained-path /raid/jade/models/xvla-libero-og \
+        --output-dir /raid/jade/models/xvla-libero-og-migrated \
+        --branch main
\ No newline at end of file
diff --git a/eval.sh b/eval.sh
new file mode 100644
index 000000000..6b69ebd98
--- /dev/null
+++ b/eval.sh
@@ -0,0 +1,7 @@
+lerobot-eval \
+  --policy.path="/raid/jade/models/xvla-libero-og_migrated" \
+  --env.type=libero \
+  --env.task=libero_spatial \
+  --eval.batch_size=1 \
+  --eval.n_episodes=1
+
diff --git a/output.txt b/output.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/prepend.py b/prepend.py
new file mode 100644
index 000000000..8e587c431
--- /dev/null
+++ b/prepend.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python3
+import safetensors.torch as st
+import torch
+import argparse
+import os
+
+def prefix_state_dict(input_path, output_path, prefix="model."):
+    # Load original checkpoint
+    state_dict = st.load_file(input_path)
+
+    print(f"Loaded {len(state_dict)} tensors from {input_path}")
+
+    # Add prefix to every key
+    new_state_dict = {f"{prefix}{k}": v for k, v in state_dict.items()}
+
+    print(f"Writing prefixed checkpoint with {len(new_state_dict)} keys...")
+    st.save_file(new_state_dict, output_path)
+
+    print(f"Saved to {output_path}")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input", type=str, required=True, help="Path to model.safetensors")
+    parser.add_argument("--output", type=str, required=True, help="Output prefixed model.safetensors")
+    parser.add_argument("--prefix", type=str, default="model.", help="Prefix to add to each key")
+    args = parser.parse_args()
+
+    prefix_state_dict(args.input, args.output, args.prefix)
diff --git a/src/lerobot/envs/libero.py b/src/lerobot/envs/libero.py
index 94b08e991..9075f0577 100644
--- a/src/lerobot/envs/libero.py
+++ b/src/lerobot/envs/libero.py
@@ -29,7 +29,7 @@ from gymnasium import spaces
 from libero.libero import benchmark, get_libero_path
 from libero.libero.envs import OffScreenRenderEnv
 from robosuite.utils.transform_utils import quat2axisangle
-
+from lerobot.policies.xvla.utils import Mat_to_Rotate6D
 
 def _parse_camera_names(camera_name: str | Sequence[str]) -> list[str]:
     """Normalize camera_name into a non-empty list of strings."""
@@ -81,14 +81,14 @@ def get_libero_dummy_action():
     return [0, 0, 0, 0, 0, 0, -1]
 
 
-OBS_STATE_DIM = 8
+OBS_STATE_DIM = 20
 ACTION_DIM = 7
 AGENT_POS_LOW = -1000.0
 AGENT_POS_HIGH = 1000.0
 ACTION_LOW = -1.0
 ACTION_HIGH = 1.0
 TASK_SUITE_MAX_STEPS: dict[str, int] = {
-    "libero_spatial": 280,  # longest training demo has 193 steps
+    "libero_spatial": 800,  # longest training demo has 193 steps
     "libero_object": 280,  # longest training demo has 254 steps
     "libero_goal": 300,  # longest training demo has 270 steps
     "libero_10": 520,  # longest training demo has 505 steps
@@ -221,6 +221,11 @@ class LiberoEnv(gym.Env):
                 raw_obs["robot0_gripper_qpos"],
             )
         )
+        # add new obs for XVLA: jadechoghari
+        robo_ori = Mat_to_Rotate6D(self._env.robots[0].controller.ee_ori_mat)
+        robo_pos = self._env.robots[0].controller.ee_pos
+        proprio = np.concatenate([robo_pos, robo_ori, np.array([0.0])], axis=-1)
+        state = np.concatenate([proprio, np.zeros_like(proprio)], axis=-1)
         agent_pos = state
         if self.obs_type == "pixels":
             return {"pixels": images.copy()}
diff --git a/src/lerobot/policies/factory.py b/src/lerobot/policies/factory.py
index 0aec74811..0f8afafe4 100644
--- a/src/lerobot/policies/factory.py
+++ b/src/lerobot/policies/factory.py
@@ -428,8 +428,6 @@ def make_policy(
     else:
         # Make a fresh policy.
         policy = policy_cls(**kwargs)
-        kwargs["pretrained_name_or_path"] = "/fsx/jade_choghari/.cache/huggingface/model/xvla-libero"
-        policy = policy_cls.from_pretrained(**kwargs)
 
     policy.to(cfg.device)
     assert isinstance(policy, torch.nn.Module)
diff --git a/src/lerobot/policies/xvla/configuration_xvla.py b/src/lerobot/policies/xvla/configuration_xvla.py
index ed3badc75..865896277 100644
--- a/src/lerobot/policies/xvla/configuration_xvla.py
+++ b/src/lerobot/policies/xvla/configuration_xvla.py
@@ -28,6 +28,8 @@ from lerobot.optim.schedulers import CosineDecayWithWarmupSchedulerConfig
 from lerobot.utils.constants import OBS_IMAGES
 
 from .configuration_florence2 import Florence2Config
+from .configuration_florence2 import Florence2VisionConfig
+from .configuration_florence2 import Florence2LanguageConfig
 
 
 @PreTrainedConfig.register_subclass("xvla")
@@ -56,7 +58,7 @@ class XVLAConfig(PreTrainedConfig):
     )
 
     # Florence2 backbone and tokenizer configuration
-    florence_config: dict[str, Any] | Florence2Config = field(default_factory=dict)
+    florence_config: dict[str, Any] = field(default_factory=dict)
     tokenizer_name: str = "facebook/bart-large"
     tokenizer_max_length: int = 64
     tokenizer_padding_side: str = "right"
@@ -81,7 +83,7 @@ class XVLAConfig(PreTrainedConfig):
     domain_feature_key: str | None = None
 
     # Vision preprocessing
-    resize_imgs_with_padding: tuple[int, int] | None = (518, 518)
+    resize_imgs_with_padding: tuple[int, int] | None = None
     num_image_views: int | None = None
     empty_cameras: int = 0
 
@@ -107,8 +109,6 @@ class XVLAConfig(PreTrainedConfig):
             raise ValueError(
                 f"`n_action_steps` ({self.n_action_steps}) must be <= `chunk_size` ({self.chunk_size})."
             )
-        if isinstance(self.florence_config, Florence2Config):
-            self.florence_config = self.florence_config.to_dict()
         if self.num_image_views is not None and self.num_image_views <= 0:
             raise ValueError("`num_image_views` must be > 0 when specified.")
         self._florence_config_obj: Florence2Config | None = None
@@ -118,56 +118,55 @@ class XVLAConfig(PreTrainedConfig):
         Build (and cache) the Florence2 transformer config that should back the VLM.
         """
         if self._florence_config_obj is None:
-            if isinstance(self.florence_config, Florence2Config):
-                self._florence_config_obj = self.florence_config
-            else:
-                # TODO: jadechoghari: provide default way, and do not hardcode
-                # Ensure vision_config and text_config are provided with defaults if not specified
-                config_dict = dict(self.florence_config)
-                if "vision_config" not in config_dict or config_dict["vision_config"] is None:
-                    # Provide default vision config
-                    config_dict["vision_config"] = {
-                        "model_type": "davit",
-                        "drop_path_rate": 0.1,
-                        "patch_size": [7, 3, 3, 3],
-                        "patch_stride": [4, 2, 2, 2],
-                        "patch_padding": [3, 1, 1, 1],
-                        "patch_prenorm": [False, True, True, True],
-                        "enable_checkpoint": False,
-                        "dim_embed": [256, 512, 1024, 2048],
-                        "num_heads": [8, 16, 32, 64],
-                        "num_groups": [8, 16, 32, 64],
-                        "depths": [1, 1, 9, 1],
-                        "window_size": 12,
-                        "projection_dim": 1024,
-                        "visual_temporal_embedding": {
-                            "type": "COSINE",
-                            "max_temporal_embeddings": 100
-                        },
-                        "image_pos_embed": {
-                            "type": "learned_abs_2d",
-                            "max_pos_embeddings": 50
-                        },
-                        "image_feature_source": ["spatial_avg_pool", "temporal_avg_pool"]
-                        }
-                if "text_config" not in config_dict or config_dict["text_config"] is None:
-                    # Provide default text config
-                    config_dict["text_config"] = {
-                        "vocab_size": 51289,
-                        "activation_dropout": 0.1,
-                        "activation_function": "gelu",
-                        "attention_dropout": 0.1,
-                        "d_model": 1024,
-                        "decoder_attention_heads": 16,
-                        "decoder_layers": 12,
-                        "encoder_attention_heads": 16,
-                        "encoder_layers": 12,
-                        "dropout": 0.1,
-                        "max_position_embeddings": 4096,
-                        "num_hidden_layers": 12,
-                        "num_beams": 3
-                        }
-                self._florence_config_obj = Florence2Config(**config_dict)
+            # TODO: jadechoghari: provide default way, and do not hardcode
+            # Ensure vision_config and text_config are provided with defaults if not specified
+            config_dict = dict(self.florence_config)
+            if "vision_config" not in config_dict or config_dict["vision_config"] is None:
+                raise ValueError("vision_config is required")
+                #     # Provide default vision config
+                #     config_dict["vision_config"] = {
+                #         "model_type": "davit",
+                #         "drop_path_rate": 0.1,
+                #         "patch_size": [7, 3, 3, 3],
+                #         "patch_stride": [4, 2, 2, 2],
+                #         "patch_padding": [3, 1, 1, 1],
+                #         "patch_prenorm": [False, True, True, True],
+                #         "enable_checkpoint": False,
+                #         "dim_embed": [256, 512, 1024, 2048],
+                #         "num_heads": [8, 16, 32, 64],
+                #         "num_groups": [8, 16, 32, 64],
+                #         "depths": [1, 1, 9, 1],
+                #         "window_size": 12,
+                #         "projection_dim": 1024,
+                #         "visual_temporal_embedding": {
+                #             "type": "COSINE",
+                #             "max_temporal_embeddings": 100
+                #         },
+                #         "image_pos_embed": {
+                #             "type": "learned_abs_2d",
+                #             "max_pos_embeddings": 50
+                #         },
+                #         "image_feature_source": ["spatial_avg_pool", "temporal_avg_pool"]
+                #         }
+            if "text_config" not in config_dict or config_dict["text_config"] is None:
+                raise ValueError("text_config is required")
+            #     # Provide default text config
+            #     config_dict["text_config"] = {
+            #         "vocab_size": 51289,
+            #         "activation_dropout": 0.1,
+            #         "activation_function": "gelu",
+            #         "attention_dropout": 0.1,
+            #         "d_model": 1024,
+            #         "decoder_attention_heads": 16,
+            #         "decoder_layers": 12,
+            #         "encoder_attention_heads": 16,
+            #         "encoder_layers": 12,
+            #         "dropout": 0.1,
+            #         "max_position_embeddings": 4096,
+            #         "num_hidden_layers": 12,
+            #         "num_beams": 3
+            #         }
+            self._florence_config_obj = Florence2Config(**config_dict)
         return self._florence_config_obj
 
     def validate_features(self) -> None:
diff --git a/src/lerobot/policies/xvla/inference.py b/src/lerobot/policies/xvla/inference.py
new file mode 100644
index 000000000..00511d72c
--- /dev/null
+++ b/src/lerobot/policies/xvla/inference.py
@@ -0,0 +1,54 @@
+from transformers import AutoModel, AutoProcessor
+import json_numpy
+import numpy as np
+import torch
+from PIL import Image
+
+model = AutoModel.from_pretrained(
+    "2toINF/X-VLA-WidowX",
+    trust_remote_code=True
+)
+
+processor = AutoProcessor.from_pretrained("2toINF/X-VLA-WidowX", trust_remote_code=True)
+# append 3 random image to a list
+def make_random_pil_images(num_images=3, H=480, W=640):
+    images = []
+    for _ in range(num_images):
+        # Random RGB image
+        arr = np.random.randint(0, 256, (H, W, 3), dtype=np.uint8)
+        img = Image.fromarray(arr)
+        images.append(img)
+    return images
+
+# Example:
+images = make_random_pil_images()
+language_instruction = "This is a random image"
+# Multimodal preprocessing by processor
+inputs = processor(images, language_instruction)
+if not {"input_ids", "image_input", "image_mask"}.issubset(inputs):
+    raise ValueError("Processor did not return the expected keys.")
+
+proprio = torch.randn(1, 20)
+domain_id = torch.tensor([int(0)], dtype=torch.long)
+
+# Align to model's device/dtype
+device = model.device
+dtype = next(model.parameters()).dtype
+
+def to_model(t: torch.Tensor) -> torch.Tensor:
+    if not isinstance(t, torch.Tensor):
+        t = torch.as_tensor(t)
+    # cast floats to model dtype, keep integral/bool as-is
+    return t.to(device=device, dtype=dtype) if t.is_floating_point() else t.to(device=device)
+
+inputs = {k: to_model(v) for k, v in inputs.items()}
+inputs.update({
+    "proprio": to_model(proprio),
+    "domain_id": domain_id.to(device),
+})
+
+# Inference
+
+action = model.generate_actions(**inputs, steps=10).squeeze(0).float().cpu().numpy()
+
+breakpoint()
diff --git a/src/lerobot/policies/xvla/inference_pipe.py b/src/lerobot/policies/xvla/inference_pipe.py
new file mode 100644
index 000000000..da42965c1
--- /dev/null
+++ b/src/lerobot/policies/xvla/inference_pipe.py
@@ -0,0 +1,59 @@
+from lerobot.policies.factory import make_policy, make_pre_post_processors
+# from lerobot.policies.xvla.configuration_xvla import XVLAConfig
+from lerobot.configs.policies import PreTrainedConfig
+from lerobot.envs.factory import make_env_config
+from lerobot.policies.xvla.utils import Rotate6D_to_AxisAngle
+from lerobot.utils.constants import OBS_IMAGES, OBS_STATE
+import torch
+import numpy as np
+
+observation_height: int = 360
+observation_width: int = 360
+# create an observation dict
+OBS = {
+    f"{OBS_IMAGES}.image1": torch.randn(1, 3, observation_height, observation_width),
+    f"{OBS_IMAGES}.image2": torch.randn(1, 3, observation_height, observation_width),
+    OBS_STATE: torch.randn(1, 9),                  # ONLY if OBS_STATE is already a string
+    "task": "put the object in the box",
+}
+def fake_rgb(H, W):
+    img = torch.randint(0, 255, (H, W, 3), dtype=torch.uint8).numpy()
+    return img
+
+OBS[f"{OBS_IMAGES}.image1"] = fake_rgb(observation_height, observation_width)
+OBS[f"{OBS_IMAGES}.image2"] = fake_rgb(observation_height, observation_width)
+
+# observation = preprocessor(OBS)
+from transformers import AutoProcessor
+processor = AutoProcessor.from_pretrained("2toINF/X-VLA-WidowX", num_views=2, trust_remote_code=True)
+inputs = processor([OBS[f"{OBS_IMAGES}.image1"], OBS[f"{OBS_IMAGES}.image2"]], OBS["task"])
+breakpoint()
+
+cfg = PreTrainedConfig.from_pretrained("/raid/jade/models/xvla-libero-og_migrated")
+cfg.pretrained_path = "/raid/jade/models/xvla-libero-og_migrated"
+env_cfg = make_env_config("libero", task="libero_spatial")
+policy = make_policy(
+    cfg=cfg,
+    env_cfg=env_cfg,
+)
+
+policy.eval()
+
+preprocessor_overrides = {
+        "device_processor": {"device": str(cfg.device)},
+}
+
+preprocessor, postprocessor = make_pre_post_processors(
+        policy_cfg=cfg,
+        pretrained_path=cfg.pretrained_path,
+        preprocessor_overrides=preprocessor_overrides,
+)
+
+observation = preprocessor(OBS)
+action = policy.select_action(observation)
+
+target_eef = action[:, :3].to("cpu").numpy()
+target_axis = Rotate6D_to_AxisAngle(action[:, 3:9].to("cpu").numpy())
+target_act = action[:, 9:10].to("cpu").numpy()
+final_action = np.concatenate([target_eef, target_axis, target_act], axis=-1)
+breakpoint()
\ No newline at end of file
diff --git a/src/lerobot/policies/xvla/modeling_xvla.py b/src/lerobot/policies/xvla/modeling_xvla.py
index 1e3db3517..56a111da3 100644
--- a/src/lerobot/policies/xvla/modeling_xvla.py
+++ b/src/lerobot/policies/xvla/modeling_xvla.py
@@ -94,10 +94,9 @@ class XVLAModel(nn.Module):
         batch_size, num_views = pixel_values.shape[:2]
         flat_mask = image_mask.view(-1).to(dtype=torch.bool)
         flat_images = pixel_values.flatten(0, 1)
-
         #TODO: jadechoghari: remove this resizing logic, and provide a way in training to do this
-        target_size = (224, 224)
-        flat_images = F.interpolate(flat_images, size=target_size, mode="bilinear", align_corners=False)
+        # target_size = (224, 224)
+        # flat_images = F.interpolate(flat_images, size=target_size, mode="bilinear", align_corners=False)
 
 
         num_valid = int(flat_mask.sum().item())
@@ -197,7 +196,6 @@ class XVLAPolicy(PreTrainedPolicy):
     def __init__(self, config: XVLAConfig):
         super().__init__(config)
         config.validate_features()
-
         florence_config = config.get_florence_config()
         proprio_dim = config.max_state_dim if config.use_proprio else 0
         self.model = XVLAModel(config=config, florence_config=florence_config, proprio_dim=proprio_dim)
@@ -381,7 +379,6 @@ class XVLAPolicy(PreTrainedPolicy):
 
         model_id = str(pretrained_name_or_path)
         instance = cls(config, **kwargs)
-
         # --- Step 2: Locate model.safetensors ---
         if os.path.isdir(model_id):
             print("Loading weights from local directory")
@@ -422,9 +419,13 @@ class XVLAPolicy(PreTrainedPolicy):
             k: v for k, v in new_state_dict.items()
             if k not in keys_to_skip
         }
-
+        # ---- ADD THIS: Fix shared embeddings ----
+        encoder_key = "model.vlm.language_model.model.encoder.embed_tokens.weight"
+        shared_key  = "model.vlm.language_model.model.shared.weight"
+        if encoder_key in state_dict:
+            state_dict[shared_key] = state_dict[encoder_key]
         # --- Step 5: Load into instance ---
-        missing, unexpected = instance.load_state_dict(new_state_dict, strict=False)
+        missing, unexpected = instance.load_state_dict(state_dict, strict=True)
         print("✅ Loaded XVLA checkpoint with modified keys.")
         if missing:
             print(f"Missing keys: {missing}")
diff --git a/src/lerobot/policies/xvla/utils.py b/src/lerobot/policies/xvla/utils.py
new file mode 100644
index 000000000..0603777ac
--- /dev/null
+++ b/src/lerobot/policies/xvla/utils.py
@@ -0,0 +1,48 @@
+import numpy as np
+import robosuite.utils.transform_utils as T
+def Rotate6D_to_AxisAngle(r6d):
+        """
+        r6d: np.ndarray, shape (N, 6)
+        return: np.ndarray, shape (N, 3), axis-angle vectors
+        """
+        flag = 0
+        if len(r6d.shape) == 1:
+            r6d = r6d[None, ...]
+            flag = 1
+            
+        a1 = r6d[:, 0:3]
+        a2 = r6d[:, 3:6]
+        
+        # b1
+        b1 = a1 / (np.linalg.norm(a1, axis=-1, keepdims=True) + 1e-6)
+        
+        # b2
+        dot_prod = np.sum(b1 * a2, axis=-1, keepdims=True)
+        b2_orth = a2 - dot_prod * b1
+        b2 = b2_orth / (np.linalg.norm(b2_orth, axis=-1, keepdims=True) + 1e-6)
+        
+        # b3
+        b3 = np.cross(b1, b2, axis=-1)
+        
+        R = np.stack([b1, b2, b3], axis=-1)  # shape: (N, 3, 3)
+        
+        axis_angle_list = []
+        for i in range(R.shape[0]):
+            quat = T.mat2quat(R[i])
+            axis_angle = T.quat2axisangle(quat)
+            axis_angle_list.append(axis_angle)
+        
+        axis_angle_array = np.stack(axis_angle_list, axis=0)  # shape: (N, 3)
+        
+        if flag == 1:   
+            axis_angle_array = axis_angle_array[0]
+                    
+        return axis_angle_array
+        
+def Mat_to_Rotate6D(abs_action):
+        if len(abs_action.shape) == 2:
+            return np.concatenate([abs_action[:3, 0], abs_action[:3, 1]], axis=-1)
+        elif len(abs_action.shape) == 3:
+            return np.concatenate([abs_action[:, :3, 0], abs_action[:, :3, 1]], axis=-1)
+        else:
+            raise NotImplementedError
\ No newline at end of file
diff --git a/src/lerobot/processor/migrate_policy_normalization.py b/src/lerobot/processor/migrate_policy_normalization.py
index 525b7431c..0b614c7c0 100644
--- a/src/lerobot/processor/migrate_policy_normalization.py
+++ b/src/lerobot/processor/migrate_policy_normalization.py
@@ -325,6 +325,7 @@ def load_state_dict_with_missing_key_handling(
     Returns:
         List of problematic missing keys that weren't in the whitelist.
     """
+    state_dict['model.vlm.language_model.model.encoder.embed_tokens.weight'] = state_dict['model.vlm.language_model.model.shared.weight'].clone()
     # Load the cleaned state dict with strict=False to capture missing/unexpected keys
     load_result = policy.load_state_dict(state_dict, strict=False)
 
diff --git a/src/lerobot/processor/normalize_processor.py b/src/lerobot/processor/normalize_processor.py
index 368c9b270..1257315c4 100644
--- a/src/lerobot/processor/normalize_processor.py
+++ b/src/lerobot/processor/normalize_processor.py
@@ -321,7 +321,6 @@ class _NormalizationMixin:
                 self.to(device=tensor.device, dtype=tensor.dtype)
 
         stats = self._tensor_stats[key]
-
         if norm_mode == NormalizationMode.MEAN_STD:
             mean = stats.get("mean", None)
             std = stats.get("std", None)
diff --git a/src/lerobot/scripts/lerobot_eval.py b/src/lerobot/scripts/lerobot_eval.py
index 0d66fa1aa..3ef5ffad0 100644
--- a/src/lerobot/scripts/lerobot_eval.py
+++ b/src/lerobot/scripts/lerobot_eval.py
@@ -89,7 +89,7 @@ from lerobot.utils.utils import (
     init_logging,
     inside_slurm,
 )
-
+from lerobot.policies.xvla.utils import Rotate6D_to_AxisAngle
 
 def rollout(
     env: gym.vector.VectorEnv,
@@ -155,6 +155,17 @@ def rollout(
         disable=inside_slurm(),  # we dont want progress bar when we use slurm, since it clutters the logs
         leave=False,
     )
+    from transformers import AutoProcessor, AutoModel
+    model = AutoModel.from_pretrained(
+        "2toINF/X-VLA-WidowX",
+        trust_remote_code=True,
+        device="cuda"
+    )
+    model.to("cuda")
+    processor = AutoProcessor.from_pretrained("2toINF/X-VLA-WidowX", num_views=2, trust_remote_code=True)
+
+    from collections import deque
+    action_queue = deque(maxlen=30)
     check_env_attributes_and_types(env)
     while not np.all(done) and step < max_steps:
         # Numpy array to tensor and changing dictionary keys to LeRobot policy format.
@@ -165,13 +176,54 @@ def rollout(
         # Infer "task" from attributes of environments.
         # TODO: works with SyncVectorEnv but not AsyncVectorEnv
         observation = add_envs_task(env, observation)
+        inputs = processor([observation[f"observation.images.image"], observation[f"observation.images.image2"]], observation["task"], do_rescale=False)
         observation = preprocessor(observation)
+        observation["observation.images.image"] = inputs["image_input"][:, 0, ...].to("cuda")
+        observation["observation.images.image2"] = inputs["image_input"][:, 1, ...].to("cuda")
+        observation["observation.language.tokens"] = inputs["input_ids"].to("cuda")
+
+        # (Pdb) inputs.keys()
+        # dict_keys(['input_ids', 'image_input', 'image_mask', 'proprio', 'domain_id'])
+        # image_input should be torch.Size([1, 2, 3, 224, 224])
+        img0 = observation["observation.images.image"]      # [1, 3, 224, 224]
+        img1 = observation["observation.images.image2"]     # [1, 3, 224, 224]
+        img0 = img0.unsqueeze(1)   # [1, 1, 3, 224, 224]
+        img1 = img1.unsqueeze(1)   # [1, 1, 3, 224, 224]
+        obs = {}
+        obs['input_ids'] = observation["observation.language.tokens"].to("cuda")
+        obs['image_input'] = torch.cat([img0, img1], dim=1).to("cuda")
+        obs['domain_id'] = torch.tensor([int(3)], dtype=torch.long).to("cuda")
+        obs['proprio'] = observation["observation.state"].to("cuda")
+        obs['image_mask'] = inputs["image_mask"].to("cuda")
+
         with torch.inference_mode():
-            action = policy.select_action(observation)
-        action = postprocessor(action)
+            action_1 = policy.select_action(observation).to("cpu").numpy()
+            if len(action_queue) == 0:
+                action = model.generate_actions(**obs, steps=10)     # shape (1, 30, 20)
+                actions_np = action.detach().cpu().numpy()
+                # add each timestep as (1, 20)
+                for t in range(actions_np.shape[1]):
+                    act_t = actions_np[:, t, :] 
+                    action_queue.append(act_t)
+                action = action_queue.popleft()
+            else:
+                action = action_queue.popleft()
+        # action = postprocessor(action)
+        # breakpoint()
+        # .to("cpu").numpy()
+        target_eef = action[:, :3]
+        target_axis = Rotate6D_to_AxisAngle(action[:, 3:9])
+        target_act = action[:, 9:10]
+        action_numpy = np.concatenate([target_eef, target_axis, target_act], axis=-1)
+
+        target_eef_1 = action_1[:, :3]
+        target_axis_1 = Rotate6D_to_AxisAngle(action_1[:, 3:9])
+        target_act_1 = action_1[:, 9:10]
+        action_numpy_1 = np.concatenate([target_eef_1, target_axis_1, target_act_1], axis=-1)
+        breakpoint()
 
         # Convert to CPU / numpy.
-        action_numpy: np.ndarray = action.to("cpu").numpy()
+        # action_numpy: np.ndarray = action.to("cpu").numpy()
         assert action_numpy.ndim == 2, "Action dimensions should be (batch, action_dim)"
 
         # Apply the next action.
@@ -497,7 +549,6 @@ def eval_main(cfg: EvalPipelineConfig):
     envs = make_env(cfg.env, n_envs=cfg.eval.batch_size, use_async_envs=cfg.eval.use_async_envs)
 
     logging.info("Making policy.")
-
     policy = make_policy(
         cfg=cfg.policy,
         env_cfg=cfg.env,