From 0e21f3fdf7d9418470eb7d0f357d2399a4de2161 Mon Sep 17 00:00:00 2001 From: Jade Choghari Date: Tue, 25 Nov 2025 14:18:26 +0100 Subject: [PATCH] upgrade transformers version --- src/lerobot/envs/factory.py | 10 +- .../policies/xvla/configuration_xvla.py | 2 +- .../policies/xvla/modeling_florence2.py | 18 +- src/lerobot/policies/xvla/modeling_xvla.py | 8 +- .../xvla/test_xvla_original_vs_lerobot.py | 206 +++++++++--------- tests/policies/xvla/tester_xvla.py | 190 ---------------- 6 files changed, 116 insertions(+), 318 deletions(-) delete mode 100644 tests/policies/xvla/tester_xvla.py diff --git a/src/lerobot/envs/factory.py b/src/lerobot/envs/factory.py index b39cfee71..3290a0f43 100644 --- a/src/lerobot/envs/factory.py +++ b/src/lerobot/envs/factory.py @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import importlib -from typing import Any +from typing import TYPE_CHECKING, Any import gymnasium as gym from gymnasium.envs.registration import registry as gym_registry @@ -22,10 +22,16 @@ from gymnasium.envs.registration import registry as gym_registry from lerobot.configs.policies import PreTrainedConfig from lerobot.envs.configs import AlohaEnv, EnvConfig, LiberoEnv, PushtEnv from lerobot.envs.utils import _call_make_env, _download_hub_file, _import_hub_module, _normalize_hub_result -from lerobot.policies.xvla.configuration_xvla import XVLAConfig from lerobot.processor import ProcessorStep from lerobot.processor.env_processor import LiberoProcessorStep from lerobot.processor.pipeline import PolicyProcessorPipeline +from lerobot.utils.import_utils import _transformers_available + +# Conditional import for type checking and lazy loading +if TYPE_CHECKING or _transformers_available: + from lerobot.policies.xvla.configuration_xvla import XVLAConfig +else: + XVLAConfig = None def make_env_config(env_type: str, **kwargs) -> EnvConfig: diff --git a/src/lerobot/policies/xvla/configuration_xvla.py b/src/lerobot/policies/xvla/configuration_xvla.py index 2ecba245f..64e2c20f5 100644 --- a/src/lerobot/policies/xvla/configuration_xvla.py +++ b/src/lerobot/policies/xvla/configuration_xvla.py @@ -49,7 +49,7 @@ class XVLAConfig(PreTrainedConfig): normalization_mapping: dict[str, NormalizationMode] = field( default_factory=lambda: { "VISUAL": NormalizationMode.IDENTITY, - "STATE": NormalizationMode.MEAN_STD, + "STATE": NormalizationMode.IDENTITY, "ACTION": NormalizationMode.MEAN_STD, } ) diff --git a/src/lerobot/policies/xvla/modeling_florence2.py b/src/lerobot/policies/xvla/modeling_florence2.py index f783d0818..e65e15967 100644 --- a/src/lerobot/policies/xvla/modeling_florence2.py +++ b/src/lerobot/policies/xvla/modeling_florence2.py @@ -2350,22 +2350,8 @@ class Florence2PreTrainedModel(PreTrainedModel): base_model_prefix = "model" supports_gradient_checkpointing = True _skip_keys_device_placement = "past_key_values" - - @property - def _supports_flash_attn_2(self): - """ - Retrieve language_model's attribute to check whether the model supports - Flash Attention 2 or not. - """ - return self.language_model._supports_flash_attn_2 - - @property - def _supports_sdpa(self): - """ - Retrieve language_model's attribute to check whether the model supports - SDPA or not. - """ - return self.language_model._supports_sdpa + _supports_flash_attn_2 = True + _supports_sdpa = True FLORENCE2_INPUTS_DOCSTRING = r""" diff --git a/src/lerobot/policies/xvla/modeling_xvla.py b/src/lerobot/policies/xvla/modeling_xvla.py index fd5dc6401..cb1784e8f 100644 --- a/src/lerobot/policies/xvla/modeling_xvla.py +++ b/src/lerobot/policies/xvla/modeling_xvla.py @@ -18,6 +18,7 @@ from __future__ import annotations +import builtins import os from collections import deque from pathlib import Path @@ -26,13 +27,14 @@ import torch import torch.nn.functional as F # noqa: N812 from torch import Tensor, nn -from lerobot.policies.pretrained import PreTrainedPolicy +from lerobot.configs.policies import PreTrainedConfig +from lerobot.policies.pretrained import PreTrainedPolicy, T from lerobot.policies.utils import populate_queues from lerobot.utils.constants import ACTION, OBS_LANGUAGE_TOKENS, OBS_STATE from .action_hub import build_action_space from .configuration_florence2 import Florence2Config -from .configuration_xvla import XVLAConfig, XVLAConfig as PreTrainedConfig +from .configuration_xvla import XVLAConfig from .modeling_florence2 import Florence2ForConditionalGeneration from .soft_transformer import SoftPromptedTransformer @@ -400,7 +402,7 @@ class XVLAPolicy(PreTrainedPolicy): @classmethod def from_pretrained( - cls, + cls: builtins.type[T], pretrained_name_or_path: str | Path, *, config: PreTrainedConfig | None = None, diff --git a/tests/policies/xvla/test_xvla_original_vs_lerobot.py b/tests/policies/xvla/test_xvla_original_vs_lerobot.py index 51ecb6357..579c0a1cc 100644 --- a/tests/policies/xvla/test_xvla_original_vs_lerobot.py +++ b/tests/policies/xvla/test_xvla_original_vs_lerobot.py @@ -26,6 +26,10 @@ import numpy as np import pytest import torch +from lerobot.policies.xvla.configuration_xvla import XVLAConfig +from lerobot.policies.xvla.modeling_xvla import XVLAPolicy +from lerobot.policies.xvla.processor_xvla import make_xvla_pre_post_processors + # Skip if transformers is not available pytest.importorskip("transformers") @@ -37,9 +41,6 @@ pytestmark = pytest.mark.skipif( from transformers import AutoModel, AutoProcessor # noqa: E402 -from lerobot.configs.policies import PreTrainedConfig # noqa: E402 -from lerobot.envs.factory import make_env_config # noqa: E402 -from lerobot.policies.factory import make_policy, make_pre_post_processors # noqa: E402 from lerobot.processor import PolicyAction, PolicyProcessorPipeline # noqa: E402 from lerobot.utils.constants import OBS_IMAGES, OBS_STATE # noqa: E402 @@ -50,8 +51,8 @@ IMAGE_HEIGHT = 224 IMAGE_WIDTH = 224 NUM_VIEWS = 2 # Number of camera views DEVICE = "cuda" if torch.cuda.is_available() else "cpu" -MODEL_PATH_LEROBOT = "lerobot/xvla-base" -MODEL_PATH_ORIGINAL = "2toINF/X-VLA-Pt" +MODEL_PATH_LEROBOT = "lerobot/xvla-widowx" +MODEL_PATH_ORIGINAL = "2toINF/X-VLA-WidowX" LIBERO_DOMAIN_ID = 0 # Domain ID for examples purposes @@ -93,33 +94,27 @@ def instantiate_lerobot_xvla( ]: """Instantiate LeRobot XVLA policy with preprocessor and postprocessor.""" if from_pretrained: - cfg = PreTrainedConfig.from_pretrained(model_path) - cfg.pretrained_path = model_path + policy = XVLAPolicy.from_pretrained( + pretrained_name_or_path=model_path, + strict=False, + ) else: - # For non-pretrained, we'd need to create a config from scratch - raise NotImplementedError("Non-pretrained XVLA instantiation not implemented yet") + config = XVLAConfig( + base_model_path=model_path, + n_action_steps=DUMMY_ACTION_DIM, + chunk_size=DUMMY_ACTION_DIM, + device=DEVICE, + num_image_views=NUM_VIEWS, + ) # add resize_imgs_with_padding=IMAGE_SIZE, IMAGE_SIZE? + policy = XVLAPolicy(config) - cfg.device = DEVICE - env_cfg = make_env_config("libero", task="libero_spatial") - - policy = make_policy( - cfg=cfg, - env_cfg=env_cfg, - ) - policy.to(DEVICE) - policy.eval() - - preprocessor_overrides = { - "device_processor": {"device": str(cfg.device)}, - } - - preprocessor, postprocessor = make_pre_post_processors( - policy_cfg=cfg, - pretrained_path=cfg.pretrained_path, - preprocessor_overrides=preprocessor_overrides, + policy.config.device = DEVICE + preprocessor, postprocessor = make_xvla_pre_post_processors( + config=policy.config, + dataset_stats=None, # Pass None for dataset_stats to disable normalization (original XVLA doesn't normalize) ) - + return policy, preprocessor, postprocessor @@ -129,76 +124,74 @@ def instantiate_original_xvla( ): """Instantiate original XVLA policy from the original implementation.""" if from_pretrained: - processor = AutoProcessor.from_pretrained( - model_path, - num_views=NUM_VIEWS, - trust_remote_code=True - ) - model = AutoModel.from_pretrained( - model_path, - trust_remote_code=True - ) + processor = AutoProcessor.from_pretrained(model_path, num_views=NUM_VIEWS, trust_remote_code=True) + model = AutoModel.from_pretrained(model_path, trust_remote_code=True) else: raise NotImplementedError("Non-pretrained XVLA instantiation not implemented yet") - + model.to(DEVICE) model.eval() - + return model, processor def create_dummy_data(device=DEVICE): """Create dummy data for testing both implementations.""" - batch_size = 2 + batch_size = 1 prompt = "Pick up the red block and place it in the bin" - + # Create random RGB images in [0, 255] uint8 range (as PIL images would be) # Then convert to [0, 1] float32 range for LeRobot - def fake_rgb(H, W): - arr = np.random.randint(0, 255, (H, W, 3), dtype=np.uint8) + def fake_rgb(h, w): + arr = np.random.randint(0, 255, (h, w, 3), dtype=np.uint8) t = torch.from_numpy(arr).permute(2, 0, 1) # CHW - t = t.float() / 255.0 # Normalize to [0, 1] return t - + batch = { - f"{OBS_IMAGES}.image": torch.stack([fake_rgb(IMAGE_HEIGHT, IMAGE_WIDTH) for _ in range(batch_size)]).to(device), - f"{OBS_IMAGES}.image2": torch.stack([fake_rgb(IMAGE_HEIGHT, IMAGE_WIDTH) for _ in range(batch_size)]).to(device), + f"{OBS_IMAGES}.image": torch.stack( + [fake_rgb(IMAGE_HEIGHT, IMAGE_WIDTH) for _ in range(batch_size)] + ).to(device), + f"{OBS_IMAGES}.image2": torch.stack( + [fake_rgb(IMAGE_HEIGHT, IMAGE_WIDTH) for _ in range(batch_size)] + ).to(device), OBS_STATE: torch.randn(batch_size, DUMMY_STATE_DIM, dtype=torch.float32, device=device), "task": [prompt for _ in range(batch_size)], } - + return batch def prepare_original_inputs(batch, processor, device=DEVICE): """Prepare inputs for the original XVLA model.""" - batch_size = batch[OBS_STATE].shape[0] - # Convert images from [0, 1] to [0, 255] uint8 for processor - image1 = (batch[f"{OBS_IMAGES}.image"] * 255).byte() - image2 = (batch[f"{OBS_IMAGES}.image2"] * 255).byte() - + image1 = (batch[f"{OBS_IMAGES}.image"]).byte() + image2 = (batch[f"{OBS_IMAGES}.image2"]).byte() + # Get task instruction (use first one if batch) task_instruction = batch["task"][0] if isinstance(batch["task"], list) else batch["task"] - + # Process images and text through original processor # The processor expects a list of images per sample processed_inputs = processor( [image1[0], image2[0]], # Process first sample only for now - task_instruction + task_instruction, ) - + # Move to correct device and dtype dtype = torch.float32 - inputs = {k: v.to(device=device, dtype=dtype) if v.is_floating_point() else v.to(device=device) - for k, v in processed_inputs.items()} - + inputs = { + k: v.to(device=device, dtype=dtype) if v.is_floating_point() else v.to(device=device) + for k, v in processed_inputs.items() + } + # Add proprio and domain_id - inputs.update({ - "proprio": batch[OBS_STATE][:1].to(device), # First sample only - "domain_id": torch.tensor([LIBERO_DOMAIN_ID], dtype=torch.long, device=device), - }) - + inputs.update( + { + "proprio": batch[OBS_STATE][:1].to(device), # First sample only + "domain_id": torch.tensor([LIBERO_DOMAIN_ID], dtype=torch.long, device=device), + } + ) + return inputs @@ -207,46 +200,46 @@ def test_xvla_preprocessor_alignment(): print("\n" + "=" * 80) print("Test: XVLA Preprocessor Alignment") print("=" * 80) - + set_seed_all(42) - + print("\n[LeRobot] Instantiating policy and preprocessor...") lerobot_policy, lerobot_preprocessor, lerobot_postprocessor = instantiate_lerobot_xvla( from_pretrained=True ) - + print("\n[Original] Instantiating model and processor...") original_model, original_processor = instantiate_original_xvla(from_pretrained=True) - + print("\nCreating dummy data...") batch = create_dummy_data() - + print("\n[LeRobot] Preprocessing...") lerobot_observation = lerobot_preprocessor(deepcopy(batch)) lerobot_inputs = lerobot_policy._build_model_inputs(lerobot_observation) - + print("\n[Original] Preprocessing...") original_inputs = prepare_original_inputs(batch, original_processor) - + print("\nComparing preprocessor outputs:") print("-" * 80) - + # Compare common keys common_keys = set(lerobot_inputs.keys()) & set(original_inputs.keys()) print(f"Common keys: {common_keys}") - + for key in common_keys: lerobot_tensor = lerobot_inputs[key] original_tensor = original_inputs[key] - + print(f"\nšŸ”Ž Key: {key}") print(f" LeRobot shape: {lerobot_tensor.shape}") print(f" Original shape: {original_tensor.shape}") - + # Handle batch size difference (we only process first sample for original) if lerobot_tensor.shape[0] > original_tensor.shape[0]: lerobot_tensor = lerobot_tensor[:1] - + if lerobot_tensor.shape == original_tensor.shape: if torch.allclose(lerobot_tensor, original_tensor, atol=1e-5, rtol=1e-5): print(" āœ”ļø Tensors are equal (allclose with atol=1e-5)") @@ -258,7 +251,7 @@ def test_xvla_preprocessor_alignment(): print(f" Std diff: {diff.std().item():.6e}") else: print(" āš ļø Shapes don't match after alignment") - + cleanup_memory() @@ -267,67 +260,69 @@ def test_xvla_original_vs_lerobot_pretrained(): print("\n" + "=" * 80) print("Test: XVLA Original vs LeRobot with Pretrained Weights (Inference)") print("=" * 80) - + set_seed_all(42) - + print("\n[LeRobot] Instantiating policy...") lerobot_policy, lerobot_preprocessor, lerobot_postprocessor = instantiate_lerobot_xvla( from_pretrained=True ) - + print("\n[Original] Instantiating model...") original_model, original_processor = instantiate_original_xvla(from_pretrained=True) - + print("\nCreating dummy data...") batch = create_dummy_data() - + print("\n[LeRobot] Running inference...") lerobot_observation = lerobot_preprocessor(deepcopy(batch)) lerobot_inputs = lerobot_policy._build_model_inputs(lerobot_observation) - + # Reset seed for inference torch.manual_seed(42) with torch.no_grad(): lerobot_actions = lerobot_policy.model.generate_actions(**lerobot_inputs, steps=10) lerobot_actions = lerobot_actions.squeeze(0).float().cpu() - + print(f"LeRobot actions shape: {lerobot_actions.shape}") print(f"LeRobot actions mean: {lerobot_actions.mean().item():.6f}") print(f"LeRobot actions std: {lerobot_actions.std().item():.6f}") - + print("\n[Original] Running inference...") original_inputs = prepare_original_inputs(batch, original_processor) - + # Reset seed for inference torch.manual_seed(42) with torch.no_grad(): original_actions = original_model.generate_actions(**original_inputs, steps=10) original_actions = original_actions.squeeze(0).float().cpu() - + print(f"Original actions shape: {original_actions.shape}") print(f"Original actions mean: {original_actions.mean().item():.6f}") print(f"Original actions std: {original_actions.std().item():.6f}") - + print("\nAction Comparison:") print("-" * 80) - + # Compare actions if lerobot_actions.shape == original_actions.shape: diff = torch.abs(lerobot_actions - original_actions) max_diff = diff.max().item() mean_diff = diff.mean().item() - + print(f"Max absolute difference: {max_diff:.6e}") print(f"Mean absolute difference: {mean_diff:.6e}") - print(f"Relative difference: {(mean_diff / (torch.abs(original_actions).mean().item() + 1e-8) * 100):.2f}%") - + print( + f"Relative difference: {(mean_diff / (torch.abs(original_actions).mean().item() + 1e-8) * 100):.2f}%" + ) + # Check with different tolerances tolerances = [1e-5, 1e-4, 1e-3, 1e-2] for tol in tolerances: is_close = torch.allclose(lerobot_actions, original_actions, atol=tol) status = "āœ”ļø" if is_close else "āŒ" print(f"{status} Actions close (atol={tol}): {is_close}") - + # Assert with reasonable tolerance tolerance = 1e-3 assert torch.allclose(lerobot_actions, original_actions, atol=tolerance), ( @@ -336,7 +331,7 @@ def test_xvla_original_vs_lerobot_pretrained(): print(f"\nāœ… Success: Actions match within tolerance ({tolerance})!") else: print(f"āš ļø Shape mismatch: LeRobot {lerobot_actions.shape} vs Original {original_actions.shape}") - + cleanup_memory() @@ -345,15 +340,15 @@ def test_xvla_inference_reproducibility(): print("\n" + "=" * 80) print("Test: XVLA Inference Reproducibility") print("=" * 80) - + print("\n[LeRobot] Instantiating policy...") lerobot_policy, lerobot_preprocessor, lerobot_postprocessor = instantiate_lerobot_xvla( from_pretrained=True ) - + print("\nCreating dummy data...") batch = create_dummy_data() - + # First inference print("\n[Run 1] Running inference...") set_seed_all(42) @@ -362,7 +357,7 @@ def test_xvla_inference_reproducibility(): with torch.no_grad(): actions_1 = lerobot_policy.model.generate_actions(**lerobot_inputs, steps=10) actions_1 = actions_1.squeeze(0).float().cpu() - + # Second inference with same seed print("\n[Run 2] Running inference with same seed...") set_seed_all(42) @@ -371,20 +366,20 @@ def test_xvla_inference_reproducibility(): with torch.no_grad(): actions_2 = lerobot_policy.model.generate_actions(**lerobot_inputs, steps=10) actions_2 = actions_2.squeeze(0).float().cpu() - + print("\nComparing two runs:") print("-" * 80) - + if torch.allclose(actions_1, actions_2, atol=1e-8): print("āœ”ļø Inference is perfectly reproducible!") else: diff = torch.abs(actions_1 - actions_2) - print(f"āš ļø Small differences detected:") + print("āš ļø Small differences detected:") print(f" Max diff: {diff.max().item():.6e}") print(f" Mean diff: {diff.mean().item():.6e}") - + assert torch.allclose(actions_1, actions_2, atol=1e-6), "Inference should be reproducible!" - + cleanup_memory() @@ -392,12 +387,12 @@ if __name__ == "__main__": print("\n" + "=" * 80) print("XVLA Original vs LeRobot Comparison Test Suite") print("=" * 80) - + try: test_xvla_preprocessor_alignment() test_xvla_original_vs_lerobot_pretrained() test_xvla_inference_reproducibility() - + print("\n" + "=" * 80) print("āœ… All tests passed!") print("=" * 80) @@ -406,4 +401,3 @@ if __name__ == "__main__": print(f"āŒ Test failed with error: {e}") print("=" * 80) raise - diff --git a/tests/policies/xvla/tester_xvla.py b/tests/policies/xvla/tester_xvla.py deleted file mode 100644 index 53e65cb40..000000000 --- a/tests/policies/xvla/tester_xvla.py +++ /dev/null @@ -1,190 +0,0 @@ -import random - -import numpy as np -import torch -from xvla.models.modeling_xvla import XVLA - -# from lerobot.policies.xvla.configuration_xvla import XVLAConfig -from lerobot.configs.policies import PreTrainedConfig -from lerobot.envs.factory import make_env_config -from lerobot.policies.factory import make_policy, make_pre_post_processors -from lerobot.utils.constants import OBS_IMAGES, OBS_STATE - -torch.manual_seed(42) -random.seed(42) -np.random.seed(42) -observation_height: int = 224 -observation_width: int = 224 # todo: jadechoghari, image size is different for the two models -# create an observation dict -OBS = { - f"{OBS_IMAGES}.image": torch.randn(1, 3, observation_height, observation_width), - f"{OBS_IMAGES}.image2": torch.randn(1, 3, observation_height, observation_width), - OBS_STATE: torch.randn(1, 20), # ONLY if OBS_STATE is already a string - "task": "put the object in the box", -} - -IMAGENET_MEAN = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1) -IMAGENET_STD = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1) - - -def fake_rgb(H, W): - arr = np.random.randint(0, 255, (H, W, 3), dtype=np.uint8) - t = torch.from_numpy(arr).permute(2, 0, 1) # CHW - t = t.unsqueeze(0).float() - # normalize pixel to imagenet - return t - - -OBS[f"{OBS_IMAGES}.image"] = fake_rgb(observation_height, observation_width) -OBS[f"{OBS_IMAGES}.image2"] = fake_rgb(observation_height, observation_width) - -cfg = PreTrainedConfig.from_pretrained("/raid/jade/models/xvla-libero-og_migrated") -cfg.pretrained_path = "/raid/jade/models/xvla-libero-og_migrated" -env_cfg = make_env_config("libero", task="libero_spatial") -policy = make_policy( - cfg=cfg, - env_cfg=env_cfg, -) - -policy.eval() - -preprocessor_overrides = { - "device_processor": {"device": str(cfg.device)}, -} - -preprocessor, postprocessor = make_pre_post_processors( - policy_cfg=cfg, - pretrained_path=cfg.pretrained_path, - preprocessor_overrides=preprocessor_overrides, -) - -observation = preprocessor(OBS) -inputs = policy._build_model_inputs(observation) - - -#### now the og model ########################################################### -from xvla.models.processing_xvla import XVLAProcessor - -processor = XVLAProcessor.from_pretrained("/raid/jade/models/xvla-libero", num_views=2) -inputs_1 = processor([OBS[f"{OBS_IMAGES}.image"], OBS[f"{OBS_IMAGES}.image2"]], OBS["task"]) -domain_id = torch.tensor([3], dtype=torch.long) -inputs.update( - { - "proprio": OBS[OBS_STATE].to("cuda"), - "domain_id": domain_id.to("cuda"), - } -) - -# check the preprocessor -for k in inputs.keys() & inputs_1.keys(): # intersection of keys - a = inputs[k] - b = inputs_1[k].to("cuda") - - print(f"\nšŸ”Ž Key: {k}") - - # Check shape - print(" shape:", a.shape, b.shape) - - # Check if close - if torch.allclose(a, b, atol=1e-5, rtol=1e-5): - print(" āœ”ļø tensors are equal (allclose)") - else: - diff = torch.abs(a - b) - print(" āŒ tensors differ") - print(" max diff:", diff.max().item()) - print(" mean diff:", diff.mean().item()) - - -model = XVLA.from_pretrained("/raid/jade/models/xvla-libero") -model.eval() -model.to("cuda") - -action = model.generate_actions(**inputs, steps=10).squeeze(0).float().cpu().numpy() -action_1 = policy.model.generate_actions(**inputs, steps=10).squeeze(0).float().cpu().numpy() - -# np all close -print(np.allclose(action, action_1, atol=1e-2, rtol=1e-2)) -print("max diff:", np.max(np.abs(action - action_1))) -print("mean diff:", np.mean(np.abs(action - action_1))) - - -import random - -import numpy as np -import torch -from PIL import Image -from xvla.models.configuration_xvla import XVLAConfig -from xvla.models.modeling_xvla import XVLA -from xvla.models.processor_xvla import XVLAProcessor - -from lerobot.configs.policies import PreTrainedConfig -from lerobot.envs.factory import make_env_config -from lerobot.policies.factory import make_policy - -cfg = XVLAConfig.from_pretrained("/raid/jade/models/xvla-libero") -model = XVLA.from_pretrained("/raid/jade/models/xvla-libero") -model.eval() -model.to("cuda") -processor = XVLAProcessor.from_pretrained("/raid/jade/models/xvla-libero") -# /raid/jade/models/xvla-libero -# seet seed -torch.manual_seed(42) -random.seed(42) -np.random.seed(42) - - -def make_random_pil_images(num_images=3, H=480, W=640): - images = [] - for _ in range(num_images): - # Random RGB image - arr = np.random.randint(0, 256, (H, W, 3), dtype=np.uint8) - img = Image.fromarray(arr) - images.append(img) - return images - - -# Example: -images = make_random_pil_images() -language_instruction = "This is a random image" -# Multimodal preprocessing by processor -inputs = processor(images, language_instruction) -if not {"input_ids", "image_input", "image_mask"}.issubset(inputs): - raise ValueError("Processor did not return the expected keys.") - -proprio = torch.randn(1, 20) -domain_id = torch.tensor([0], dtype=torch.long) - -# Align to model's device/dtype -device = model.device -dtype = next(model.parameters()).dtype - - -def to_model(t: torch.Tensor) -> torch.Tensor: - if not isinstance(t, torch.Tensor): - t = torch.as_tensor(t) - # cast floats to model dtype, keep integral/bool as-is - return t.to(device=device, dtype=dtype) if t.is_floating_point() else t.to(device=device) - - -inputs = {k: to_model(v) for k, v in inputs.items()} -inputs.update( - { - "proprio": to_model(proprio), - "domain_id": domain_id.to(device), - } -) - -# Inference -action = model.generate_actions(**inputs, steps=10).squeeze(0).float().cpu().numpy() - - -#### now for lerobot model ##################################################### - -cfg = PreTrainedConfig.from_pretrained("/raid/jade/models/xvla-libero-og_migrated") -env_cfg = make_env_config("libero", task="libero_spatial") -cfg.pretrained_path = "/raid/jade/models/xvla-libero-og_migrated" -policy = make_policy(cfg=cfg, env_cfg=env_cfg) -policy.eval() -policy.to("cuda") - -action_1 = policy.model.generate_actions(**inputs, steps=10).squeeze(0).float().cpu().numpy() \ No newline at end of file