add testing

2026-05-18 10:10:08 +00:00 · 2025-11-25 09:31:27 +01:00
parent 722766b825
commit 936a6728f0
2 changed files with 599 additions and 0 deletions
@@ -0,0 +1,409 @@
+#!/usr/bin/env python
+
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Test script to verify XVLA policy integration with LeRobot vs the original implementation, only meant to be run locally!"""
+
+import gc
+import os
+import random
+from copy import deepcopy
+from typing import Any
+
+import numpy as np
+import pytest
+import torch
+
+# Skip if transformers is not available
+pytest.importorskip("transformers")
+
+# Skip this entire module in CI
+pytestmark = pytest.mark.skipif(
+    os.environ.get("CI") == "true" or os.environ.get("GITHUB_ACTIONS") == "true",
+    reason="This test requires XVLA model access and is not meant for CI",
+)
+
+from transformers import AutoModel, AutoProcessor  # noqa: E402
+
+from lerobot.configs.policies import PreTrainedConfig  # noqa: E402
+from lerobot.envs.factory import make_env_config  # noqa: E402
+from lerobot.policies.factory import make_policy, make_pre_post_processors  # noqa: E402
+from lerobot.processor import PolicyAction, PolicyProcessorPipeline  # noqa: E402
+from lerobot.utils.constants import OBS_IMAGES, OBS_STATE  # noqa: E402
+
+# Constants
+DUMMY_ACTION_DIM = 7  # Standard robot arm action dimension
+DUMMY_STATE_DIM = 20  # Proprioceptive state dimension
+IMAGE_HEIGHT = 224
+IMAGE_WIDTH = 224
+NUM_VIEWS = 2  # Number of camera views
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+MODEL_PATH_LEROBOT = "lerobot/xvla-base"
+MODEL_PATH_ORIGINAL = "2toINF/X-VLA-Pt"
+LIBERO_DOMAIN_ID = 0  # Domain ID for examples purposes
+
+
+def cleanup_memory():
+    """Clean up GPU/MPS memory to prevent OOM errors between tests."""
+    print("\nCleaning up memory...")
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+    if torch.backends.mps.is_available():
+        torch.mps.empty_cache()
+    print("Memory cleanup complete.")
+
+
+def set_seed_all(seed: int):
+    """Set random seed for all RNG sources to ensure reproducibility."""
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+
+    # Set deterministic behavior
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+    torch.use_deterministic_algorithms(True, warn_only=True)
+
+
+def instantiate_lerobot_xvla(
+    from_pretrained: bool = False,
+    model_path: str = MODEL_PATH_LEROBOT,
+) -> tuple[
+    Any,  # Policy
+    PolicyProcessorPipeline[dict[str, Any], dict[str, Any]],
+    PolicyProcessorPipeline[PolicyAction, PolicyAction],
+]:
+    """Instantiate LeRobot XVLA policy with preprocessor and postprocessor."""
+    if from_pretrained:
+        cfg = PreTrainedConfig.from_pretrained(model_path)
+        cfg.pretrained_path = model_path
+    else:
+        # For non-pretrained, we'd need to create a config from scratch
+        raise NotImplementedError("Non-pretrained XVLA instantiation not implemented yet")
+
+    cfg.device = DEVICE
+    env_cfg = make_env_config("libero", task="libero_spatial")
+    
+    policy = make_policy(
+        cfg=cfg,
+        env_cfg=env_cfg,
+    )
+    
+    policy.to(DEVICE)
+    policy.eval()
+    
+    preprocessor_overrides = {
+        "device_processor": {"device": str(cfg.device)},
+    }
+    
+    preprocessor, postprocessor = make_pre_post_processors(
+        policy_cfg=cfg,
+        pretrained_path=cfg.pretrained_path,
+        preprocessor_overrides=preprocessor_overrides,
+    )
+    
+    return policy, preprocessor, postprocessor
+
+
+def instantiate_original_xvla(
+    from_pretrained: bool = False,
+    model_path: str = MODEL_PATH_ORIGINAL,
+):
+    """Instantiate original XVLA policy from the original implementation."""
+    if from_pretrained:
+        processor = AutoProcessor.from_pretrained(
+            model_path, 
+            num_views=NUM_VIEWS, 
+            trust_remote_code=True
+        )
+        model = AutoModel.from_pretrained(
+            model_path,
+            trust_remote_code=True
+        )
+    else:
+        raise NotImplementedError("Non-pretrained XVLA instantiation not implemented yet")
+    
+    model.to(DEVICE)
+    model.eval()
+    
+    return model, processor
+
+
+def create_dummy_data(device=DEVICE):
+    """Create dummy data for testing both implementations."""
+    batch_size = 2
+    prompt = "Pick up the red block and place it in the bin"
+    
+    # Create random RGB images in [0, 255] uint8 range (as PIL images would be)
+    # Then convert to [0, 1] float32 range for LeRobot
+    def fake_rgb(H, W):
+        arr = np.random.randint(0, 255, (H, W, 3), dtype=np.uint8)
+        t = torch.from_numpy(arr).permute(2, 0, 1)  # CHW
+        t = t.float() / 255.0  # Normalize to [0, 1]
+        return t
+    
+    batch = {
+        f"{OBS_IMAGES}.image": torch.stack([fake_rgb(IMAGE_HEIGHT, IMAGE_WIDTH) for _ in range(batch_size)]).to(device),
+        f"{OBS_IMAGES}.image2": torch.stack([fake_rgb(IMAGE_HEIGHT, IMAGE_WIDTH) for _ in range(batch_size)]).to(device),
+        OBS_STATE: torch.randn(batch_size, DUMMY_STATE_DIM, dtype=torch.float32, device=device),
+        "task": [prompt for _ in range(batch_size)],
+    }
+    
+    return batch
+
+
+def prepare_original_inputs(batch, processor, device=DEVICE):
+    """Prepare inputs for the original XVLA model."""
+    batch_size = batch[OBS_STATE].shape[0]
+    
+    # Convert images from [0, 1] to [0, 255] uint8 for processor
+    image1 = (batch[f"{OBS_IMAGES}.image"] * 255).byte()
+    image2 = (batch[f"{OBS_IMAGES}.image2"] * 255).byte()
+    
+    # Get task instruction (use first one if batch)
+    task_instruction = batch["task"][0] if isinstance(batch["task"], list) else batch["task"]
+    
+    # Process images and text through original processor
+    # The processor expects a list of images per sample
+    processed_inputs = processor(
+        [image1[0], image2[0]],  # Process first sample only for now
+        task_instruction
+    )
+    
+    # Move to correct device and dtype
+    dtype = torch.float32
+    inputs = {k: v.to(device=device, dtype=dtype) if v.is_floating_point() else v.to(device=device) 
+              for k, v in processed_inputs.items()}
+    
+    # Add proprio and domain_id
+    inputs.update({
+        "proprio": batch[OBS_STATE][:1].to(device),  # First sample only
+        "domain_id": torch.tensor([LIBERO_DOMAIN_ID], dtype=torch.long, device=device),
+    })
+    
+    return inputs
+
+
+def test_xvla_preprocessor_alignment():
+    """Test that LeRobot and Original XVLA preprocessors produce similar outputs."""
+    print("\n" + "=" * 80)
+    print("Test: XVLA Preprocessor Alignment")
+    print("=" * 80)
+    
+    set_seed_all(42)
+    
+    print("\n[LeRobot] Instantiating policy and preprocessor...")
+    lerobot_policy, lerobot_preprocessor, lerobot_postprocessor = instantiate_lerobot_xvla(
+        from_pretrained=True
+    )
+    
+    print("\n[Original] Instantiating model and processor...")
+    original_model, original_processor = instantiate_original_xvla(from_pretrained=True)
+    
+    print("\nCreating dummy data...")
+    batch = create_dummy_data()
+    
+    print("\n[LeRobot] Preprocessing...")
+    lerobot_observation = lerobot_preprocessor(deepcopy(batch))
+    lerobot_inputs = lerobot_policy._build_model_inputs(lerobot_observation)
+    
+    print("\n[Original] Preprocessing...")
+    original_inputs = prepare_original_inputs(batch, original_processor)
+    
+    print("\nComparing preprocessor outputs:")
+    print("-" * 80)
+    
+    # Compare common keys
+    common_keys = set(lerobot_inputs.keys()) & set(original_inputs.keys())
+    print(f"Common keys: {common_keys}")
+    
+    for key in common_keys:
+        lerobot_tensor = lerobot_inputs[key]
+        original_tensor = original_inputs[key]
+        
+        print(f"\n🔎 Key: {key}")
+        print(f"  LeRobot shape: {lerobot_tensor.shape}")
+        print(f"  Original shape: {original_tensor.shape}")
+        
+        # Handle batch size difference (we only process first sample for original)
+        if lerobot_tensor.shape[0] > original_tensor.shape[0]:
+            lerobot_tensor = lerobot_tensor[:1]
+        
+        if lerobot_tensor.shape == original_tensor.shape:
+            if torch.allclose(lerobot_tensor, original_tensor, atol=1e-5, rtol=1e-5):
+                print("  ✔️ Tensors are equal (allclose with atol=1e-5)")
+            else:
+                diff = torch.abs(lerobot_tensor - original_tensor)
+                print("  ⚠️ Tensors differ")
+                print(f"  Max diff: {diff.max().item():.6e}")
+                print(f"  Mean diff: {diff.mean().item():.6e}")
+                print(f"  Std diff: {diff.std().item():.6e}")
+        else:
+            print("  ⚠️ Shapes don't match after alignment")
+    
+    cleanup_memory()
+
+
+def test_xvla_original_vs_lerobot_pretrained():
+    """Test XVLA original implementation vs LeRobot implementation with pretrained weights."""
+    print("\n" + "=" * 80)
+    print("Test: XVLA Original vs LeRobot with Pretrained Weights (Inference)")
+    print("=" * 80)
+    
+    set_seed_all(42)
+    
+    print("\n[LeRobot] Instantiating policy...")
+    lerobot_policy, lerobot_preprocessor, lerobot_postprocessor = instantiate_lerobot_xvla(
+        from_pretrained=True
+    )
+    
+    print("\n[Original] Instantiating model...")
+    original_model, original_processor = instantiate_original_xvla(from_pretrained=True)
+    
+    print("\nCreating dummy data...")
+    batch = create_dummy_data()
+    
+    print("\n[LeRobot] Running inference...")
+    lerobot_observation = lerobot_preprocessor(deepcopy(batch))
+    lerobot_inputs = lerobot_policy._build_model_inputs(lerobot_observation)
+    
+    # Reset seed for inference
+    torch.manual_seed(42)
+    with torch.no_grad():
+        lerobot_actions = lerobot_policy.model.generate_actions(**lerobot_inputs, steps=10)
+        lerobot_actions = lerobot_actions.squeeze(0).float().cpu()
+    
+    print(f"LeRobot actions shape: {lerobot_actions.shape}")
+    print(f"LeRobot actions mean: {lerobot_actions.mean().item():.6f}")
+    print(f"LeRobot actions std: {lerobot_actions.std().item():.6f}")
+    
+    print("\n[Original] Running inference...")
+    original_inputs = prepare_original_inputs(batch, original_processor)
+    
+    # Reset seed for inference
+    torch.manual_seed(42)
+    with torch.no_grad():
+        original_actions = original_model.generate_actions(**original_inputs, steps=10)
+        original_actions = original_actions.squeeze(0).float().cpu()
+    
+    print(f"Original actions shape: {original_actions.shape}")
+    print(f"Original actions mean: {original_actions.mean().item():.6f}")
+    print(f"Original actions std: {original_actions.std().item():.6f}")
+    
+    print("\nAction Comparison:")
+    print("-" * 80)
+    
+    # Compare actions
+    if lerobot_actions.shape == original_actions.shape:
+        diff = torch.abs(lerobot_actions - original_actions)
+        max_diff = diff.max().item()
+        mean_diff = diff.mean().item()
+        
+        print(f"Max absolute difference: {max_diff:.6e}")
+        print(f"Mean absolute difference: {mean_diff:.6e}")
+        print(f"Relative difference: {(mean_diff / (torch.abs(original_actions).mean().item() + 1e-8) * 100):.2f}%")
+        
+        # Check with different tolerances
+        tolerances = [1e-5, 1e-4, 1e-3, 1e-2]
+        for tol in tolerances:
+            is_close = torch.allclose(lerobot_actions, original_actions, atol=tol)
+            status = "✔️" if is_close else "❌"
+            print(f"{status} Actions close (atol={tol}): {is_close}")
+        
+        # Assert with reasonable tolerance
+        tolerance = 1e-3
+        assert torch.allclose(lerobot_actions, original_actions, atol=tolerance), (
+            f"Actions differ by more than tolerance ({tolerance}): max diff = {max_diff:.6e}"
+        )
+        print(f"\n✅ Success: Actions match within tolerance ({tolerance})!")
+    else:
+        print(f"⚠️ Shape mismatch: LeRobot {lerobot_actions.shape} vs Original {original_actions.shape}")
+    
+    cleanup_memory()
+
+
+def test_xvla_inference_reproducibility():
+    """Test that XVLA inference is reproducible with the same seed."""
+    print("\n" + "=" * 80)
+    print("Test: XVLA Inference Reproducibility")
+    print("=" * 80)
+    
+    print("\n[LeRobot] Instantiating policy...")
+    lerobot_policy, lerobot_preprocessor, lerobot_postprocessor = instantiate_lerobot_xvla(
+        from_pretrained=True
+    )
+    
+    print("\nCreating dummy data...")
+    batch = create_dummy_data()
+    
+    # First inference
+    print("\n[Run 1] Running inference...")
+    set_seed_all(42)
+    lerobot_observation = lerobot_preprocessor(deepcopy(batch))
+    lerobot_inputs = lerobot_policy._build_model_inputs(lerobot_observation)
+    with torch.no_grad():
+        actions_1 = lerobot_policy.model.generate_actions(**lerobot_inputs, steps=10)
+        actions_1 = actions_1.squeeze(0).float().cpu()
+    
+    # Second inference with same seed
+    print("\n[Run 2] Running inference with same seed...")
+    set_seed_all(42)
+    lerobot_observation = lerobot_preprocessor(deepcopy(batch))
+    lerobot_inputs = lerobot_policy._build_model_inputs(lerobot_observation)
+    with torch.no_grad():
+        actions_2 = lerobot_policy.model.generate_actions(**lerobot_inputs, steps=10)
+        actions_2 = actions_2.squeeze(0).float().cpu()
+    
+    print("\nComparing two runs:")
+    print("-" * 80)
+    
+    if torch.allclose(actions_1, actions_2, atol=1e-8):
+        print("✔️ Inference is perfectly reproducible!")
+    else:
+        diff = torch.abs(actions_1 - actions_2)
+        print(f"⚠️ Small differences detected:")
+        print(f"  Max diff: {diff.max().item():.6e}")
+        print(f"  Mean diff: {diff.mean().item():.6e}")
+    
+    assert torch.allclose(actions_1, actions_2, atol=1e-6), "Inference should be reproducible!"
+    
+    cleanup_memory()
+
+
+if __name__ == "__main__":
+    print("\n" + "=" * 80)
+    print("XVLA Original vs LeRobot Comparison Test Suite")
+    print("=" * 80)
+    
+    try:
+        test_xvla_preprocessor_alignment()
+        test_xvla_original_vs_lerobot_pretrained()
+        test_xvla_inference_reproducibility()
+        
+        print("\n" + "=" * 80)
+        print("✅ All tests passed!")
+        print("=" * 80)
+    except Exception as e:
+        print("\n" + "=" * 80)
+        print(f"❌ Test failed with error: {e}")
+        print("=" * 80)
+        raise
+
@@ -0,0 +1,190 @@
+import random
+
+import numpy as np
+import torch
+from xvla.models.modeling_xvla import XVLA
+
+# from lerobot.policies.xvla.configuration_xvla import XVLAConfig
+from lerobot.configs.policies import PreTrainedConfig
+from lerobot.envs.factory import make_env_config
+from lerobot.policies.factory import make_policy, make_pre_post_processors
+from lerobot.utils.constants import OBS_IMAGES, OBS_STATE
+
+torch.manual_seed(42)
+random.seed(42)
+np.random.seed(42)
+observation_height: int = 224
+observation_width: int = 224  # todo: jadechoghari, image size is different for the two models
+# create an observation dict
+OBS = {
+    f"{OBS_IMAGES}.image": torch.randn(1, 3, observation_height, observation_width),
+    f"{OBS_IMAGES}.image2": torch.randn(1, 3, observation_height, observation_width),
+    OBS_STATE: torch.randn(1, 20),  # ONLY if OBS_STATE is already a string
+    "task": "put the object in the box",
+}
+
+IMAGENET_MEAN = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1)
+IMAGENET_STD = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1)
+
+
+def fake_rgb(H, W):
+    arr = np.random.randint(0, 255, (H, W, 3), dtype=np.uint8)
+    t = torch.from_numpy(arr).permute(2, 0, 1)  # CHW
+    t = t.unsqueeze(0).float()
+    # normalize pixel to imagenet
+    return t
+
+
+OBS[f"{OBS_IMAGES}.image"] = fake_rgb(observation_height, observation_width)
+OBS[f"{OBS_IMAGES}.image2"] = fake_rgb(observation_height, observation_width)
+
+cfg = PreTrainedConfig.from_pretrained("/raid/jade/models/xvla-libero-og_migrated")
+cfg.pretrained_path = "/raid/jade/models/xvla-libero-og_migrated"
+env_cfg = make_env_config("libero", task="libero_spatial")
+policy = make_policy(
+    cfg=cfg,
+    env_cfg=env_cfg,
+)
+
+policy.eval()
+
+preprocessor_overrides = {
+    "device_processor": {"device": str(cfg.device)},
+}
+
+preprocessor, postprocessor = make_pre_post_processors(
+    policy_cfg=cfg,
+    pretrained_path=cfg.pretrained_path,
+    preprocessor_overrides=preprocessor_overrides,
+)
+
+observation = preprocessor(OBS)
+inputs = policy._build_model_inputs(observation)
+
+
+#### now the og model ###########################################################
+from xvla.models.processing_xvla import XVLAProcessor
+
+processor = XVLAProcessor.from_pretrained("/raid/jade/models/xvla-libero", num_views=2)
+inputs_1 = processor([OBS[f"{OBS_IMAGES}.image"], OBS[f"{OBS_IMAGES}.image2"]], OBS["task"])
+domain_id = torch.tensor([3], dtype=torch.long)
+inputs.update(
+    {
+        "proprio": OBS[OBS_STATE].to("cuda"),
+        "domain_id": domain_id.to("cuda"),
+    }
+)
+
+# check the preprocessor
+for k in inputs.keys() & inputs_1.keys():  # intersection of keys
+    a = inputs[k]
+    b = inputs_1[k].to("cuda")
+
+    print(f"\n🔎 Key: {k}")
+
+    # Check shape
+    print("  shape:", a.shape, b.shape)
+
+    # Check if close
+    if torch.allclose(a, b, atol=1e-5, rtol=1e-5):
+        print("  ✔️ tensors are equal (allclose)")
+    else:
+        diff = torch.abs(a - b)
+        print("  ❌ tensors differ")
+        print("  max diff:", diff.max().item())
+        print("  mean diff:", diff.mean().item())
+
+
+model = XVLA.from_pretrained("/raid/jade/models/xvla-libero")
+model.eval()
+model.to("cuda")
+
+action = model.generate_actions(**inputs, steps=10).squeeze(0).float().cpu().numpy()
+action_1 = policy.model.generate_actions(**inputs, steps=10).squeeze(0).float().cpu().numpy()
+
+# np all close
+print(np.allclose(action, action_1, atol=1e-2, rtol=1e-2))
+print("max diff:", np.max(np.abs(action - action_1)))
+print("mean diff:", np.mean(np.abs(action - action_1)))
+
+
+import random
+
+import numpy as np
+import torch
+from PIL import Image
+from xvla.models.configuration_xvla import XVLAConfig
+from xvla.models.modeling_xvla import XVLA
+from xvla.models.processor_xvla import XVLAProcessor
+
+from lerobot.configs.policies import PreTrainedConfig
+from lerobot.envs.factory import make_env_config
+from lerobot.policies.factory import make_policy
+
+cfg = XVLAConfig.from_pretrained("/raid/jade/models/xvla-libero")
+model = XVLA.from_pretrained("/raid/jade/models/xvla-libero")
+model.eval()
+model.to("cuda")
+processor = XVLAProcessor.from_pretrained("/raid/jade/models/xvla-libero")
+# /raid/jade/models/xvla-libero
+# seet seed
+torch.manual_seed(42)
+random.seed(42)
+np.random.seed(42)
+
+
+def make_random_pil_images(num_images=3, H=480, W=640):
+    images = []
+    for _ in range(num_images):
+        # Random RGB image
+        arr = np.random.randint(0, 256, (H, W, 3), dtype=np.uint8)
+        img = Image.fromarray(arr)
+        images.append(img)
+    return images
+
+
+# Example:
+images = make_random_pil_images()
+language_instruction = "This is a random image"
+# Multimodal preprocessing by processor
+inputs = processor(images, language_instruction)
+if not {"input_ids", "image_input", "image_mask"}.issubset(inputs):
+    raise ValueError("Processor did not return the expected keys.")
+
+proprio = torch.randn(1, 20)
+domain_id = torch.tensor([0], dtype=torch.long)
+
+# Align to model's device/dtype
+device = model.device
+dtype = next(model.parameters()).dtype
+
+
+def to_model(t: torch.Tensor) -> torch.Tensor:
+    if not isinstance(t, torch.Tensor):
+        t = torch.as_tensor(t)
+    # cast floats to model dtype, keep integral/bool as-is
+    return t.to(device=device, dtype=dtype) if t.is_floating_point() else t.to(device=device)
+
+
+inputs = {k: to_model(v) for k, v in inputs.items()}
+inputs.update(
+    {
+        "proprio": to_model(proprio),
+        "domain_id": domain_id.to(device),
+    }
+)
+
+# Inference
+action = model.generate_actions(**inputs, steps=10).squeeze(0).float().cpu().numpy()
+
+
+#### now for lerobot model #####################################################
+
+cfg = PreTrainedConfig.from_pretrained("/raid/jade/models/xvla-libero-og_migrated")
+env_cfg = make_env_config("libero", task="libero_spatial")
+cfg.pretrained_path = "/raid/jade/models/xvla-libero-og_migrated"
+policy = make_policy(cfg=cfg, env_cfg=env_cfg)
+policy.eval()
+policy.to("cuda")
+
+action_1 = policy.model.generate_actions(**inputs, steps=10).squeeze(0).float().cpu().numpy()