more fixes

2026-05-15 08:39:49 +00:00 · 2025-11-17 14:03:15 +01:00
parent fb6f59e074
commit 5277a9909d
16 changed files with 215 additions and 176 deletions
@@ -1,15 +1,13 @@
-from transformers import AutoModel, AutoProcessor
-import json_numpy
 import numpy as np
 import torch
 from PIL import Image
+from transformers import AutoModel, AutoProcessor

-model = AutoModel.from_pretrained(
-    "2toINF/X-VLA-WidowX",
-    trust_remote_code=True
-)
+model = AutoModel.from_pretrained("2toINF/X-VLA-WidowX", trust_remote_code=True)

 processor = AutoProcessor.from_pretrained("2toINF/X-VLA-WidowX", trust_remote_code=True)
+
+
 # append 3 random image to a list
 def make_random_pil_images(num_images=3, H=480, W=640):
    images = []
@@ -20,6 +18,7 @@ def make_random_pil_images(num_images=3, H=480, W=640):
        images.append(img)
    return images

+
 # Example:
 images = make_random_pil_images()
 language_instruction = "This is a random image"
@@ -29,23 +28,27 @@ if not {"input_ids", "image_input", "image_mask"}.issubset(inputs):
    raise ValueError("Processor did not return the expected keys.")

 proprio = torch.randn(1, 20)
-domain_id = torch.tensor([int(0)], dtype=torch.long)
+domain_id = torch.tensor([0], dtype=torch.long)

 # Align to model's device/dtype
 device = model.device
 dtype = next(model.parameters()).dtype

+
 def to_model(t: torch.Tensor) -> torch.Tensor:
    if not isinstance(t, torch.Tensor):
        t = torch.as_tensor(t)
    # cast floats to model dtype, keep integral/bool as-is
    return t.to(device=device, dtype=dtype) if t.is_floating_point() else t.to(device=device)

+
 inputs = {k: to_model(v) for k, v in inputs.items()}
-inputs.update({
-    "proprio": to_model(proprio),
-    "domain_id": domain_id.to(device),
-})
+inputs.update(
+    {
+        "proprio": to_model(proprio),
+        "domain_id": domain_id.to(device),
+    }
+)

 # Inference

@@ -1,11 +1,12 @@
-from lerobot.policies.factory import make_policy, make_pre_post_processors
+import numpy as np
+import torch
+
 # from lerobot.policies.xvla.configuration_xvla import XVLAConfig
 from lerobot.configs.policies import PreTrainedConfig
 from lerobot.envs.factory import make_env_config
-from lerobot.policies.xvla.utils import Rotate6D_to_AxisAngle
+from lerobot.policies.factory import make_policy, make_pre_post_processors
+from lerobot.policies.xvla.utils import rotate6d_to_axis_angle
 from lerobot.utils.constants import OBS_IMAGES, OBS_STATE
-import torch
-import numpy as np

 observation_height: int = 360
 observation_width: int = 360
@@ -13,18 +14,22 @@ observation_width: int = 360
 OBS = {
    f"{OBS_IMAGES}.image1": torch.randn(1, 3, observation_height, observation_width),
    f"{OBS_IMAGES}.image2": torch.randn(1, 3, observation_height, observation_width),
-    OBS_STATE: torch.randn(1, 9),                  # ONLY if OBS_STATE is already a string
+    OBS_STATE: torch.randn(1, 9),  # ONLY if OBS_STATE is already a string
    "task": "put the object in the box",
 }
+
+
 def fake_rgb(H, W):
    img = torch.randint(0, 255, (H, W, 3), dtype=torch.uint8).numpy()
    return img

+
 OBS[f"{OBS_IMAGES}.image1"] = fake_rgb(observation_height, observation_width)
 OBS[f"{OBS_IMAGES}.image2"] = fake_rgb(observation_height, observation_width)

 # observation = preprocessor(OBS)
 from transformers import AutoProcessor
+
 processor = AutoProcessor.from_pretrained("2toINF/X-VLA-WidowX", num_views=2, trust_remote_code=True)
 inputs = processor([OBS[f"{OBS_IMAGES}.image1"], OBS[f"{OBS_IMAGES}.image2"]], OBS["task"])
 breakpoint()
@@ -40,19 +45,19 @@ policy = make_policy(
 policy.eval()

 preprocessor_overrides = {
-        "device_processor": {"device": str(cfg.device)},
+    "device_processor": {"device": str(cfg.device)},
 }

 preprocessor, postprocessor = make_pre_post_processors(
-        policy_cfg=cfg,
-        pretrained_path=cfg.pretrained_path,
-        preprocessor_overrides=preprocessor_overrides,
+    policy_cfg=cfg,
+    pretrained_path=cfg.pretrained_path,
+    preprocessor_overrides=preprocessor_overrides,
 )

 observation = preprocessor(OBS)
 action = policy.select_action(observation)

 target_eef = action[:, :3].to("cpu").numpy()
-target_axis = Rotate6D_to_AxisAngle(action[:, 3:9].to("cpu").numpy())
+target_axis = rotate6d_to_axis_angle(action[:, 3:9].to("cpu").numpy())
 target_act = action[:, 9:10].to("cpu").numpy()
 final_action = np.concatenate([target_eef, target_axis, target_act], axis=-1)
@@ -1,6 +1,8 @@
+import os
+
 from lerobot.datasets.lerobot_dataset import LeRobotDatasetMetadata
 from lerobot.policies.factory import make_policy, make_policy_config
-import os
+
 cfg = make_policy_config("xvla")

 dataset_id = "lerobot/svla_so101_pickplace"
@@ -16,7 +18,9 @@ for name, param in policy.state_dict().items():
 import safetensors.torch
 from huggingface_hub import snapshot_download

-cache_dir = snapshot_download(repo_id="2toINF/X-VLA-Libero", repo_type="model", cache_dir="/fsx/jade_choghari/.cache/huggingface/model")
+cache_dir = snapshot_download(
+    repo_id="2toINF/X-VLA-Libero", repo_type="model", cache_dir="/fsx/jade_choghari/.cache/huggingface/model"
+)
 state_dict = safetensors.torch.load_file(os.path.join(cache_dir, "model.safetensors"))
 # policy.load_state_dict(state_dict)
 # 3. Add "model." prefix to every key
@@ -36,34 +40,38 @@ print()
 print("unexpected keys:", unexpected)


+import random
+
+import numpy as np
+import torch
+from xvla.models.modeling_xvla import XVLA

-from lerobot.policies.factory import make_policy, make_pre_post_processors
 # from lerobot.policies.xvla.configuration_xvla import XVLAConfig
 from lerobot.configs.policies import PreTrainedConfig
 from lerobot.envs.factory import make_env_config
+from lerobot.policies.factory import make_policy, make_pre_post_processors
 from lerobot.utils.constants import OBS_IMAGES, OBS_STATE
-from xvla.models.modeling_xvla import XVLA
-import torch
-import numpy as np
-import random
+
 torch.manual_seed(42)
 random.seed(42)
 np.random.seed(42)
 observation_height: int = 224
-observation_width: int = 224 # todo: jadechoghari, image size is different for the two models
+observation_width: int = 224  # todo: jadechoghari, image size is different for the two models
 # create an observation dict
 OBS = {
    f"{OBS_IMAGES}.image": torch.randn(1, 3, observation_height, observation_width),
    f"{OBS_IMAGES}.image2": torch.randn(1, 3, observation_height, observation_width),
-    OBS_STATE: torch.randn(1, 20),                  # ONLY if OBS_STATE is already a string
+    OBS_STATE: torch.randn(1, 20),  # ONLY if OBS_STATE is already a string
    "task": "put the object in the box",
 }

-IMAGENET_MEAN = torch.tensor([0.485, 0.456, 0.406]).view(1,3,1,1)
-IMAGENET_STD  = torch.tensor([0.229, 0.224, 0.225]).view(1,3,1,1)
+IMAGENET_MEAN = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1)
+IMAGENET_STD = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1)
+
+
 def fake_rgb(H, W):
    arr = np.random.randint(0, 255, (H, W, 3), dtype=np.uint8)
-    t = torch.from_numpy(arr).permute(2, 0, 1)      # CHW
+    t = torch.from_numpy(arr).permute(2, 0, 1)  # CHW
    t = t.unsqueeze(0).float()
    # normalize pixel to imagenet
    return t
@@ -83,13 +91,13 @@ policy = make_policy(
 policy.eval()

 preprocessor_overrides = {
-        "device_processor": {"device": str(cfg.device)},
+    "device_processor": {"device": str(cfg.device)},
 }

 preprocessor, postprocessor = make_pre_post_processors(
-        policy_cfg=cfg,
-        pretrained_path=cfg.pretrained_path,
-        preprocessor_overrides=preprocessor_overrides,
+    policy_cfg=cfg,
+    pretrained_path=cfg.pretrained_path,
+    preprocessor_overrides=preprocessor_overrides,
 )

 observation = preprocessor(OBS)
@@ -101,14 +109,16 @@ from xvla.models.processing_xvla import XVLAProcessor

 processor = XVLAProcessor.from_pretrained("/raid/jade/models/xvla-libero", num_views=2)
 inputs_1 = processor([OBS[f"{OBS_IMAGES}.image"], OBS[f"{OBS_IMAGES}.image2"]], OBS["task"])
-domain_id = torch.tensor([int(3)], dtype=torch.long)
-inputs.update({
-    "proprio": OBS[OBS_STATE].to("cuda"),
-    "domain_id": domain_id.to("cuda"),
-})
+domain_id = torch.tensor([3], dtype=torch.long)
+inputs.update(
+    {
+        "proprio": OBS[OBS_STATE].to("cuda"),
+        "domain_id": domain_id.to("cuda"),
+    }
+)


-for k in inputs.keys() & inputs_1.keys():   # intersection of keys
+for k in inputs.keys() & inputs_1.keys():  # intersection of keys
    a = inputs[k]
    b = inputs_1[k].to("cuda")

@@ -139,22 +149,25 @@ action = model.generate_actions(**inputs, steps=10).squeeze(0).float().cpu().num
 # (Pdb) [0, 0, :, :4, 0]
 action_1 = policy.model.generate_actions(**inputs, steps=10).squeeze(0).float().cpu().numpy()

-#np all close
+# np all close
 print(np.allclose(action, action_1, atol=1e-2, rtol=1e-2))
 print("max diff:", np.max(np.abs(action - action_1)))
 print("mean diff:", np.mean(np.abs(action - action_1)))


-from xvla.models.processor_xvla import XVLAProcessor
-from xvla.models.modeling_xvla import XVLA
-from xvla.models.configuration_xvla import XVLAConfig
-import torch
 import random
+
 import numpy as np
+import torch
 from PIL import Image
-from lerobot.policies.factory import make_policy
+from xvla.models.configuration_xvla import XVLAConfig
+from xvla.models.modeling_xvla import XVLA
+from xvla.models.processor_xvla import XVLAProcessor
+
 from lerobot.configs.policies import PreTrainedConfig
 from lerobot.envs.factory import make_env_config
+from lerobot.policies.factory import make_policy
+
 cfg = XVLAConfig.from_pretrained("/raid/jade/models/xvla-libero")
 model = XVLA.from_pretrained("/raid/jade/models/xvla-libero")
 model.eval()
@@ -166,6 +179,7 @@ torch.manual_seed(42)
 random.seed(42)
 np.random.seed(42)

+
 def make_random_pil_images(num_images=3, H=480, W=640):
    images = []
    for _ in range(num_images):
@@ -175,6 +189,7 @@ def make_random_pil_images(num_images=3, H=480, W=640):
        images.append(img)
    return images

+
 # Example:
 images = make_random_pil_images()
 language_instruction = "This is a random image"
@@ -184,23 +199,27 @@ if not {"input_ids", "image_input", "image_mask"}.issubset(inputs):
    raise ValueError("Processor did not return the expected keys.")

 proprio = torch.randn(1, 20)
-domain_id = torch.tensor([int(0)], dtype=torch.long)
+domain_id = torch.tensor([0], dtype=torch.long)

 # Align to model's device/dtype
 device = model.device
 dtype = next(model.parameters()).dtype

+
 def to_model(t: torch.Tensor) -> torch.Tensor:
    if not isinstance(t, torch.Tensor):
        t = torch.as_tensor(t)
    # cast floats to model dtype, keep integral/bool as-is
    return t.to(device=device, dtype=dtype) if t.is_floating_point() else t.to(device=device)

+
 inputs = {k: to_model(v) for k, v in inputs.items()}
-inputs.update({
-    "proprio": to_model(proprio),
-    "domain_id": domain_id.to(device),
-})
+inputs.update(
+    {
+        "proprio": to_model(proprio),
+        "domain_id": domain_id.to(device),
+    }
+)

 # Inference
 action = model.generate_actions(**inputs, steps=10).squeeze(0).float().cpu().numpy()