diff --git a/convert.sh b/convert.sh deleted file mode 100644 index c7d4998dc..000000000 --- a/convert.sh +++ /dev/null @@ -1,4 +0,0 @@ -python src/lerobot/processor/migrate_policy_normalization.py \ - --pretrained-path /raid/jade/models/xvla-libero-og \ - --output-dir /raid/jade/models/xvla-libero-og-migrated \ - --branch main \ No newline at end of file diff --git a/eval.sh b/eval.sh index 22d45db08..45acb100c 100644 --- a/eval.sh +++ b/eval.sh @@ -2,6 +2,7 @@ lerobot-eval \ --policy.path="/raid/jade/models/xvla-libero-og_migrated" \ --env.type=libero \ --env.task=libero_spatial \ + --env.action_type=abs \ --eval.batch_size=1 \ --eval.n_episodes=1 \ --seed=142 diff --git a/src/lerobot/policies/xvla/inference.py b/examples/tutorial/xvla/inference.py similarity index 100% rename from src/lerobot/policies/xvla/inference.py rename to examples/tutorial/xvla/inference.py diff --git a/src/lerobot/policies/xvla/inference_pipe.py b/examples/tutorial/xvla/inference_pipe.py similarity index 99% rename from src/lerobot/policies/xvla/inference_pipe.py rename to examples/tutorial/xvla/inference_pipe.py index da42965c1..c60a4e305 100644 --- a/src/lerobot/policies/xvla/inference_pipe.py +++ b/examples/tutorial/xvla/inference_pipe.py @@ -56,4 +56,3 @@ target_eef = action[:, :3].to("cpu").numpy() target_axis = Rotate6D_to_AxisAngle(action[:, 3:9].to("cpu").numpy()) target_act = action[:, 9:10].to("cpu").numpy() final_action = np.concatenate([target_eef, target_axis, target_act], axis=-1) -breakpoint() \ No newline at end of file diff --git a/examples/tutorial/xvla/test_xvla.py b/examples/tutorial/xvla/test_xvla.py new file mode 100644 index 000000000..cc6e03df7 --- /dev/null +++ b/examples/tutorial/xvla/test_xvla.py @@ -0,0 +1,218 @@ +from lerobot.datasets.lerobot_dataset import LeRobotDatasetMetadata +from lerobot.policies.factory import make_policy, make_policy_config +import os +cfg = make_policy_config("xvla") + +dataset_id = "lerobot/svla_so101_pickplace" +# This only downloads the metadata for the dataset, ~10s of MB even for large-scale datasets +dataset_metadata = LeRobotDatasetMetadata(dataset_id) +policy = make_policy(cfg=cfg, ds_meta=dataset_metadata) + +for name, param in policy.state_dict().items(): + print(name, param.shape) + + +# now let's load in safetensors +import safetensors.torch +from huggingface_hub import snapshot_download + +cache_dir = snapshot_download(repo_id="2toINF/X-VLA-Libero", repo_type="model", cache_dir="/fsx/jade_choghari/.cache/huggingface/model") +state_dict = safetensors.torch.load_file(os.path.join(cache_dir, "model.safetensors")) +# policy.load_state_dict(state_dict) +# 3. Add "model." prefix to every key +new_state_dict = {f"model.{k}": v for k, v in state_dict.items()} +keys_to_skip = [ + "model.transformer.action_encoder.fc.weight", + "model.transformer.action_encoder.fc.bias", +] + +new_state_dict = {k: v for k, v in new_state_dict.items() if k not in keys_to_skip} +# 4. Load into your model +missing, unexpected = policy.load_state_dict(new_state_dict, strict=False) + +print("missing keys:", missing) + +print() +print("unexpected keys:", unexpected) + + + +from lerobot.policies.factory import make_policy, make_pre_post_processors +# from lerobot.policies.xvla.configuration_xvla import XVLAConfig +from lerobot.configs.policies import PreTrainedConfig +from lerobot.envs.factory import make_env_config +from lerobot.utils.constants import OBS_IMAGES, OBS_STATE +from xvla.models.modeling_xvla import XVLA +import torch +import numpy as np +import random +torch.manual_seed(42) +random.seed(42) +np.random.seed(42) +observation_height: int = 224 +observation_width: int = 224 # todo: jadechoghari, image size is different for the two models +# create an observation dict +OBS = { + f"{OBS_IMAGES}.image": torch.randn(1, 3, observation_height, observation_width), + f"{OBS_IMAGES}.image2": torch.randn(1, 3, observation_height, observation_width), + OBS_STATE: torch.randn(1, 20), # ONLY if OBS_STATE is already a string + "task": "put the object in the box", +} + +IMAGENET_MEAN = torch.tensor([0.485, 0.456, 0.406]).view(1,3,1,1) +IMAGENET_STD = torch.tensor([0.229, 0.224, 0.225]).view(1,3,1,1) +def fake_rgb(H, W): + arr = np.random.randint(0, 255, (H, W, 3), dtype=np.uint8) + t = torch.from_numpy(arr).permute(2, 0, 1) # CHW + t = t.unsqueeze(0).float() + # normalize pixel to imagenet + return t + + +OBS[f"{OBS_IMAGES}.image"] = fake_rgb(observation_height, observation_width) +OBS[f"{OBS_IMAGES}.image2"] = fake_rgb(observation_height, observation_width) + +cfg = PreTrainedConfig.from_pretrained("/raid/jade/models/xvla-libero-og_migrated") +cfg.pretrained_path = "/raid/jade/models/xvla-libero-og_migrated" +env_cfg = make_env_config("libero", task="libero_spatial") +policy = make_policy( + cfg=cfg, + env_cfg=env_cfg, +) + +policy.eval() + +preprocessor_overrides = { + "device_processor": {"device": str(cfg.device)}, +} + +preprocessor, postprocessor = make_pre_post_processors( + policy_cfg=cfg, + pretrained_path=cfg.pretrained_path, + preprocessor_overrides=preprocessor_overrides, +) + +observation = preprocessor(OBS) +inputs = policy._build_model_inputs(observation) + + +#### now the og model ########################################################### +from xvla.models.processing_xvla import XVLAProcessor + +processor = XVLAProcessor.from_pretrained("/raid/jade/models/xvla-libero", num_views=2) +inputs_1 = processor([OBS[f"{OBS_IMAGES}.image"], OBS[f"{OBS_IMAGES}.image2"]], OBS["task"]) +domain_id = torch.tensor([int(3)], dtype=torch.long) +inputs.update({ + "proprio": OBS[OBS_STATE].to("cuda"), + "domain_id": domain_id.to("cuda"), +}) + + +for k in inputs.keys() & inputs_1.keys(): # intersection of keys + a = inputs[k] + b = inputs_1[k].to("cuda") + + print(f"\nšŸ”Ž Key: {k}") + + # Check shape + print(" shape:", a.shape, b.shape) + + # Check if close + if torch.allclose(a, b, atol=1e-5, rtol=1e-5): + print(" āœ”ļø tensors are equal (allclose)") + else: + diff = torch.abs(a - b) + print(" āŒ tensors differ") + print(" max diff:", diff.max().item()) + print(" mean diff:", diff.mean().item()) + + +model = XVLA.from_pretrained("/raid/jade/models/xvla-libero") +model.eval() +model.to("cuda") + +action = model.generate_actions(**inputs, steps=10).squeeze(0).float().cpu().numpy() +# (Pdb) inputs['input_ids'].shape +# torch.Size([1, 64]) +# (Pdb) inputs_1['input_ids'].shape +# torch.Size([1, 50]) +# (Pdb) [0, 0, :, :4, 0] +action_1 = policy.model.generate_actions(**inputs, steps=10).squeeze(0).float().cpu().numpy() + +#np all close +print(np.allclose(action, action_1, atol=1e-2, rtol=1e-2)) +print("max diff:", np.max(np.abs(action - action_1))) +print("mean diff:", np.mean(np.abs(action - action_1))) + + +from xvla.models.processor_xvla import XVLAProcessor +from xvla.models.modeling_xvla import XVLA +from xvla.models.configuration_xvla import XVLAConfig +import torch +import random +import numpy as np +from PIL import Image +from lerobot.policies.factory import make_policy +from lerobot.configs.policies import PreTrainedConfig +from lerobot.envs.factory import make_env_config +cfg = XVLAConfig.from_pretrained("/raid/jade/models/xvla-libero") +model = XVLA.from_pretrained("/raid/jade/models/xvla-libero") +model.eval() +model.to("cuda") +processor = XVLAProcessor.from_pretrained("/raid/jade/models/xvla-libero") +# /raid/jade/models/xvla-libero +# seet seed +torch.manual_seed(42) +random.seed(42) +np.random.seed(42) + +def make_random_pil_images(num_images=3, H=480, W=640): + images = [] + for _ in range(num_images): + # Random RGB image + arr = np.random.randint(0, 256, (H, W, 3), dtype=np.uint8) + img = Image.fromarray(arr) + images.append(img) + return images + +# Example: +images = make_random_pil_images() +language_instruction = "This is a random image" +# Multimodal preprocessing by processor +inputs = processor(images, language_instruction) +if not {"input_ids", "image_input", "image_mask"}.issubset(inputs): + raise ValueError("Processor did not return the expected keys.") + +proprio = torch.randn(1, 20) +domain_id = torch.tensor([int(0)], dtype=torch.long) + +# Align to model's device/dtype +device = model.device +dtype = next(model.parameters()).dtype + +def to_model(t: torch.Tensor) -> torch.Tensor: + if not isinstance(t, torch.Tensor): + t = torch.as_tensor(t) + # cast floats to model dtype, keep integral/bool as-is + return t.to(device=device, dtype=dtype) if t.is_floating_point() else t.to(device=device) + +inputs = {k: to_model(v) for k, v in inputs.items()} +inputs.update({ + "proprio": to_model(proprio), + "domain_id": domain_id.to(device), +}) + +# Inference +action = model.generate_actions(**inputs, steps=10).squeeze(0).float().cpu().numpy() + + +#### now for lerobot model ##################################################### + +cfg = PreTrainedConfig.from_pretrained("/raid/jade/models/xvla-libero-og_migrated") +env_cfg = make_env_config("libero", task="libero_spatial") +cfg.pretrained_path = "/raid/jade/models/xvla-libero-og_migrated" +policy = make_policy(cfg=cfg, env_cfg=env_cfg) +policy.eval() +policy.to("cuda") + +action_1 = policy.model.generate_actions(**inputs, steps=10).squeeze(0).float().cpu().numpy() diff --git a/image0.png b/image0.png deleted file mode 100644 index 4bd158cf0..000000000 Binary files a/image0.png and /dev/null differ diff --git a/image1.png b/image1.png deleted file mode 100644 index d05f4d6a9..000000000 Binary files a/image1.png and /dev/null differ diff --git a/output.txt b/output.txt deleted file mode 100644 index e69de29bb..000000000 diff --git a/prepend.py b/prepend.py deleted file mode 100644 index 8e587c431..000000000 --- a/prepend.py +++ /dev/null @@ -1,28 +0,0 @@ -#!/usr/bin/env python3 -import safetensors.torch as st -import torch -import argparse -import os - -def prefix_state_dict(input_path, output_path, prefix="model."): - # Load original checkpoint - state_dict = st.load_file(input_path) - - print(f"Loaded {len(state_dict)} tensors from {input_path}") - - # Add prefix to every key - new_state_dict = {f"{prefix}{k}": v for k, v in state_dict.items()} - - print(f"Writing prefixed checkpoint with {len(new_state_dict)} keys...") - st.save_file(new_state_dict, output_path) - - print(f"Saved to {output_path}") - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--input", type=str, required=True, help="Path to model.safetensors") - parser.add_argument("--output", type=str, required=True, help="Output prefixed model.safetensors") - parser.add_argument("--prefix", type=str, default="model.", help="Prefix to add to each key") - args = parser.parse_args() - - prefix_state_dict(args.input, args.output, args.prefix) diff --git a/src/lerobot/envs/configs.py b/src/lerobot/envs/configs.py index dc526114d..639c98f1f 100644 --- a/src/lerobot/envs/configs.py +++ b/src/lerobot/envs/configs.py @@ -251,6 +251,7 @@ class LiberoEnv(EnvConfig): "pixels/robot0_eye_in_hand_image": f"{OBS_IMAGES}.image2", } ) + action_type: str = "rel" def __post_init__(self): if self.obs_type == "pixels": diff --git a/src/lerobot/envs/factory.py b/src/lerobot/envs/factory.py index caa6f91ed..57fca8af6 100644 --- a/src/lerobot/envs/factory.py +++ b/src/lerobot/envs/factory.py @@ -97,6 +97,7 @@ def make_env( init_states=cfg.init_states, gym_kwargs=cfg.gym_kwargs, env_cls=env_cls, + action_type=cfg.action_type, ) elif "metaworld" in cfg.type: from lerobot.envs.metaworld import create_metaworld_envs diff --git a/src/lerobot/envs/libero.py b/src/lerobot/envs/libero.py index e2623c4f5..f0ee22503 100644 --- a/src/lerobot/envs/libero.py +++ b/src/lerobot/envs/libero.py @@ -115,6 +115,7 @@ class LiberoEnv(gym.Env): episode_index: int = 0, camera_name_mapping: dict[str, str] | None = None, num_steps_wait: int = 10, + action_type: str = "rel", ): super().__init__() self.task_id = task_id @@ -185,6 +186,7 @@ class LiberoEnv(gym.Env): self.action_space = spaces.Box( low=ACTION_LOW, high=ACTION_HIGH, shape=(ACTION_DIM,), dtype=np.float32 ) + self.action_type = action_type def render(self): raw_obs = self._env.env._get_observations() @@ -213,18 +215,25 @@ class LiberoEnv(gym.Env): if camera_name == "agentview_image": image = image[::-1, ::-1] # rotate 180 degrees images[self.camera_name_mapping[camera_name]] = image - state = np.concatenate( - ( - raw_obs["robot0_eef_pos"], - quat2axisangle(raw_obs["robot0_eef_quat"]), - raw_obs["robot0_gripper_qpos"], + + if self.action_type == "rel": + state = np.concatenate( + ( + raw_obs["robot0_eef_pos"], + quat2axisangle(raw_obs["robot0_eef_quat"]), + raw_obs["robot0_gripper_qpos"], + ) ) - ) - # add new obs for XVLA: jadechoghari - robo_ori = Mat_to_Rotate6D(self._env.robots[0].controller.ee_ori_mat) - robo_pos = self._env.robots[0].controller.ee_pos - proprio = np.concatenate([robo_pos, robo_ori, np.array([0.0])], axis=-1) - state = np.concatenate([proprio, np.zeros_like(proprio)], axis=-1) + # TODO: jadechoghari, this is an ugly quick workaround for XVLA states. + # we will open a new PR to handle this in a preprocessor. + elif self.action_type == "abs": + robo_ori = Mat_to_Rotate6D(self._env.robots[0].controller.ee_ori_mat) + robo_pos = self._env.robots[0].controller.ee_pos + proprio = np.concatenate([robo_pos, robo_ori, np.array([0.0])], axis=-1) + state = np.concatenate([proprio, np.zeros_like(proprio)], axis=-1) + else: + raise NotImplementedError(f"The action type '{self.action_type}' is not supported in LiberoEnv. " + "Please switch to an action type (e.g. 'rel', 'abs').") agent_pos = state if self.obs_type == "pixels": return {"pixels": images.copy()} @@ -250,8 +259,9 @@ class LiberoEnv(gym.Env): # Step the simulator with a no-op action for a few frames so everything settles. # Increasing this value can improve determinism and reproducibility across resets. for _ in range(self.num_steps_wait): - action = np.array([0., 0., 0., 0., 0., 0., -1.0]) + action = np.array(get_libero_dummy_action()) raw_obs, _, _, _ = self._env.step(action) + observation = self._format_raw_obs(raw_obs) for robot in self._env.robots: robot.controller.use_delta = False @@ -264,7 +274,6 @@ class LiberoEnv(gym.Env): f"Expected action to be 1-D (shape (action_dim,)), " f"but got shape {action.shape} with ndim={action.ndim}" ) - action[-1] = 1 if action[-1] > 0.5 else -1 raw_obs, reward, done, info = self._env.step(action) is_success = self._env.check_success() @@ -302,6 +311,7 @@ def _make_env_fns( camera_names: list[str], init_states: bool, gym_kwargs: Mapping[str, Any], + action_type: str, ) -> list[Callable[[], LiberoEnv]]: """Build n_envs factory callables for a single (suite, task_id).""" @@ -314,6 +324,7 @@ def _make_env_fns( camera_name=camera_names, init_states=init_states, episode_index=episode_index, + action_type=action_type, **local_kwargs, ) @@ -333,6 +344,7 @@ def create_libero_envs( camera_name: str | Sequence[str] = "agentview_image,robot0_eye_in_hand_image", init_states: bool = True, env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None, + action_type: str = "rel", ) -> dict[str, dict[int, Any]]: """ Create vectorized LIBERO environments with a consistent return shape. @@ -382,6 +394,7 @@ def create_libero_envs( camera_names=camera_names, init_states=init_states, gym_kwargs=gym_kwargs, + action_type=action_type, ) out[suite_name][tid] = env_cls(fns) print(f"Built vec env | suite={suite_name} | task_id={tid} | n_envs={n_envs}") diff --git a/src/lerobot/policies/xvla/README_PROCESSORS.md b/src/lerobot/policies/xvla/README_PROCESSORS.md deleted file mode 100644 index 1660f22a3..000000000 --- a/src/lerobot/policies/xvla/README_PROCESSORS.md +++ /dev/null @@ -1,132 +0,0 @@ -# XVLA Custom Processor Steps - -Three custom processor steps have been implemented for XVLA that encapsulate the preprocessing and postprocessing logic from `lerobot_eval.py`. - -## Processor Steps - -### 1. XVLAImageScaleProcessorStep -**Registry Name:** `xvla_image_scale` - -Scales image observations by 255 (from [0,1] to [0,255] range). - -```python -XVLAImageScaleProcessorStep( - image_keys=None # Auto-detects "observation.images.*" or specify list -) -``` - -### 2. XVLAAddDomainIdProcessorStep -**Registry Name:** `xvla_add_domain_id` - -Adds `domain_id` tensor to complementary data for multi-domain support. - -```python -XVLAAddDomainIdProcessorStep( - domain_id=3, # Domain identifier - device="cuda" # Tensor device -) -``` - -### 3. XVLARotation6DToAxisAngleProcessorStep -**Registry Name:** `xvla_rotation_6d_to_axis_angle` - -Converts 6D rotation to axis-angle representation: -- **Input:** [eef(3), rotation_6d(6), gripper(1)] = 10D -- **Output:** [eef(3), axis_angle(3), gripper(1)] = 7D - -```python -XVLARotation6DToAxisAngleProcessorStep( - expected_action_dim=10 -) -``` - -## Integration with Config - -These steps can be added to your XVLA policy configuration: - -### In Preprocessing Pipeline: -```python -from lerobot.policies.xvla.processor_xvla import ( - XVLAImageScaleProcessorStep, - XVLAAddDomainIdProcessorStep, -) - -preprocessor_steps = [ - RenameObservationsProcessorStep(rename_map={}), - AddBatchDimensionProcessorStep(), - XVLAImageScaleProcessorStep(), # Add this - TokenizerProcessorStep(...), - DeviceProcessorStep(device="cuda"), - XVLAAddDomainIdProcessorStep(domain_id=3), # Add this - NormalizerProcessorStep(...), -] -``` - -### In Postprocessing Pipeline: -```python -from lerobot.policies.xvla.processor_xvla import XVLARotation6DToAxisAngleProcessorStep - -postprocessor_steps = [ - UnnormalizerProcessorStep(...), - XVLARotation6DToAxisAngleProcessorStep(), # Add this - DeviceProcessorStep(device="cpu"), -] -``` - -## Usage in Evaluation - -Now your evaluation loop simplifies to: - -```python -# Before (from lerobot_eval.py lines 165-184) -observation[f"observation.images.image"] = observation[f"observation.images.image"] * 255 -observation[f"observation.images.image2"] = observation[f"observation.images.image2"] * 255 -observation = add_envs_task(env, observation) -observation = preprocessor(observation) -observation["domain_id"] = torch.tensor([int(3)], dtype=torch.long).to("cuda") - -with torch.inference_mode(): - action = policy.select_action(observation).to("cpu").numpy() -target_eef = action[:, :3] -target_axis = Rotate6D_to_AxisAngle(action[:, 3:9]) -target_act = action[:, 9:10] -action_numpy = np.concatenate([target_eef, target_axis, target_act], axis=-1) - -# After (clean and simple) -observation = add_envs_task(env, observation) # Add task -observation = preprocessor(observation) # Scales images + adds domain_id - -with torch.inference_mode(): - action = policy.select_action(observation) -action = postprocessor(action) # Converts rotation + moves to CPU -action_numpy = action.numpy() -``` - -## Configuration via Registry - -All steps are registered and can be loaded from JSON/YAML config: - -```json -{ - "preprocessor": { - "steps": [ - {"name": "xvla_image_scale"}, - {"name": "xvla_add_domain_id", "domain_id": 3, "device": "cuda"} - ] - }, - "postprocessor": { - "steps": [ - {"name": "xvla_rotation_6d_to_axis_angle", "expected_action_dim": 10} - ] - } -} -``` - -## Implementation Reference - -See `processor_groot.py` for similar patterns - these XVLA processors follow the same design: -- Registered with `@ProcessorStepRegistry.register()` -- Implement `__call__`, `transform_features`, and `get_config` -- Operate on `EnvTransition` objects -- Properly handle `transition.copy()` to avoid side effects - diff --git a/src/lerobot/policies/xvla/modeling_xvla.py b/src/lerobot/policies/xvla/modeling_xvla.py index 1e9437cd9..8eadffffa 100644 --- a/src/lerobot/policies/xvla/modeling_xvla.py +++ b/src/lerobot/policies/xvla/modeling_xvla.py @@ -402,38 +402,37 @@ class XVLAPolicy(PreTrainedPolicy): f"model.safetensors not found on the Hub at {model_id}" ) from e - # --- Step 3: Load safetensor weights --- print(f"Loading checkpoint from {model_file}") state_dict = safetensors.torch.load_file(model_file) - # --- Step 4: Modify keys --- - new_state_dict = {f"model.{k}": v for k, v in state_dict.items()} + # # --- Step 4: Modify keys --- + # new_state_dict = {f"model.{k}": v for k, v in state_dict.items()} - # Layers to skip (reinitialize) - keys_to_skip = [ - "model.transformer.action_encoder.fc.weight", - "model.transformer.action_encoder.fc.bias", - "model.transformer.action_decoder.fc.weight", - "model.transformer.action_decoder.bias.weight" - ] - new_state_dict = { - k: v for k, v in new_state_dict.items() - if k not in keys_to_skip - } - # ---- ADD THIS: Fix shared embeddings ---- + # # Layers to skip (reinitialize) + # keys_to_skip = [ + # "model.transformer.action_encoder.fc.weight", + # "model.transformer.action_encoder.fc.bias", + # "model.transformer.action_decoder.fc.weight", + # "model.transformer.action_decoder.bias.weight" + # ] + # new_state_dict = { + # k: v for k, v in new_state_dict.items() + # if k not in keys_to_skip + # } + # # ---- ADD THIS: Fix shared embeddings ---- encoder_key = "model.vlm.language_model.model.encoder.embed_tokens.weight" shared_key = "model.vlm.language_model.model.shared.weight" if encoder_key in state_dict: state_dict[shared_key] = state_dict[encoder_key] - # --- Step 5: Load into instance --- + # step 5: load into instance missing, unexpected = instance.load_state_dict(state_dict, strict=True) - print("āœ… Loaded XVLA checkpoint with modified keys.") + print("Loaded XVLA checkpoint") if missing: print(f"Missing keys: {missing}") if unexpected: print(f"Unexpected keys: {unexpected}") - # --- Step 6: Finalize --- + # step 6: finalize instance.to(config.device) instance.eval() return instance diff --git a/src/lerobot/policies/xvla/policy_postprocessor.json b/src/lerobot/policies/xvla/policy_postprocessor.json deleted file mode 100644 index e4c499d7c..000000000 --- a/src/lerobot/policies/xvla/policy_postprocessor.json +++ /dev/null @@ -1,37 +0,0 @@ -{ - "name": "policy_postprocessor", - "steps": [ - { - "registry_name": "unnormalizer_processor", - "config": { - "eps": 1e-08, - "features": { - "action": { - "type": "ACTION", - "shape": [ - 20 - ] - } - }, - "norm_map": { - "VISUAL": "MEAN_STD", - "STATE": "IDENTITY", - "ACTION": "IDENTITY" - } - } - }, - { - "registry_name": "xvla_rotation_6d_to_axis_angle", - "config": { - "expected_action_dim": 10 - } - }, - { - "registry_name": "device_processor", - "config": { - "device": "cpu", - "float_dtype": null - } - } - ] -} \ No newline at end of file diff --git a/src/lerobot/policies/xvla/policy_preprocessor.json b/src/lerobot/policies/xvla/policy_preprocessor.json deleted file mode 100644 index ce9da5010..000000000 --- a/src/lerobot/policies/xvla/policy_preprocessor.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "name": "policy_preprocessor", - "steps": [ - { - "registry_name": "rename_observations_processor", - "config": { - "rename_map": {} - } - }, - { - "registry_name": "to_batch_processor", - "config": {} - }, - { - "registry_name": "xvla_image_scale", - "config": { - "image_keys": null - } - }, - { - "registry_name": "tokenizer_processor", - "config": { - "max_length": 50, - "task_key": "task", - "padding_side": "right", - "padding": "max_length", - "truncation": true, - "tokenizer_name": "facebook/bart-large" - } - }, - { - "registry_name": "device_processor", - "config": { - "device": "cuda", - "float_dtype": null - } - }, - { - "registry_name": "xvla_add_domain_id", - "config": { - "domain_id": 3, - "device": "cuda" - } - }, - { - "registry_name": "normalizer_processor", - "config": { - "eps": 1e-08, - "features": { - "observation.images.image": { - "type": "VISUAL", - "shape": [ - 3, - 224, - 224 - ] - }, - "observation.images.image2": { - "type": "VISUAL", - "shape": [ - 3, - 224, - 224 - ] - }, - "observation.state": { - "type": "STATE", - "shape": [ - 8 - ] - }, - "action": { - "type": "ACTION", - "shape": [ - 20 - ] - } - }, - "norm_map": { - "VISUAL": "IMAGENET", - "STATE": "IDENTITY", - "ACTION": "IDENTITY" - } - } - } - ] -} \ No newline at end of file diff --git a/src/lerobot/policies/xvla/processor_xvla.py b/src/lerobot/policies/xvla/processor_xvla.py index 1b7fec659..e5fc96704 100644 --- a/src/lerobot/policies/xvla/processor_xvla.py +++ b/src/lerobot/policies/xvla/processor_xvla.py @@ -235,6 +235,9 @@ class XVLARotation6DToAxisAngleProcessorStep(ProcessorStep): # Concatenate: [eef (3), axis_angle (3), gripper (1)] = 7D action_np = np.concatenate([target_eef, target_axis, target_act], axis=-1) + + # Convert gripper action to -1 or 1 + action_np[:, -1] = np.where(action_np[:, -1] > 0.5, 1.0, -1.0) # Convert back to tensor action = torch.from_numpy(action_np).to(device=device, dtype=dtype) diff --git a/src/lerobot/scripts/lerobot_eval.py b/src/lerobot/scripts/lerobot_eval.py index e19e25ae9..15952e16e 100644 --- a/src/lerobot/scripts/lerobot_eval.py +++ b/src/lerobot/scripts/lerobot_eval.py @@ -167,14 +167,12 @@ def rollout( # Preprocess observation (includes image scaling and domain_id addition) observation = preprocessor(observation) - breakpoint() # Policy inference with torch.inference_mode(): action = policy.select_action(observation) # Postprocess action (includes rotation conversion and device transfer to CPU) action = postprocessor(action) - # Convert to numpy action_numpy: np.ndarray = action.numpy() assert action_numpy.ndim == 2, "Action dimensions should be (batch, action_dim)" diff --git a/test_2.py b/test_2.py deleted file mode 100644 index 2adac2715..000000000 --- a/test_2.py +++ /dev/null @@ -1,72 +0,0 @@ -from xvla.models.processor_xvla import XVLAProcessor -from xvla.models.modeling_xvla import XVLA -from xvla.models.configuration_xvla import XVLAConfig -import torch -import random -import numpy as np -from PIL import Image -from lerobot.policies.factory import make_policy -from lerobot.configs.policies import PreTrainedConfig -from lerobot.envs.factory import make_env_config -cfg = XVLAConfig.from_pretrained("/raid/jade/models/xvla-libero") -model = XVLA.from_pretrained("/raid/jade/models/xvla-libero") -model.eval() -model.to("cuda") -processor = XVLAProcessor.from_pretrained("/raid/jade/models/xvla-libero") -# /raid/jade/models/xvla-libero -# seet seed -torch.manual_seed(42) -random.seed(42) -np.random.seed(42) - -def make_random_pil_images(num_images=3, H=480, W=640): - images = [] - for _ in range(num_images): - # Random RGB image - arr = np.random.randint(0, 256, (H, W, 3), dtype=np.uint8) - img = Image.fromarray(arr) - images.append(img) - return images - -# Example: -images = make_random_pil_images() -language_instruction = "This is a random image" -# Multimodal preprocessing by processor -inputs = processor(images, language_instruction) -if not {"input_ids", "image_input", "image_mask"}.issubset(inputs): - raise ValueError("Processor did not return the expected keys.") - -proprio = torch.randn(1, 20) -domain_id = torch.tensor([int(0)], dtype=torch.long) - -# Align to model's device/dtype -device = model.device -dtype = next(model.parameters()).dtype - -def to_model(t: torch.Tensor) -> torch.Tensor: - if not isinstance(t, torch.Tensor): - t = torch.as_tensor(t) - # cast floats to model dtype, keep integral/bool as-is - return t.to(device=device, dtype=dtype) if t.is_floating_point() else t.to(device=device) - -inputs = {k: to_model(v) for k, v in inputs.items()} -inputs.update({ - "proprio": to_model(proprio), - "domain_id": domain_id.to(device), -}) - -# Inference -action = model.generate_actions(**inputs, steps=10).squeeze(0).float().cpu().numpy() - - -#### now for lerobot model ##################################################### - -cfg = PreTrainedConfig.from_pretrained("/raid/jade/models/xvla-libero-og_migrated") -env_cfg = make_env_config("libero", task="libero_spatial") -cfg.pretrained_path = "/raid/jade/models/xvla-libero-og_migrated" -policy = make_policy(cfg=cfg, env_cfg=env_cfg) -policy.eval() -policy.to("cuda") - -action_1 = policy.model.generate_actions(**inputs, steps=10).squeeze(0).float().cpu().numpy() -breakpoint() \ No newline at end of file diff --git a/test_3.py b/test_3.py deleted file mode 100644 index 26a2f6c5d..000000000 --- a/test_3.py +++ /dev/null @@ -1,107 +0,0 @@ -from lerobot.policies.factory import make_policy, make_pre_post_processors -# from lerobot.policies.xvla.configuration_xvla import XVLAConfig -from lerobot.configs.policies import PreTrainedConfig -from lerobot.envs.factory import make_env_config -from lerobot.utils.constants import OBS_IMAGES, OBS_STATE -from xvla.models.modeling_xvla import XVLA -import torch -import numpy as np -import random -torch.manual_seed(42) -random.seed(42) -np.random.seed(42) -observation_height: int = 224 -observation_width: int = 224 # todo: jadechoghari, image size is different for the two models -# create an observation dict -OBS = { - f"{OBS_IMAGES}.image": torch.randn(1, 3, observation_height, observation_width), - f"{OBS_IMAGES}.image2": torch.randn(1, 3, observation_height, observation_width), - OBS_STATE: torch.randn(1, 20), # ONLY if OBS_STATE is already a string - "task": "put the object in the box", -} - -IMAGENET_MEAN = torch.tensor([0.485, 0.456, 0.406]).view(1,3,1,1) -IMAGENET_STD = torch.tensor([0.229, 0.224, 0.225]).view(1,3,1,1) -def fake_rgb(H, W): - arr = np.random.randint(0, 255, (H, W, 3), dtype=np.uint8) - t = torch.from_numpy(arr).permute(2, 0, 1) # CHW - t = t.unsqueeze(0).float() - # normalize pixel to imagenet - return t - - -OBS[f"{OBS_IMAGES}.image"] = fake_rgb(observation_height, observation_width) -OBS[f"{OBS_IMAGES}.image2"] = fake_rgb(observation_height, observation_width) - -cfg = PreTrainedConfig.from_pretrained("/raid/jade/models/xvla-libero-og_migrated") -cfg.pretrained_path = "/raid/jade/models/xvla-libero-og_migrated" -env_cfg = make_env_config("libero", task="libero_spatial") -policy = make_policy( - cfg=cfg, - env_cfg=env_cfg, -) - -policy.eval() - -preprocessor_overrides = { - "device_processor": {"device": str(cfg.device)}, -} - -preprocessor, postprocessor = make_pre_post_processors( - policy_cfg=cfg, - pretrained_path=cfg.pretrained_path, - preprocessor_overrides=preprocessor_overrides, -) - -observation = preprocessor(OBS) -inputs = policy._build_model_inputs(observation) - - -#### now the og model ########################################################### -from xvla.models.processing_xvla import XVLAProcessor - -processor = XVLAProcessor.from_pretrained("/raid/jade/models/xvla-libero", num_views=2) -inputs_1 = processor([OBS[f"{OBS_IMAGES}.image"], OBS[f"{OBS_IMAGES}.image2"]], OBS["task"]) -domain_id = torch.tensor([int(3)], dtype=torch.long) -inputs.update({ - "proprio": OBS[OBS_STATE].to("cuda"), - "domain_id": domain_id.to("cuda"), -}) - -breakpoint() -for k in inputs.keys() & inputs_1.keys(): # intersection of keys - a = inputs[k] - b = inputs_1[k].to("cuda") - - print(f"\nšŸ”Ž Key: {k}") - - # Check shape - print(" shape:", a.shape, b.shape) - - # Check if close - if torch.allclose(a, b, atol=1e-5, rtol=1e-5): - print(" āœ”ļø tensors are equal (allclose)") - else: - diff = torch.abs(a - b) - print(" āŒ tensors differ") - print(" max diff:", diff.max().item()) - print(" mean diff:", diff.mean().item()) - - -model = XVLA.from_pretrained("/raid/jade/models/xvla-libero") -model.eval() -model.to("cuda") - -action = model.generate_actions(**inputs, steps=10).squeeze(0).float().cpu().numpy() -# (Pdb) inputs['input_ids'].shape -# torch.Size([1, 64]) -# (Pdb) inputs_1['input_ids'].shape -# torch.Size([1, 50]) -# (Pdb) [0, 0, :, :4, 0] -action_1 = policy.model.generate_actions(**inputs, steps=10).squeeze(0).float().cpu().numpy() - -#np all close -print(np.allclose(action, action_1, atol=1e-2, rtol=1e-2)) -print("max diff:", np.max(np.abs(action - action_1))) -print("mean diff:", np.mean(np.abs(action - action_1))) -breakpoint() \ No newline at end of file diff --git a/test_xvla.py b/test_xvla.py deleted file mode 100644 index 6b8ab182d..000000000 --- a/test_xvla.py +++ /dev/null @@ -1,37 +0,0 @@ -from lerobot.datasets.lerobot_dataset import LeRobotDatasetMetadata -from lerobot.policies.factory import make_policy, make_policy_config -import os -cfg = make_policy_config("xvla") - -dataset_id = "lerobot/svla_so101_pickplace" -# This only downloads the metadata for the dataset, ~10s of MB even for large-scale datasets -dataset_metadata = LeRobotDatasetMetadata(dataset_id) -policy = make_policy(cfg=cfg, ds_meta=dataset_metadata) - -for name, param in policy.state_dict().items(): - print(name, param.shape) - - -# now let's load in safetensors -import safetensors.torch -from huggingface_hub import snapshot_download - -cache_dir = snapshot_download(repo_id="2toINF/X-VLA-Libero", repo_type="model", cache_dir="/fsx/jade_choghari/.cache/huggingface/model") -state_dict = safetensors.torch.load_file(os.path.join(cache_dir, "model.safetensors")) -# policy.load_state_dict(state_dict) -# 3. Add "model." prefix to every key -new_state_dict = {f"model.{k}": v for k, v in state_dict.items()} -keys_to_skip = [ - "model.transformer.action_encoder.fc.weight", - "model.transformer.action_encoder.fc.bias", -] - -new_state_dict = {k: v for k, v in new_state_dict.items() if k not in keys_to_skip} -# 4. Load into your model -missing, unexpected = policy.load_state_dict(new_state_dict, strict=False) - -print("missing keys:", missing) - -print() -print("unexpected keys:", unexpected) - diff --git a/train.sh b/train.sh deleted file mode 100644 index 5b73de847..000000000 --- a/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -lerobot-train \ - --dataset.repo_id=libero_dataset \ - --dataset.root=/fsx/jade_choghari/datasets/libero/ \ - --policy.type=xvla \ - --output_dir=/fsx/jade_choghari/outputs/train/xvla_libero \ - --job_name=xvla_libero \ - --policy.device=cuda \ - --policy.action_mode=franka_joint7 \ - --wandb.enable=true \ - --policy.repo_id=jadechoghari/X-VLA-Libero \ - --steps=10000 - -# # --policy.pretrained_path=/fsx/jade_choghari/.cache/huggingface/model/xvla-libero \ \ No newline at end of file diff --git a/train_multi.sh b/train_multi.sh deleted file mode 100644 index fa3668d75..000000000 --- a/train_multi.sh +++ /dev/null @@ -1,18 +0,0 @@ -accelerate launch \ - --multi_gpu \ - --num_processes=4 \ - --mixed_precision=fp16 \ - $(which lerobot-train) \ - --batch_size=32 \ - --save_freq=5000 \ - --num_workers=32 \ - --dataset.repo_id=libero_dataset \ - --dataset.root=/fsx/jade_choghari/datasets/libero/ \ - --policy.type=xvla \ - --output_dir=/fsx/jade_choghari/outputs/train/xvla_libero_multi \ - --job_name=xvla_libero \ - --policy.device=cuda \ - --policy.action_mode=franka_joint7 \ - --wandb.enable=true \ - --policy.repo_id=jadechoghari/X-VLA-Libero \ - --steps=10000 diff --git a/xvla b/xvla deleted file mode 160000 index e2f0554f8..000000000 --- a/xvla +++ /dev/null @@ -1 +0,0 @@ -Subproject commit e2f0554f8ce8ab19c678652f9c30c431a37b7bbd