diff --git a/eval.sh b/eval.sh index 45acb100c..37b67ad3b 100644 --- a/eval.sh +++ b/eval.sh @@ -6,4 +6,3 @@ lerobot-eval \ --eval.batch_size=1 \ --eval.n_episodes=1 \ --seed=142 - diff --git a/examples/tutorial/xvla/inference.py b/examples/tutorial/xvla/inference.py index 00511d72c..8f59adaeb 100644 --- a/examples/tutorial/xvla/inference.py +++ b/examples/tutorial/xvla/inference.py @@ -1,15 +1,13 @@ -from transformers import AutoModel, AutoProcessor -import json_numpy import numpy as np import torch from PIL import Image +from transformers import AutoModel, AutoProcessor -model = AutoModel.from_pretrained( - "2toINF/X-VLA-WidowX", - trust_remote_code=True -) +model = AutoModel.from_pretrained("2toINF/X-VLA-WidowX", trust_remote_code=True) processor = AutoProcessor.from_pretrained("2toINF/X-VLA-WidowX", trust_remote_code=True) + + # append 3 random image to a list def make_random_pil_images(num_images=3, H=480, W=640): images = [] @@ -20,6 +18,7 @@ def make_random_pil_images(num_images=3, H=480, W=640): images.append(img) return images + # Example: images = make_random_pil_images() language_instruction = "This is a random image" @@ -29,23 +28,27 @@ if not {"input_ids", "image_input", "image_mask"}.issubset(inputs): raise ValueError("Processor did not return the expected keys.") proprio = torch.randn(1, 20) -domain_id = torch.tensor([int(0)], dtype=torch.long) +domain_id = torch.tensor([0], dtype=torch.long) # Align to model's device/dtype device = model.device dtype = next(model.parameters()).dtype + def to_model(t: torch.Tensor) -> torch.Tensor: if not isinstance(t, torch.Tensor): t = torch.as_tensor(t) # cast floats to model dtype, keep integral/bool as-is return t.to(device=device, dtype=dtype) if t.is_floating_point() else t.to(device=device) + inputs = {k: to_model(v) for k, v in inputs.items()} -inputs.update({ - "proprio": to_model(proprio), - "domain_id": domain_id.to(device), -}) +inputs.update( + { + "proprio": to_model(proprio), + "domain_id": domain_id.to(device), + } +) # Inference diff --git a/examples/tutorial/xvla/inference_pipe.py b/examples/tutorial/xvla/inference_pipe.py index c60a4e305..93bff30ca 100644 --- a/examples/tutorial/xvla/inference_pipe.py +++ b/examples/tutorial/xvla/inference_pipe.py @@ -1,11 +1,12 @@ -from lerobot.policies.factory import make_policy, make_pre_post_processors +import numpy as np +import torch + # from lerobot.policies.xvla.configuration_xvla import XVLAConfig from lerobot.configs.policies import PreTrainedConfig from lerobot.envs.factory import make_env_config -from lerobot.policies.xvla.utils import Rotate6D_to_AxisAngle +from lerobot.policies.factory import make_policy, make_pre_post_processors +from lerobot.policies.xvla.utils import rotate6d_to_axis_angle from lerobot.utils.constants import OBS_IMAGES, OBS_STATE -import torch -import numpy as np observation_height: int = 360 observation_width: int = 360 @@ -13,18 +14,22 @@ observation_width: int = 360 OBS = { f"{OBS_IMAGES}.image1": torch.randn(1, 3, observation_height, observation_width), f"{OBS_IMAGES}.image2": torch.randn(1, 3, observation_height, observation_width), - OBS_STATE: torch.randn(1, 9), # ONLY if OBS_STATE is already a string + OBS_STATE: torch.randn(1, 9), # ONLY if OBS_STATE is already a string "task": "put the object in the box", } + + def fake_rgb(H, W): img = torch.randint(0, 255, (H, W, 3), dtype=torch.uint8).numpy() return img + OBS[f"{OBS_IMAGES}.image1"] = fake_rgb(observation_height, observation_width) OBS[f"{OBS_IMAGES}.image2"] = fake_rgb(observation_height, observation_width) # observation = preprocessor(OBS) from transformers import AutoProcessor + processor = AutoProcessor.from_pretrained("2toINF/X-VLA-WidowX", num_views=2, trust_remote_code=True) inputs = processor([OBS[f"{OBS_IMAGES}.image1"], OBS[f"{OBS_IMAGES}.image2"]], OBS["task"]) breakpoint() @@ -40,19 +45,19 @@ policy = make_policy( policy.eval() preprocessor_overrides = { - "device_processor": {"device": str(cfg.device)}, + "device_processor": {"device": str(cfg.device)}, } preprocessor, postprocessor = make_pre_post_processors( - policy_cfg=cfg, - pretrained_path=cfg.pretrained_path, - preprocessor_overrides=preprocessor_overrides, + policy_cfg=cfg, + pretrained_path=cfg.pretrained_path, + preprocessor_overrides=preprocessor_overrides, ) observation = preprocessor(OBS) action = policy.select_action(observation) target_eef = action[:, :3].to("cpu").numpy() -target_axis = Rotate6D_to_AxisAngle(action[:, 3:9].to("cpu").numpy()) +target_axis = rotate6d_to_axis_angle(action[:, 3:9].to("cpu").numpy()) target_act = action[:, 9:10].to("cpu").numpy() final_action = np.concatenate([target_eef, target_axis, target_act], axis=-1) diff --git a/examples/tutorial/xvla/test_xvla.py b/examples/tutorial/xvla/test_xvla.py index cc6e03df7..d001b9877 100644 --- a/examples/tutorial/xvla/test_xvla.py +++ b/examples/tutorial/xvla/test_xvla.py @@ -1,6 +1,8 @@ +import os + from lerobot.datasets.lerobot_dataset import LeRobotDatasetMetadata from lerobot.policies.factory import make_policy, make_policy_config -import os + cfg = make_policy_config("xvla") dataset_id = "lerobot/svla_so101_pickplace" @@ -16,7 +18,9 @@ for name, param in policy.state_dict().items(): import safetensors.torch from huggingface_hub import snapshot_download -cache_dir = snapshot_download(repo_id="2toINF/X-VLA-Libero", repo_type="model", cache_dir="/fsx/jade_choghari/.cache/huggingface/model") +cache_dir = snapshot_download( + repo_id="2toINF/X-VLA-Libero", repo_type="model", cache_dir="/fsx/jade_choghari/.cache/huggingface/model" +) state_dict = safetensors.torch.load_file(os.path.join(cache_dir, "model.safetensors")) # policy.load_state_dict(state_dict) # 3. Add "model." prefix to every key @@ -36,34 +40,38 @@ print() print("unexpected keys:", unexpected) +import random + +import numpy as np +import torch +from xvla.models.modeling_xvla import XVLA -from lerobot.policies.factory import make_policy, make_pre_post_processors # from lerobot.policies.xvla.configuration_xvla import XVLAConfig from lerobot.configs.policies import PreTrainedConfig from lerobot.envs.factory import make_env_config +from lerobot.policies.factory import make_policy, make_pre_post_processors from lerobot.utils.constants import OBS_IMAGES, OBS_STATE -from xvla.models.modeling_xvla import XVLA -import torch -import numpy as np -import random + torch.manual_seed(42) random.seed(42) np.random.seed(42) observation_height: int = 224 -observation_width: int = 224 # todo: jadechoghari, image size is different for the two models +observation_width: int = 224 # todo: jadechoghari, image size is different for the two models # create an observation dict OBS = { f"{OBS_IMAGES}.image": torch.randn(1, 3, observation_height, observation_width), f"{OBS_IMAGES}.image2": torch.randn(1, 3, observation_height, observation_width), - OBS_STATE: torch.randn(1, 20), # ONLY if OBS_STATE is already a string + OBS_STATE: torch.randn(1, 20), # ONLY if OBS_STATE is already a string "task": "put the object in the box", } -IMAGENET_MEAN = torch.tensor([0.485, 0.456, 0.406]).view(1,3,1,1) -IMAGENET_STD = torch.tensor([0.229, 0.224, 0.225]).view(1,3,1,1) +IMAGENET_MEAN = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1) +IMAGENET_STD = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1) + + def fake_rgb(H, W): arr = np.random.randint(0, 255, (H, W, 3), dtype=np.uint8) - t = torch.from_numpy(arr).permute(2, 0, 1) # CHW + t = torch.from_numpy(arr).permute(2, 0, 1) # CHW t = t.unsqueeze(0).float() # normalize pixel to imagenet return t @@ -83,13 +91,13 @@ policy = make_policy( policy.eval() preprocessor_overrides = { - "device_processor": {"device": str(cfg.device)}, + "device_processor": {"device": str(cfg.device)}, } preprocessor, postprocessor = make_pre_post_processors( - policy_cfg=cfg, - pretrained_path=cfg.pretrained_path, - preprocessor_overrides=preprocessor_overrides, + policy_cfg=cfg, + pretrained_path=cfg.pretrained_path, + preprocessor_overrides=preprocessor_overrides, ) observation = preprocessor(OBS) @@ -101,14 +109,16 @@ from xvla.models.processing_xvla import XVLAProcessor processor = XVLAProcessor.from_pretrained("/raid/jade/models/xvla-libero", num_views=2) inputs_1 = processor([OBS[f"{OBS_IMAGES}.image"], OBS[f"{OBS_IMAGES}.image2"]], OBS["task"]) -domain_id = torch.tensor([int(3)], dtype=torch.long) -inputs.update({ - "proprio": OBS[OBS_STATE].to("cuda"), - "domain_id": domain_id.to("cuda"), -}) +domain_id = torch.tensor([3], dtype=torch.long) +inputs.update( + { + "proprio": OBS[OBS_STATE].to("cuda"), + "domain_id": domain_id.to("cuda"), + } +) -for k in inputs.keys() & inputs_1.keys(): # intersection of keys +for k in inputs.keys() & inputs_1.keys(): # intersection of keys a = inputs[k] b = inputs_1[k].to("cuda") @@ -139,22 +149,25 @@ action = model.generate_actions(**inputs, steps=10).squeeze(0).float().cpu().num # (Pdb) [0, 0, :, :4, 0] action_1 = policy.model.generate_actions(**inputs, steps=10).squeeze(0).float().cpu().numpy() -#np all close +# np all close print(np.allclose(action, action_1, atol=1e-2, rtol=1e-2)) print("max diff:", np.max(np.abs(action - action_1))) print("mean diff:", np.mean(np.abs(action - action_1))) -from xvla.models.processor_xvla import XVLAProcessor -from xvla.models.modeling_xvla import XVLA -from xvla.models.configuration_xvla import XVLAConfig -import torch import random + import numpy as np +import torch from PIL import Image -from lerobot.policies.factory import make_policy +from xvla.models.configuration_xvla import XVLAConfig +from xvla.models.modeling_xvla import XVLA +from xvla.models.processor_xvla import XVLAProcessor + from lerobot.configs.policies import PreTrainedConfig from lerobot.envs.factory import make_env_config +from lerobot.policies.factory import make_policy + cfg = XVLAConfig.from_pretrained("/raid/jade/models/xvla-libero") model = XVLA.from_pretrained("/raid/jade/models/xvla-libero") model.eval() @@ -166,6 +179,7 @@ torch.manual_seed(42) random.seed(42) np.random.seed(42) + def make_random_pil_images(num_images=3, H=480, W=640): images = [] for _ in range(num_images): @@ -175,6 +189,7 @@ def make_random_pil_images(num_images=3, H=480, W=640): images.append(img) return images + # Example: images = make_random_pil_images() language_instruction = "This is a random image" @@ -184,23 +199,27 @@ if not {"input_ids", "image_input", "image_mask"}.issubset(inputs): raise ValueError("Processor did not return the expected keys.") proprio = torch.randn(1, 20) -domain_id = torch.tensor([int(0)], dtype=torch.long) +domain_id = torch.tensor([0], dtype=torch.long) # Align to model's device/dtype device = model.device dtype = next(model.parameters()).dtype + def to_model(t: torch.Tensor) -> torch.Tensor: if not isinstance(t, torch.Tensor): t = torch.as_tensor(t) # cast floats to model dtype, keep integral/bool as-is return t.to(device=device, dtype=dtype) if t.is_floating_point() else t.to(device=device) + inputs = {k: to_model(v) for k, v in inputs.items()} -inputs.update({ - "proprio": to_model(proprio), - "domain_id": domain_id.to(device), -}) +inputs.update( + { + "proprio": to_model(proprio), + "domain_id": domain_id.to(device), + } +) # Inference action = model.generate_actions(**inputs, steps=10).squeeze(0).float().cpu().numpy() diff --git a/src/lerobot/envs/libero.py b/src/lerobot/envs/libero.py index f0ee22503..58f4e051a 100644 --- a/src/lerobot/envs/libero.py +++ b/src/lerobot/envs/libero.py @@ -29,7 +29,9 @@ from gymnasium import spaces from libero.libero import benchmark, get_libero_path from libero.libero.envs import OffScreenRenderEnv from robosuite.utils.transform_utils import quat2axisangle -from lerobot.policies.xvla.utils import Mat_to_Rotate6D + +from lerobot.policies.xvla.utils import mat_to_rotate6d + def _parse_camera_names(camera_name: str | Sequence[str]) -> list[str]: """Normalize camera_name into a non-empty list of strings.""" @@ -215,7 +217,7 @@ class LiberoEnv(gym.Env): if camera_name == "agentview_image": image = image[::-1, ::-1] # rotate 180 degrees images[self.camera_name_mapping[camera_name]] = image - + if self.action_type == "rel": state = np.concatenate( ( @@ -227,13 +229,15 @@ class LiberoEnv(gym.Env): # TODO: jadechoghari, this is an ugly quick workaround for XVLA states. # we will open a new PR to handle this in a preprocessor. elif self.action_type == "abs": - robo_ori = Mat_to_Rotate6D(self._env.robots[0].controller.ee_ori_mat) + robo_ori = mat_to_rotate6d(self._env.robots[0].controller.ee_ori_mat) robo_pos = self._env.robots[0].controller.ee_pos proprio = np.concatenate([robo_pos, robo_ori, np.array([0.0])], axis=-1) state = np.concatenate([proprio, np.zeros_like(proprio)], axis=-1) else: - raise NotImplementedError(f"The action type '{self.action_type}' is not supported in LiberoEnv. " - "Please switch to an action type (e.g. 'rel', 'abs').") + raise NotImplementedError( + f"The action type '{self.action_type}' is not supported in LiberoEnv. " + "Please switch to an action type (e.g. 'rel', 'abs')." + ) agent_pos = state if self.obs_type == "pixels": return {"pixels": images.copy()} @@ -252,7 +256,7 @@ class LiberoEnv(gym.Env): self._env.seed(seed) if self.init_states and self._init_states is not None: self._env.set_init_state(self._init_states[self._init_state_id]) - + raw_obs = self._env.reset() # After reset, objects may be unstable (slightly floating, intersecting, etc.). @@ -261,7 +265,7 @@ class LiberoEnv(gym.Env): for _ in range(self.num_steps_wait): action = np.array(get_libero_dummy_action()) raw_obs, _, _, _ = self._env.step(action) - + observation = self._format_raw_obs(raw_obs) for robot in self._env.robots: robot.controller.use_delta = False diff --git a/src/lerobot/policies/xvla/__init__.py b/src/lerobot/policies/xvla/__init__.py index 84fecda43..822a4797e 100644 --- a/src/lerobot/policies/xvla/__init__.py +++ b/src/lerobot/policies/xvla/__init__.py @@ -1,6 +1,6 @@ from lerobot.policies.xvla.processor_xvla import ( - make_xvla_pre_post_processors, - XVLAImageScaleProcessorStep, - XVLAAddDomainIdProcessorStep, - XVLARotation6DToAxisAngleProcessorStep, - ) \ No newline at end of file + XVLAAddDomainIdProcessorStep, + XVLAImageScaleProcessorStep, + XVLARotation6DToAxisAngleProcessorStep, + make_xvla_pre_post_processors, +) diff --git a/src/lerobot/policies/xvla/configuration_florence2.py b/src/lerobot/policies/xvla/configuration_florence2.py index 7dec9d68e..1a66cf3e2 100644 --- a/src/lerobot/policies/xvla/configuration_florence2.py +++ b/src/lerobot/policies/xvla/configuration_florence2.py @@ -187,7 +187,7 @@ class Florence2LanguageConfig(PretrainedConfig): >>> configuration = Florence2LanguageConfig() >>> # Initializing a model (with random weights) - >>> model = Florence2LangaugeModel(configuration) + >>> model = Florence2LanguageModel(configuration) >>> # Accessing the model configuration >>> configuration = model.config diff --git a/src/lerobot/policies/xvla/configuration_xvla.py b/src/lerobot/policies/xvla/configuration_xvla.py index 865896277..c6c3f1f25 100644 --- a/src/lerobot/policies/xvla/configuration_xvla.py +++ b/src/lerobot/policies/xvla/configuration_xvla.py @@ -28,8 +28,6 @@ from lerobot.optim.schedulers import CosineDecayWithWarmupSchedulerConfig from lerobot.utils.constants import OBS_IMAGES from .configuration_florence2 import Florence2Config -from .configuration_florence2 import Florence2VisionConfig -from .configuration_florence2 import Florence2LanguageConfig @PreTrainedConfig.register_subclass("xvla") @@ -72,7 +70,7 @@ class XVLAConfig(PreTrainedConfig): num_domains: int = 30 len_soft_prompts: int = 32 dim_time: int = 32 - max_len_seq: int = 512 #TODO: jadechoghari: change to 512 1024 + max_len_seq: int = 512 # TODO: jadechoghari: change to 512 1024 use_hetero_proj: bool = False # Action & proprioception diff --git a/src/lerobot/policies/xvla/modeling_florence2.py b/src/lerobot/policies/xvla/modeling_florence2.py index 84f075807..e61689568 100644 --- a/src/lerobot/policies/xvla/modeling_florence2.py +++ b/src/lerobot/policies/xvla/modeling_florence2.py @@ -496,8 +496,8 @@ class DaViT(nn.Module): drop_path_rate (float): Stochastic depth rate. Default: 0.1. norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm. enable_checkpoint (bool): If True, enable checkpointing. Default: False. - conv_at_attn (bool): If True, performe depthwise convolution before attention layer. Default: True. - conv_at_ffn (bool): If True, performe depthwise convolution before ffn layer. Default: True. + conv_at_attn (bool): If True, perform depthwise convolution before attention layer. Default: True. + conv_at_ffn (bool): If True, perform depthwise convolution before ffn layer. Default: True. """ def __init__( @@ -892,7 +892,7 @@ class Florence2FlashAttention2(Florence2Attention): super().__init__(*args, **kwargs) # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. - # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() diff --git a/src/lerobot/policies/xvla/modeling_xvla.py b/src/lerobot/policies/xvla/modeling_xvla.py index 8eadffffa..a8ee2eae0 100644 --- a/src/lerobot/policies/xvla/modeling_xvla.py +++ b/src/lerobot/policies/xvla/modeling_xvla.py @@ -18,6 +18,7 @@ from __future__ import annotations +import os from collections import deque import torch @@ -34,7 +35,6 @@ from .configuration_xvla import XVLAConfig from .modeling_florence2 import Florence2ForConditionalGeneration from .transformer import SoftPromptedTransformer -import os class XVLAModel(nn.Module): """ @@ -94,11 +94,10 @@ class XVLAModel(nn.Module): batch_size, num_views = pixel_values.shape[:2] flat_mask = image_mask.view(-1).to(dtype=torch.bool) flat_images = pixel_values.flatten(0, 1) - #TODO: jadechoghari: remove this resizing logic, and provide a way in training to do this + # TODO: jadechoghari: remove this resizing logic, and provide a way in training to do this # target_size = (224, 224) # flat_images = F.interpolate(flat_images, size=target_size, mode="bilinear", align_corners=False) - num_valid = int(flat_mask.sum().item()) if num_valid == 0: raise ValueError("At least one image view must be valid per batch.") @@ -341,13 +340,13 @@ class XVLAPolicy(PreTrainedPolicy): self._queues[ACTION].extend(actions.transpose(0, 1)[: self.config.n_action_steps]) return self._queues[ACTION].popleft() - + @classmethod def from_pretrained( cls, pretrained_name_or_path: str | Path, *, - config: "PreTrainedConfig" | None = None, + config: PreTrainedConfig | None = None, force_download: bool = False, resume_download: bool | None = None, proxies: dict | None = None, @@ -364,6 +363,7 @@ class XVLAPolicy(PreTrainedPolicy): - skip list for layers that should remain randomly initialized """ import safetensors.torch + # --- Step 1: Load config --- if config is None: config = PreTrainedConfig.from_pretrained( @@ -386,6 +386,9 @@ class XVLAPolicy(PreTrainedPolicy): model_file = os.path.join(model_id, "model.safetensors") else: try: + from huggingface_hub import hf_hub_download + from huggingface_hub.utils import HfHubHTTPError + model_file = hf_hub_download( repo_id=model_id, filename="model.safetensors", @@ -398,9 +401,7 @@ class XVLAPolicy(PreTrainedPolicy): local_files_only=local_files_only, ) except HfHubHTTPError as e: - raise FileNotFoundError( - f"model.safetensors not found on the Hub at {model_id}" - ) from e + raise FileNotFoundError(f"model.safetensors not found on the Hub at {model_id}") from e print(f"Loading checkpoint from {model_file}") state_dict = safetensors.torch.load_file(model_file) @@ -421,7 +422,7 @@ class XVLAPolicy(PreTrainedPolicy): # } # # ---- ADD THIS: Fix shared embeddings ---- encoder_key = "model.vlm.language_model.model.encoder.embed_tokens.weight" - shared_key = "model.vlm.language_model.model.shared.weight" + shared_key = "model.vlm.language_model.model.shared.weight" if encoder_key in state_dict: state_dict[shared_key] = state_dict[encoder_key] # step 5: load into instance @@ -431,7 +432,7 @@ class XVLAPolicy(PreTrainedPolicy): print(f"Missing keys: {missing}") if unexpected: print(f"Unexpected keys: {unexpected}") - + # step 6: finalize instance.to(config.device) instance.eval() diff --git a/src/lerobot/policies/xvla/processor_xvla.py b/src/lerobot/policies/xvla/processor_xvla.py index e5fc96704..5296f2131 100644 --- a/src/lerobot/policies/xvla/processor_xvla.py +++ b/src/lerobot/policies/xvla/processor_xvla.py @@ -21,7 +21,7 @@ import numpy as np import torch from lerobot.policies.xvla.configuration_xvla import XVLAConfig -from lerobot.policies.xvla.utils import Rotate6D_to_AxisAngle +from lerobot.policies.xvla.utils import rotate6d_to_axis_angle from lerobot.processor import ( AddBatchDimensionProcessorStep, DeviceProcessorStep, @@ -38,6 +38,7 @@ from lerobot.processor.converters import policy_action_to_transition, transition from lerobot.processor.core import EnvTransition, TransitionKey from lerobot.utils.constants import POLICY_POSTPROCESSOR_DEFAULT_NAME, POLICY_PREPROCESSOR_DEFAULT_NAME + def make_xvla_pre_post_processors( config: XVLAConfig, dataset_stats: dict[str, dict[str, torch.Tensor]] | None = None, @@ -94,45 +95,45 @@ def make_xvla_pre_post_processors( @ProcessorStepRegistry.register(name="xvla_image_scale") class XVLAImageScaleProcessorStep(ProcessorStep): """Scale image observations by 255 to convert from [0, 1] to [0, 255] range. - + This processor step multiplies all image observations by 255, which is required for XVLA models that expect images in uint8-like range. - + Args: image_keys: List of observation keys that contain images to scale. If None, will automatically detect keys starting with "observation.images." """ - + image_keys: list[str] | None = None - + def __call__(self, transition: EnvTransition) -> EnvTransition: """Scale image observations by 255.""" new_transition = transition.copy() obs = new_transition.get(TransitionKey.OBSERVATION, {}) if obs is None: return new_transition - + # Make a copy of observations to avoid modifying the original obs = obs.copy() - + # Determine which keys to scale keys_to_scale = self.image_keys if keys_to_scale is None: # Auto-detect image keys keys_to_scale = [k for k in obs.keys() if k.startswith("observation.images.")] - + # Scale each image for key in keys_to_scale: if key in obs and isinstance(obs[key], torch.Tensor): obs[key] = obs[key] * 255 - + new_transition[TransitionKey.OBSERVATION] = obs return new_transition - + def transform_features(self, features): """Image scaling doesn't change feature structure.""" return features - + def get_config(self) -> dict[str, Any]: """Return serializable configuration.""" return { @@ -144,18 +145,18 @@ class XVLAImageScaleProcessorStep(ProcessorStep): @ProcessorStepRegistry.register(name="xvla_add_domain_id") class XVLAAddDomainIdProcessorStep(ProcessorStep): """Add domain_id to complementary data. - + This processor step adds a domain_id tensor to the complementary data, which is used by XVLA to identify different robot embodiments or task domains. - + Args: domain_id: The domain ID to add (default: 3) device: Device to place the domain_id tensor on (default: "cuda") """ - + domain_id: int = 3 device: str = "cuda" - + def __call__(self, transition: EnvTransition) -> EnvTransition: """Add domain_id to complementary data.""" new_transition = transition.copy() @@ -164,7 +165,7 @@ class XVLAAddDomainIdProcessorStep(ProcessorStep): comp = {} else: comp = comp.copy() - + # Infer batch size from observation tensors obs = new_transition.get(TransitionKey.OBSERVATION, {}) batch_size = 1 @@ -173,17 +174,17 @@ class XVLAAddDomainIdProcessorStep(ProcessorStep): if isinstance(v, torch.Tensor): batch_size = v.shape[0] break - + # Add domain_id tensor comp["domain_id"] = torch.tensor([int(self.domain_id)] * batch_size, dtype=torch.long).to(self.device) - + new_transition[TransitionKey.COMPLEMENTARY_DATA] = comp return new_transition - + def transform_features(self, features): """Domain ID addition doesn't change feature structure.""" return features - + def get_config(self) -> dict[str, Any]: """Return serializable configuration.""" return { @@ -196,61 +197,61 @@ class XVLAAddDomainIdProcessorStep(ProcessorStep): @ProcessorStepRegistry.register(name="xvla_rotation_6d_to_axis_angle") class XVLARotation6DToAxisAngleProcessorStep(ProcessorStep): """Convert 6D rotation representation to axis-angle and reorganize action dimensions. - + This processor step takes actions with 6D rotation representation and converts them to axis-angle representation, reorganizing the action dimensions as: - action[:, :3] -> target_eef (end-effector position) - action[:, 3:9] -> 6D rotation (converted to axis-angle, 3D) - action[:, 9:10] -> gripper action - + Final output: [target_eef (3), axis_angle (3), gripper (1)] = 7D action - + Args: expected_action_dim: Expected input action dimension (default: 10, supports 6D rotation + extras) """ - + expected_action_dim: int = 10 - + def __call__(self, transition: EnvTransition) -> EnvTransition: """Convert 6D rotation to axis-angle in action.""" new_transition = transition.copy() action = new_transition.get(TransitionKey.ACTION) - + if action is None or not isinstance(action, torch.Tensor): return new_transition - + # Convert to numpy for processing device = action.device dtype = action.dtype action_np = action.cpu().numpy() - + # Extract components # action shape: (B, D) where D >= 10 target_eef = action_np[:, :3] # (B, 3) rotation_6d = action_np[:, 3:9] # (B, 6) target_act = action_np[:, 9:10] # (B, 1) - + # Convert 6D rotation to axis-angle - target_axis = Rotate6D_to_AxisAngle(rotation_6d) # (B, 3) - + target_axis = rotate6d_to_axis_angle(rotation_6d) # (B, 3) + # Concatenate: [eef (3), axis_angle (3), gripper (1)] = 7D action_np = np.concatenate([target_eef, target_axis, target_act], axis=-1) # Convert gripper action to -1 or 1 action_np[:, -1] = np.where(action_np[:, -1] > 0.5, 1.0, -1.0) - + # Convert back to tensor action = torch.from_numpy(action_np).to(device=device, dtype=dtype) - + new_transition[TransitionKey.ACTION] = action return new_transition - + def transform_features(self, features): """Rotation conversion changes action dimension from 10 to 7.""" # Note: This is a simplified version. In practice, you might want to # update the action feature shape in the features dict. return features - + def get_config(self) -> dict[str, Any]: """Return serializable configuration.""" return { diff --git a/src/lerobot/policies/xvla/utils.py b/src/lerobot/policies/xvla/utils.py index 0603777ac..abd02edcc 100644 --- a/src/lerobot/policies/xvla/utils.py +++ b/src/lerobot/policies/xvla/utils.py @@ -1,48 +1,51 @@ import numpy as np import robosuite.utils.transform_utils as T -def Rotate6D_to_AxisAngle(r6d): - """ - r6d: np.ndarray, shape (N, 6) - return: np.ndarray, shape (N, 3), axis-angle vectors - """ - flag = 0 - if len(r6d.shape) == 1: - r6d = r6d[None, ...] - flag = 1 - - a1 = r6d[:, 0:3] - a2 = r6d[:, 3:6] - - # b1 - b1 = a1 / (np.linalg.norm(a1, axis=-1, keepdims=True) + 1e-6) - - # b2 - dot_prod = np.sum(b1 * a2, axis=-1, keepdims=True) - b2_orth = a2 - dot_prod * b1 - b2 = b2_orth / (np.linalg.norm(b2_orth, axis=-1, keepdims=True) + 1e-6) - - # b3 - b3 = np.cross(b1, b2, axis=-1) - - R = np.stack([b1, b2, b3], axis=-1) # shape: (N, 3, 3) - - axis_angle_list = [] - for i in range(R.shape[0]): - quat = T.mat2quat(R[i]) - axis_angle = T.quat2axisangle(quat) - axis_angle_list.append(axis_angle) - - axis_angle_array = np.stack(axis_angle_list, axis=0) # shape: (N, 3) - - if flag == 1: - axis_angle_array = axis_angle_array[0] - - return axis_angle_array - -def Mat_to_Rotate6D(abs_action): - if len(abs_action.shape) == 2: - return np.concatenate([abs_action[:3, 0], abs_action[:3, 1]], axis=-1) - elif len(abs_action.shape) == 3: - return np.concatenate([abs_action[:, :3, 0], abs_action[:, :3, 1]], axis=-1) - else: - raise NotImplementedError \ No newline at end of file + + +def rotate6d_to_axis_angle(r6d): + """ + r6d: np.ndarray, shape (N, 6) + return: np.ndarray, shape (N, 3), axis-angle vectors + """ + flag = 0 + if len(r6d.shape) == 1: + r6d = r6d[None, ...] + flag = 1 + + a1 = r6d[:, 0:3] + a2 = r6d[:, 3:6] + + # b1 + b1 = a1 / (np.linalg.norm(a1, axis=-1, keepdims=True) + 1e-6) + + # b2 + dot_prod = np.sum(b1 * a2, axis=-1, keepdims=True) + b2_orth = a2 - dot_prod * b1 + b2 = b2_orth / (np.linalg.norm(b2_orth, axis=-1, keepdims=True) + 1e-6) + + # b3 + b3 = np.cross(b1, b2, axis=-1) + + R = np.stack([b1, b2, b3], axis=-1) # shape: (N, 3, 3) + + axis_angle_list = [] + for i in range(R.shape[0]): + quat = T.mat2quat(R[i]) + axis_angle = T.quat2axisangle(quat) + axis_angle_list.append(axis_angle) + + axis_angle_array = np.stack(axis_angle_list, axis=0) # shape: (N, 3) + + if flag == 1: + axis_angle_array = axis_angle_array[0] + + return axis_angle_array + + +def mat_to_rotate6d(abs_action): + if len(abs_action.shape) == 2: + return np.concatenate([abs_action[:3, 0], abs_action[:3, 1]], axis=-1) + elif len(abs_action.shape) == 3: + return np.concatenate([abs_action[:, :3, 0], abs_action[:, :3, 1]], axis=-1) + else: + raise NotImplementedError diff --git a/src/lerobot/processor/migrate_policy_normalization.py b/src/lerobot/processor/migrate_policy_normalization.py index 0b614c7c0..e602bb5d0 100644 --- a/src/lerobot/processor/migrate_policy_normalization.py +++ b/src/lerobot/processor/migrate_policy_normalization.py @@ -325,7 +325,9 @@ def load_state_dict_with_missing_key_handling( Returns: List of problematic missing keys that weren't in the whitelist. """ - state_dict['model.vlm.language_model.model.encoder.embed_tokens.weight'] = state_dict['model.vlm.language_model.model.shared.weight'].clone() + state_dict["model.vlm.language_model.model.encoder.embed_tokens.weight"] = state_dict[ + "model.vlm.language_model.model.shared.weight" + ].clone() # Load the cleaned state dict with strict=False to capture missing/unexpected keys load_result = policy.load_state_dict(state_dict, strict=False) diff --git a/src/lerobot/processor/normalize_processor.py b/src/lerobot/processor/normalize_processor.py index 4ab09beaf..87d8ae267 100644 --- a/src/lerobot/processor/normalize_processor.py +++ b/src/lerobot/processor/normalize_processor.py @@ -25,9 +25,10 @@ import torch from torch import Tensor from lerobot.configs.types import FeatureType, NormalizationMode, PipelineFeatureType, PolicyFeature +from lerobot.datasets.factory import IMAGENET_STATS from lerobot.datasets.lerobot_dataset import LeRobotDataset from lerobot.utils.constants import ACTION -from lerobot.datasets.factory import IMAGENET_STATS + from .converters import from_tensor_to_numpy, to_tensor from .core import EnvTransition, PolicyAction, TransitionKey from .pipeline import PolicyProcessorPipeline, ProcessorStep, ProcessorStepRegistry @@ -303,7 +304,11 @@ class _NormalizationMixin: ValueError: If an unsupported normalization mode is encountered. """ norm_mode = self.norm_map.get(feature_type, NormalizationMode.IDENTITY) - if norm_mode == NormalizationMode.IDENTITY or key not in self._tensor_stats and norm_mode != NormalizationMode.IMAGENET: + if ( + norm_mode == NormalizationMode.IDENTITY + or key not in self._tensor_stats + and norm_mode != NormalizationMode.IMAGENET + ): return tensor if norm_mode not in ( NormalizationMode.MEAN_STD, diff --git a/src/lerobot/processor/pipeline.py b/src/lerobot/processor/pipeline.py index 9dcfbe806..e14d8b0b9 100644 --- a/src/lerobot/processor/pipeline.py +++ b/src/lerobot/processor/pipeline.py @@ -55,6 +55,7 @@ from .core import EnvAction, EnvTransition, PolicyAction, RobotAction, Transitio TInput = TypeVar("TInput") TOutput = TypeVar("TOutput") + class ProcessorStepRegistry: """A registry for ProcessorStep classes to allow instantiation from a string name. diff --git a/src/lerobot/scripts/lerobot_eval.py b/src/lerobot/scripts/lerobot_eval.py index 15952e16e..0d66fa1aa 100644 --- a/src/lerobot/scripts/lerobot_eval.py +++ b/src/lerobot/scripts/lerobot_eval.py @@ -45,6 +45,7 @@ Note that in both examples, the repo/folder should contain at least `config.json You can learn about the CLI options for this script in the `EvalPipelineConfig` in lerobot/configs/eval.py """ + import concurrent.futures as cf import json import logging @@ -89,6 +90,7 @@ from lerobot.utils.utils import ( inside_slurm, ) + def rollout( env: gym.vector.VectorEnv, policy: PreTrainedPolicy, @@ -153,7 +155,6 @@ def rollout( disable=inside_slurm(), # we dont want progress bar when we use slurm, since it clutters the logs leave=False, ) - check_env_attributes_and_types(env) while not np.all(done) and step < max_steps: # Numpy array to tensor and changing dictionary keys to LeRobot policy format. @@ -164,17 +165,13 @@ def rollout( # Infer "task" from attributes of environments. # TODO: works with SyncVectorEnv but not AsyncVectorEnv observation = add_envs_task(env, observation) - - # Preprocess observation (includes image scaling and domain_id addition) observation = preprocessor(observation) - # Policy inference with torch.inference_mode(): action = policy.select_action(observation) - - # Postprocess action (includes rotation conversion and device transfer to CPU) action = postprocessor(action) - # Convert to numpy - action_numpy: np.ndarray = action.numpy() + + # Convert to CPU / numpy. + action_numpy: np.ndarray = action.to("cpu").numpy() assert action_numpy.ndim == 2, "Action dimensions should be (batch, action_dim)" # Apply the next action. @@ -500,6 +497,7 @@ def eval_main(cfg: EvalPipelineConfig): envs = make_env(cfg.env, n_envs=cfg.eval.batch_size, use_async_envs=cfg.eval.use_async_envs) logging.info("Making policy.") + policy = make_policy( cfg=cfg.policy, env_cfg=cfg.env,