mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-26 14:09:47 +00:00
upgrade transformers version
This commit is contained in:
@@ -14,7 +14,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import importlib
|
||||
from typing import Any
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
import gymnasium as gym
|
||||
from gymnasium.envs.registration import registry as gym_registry
|
||||
@@ -22,10 +22,16 @@ from gymnasium.envs.registration import registry as gym_registry
|
||||
from lerobot.configs.policies import PreTrainedConfig
|
||||
from lerobot.envs.configs import AlohaEnv, EnvConfig, LiberoEnv, PushtEnv
|
||||
from lerobot.envs.utils import _call_make_env, _download_hub_file, _import_hub_module, _normalize_hub_result
|
||||
from lerobot.policies.xvla.configuration_xvla import XVLAConfig
|
||||
from lerobot.processor import ProcessorStep
|
||||
from lerobot.processor.env_processor import LiberoProcessorStep
|
||||
from lerobot.processor.pipeline import PolicyProcessorPipeline
|
||||
from lerobot.utils.import_utils import _transformers_available
|
||||
|
||||
# Conditional import for type checking and lazy loading
|
||||
if TYPE_CHECKING or _transformers_available:
|
||||
from lerobot.policies.xvla.configuration_xvla import XVLAConfig
|
||||
else:
|
||||
XVLAConfig = None
|
||||
|
||||
|
||||
def make_env_config(env_type: str, **kwargs) -> EnvConfig:
|
||||
|
||||
@@ -49,7 +49,7 @@ class XVLAConfig(PreTrainedConfig):
|
||||
normalization_mapping: dict[str, NormalizationMode] = field(
|
||||
default_factory=lambda: {
|
||||
"VISUAL": NormalizationMode.IDENTITY,
|
||||
"STATE": NormalizationMode.MEAN_STD,
|
||||
"STATE": NormalizationMode.IDENTITY,
|
||||
"ACTION": NormalizationMode.MEAN_STD,
|
||||
}
|
||||
)
|
||||
|
||||
@@ -2350,22 +2350,8 @@ class Florence2PreTrainedModel(PreTrainedModel):
|
||||
base_model_prefix = "model"
|
||||
supports_gradient_checkpointing = True
|
||||
_skip_keys_device_placement = "past_key_values"
|
||||
|
||||
@property
|
||||
def _supports_flash_attn_2(self):
|
||||
"""
|
||||
Retrieve language_model's attribute to check whether the model supports
|
||||
Flash Attention 2 or not.
|
||||
"""
|
||||
return self.language_model._supports_flash_attn_2
|
||||
|
||||
@property
|
||||
def _supports_sdpa(self):
|
||||
"""
|
||||
Retrieve language_model's attribute to check whether the model supports
|
||||
SDPA or not.
|
||||
"""
|
||||
return self.language_model._supports_sdpa
|
||||
_supports_flash_attn_2 = True
|
||||
_supports_sdpa = True
|
||||
|
||||
|
||||
FLORENCE2_INPUTS_DOCSTRING = r"""
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import builtins
|
||||
import os
|
||||
from collections import deque
|
||||
from pathlib import Path
|
||||
@@ -26,13 +27,14 @@ import torch
|
||||
import torch.nn.functional as F # noqa: N812
|
||||
from torch import Tensor, nn
|
||||
|
||||
from lerobot.policies.pretrained import PreTrainedPolicy
|
||||
from lerobot.configs.policies import PreTrainedConfig
|
||||
from lerobot.policies.pretrained import PreTrainedPolicy, T
|
||||
from lerobot.policies.utils import populate_queues
|
||||
from lerobot.utils.constants import ACTION, OBS_LANGUAGE_TOKENS, OBS_STATE
|
||||
|
||||
from .action_hub import build_action_space
|
||||
from .configuration_florence2 import Florence2Config
|
||||
from .configuration_xvla import XVLAConfig, XVLAConfig as PreTrainedConfig
|
||||
from .configuration_xvla import XVLAConfig
|
||||
from .modeling_florence2 import Florence2ForConditionalGeneration
|
||||
from .soft_transformer import SoftPromptedTransformer
|
||||
|
||||
@@ -400,7 +402,7 @@ class XVLAPolicy(PreTrainedPolicy):
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(
|
||||
cls,
|
||||
cls: builtins.type[T],
|
||||
pretrained_name_or_path: str | Path,
|
||||
*,
|
||||
config: PreTrainedConfig | None = None,
|
||||
|
||||
@@ -26,6 +26,10 @@ import numpy as np
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from lerobot.policies.xvla.configuration_xvla import XVLAConfig
|
||||
from lerobot.policies.xvla.modeling_xvla import XVLAPolicy
|
||||
from lerobot.policies.xvla.processor_xvla import make_xvla_pre_post_processors
|
||||
|
||||
# Skip if transformers is not available
|
||||
pytest.importorskip("transformers")
|
||||
|
||||
@@ -37,9 +41,6 @@ pytestmark = pytest.mark.skipif(
|
||||
|
||||
from transformers import AutoModel, AutoProcessor # noqa: E402
|
||||
|
||||
from lerobot.configs.policies import PreTrainedConfig # noqa: E402
|
||||
from lerobot.envs.factory import make_env_config # noqa: E402
|
||||
from lerobot.policies.factory import make_policy, make_pre_post_processors # noqa: E402
|
||||
from lerobot.processor import PolicyAction, PolicyProcessorPipeline # noqa: E402
|
||||
from lerobot.utils.constants import OBS_IMAGES, OBS_STATE # noqa: E402
|
||||
|
||||
@@ -50,8 +51,8 @@ IMAGE_HEIGHT = 224
|
||||
IMAGE_WIDTH = 224
|
||||
NUM_VIEWS = 2 # Number of camera views
|
||||
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
MODEL_PATH_LEROBOT = "lerobot/xvla-base"
|
||||
MODEL_PATH_ORIGINAL = "2toINF/X-VLA-Pt"
|
||||
MODEL_PATH_LEROBOT = "lerobot/xvla-widowx"
|
||||
MODEL_PATH_ORIGINAL = "2toINF/X-VLA-WidowX"
|
||||
LIBERO_DOMAIN_ID = 0 # Domain ID for examples purposes
|
||||
|
||||
|
||||
@@ -93,31 +94,25 @@ def instantiate_lerobot_xvla(
|
||||
]:
|
||||
"""Instantiate LeRobot XVLA policy with preprocessor and postprocessor."""
|
||||
if from_pretrained:
|
||||
cfg = PreTrainedConfig.from_pretrained(model_path)
|
||||
cfg.pretrained_path = model_path
|
||||
else:
|
||||
# For non-pretrained, we'd need to create a config from scratch
|
||||
raise NotImplementedError("Non-pretrained XVLA instantiation not implemented yet")
|
||||
|
||||
cfg.device = DEVICE
|
||||
env_cfg = make_env_config("libero", task="libero_spatial")
|
||||
|
||||
policy = make_policy(
|
||||
cfg=cfg,
|
||||
env_cfg=env_cfg,
|
||||
policy = XVLAPolicy.from_pretrained(
|
||||
pretrained_name_or_path=model_path,
|
||||
strict=False,
|
||||
)
|
||||
else:
|
||||
config = XVLAConfig(
|
||||
base_model_path=model_path,
|
||||
n_action_steps=DUMMY_ACTION_DIM,
|
||||
chunk_size=DUMMY_ACTION_DIM,
|
||||
device=DEVICE,
|
||||
num_image_views=NUM_VIEWS,
|
||||
) # add resize_imgs_with_padding=IMAGE_SIZE, IMAGE_SIZE?
|
||||
policy = XVLAPolicy(config)
|
||||
|
||||
policy.to(DEVICE)
|
||||
policy.eval()
|
||||
|
||||
preprocessor_overrides = {
|
||||
"device_processor": {"device": str(cfg.device)},
|
||||
}
|
||||
|
||||
preprocessor, postprocessor = make_pre_post_processors(
|
||||
policy_cfg=cfg,
|
||||
pretrained_path=cfg.pretrained_path,
|
||||
preprocessor_overrides=preprocessor_overrides,
|
||||
policy.config.device = DEVICE
|
||||
preprocessor, postprocessor = make_xvla_pre_post_processors(
|
||||
config=policy.config,
|
||||
dataset_stats=None, # Pass None for dataset_stats to disable normalization (original XVLA doesn't normalize)
|
||||
)
|
||||
|
||||
return policy, preprocessor, postprocessor
|
||||
@@ -129,15 +124,8 @@ def instantiate_original_xvla(
|
||||
):
|
||||
"""Instantiate original XVLA policy from the original implementation."""
|
||||
if from_pretrained:
|
||||
processor = AutoProcessor.from_pretrained(
|
||||
model_path,
|
||||
num_views=NUM_VIEWS,
|
||||
trust_remote_code=True
|
||||
)
|
||||
model = AutoModel.from_pretrained(
|
||||
model_path,
|
||||
trust_remote_code=True
|
||||
)
|
||||
processor = AutoProcessor.from_pretrained(model_path, num_views=NUM_VIEWS, trust_remote_code=True)
|
||||
model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
|
||||
else:
|
||||
raise NotImplementedError("Non-pretrained XVLA instantiation not implemented yet")
|
||||
|
||||
@@ -149,20 +137,23 @@ def instantiate_original_xvla(
|
||||
|
||||
def create_dummy_data(device=DEVICE):
|
||||
"""Create dummy data for testing both implementations."""
|
||||
batch_size = 2
|
||||
batch_size = 1
|
||||
prompt = "Pick up the red block and place it in the bin"
|
||||
|
||||
# Create random RGB images in [0, 255] uint8 range (as PIL images would be)
|
||||
# Then convert to [0, 1] float32 range for LeRobot
|
||||
def fake_rgb(H, W):
|
||||
arr = np.random.randint(0, 255, (H, W, 3), dtype=np.uint8)
|
||||
def fake_rgb(h, w):
|
||||
arr = np.random.randint(0, 255, (h, w, 3), dtype=np.uint8)
|
||||
t = torch.from_numpy(arr).permute(2, 0, 1) # CHW
|
||||
t = t.float() / 255.0 # Normalize to [0, 1]
|
||||
return t
|
||||
|
||||
batch = {
|
||||
f"{OBS_IMAGES}.image": torch.stack([fake_rgb(IMAGE_HEIGHT, IMAGE_WIDTH) for _ in range(batch_size)]).to(device),
|
||||
f"{OBS_IMAGES}.image2": torch.stack([fake_rgb(IMAGE_HEIGHT, IMAGE_WIDTH) for _ in range(batch_size)]).to(device),
|
||||
f"{OBS_IMAGES}.image": torch.stack(
|
||||
[fake_rgb(IMAGE_HEIGHT, IMAGE_WIDTH) for _ in range(batch_size)]
|
||||
).to(device),
|
||||
f"{OBS_IMAGES}.image2": torch.stack(
|
||||
[fake_rgb(IMAGE_HEIGHT, IMAGE_WIDTH) for _ in range(batch_size)]
|
||||
).to(device),
|
||||
OBS_STATE: torch.randn(batch_size, DUMMY_STATE_DIM, dtype=torch.float32, device=device),
|
||||
"task": [prompt for _ in range(batch_size)],
|
||||
}
|
||||
@@ -172,11 +163,9 @@ def create_dummy_data(device=DEVICE):
|
||||
|
||||
def prepare_original_inputs(batch, processor, device=DEVICE):
|
||||
"""Prepare inputs for the original XVLA model."""
|
||||
batch_size = batch[OBS_STATE].shape[0]
|
||||
|
||||
# Convert images from [0, 1] to [0, 255] uint8 for processor
|
||||
image1 = (batch[f"{OBS_IMAGES}.image"] * 255).byte()
|
||||
image2 = (batch[f"{OBS_IMAGES}.image2"] * 255).byte()
|
||||
image1 = (batch[f"{OBS_IMAGES}.image"]).byte()
|
||||
image2 = (batch[f"{OBS_IMAGES}.image2"]).byte()
|
||||
|
||||
# Get task instruction (use first one if batch)
|
||||
task_instruction = batch["task"][0] if isinstance(batch["task"], list) else batch["task"]
|
||||
@@ -185,19 +174,23 @@ def prepare_original_inputs(batch, processor, device=DEVICE):
|
||||
# The processor expects a list of images per sample
|
||||
processed_inputs = processor(
|
||||
[image1[0], image2[0]], # Process first sample only for now
|
||||
task_instruction
|
||||
task_instruction,
|
||||
)
|
||||
|
||||
# Move to correct device and dtype
|
||||
dtype = torch.float32
|
||||
inputs = {k: v.to(device=device, dtype=dtype) if v.is_floating_point() else v.to(device=device)
|
||||
for k, v in processed_inputs.items()}
|
||||
inputs = {
|
||||
k: v.to(device=device, dtype=dtype) if v.is_floating_point() else v.to(device=device)
|
||||
for k, v in processed_inputs.items()
|
||||
}
|
||||
|
||||
# Add proprio and domain_id
|
||||
inputs.update({
|
||||
inputs.update(
|
||||
{
|
||||
"proprio": batch[OBS_STATE][:1].to(device), # First sample only
|
||||
"domain_id": torch.tensor([LIBERO_DOMAIN_ID], dtype=torch.long, device=device),
|
||||
})
|
||||
}
|
||||
)
|
||||
|
||||
return inputs
|
||||
|
||||
@@ -319,7 +312,9 @@ def test_xvla_original_vs_lerobot_pretrained():
|
||||
|
||||
print(f"Max absolute difference: {max_diff:.6e}")
|
||||
print(f"Mean absolute difference: {mean_diff:.6e}")
|
||||
print(f"Relative difference: {(mean_diff / (torch.abs(original_actions).mean().item() + 1e-8) * 100):.2f}%")
|
||||
print(
|
||||
f"Relative difference: {(mean_diff / (torch.abs(original_actions).mean().item() + 1e-8) * 100):.2f}%"
|
||||
)
|
||||
|
||||
# Check with different tolerances
|
||||
tolerances = [1e-5, 1e-4, 1e-3, 1e-2]
|
||||
@@ -379,7 +374,7 @@ def test_xvla_inference_reproducibility():
|
||||
print("✔️ Inference is perfectly reproducible!")
|
||||
else:
|
||||
diff = torch.abs(actions_1 - actions_2)
|
||||
print(f"⚠️ Small differences detected:")
|
||||
print("⚠️ Small differences detected:")
|
||||
print(f" Max diff: {diff.max().item():.6e}")
|
||||
print(f" Mean diff: {diff.mean().item():.6e}")
|
||||
|
||||
@@ -406,4 +401,3 @@ if __name__ == "__main__":
|
||||
print(f"❌ Test failed with error: {e}")
|
||||
print("=" * 80)
|
||||
raise
|
||||
|
||||
|
||||
@@ -1,190 +0,0 @@
|
||||
import random
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from xvla.models.modeling_xvla import XVLA
|
||||
|
||||
# from lerobot.policies.xvla.configuration_xvla import XVLAConfig
|
||||
from lerobot.configs.policies import PreTrainedConfig
|
||||
from lerobot.envs.factory import make_env_config
|
||||
from lerobot.policies.factory import make_policy, make_pre_post_processors
|
||||
from lerobot.utils.constants import OBS_IMAGES, OBS_STATE
|
||||
|
||||
torch.manual_seed(42)
|
||||
random.seed(42)
|
||||
np.random.seed(42)
|
||||
observation_height: int = 224
|
||||
observation_width: int = 224 # todo: jadechoghari, image size is different for the two models
|
||||
# create an observation dict
|
||||
OBS = {
|
||||
f"{OBS_IMAGES}.image": torch.randn(1, 3, observation_height, observation_width),
|
||||
f"{OBS_IMAGES}.image2": torch.randn(1, 3, observation_height, observation_width),
|
||||
OBS_STATE: torch.randn(1, 20), # ONLY if OBS_STATE is already a string
|
||||
"task": "put the object in the box",
|
||||
}
|
||||
|
||||
IMAGENET_MEAN = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1)
|
||||
IMAGENET_STD = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1)
|
||||
|
||||
|
||||
def fake_rgb(H, W):
|
||||
arr = np.random.randint(0, 255, (H, W, 3), dtype=np.uint8)
|
||||
t = torch.from_numpy(arr).permute(2, 0, 1) # CHW
|
||||
t = t.unsqueeze(0).float()
|
||||
# normalize pixel to imagenet
|
||||
return t
|
||||
|
||||
|
||||
OBS[f"{OBS_IMAGES}.image"] = fake_rgb(observation_height, observation_width)
|
||||
OBS[f"{OBS_IMAGES}.image2"] = fake_rgb(observation_height, observation_width)
|
||||
|
||||
cfg = PreTrainedConfig.from_pretrained("/raid/jade/models/xvla-libero-og_migrated")
|
||||
cfg.pretrained_path = "/raid/jade/models/xvla-libero-og_migrated"
|
||||
env_cfg = make_env_config("libero", task="libero_spatial")
|
||||
policy = make_policy(
|
||||
cfg=cfg,
|
||||
env_cfg=env_cfg,
|
||||
)
|
||||
|
||||
policy.eval()
|
||||
|
||||
preprocessor_overrides = {
|
||||
"device_processor": {"device": str(cfg.device)},
|
||||
}
|
||||
|
||||
preprocessor, postprocessor = make_pre_post_processors(
|
||||
policy_cfg=cfg,
|
||||
pretrained_path=cfg.pretrained_path,
|
||||
preprocessor_overrides=preprocessor_overrides,
|
||||
)
|
||||
|
||||
observation = preprocessor(OBS)
|
||||
inputs = policy._build_model_inputs(observation)
|
||||
|
||||
|
||||
#### now the og model ###########################################################
|
||||
from xvla.models.processing_xvla import XVLAProcessor
|
||||
|
||||
processor = XVLAProcessor.from_pretrained("/raid/jade/models/xvla-libero", num_views=2)
|
||||
inputs_1 = processor([OBS[f"{OBS_IMAGES}.image"], OBS[f"{OBS_IMAGES}.image2"]], OBS["task"])
|
||||
domain_id = torch.tensor([3], dtype=torch.long)
|
||||
inputs.update(
|
||||
{
|
||||
"proprio": OBS[OBS_STATE].to("cuda"),
|
||||
"domain_id": domain_id.to("cuda"),
|
||||
}
|
||||
)
|
||||
|
||||
# check the preprocessor
|
||||
for k in inputs.keys() & inputs_1.keys(): # intersection of keys
|
||||
a = inputs[k]
|
||||
b = inputs_1[k].to("cuda")
|
||||
|
||||
print(f"\n🔎 Key: {k}")
|
||||
|
||||
# Check shape
|
||||
print(" shape:", a.shape, b.shape)
|
||||
|
||||
# Check if close
|
||||
if torch.allclose(a, b, atol=1e-5, rtol=1e-5):
|
||||
print(" ✔️ tensors are equal (allclose)")
|
||||
else:
|
||||
diff = torch.abs(a - b)
|
||||
print(" ❌ tensors differ")
|
||||
print(" max diff:", diff.max().item())
|
||||
print(" mean diff:", diff.mean().item())
|
||||
|
||||
|
||||
model = XVLA.from_pretrained("/raid/jade/models/xvla-libero")
|
||||
model.eval()
|
||||
model.to("cuda")
|
||||
|
||||
action = model.generate_actions(**inputs, steps=10).squeeze(0).float().cpu().numpy()
|
||||
action_1 = policy.model.generate_actions(**inputs, steps=10).squeeze(0).float().cpu().numpy()
|
||||
|
||||
# np all close
|
||||
print(np.allclose(action, action_1, atol=1e-2, rtol=1e-2))
|
||||
print("max diff:", np.max(np.abs(action - action_1)))
|
||||
print("mean diff:", np.mean(np.abs(action - action_1)))
|
||||
|
||||
|
||||
import random
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image
|
||||
from xvla.models.configuration_xvla import XVLAConfig
|
||||
from xvla.models.modeling_xvla import XVLA
|
||||
from xvla.models.processor_xvla import XVLAProcessor
|
||||
|
||||
from lerobot.configs.policies import PreTrainedConfig
|
||||
from lerobot.envs.factory import make_env_config
|
||||
from lerobot.policies.factory import make_policy
|
||||
|
||||
cfg = XVLAConfig.from_pretrained("/raid/jade/models/xvla-libero")
|
||||
model = XVLA.from_pretrained("/raid/jade/models/xvla-libero")
|
||||
model.eval()
|
||||
model.to("cuda")
|
||||
processor = XVLAProcessor.from_pretrained("/raid/jade/models/xvla-libero")
|
||||
# /raid/jade/models/xvla-libero
|
||||
# seet seed
|
||||
torch.manual_seed(42)
|
||||
random.seed(42)
|
||||
np.random.seed(42)
|
||||
|
||||
|
||||
def make_random_pil_images(num_images=3, H=480, W=640):
|
||||
images = []
|
||||
for _ in range(num_images):
|
||||
# Random RGB image
|
||||
arr = np.random.randint(0, 256, (H, W, 3), dtype=np.uint8)
|
||||
img = Image.fromarray(arr)
|
||||
images.append(img)
|
||||
return images
|
||||
|
||||
|
||||
# Example:
|
||||
images = make_random_pil_images()
|
||||
language_instruction = "This is a random image"
|
||||
# Multimodal preprocessing by processor
|
||||
inputs = processor(images, language_instruction)
|
||||
if not {"input_ids", "image_input", "image_mask"}.issubset(inputs):
|
||||
raise ValueError("Processor did not return the expected keys.")
|
||||
|
||||
proprio = torch.randn(1, 20)
|
||||
domain_id = torch.tensor([0], dtype=torch.long)
|
||||
|
||||
# Align to model's device/dtype
|
||||
device = model.device
|
||||
dtype = next(model.parameters()).dtype
|
||||
|
||||
|
||||
def to_model(t: torch.Tensor) -> torch.Tensor:
|
||||
if not isinstance(t, torch.Tensor):
|
||||
t = torch.as_tensor(t)
|
||||
# cast floats to model dtype, keep integral/bool as-is
|
||||
return t.to(device=device, dtype=dtype) if t.is_floating_point() else t.to(device=device)
|
||||
|
||||
|
||||
inputs = {k: to_model(v) for k, v in inputs.items()}
|
||||
inputs.update(
|
||||
{
|
||||
"proprio": to_model(proprio),
|
||||
"domain_id": domain_id.to(device),
|
||||
}
|
||||
)
|
||||
|
||||
# Inference
|
||||
action = model.generate_actions(**inputs, steps=10).squeeze(0).float().cpu().numpy()
|
||||
|
||||
|
||||
#### now for lerobot model #####################################################
|
||||
|
||||
cfg = PreTrainedConfig.from_pretrained("/raid/jade/models/xvla-libero-og_migrated")
|
||||
env_cfg = make_env_config("libero", task="libero_spatial")
|
||||
cfg.pretrained_path = "/raid/jade/models/xvla-libero-og_migrated"
|
||||
policy = make_policy(cfg=cfg, env_cfg=env_cfg)
|
||||
policy.eval()
|
||||
policy.to("cuda")
|
||||
|
||||
action_1 = policy.model.generate_actions(**inputs, steps=10).squeeze(0).float().cpu().numpy()
|
||||
Reference in New Issue
Block a user