chore(evo1): update uv.lock

refactor(policy): evo1 GPU-batched preprocessing + vectorized attention masking + remove dead code
2026-07-02 15:47:05 +00:00 · 2026-07-02 00:03:30 +02:00 · 2026-07-01 20:05:41 +02:00
5 changed files with 197 additions and 159 deletions
@@ -14,12 +14,10 @@

 from __future__ import annotations

-from collections.abc import Sequence
 from typing import Any

 import torch
 import torch.nn as nn
-from PIL import Image

 from .flow_matching import FlowmatchingActionHead
 from .internvl3_embedder import InternVL3Embedder
@@ -73,22 +71,25 @@ class EVO1(nn.Module):
        self.per_action_dim = per_action_dim
        self.action_head = FlowmatchingActionHead(config=config).to(self._device)

-    def _normalize_image_batches(
+    def get_vl_embeddings(
        self,
-        images: Sequence[Image.Image | torch.Tensor] | Sequence[Sequence[Image.Image | torch.Tensor]],
-        prompt: str | list[str] | None,
+        images: list[torch.Tensor],
        image_mask: torch.Tensor,
-    ) -> tuple[list[list[Image.Image | torch.Tensor]], list[str], torch.Tensor]:
+        prompt: str | list[str] | None = None,
+        return_cls_only: bool | None = None,
+    ) -> torch.Tensor:
+        """Fused VL embeddings from per-camera image batches.
+
+        Args:
+            images: list of per-camera tensors, each shaped ``(B, C, H, W)`` with values in ``[0, 1]``.
+            image_mask: bool tensor ``(B, max_views)`` marking present views.
+        """
+        if return_cls_only is None:
+            return_cls_only = self.return_cls_only
        if not images:
            raise ValueError("EVO1 expects at least one image per sample.")

-        first = images[0]
-        if isinstance(first, (Image.Image, torch.Tensor)):
-            image_batches = [list(images)]  # type: ignore[arg-type]
-        else:
-            image_batches = [list(sample) for sample in images]  # type: ignore[arg-type]
-
-        batch_size = len(image_batches)
+        batch_size = images[0].shape[0]
        if prompt is None:
            prompts = [""] * batch_size
        elif isinstance(prompt, str):
@@ -107,21 +108,8 @@ class EVO1(nn.Module):
                f"image_mask batch size {image_mask.shape[0]} does not match image batch size {batch_size}"
            )

-        return image_batches, prompts, image_mask
-
-    def get_vl_embeddings(
-        self,
-        images: list[Image.Image | torch.Tensor] | list[list[Image.Image | torch.Tensor]],
-        image_mask: torch.Tensor,
-        prompt: str | list[str] | None = None,
-        return_cls_only: bool | None = None,
-    ) -> torch.Tensor:
-        if return_cls_only is None:
-            return_cls_only = self.return_cls_only
-
-        image_batches, prompts, image_mask = self._normalize_image_batches(images, prompt, image_mask)
-        return self.embedder.get_fused_image_text_embedding_from_tensor_images(
-            image_tensors_batch=image_batches,
+        return self.embedder.get_fused_image_text_embedding_batched(
+            camera_images=images,
            image_masks=image_mask,
            text_prompts=prompts,
            return_cls_only=return_cls_only,
@@ -14,7 +14,6 @@

 from __future__ import annotations

-import functools
 import logging
 from collections.abc import Sequence
 from typing import TYPE_CHECKING
@@ -22,8 +21,7 @@ from typing import TYPE_CHECKING
 import torch
 import torch.nn as nn
 import torchvision.transforms.functional as tvf
-from PIL import Image
-from torchvision.transforms.functional import to_pil_image
+from torchvision.transforms.functional import InterpolationMode

 from lerobot.utils.import_utils import _transformers_available, require_package

@@ -42,51 +40,64 @@ IMG_END_TOKEN = "</img>"  # nosec B105
 logger = logging.getLogger(__name__)


-@functools.lru_cache(maxsize=10000)
-def get_target_aspect_ratio(orig_width: int, orig_height: int, image_size: int, min_num: int, max_num: int):
-    aspect_ratio = orig_width / orig_height
-    target_ratios = {
-        (i, j)
-        for n in range(min_num, max_num + 1)
-        for i in range(1, n + 1)
-        for j in range(1, n + 1)
-        if i * j <= max_num and i * j >= min_num
-    }
-    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+def _batched_resize_01(images: torch.Tensor, image_size: int) -> torch.Tensor:
+    """Resize a batch of ``[0, 1]`` images to ``(image_size, image_size)`` on-device.

-    best_ratio_diff = float("inf")
-    best_ratio = (1, 1)
-    area = orig_width * orig_height
-    for ratio in target_ratios:
-        target_ar = ratio[0] / ratio[1]
-        diff = abs(aspect_ratio - target_ar)
-        if diff < best_ratio_diff:
-            best_ratio_diff = diff
-            best_ratio = ratio
-        elif diff == best_ratio_diff and area > 0.5 * image_size**2 * ratio[0] * ratio[1]:
-            best_ratio = ratio
-    return best_ratio
+    Numerically mirrors InternVL3's per-image PIL preprocessing
+    (``to_pil_image`` -> ``Image.resize`` -> ``to_tensor``): the float input is quantized to uint8
+    exactly as ``to_pil_image`` does, then resized with bicubic interpolation and antialiasing,
+    which matches PIL's default resampler. This runs as a single batched op instead of a per-image
+    Python loop with a GPU->CPU->PIL->GPU round-trip.
+
+    Args:
+        images: float tensor of shape ``(N, C, H, W)`` with values in ``[0, 1]``.
+
+    Returns:
+        float32 tensor of shape ``(N, C, image_size, image_size)`` with values in ``[0, 1]``.
+    """
+    # to_pil_image() quantizes float [0, 1] to uint8 (x * 255, truncated); replicate that so the
+    # bicubic resample sees the same integer pixels PIL would.
+    pixels_u8 = (images * 255.0).clamp(0, 255).to(torch.uint8)
+    resized = tvf.resize(
+        pixels_u8, [image_size, image_size], interpolation=InterpolationMode.BICUBIC, antialias=True
+    )
+    return resized.to(torch.float32) / 255.0


-def dynamic_preprocess(image, min_num=1, max_num=1, image_size=448, use_thumbnail=False):
-    orig_width, orig_height = image.size
-    ratio_w, ratio_h = get_target_aspect_ratio(orig_width, orig_height, image_size, min_num, max_num)
-    target_width = image_size * ratio_w
-    target_height = image_size * ratio_h
-    blocks = ratio_w * ratio_h
-    resized_img = image.resize((target_width, target_height))
-    processed_images = []
-    for i in range(blocks):
-        box = (
-            (i % (target_width // image_size)) * image_size,
-            (i // (target_width // image_size)) * image_size,
-            ((i % (target_width // image_size)) + 1) * image_size,
-            ((i // (target_width // image_size)) + 1) * image_size,
-        )
-        processed_images.append(resized_img.crop(box))
-    if use_thumbnail and len(processed_images) != 1:
-        processed_images.append(image.resize((image_size, image_size)))
-    return processed_images
+def _batched_pixel_values(
+    camera_images: Sequence[torch.Tensor],
+    max_views: int,
+    image_size: int,
+    mean: torch.Tensor,
+    std: torch.Tensor,
+    dtype: torch.dtype,
+    device: torch.device | str,
+) -> torch.Tensor:
+    """Build InternVL3 ``pixel_values`` from per-camera ``[0, 1]`` image batches without leaving the device.
+
+    Equivalent to running the old per-sample/per-image PIL path (resize -> to_tensor -> ImageNet
+    normalize, a single tile per image) but batched across the whole minibatch. Absent views (fewer
+    cameras than ``max_views``) are zero-padded to reproduce the previous ``torch.zeros_like``
+    padding; those views are masked out downstream via the attention mask.
+
+    Returns:
+        ``pixel_values`` of shape ``(B * max_views, C, image_size, image_size)``, ordered row-major
+        over ``(sample, view)`` to match the old preprocessing.
+    """
+    resized: list[torch.Tensor] = []
+    for image in camera_images:
+        resized.append(_batched_resize_01(image.to(device=device), image_size).to(dtype))
+
+    batch_size = resized[0].shape[0]
+    channels = resized[0].shape[1]
+    while len(resized) < max_views:
+        resized.append(torch.zeros(batch_size, channels, image_size, image_size, dtype=dtype, device=device))
+
+    stacked = torch.stack(resized[:max_views], dim=1)  # (B, V, C, H, W)
+    mean = mean.to(device=device, dtype=dtype).view(1, 1, -1, 1, 1)
+    std = std.to(device=device, dtype=dtype).view(1, 1, -1, 1, 1)
+    normalized = (stacked - mean) / std
+    return normalized.reshape(batch_size * max_views, channels, image_size, image_size)


 class InternVL3Embedder(nn.Module):
@@ -191,42 +202,6 @@ class InternVL3Embedder(nn.Module):
                "Requested gradient checkpointing, but model does not expose checkpointing controls."
            )

-    def _preprocess_single_image(self, image: Image.Image | torch.Tensor) -> torch.Tensor:
-        if isinstance(image, torch.Tensor):
-            pil_image = to_pil_image(image.detach().cpu())
-        else:
-            pil_image = image.convert("RGB")
-        tiles = dynamic_preprocess(pil_image, image_size=self.image_size)
-        tile_tensors = torch.stack([tvf.to_tensor(tile) for tile in tiles]).to(
-            device=self.device, dtype=torch.bfloat16
-        )
-        mean = torch.tensor(IMAGENET_MEAN, device=self.device, dtype=torch.bfloat16).view(1, 3, 1, 1)
-        std = torch.tensor(IMAGENET_STD, device=self.device, dtype=torch.bfloat16).view(1, 3, 1, 1)
-        return (tile_tensors - mean) / std
-
-    def _preprocess_images(
-        self,
-        image_tensors_batch: Sequence[Sequence[Image.Image | torch.Tensor]],
-    ) -> tuple[torch.Tensor, list[list[int]]]:
-        pixel_values_list = []
-        batch_num_tiles_list: list[list[int]] = []
-
-        for image_tensors in image_tensors_batch:
-            num_tiles_list: list[int] = []
-            for image in image_tensors:
-                tiles = self._preprocess_single_image(image)
-                pixel_values_list.append(tiles)
-                num_tiles_list.append(int(tiles.shape[0]))
-            batch_num_tiles_list.append(num_tiles_list)
-
-        if pixel_values_list:
-            pixel_values = torch.cat(pixel_values_list, dim=0)
-        else:
-            pixel_values = torch.empty(
-                0, 3, self.image_size, self.image_size, dtype=torch.bfloat16, device=self.device
-            )
-        return pixel_values, batch_num_tiles_list
-
    def _build_multimodal_prompts(
        self,
        batch_num_tiles_list: list[list[int]],
@@ -242,14 +217,70 @@ class InternVL3Embedder(nn.Module):
            prompts.append("".join(prompt_segments) + text_prompt.strip())
        return prompts

-    def get_fused_image_text_embedding_from_tensor_images(
+    def get_fused_image_text_embedding_batched(
        self,
-        image_tensors_batch: Sequence[Sequence[Image.Image | torch.Tensor]],
+        camera_images: Sequence[torch.Tensor],
        image_masks: torch.Tensor,
        text_prompts: Sequence[str],
        return_cls_only: bool = True,
    ):
-        pixel_values, batch_num_tiles_list = self._preprocess_images(image_tensors_batch)
+        """Fused VL embedding from per-camera ``[0, 1]`` image batches (no PIL, no host round-trip).
+
+        Args:
+            camera_images: list of per-camera tensors, each shaped ``(B, C, H, W)`` in ``[0, 1]``.
+            image_masks: bool tensor ``(B, max_views)`` marking present views.
+        """
+        max_views = int(image_masks.shape[1])
+        batch_size = int(image_masks.shape[0])
+        mean = torch.tensor(IMAGENET_MEAN, device=self.device, dtype=torch.bfloat16)
+        std = torch.tensor(IMAGENET_STD, device=self.device, dtype=torch.bfloat16)
+        pixel_values = _batched_pixel_values(
+            camera_images, max_views, self.image_size, mean, std, torch.bfloat16, self.device
+        )
+        # InternVL3 preprocessing uses a single tile per image (max_num=1).
+        batch_num_tiles_list = [[1] * max_views for _ in range(batch_size)]
+        return self._forward_vlm(
+            pixel_values, batch_num_tiles_list, image_masks, text_prompts, return_cls_only
+        )
+
+    def _mask_absent_image_tokens(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+        image_masks: torch.Tensor,
+        batch_num_tiles_list: list[list[int]],
+    ) -> torch.Tensor:
+        """Zero attention over the image-context tokens of absent views, fully vectorized.
+
+        Reproduces the previous per-sample/per-image Python loop, which called ``.item()`` once per
+        image and forced a device->host sync each time, without any host<->device synchronization.
+        """
+        # A single tile per image (max_num=1), so every image occupies the same number of
+        # context tokens.
+        tiles_per_image = (
+            batch_num_tiles_list[0][0] if batch_num_tiles_list and batch_num_tiles_list[0] else 1
+        )
+        tokens_per_image = self.num_image_token * tiles_per_image
+
+        image_masks = image_masks.to(device=input_ids.device).bool()
+        img_token_mask = input_ids == self.img_context_token_id  # (B, L)
+        # keep[b, k] tells whether the k-th image-context token (ordered view0, view1, ...) survives.
+        per_token_keep = image_masks.repeat_interleave(tokens_per_image, dim=1)  # (B, V * tokens_per_image)
+        # Rank each context token by its running position among the row's context tokens.
+        ctx_index = img_token_mask.to(torch.long).cumsum(dim=1) - 1
+        ctx_index = ctx_index.clamp(min=0, max=per_token_keep.shape[1] - 1)
+        keep_here = torch.gather(per_token_keep, 1, ctx_index)  # (B, L)
+        drop = img_token_mask & ~keep_here
+        return attention_mask.masked_fill(drop, 0)
+
+    def _forward_vlm(
+        self,
+        pixel_values: torch.Tensor,
+        batch_num_tiles_list: list[list[int]],
+        image_masks: torch.Tensor,
+        text_prompts: Sequence[str],
+        return_cls_only: bool,
+    ):
        if pixel_values.shape[0] == 0:
            logger.warning("InternVL3 received an empty image batch after preprocessing.")
            hidden_size = getattr(self.model.config, "hidden_size", None)
@@ -257,8 +288,7 @@ class InternVL3Embedder(nn.Module):
                hidden_size = getattr(self.model.config.text_config, "hidden_size", None)
            if hidden_size is None:
                raise RuntimeError("Unable to infer hidden size for empty InternVL3 batch.")
-            empty = torch.empty(0, hidden_size, device=self.device, dtype=torch.float32)
-            return empty
+            return torch.empty(0, hidden_size, device=self.device, dtype=torch.float32)

        prompts = self._build_multimodal_prompts(batch_num_tiles_list, text_prompts)

@@ -270,23 +300,9 @@ class InternVL3Embedder(nn.Module):
            max_length=self.max_text_length,
        ).to(self.device)
        input_ids = model_inputs["input_ids"]
-        attention_mask = model_inputs["attention_mask"]
-
-        # Zero out attention for absent images
-        img_token_mask = input_ids == self.img_context_token_id
-        tokens_per_tile = self.num_image_token
-        for batch_index in range(input_ids.shape[0]):
-            current_token_idx = 0
-            img_token_locations = torch.where(img_token_mask[batch_index])[0]
-            for image_index, num_tiles in enumerate(batch_num_tiles_list[batch_index]):
-                num_tokens_for_image = num_tiles * tokens_per_tile
-                if not bool(image_masks[batch_index, image_index].item()):
-                    start_offset = current_token_idx
-                    end_offset = min(current_token_idx + num_tokens_for_image, len(img_token_locations))
-                    if start_offset < end_offset:
-                        idxs = img_token_locations[start_offset:end_offset]
-                        attention_mask[batch_index, idxs] = 0
-                current_token_idx += num_tokens_for_image
+        attention_mask = self._mask_absent_image_tokens(
+            input_ids, model_inputs["attention_mask"], image_masks, batch_num_tiles_list
+        )

        outputs = self.model(
            input_ids=input_ids,
@@ -318,17 +318,20 @@ class EVO1Policy(PreTrainedPolicy):
        self._keep_frozen_embedder_eval()
        return self

-    def _collect_image_batches(self, batch: dict[str, Tensor]) -> tuple[list[list[Tensor]], Tensor]:
+    def _collect_image_batches(self, batch: dict[str, Tensor]) -> tuple[list[Tensor], Tensor]:
        camera_keys = self._camera_keys or sorted(key for key in batch if key.startswith(f"{OBS_IMAGES}."))
        if not camera_keys:
            raise ValueError("EVO1 requires at least one visual observation feature.")
+        camera_keys = list(camera_keys)[: self.config.max_views]

-        # Normalize each camera tensor to (B, C, H, W) up-front so that batch_size is read
-        # from a real batch dim and not from C in the unbatched (C, H, W) case.
-        normalized: dict[str, Tensor] = {}
-        for camera_key in camera_keys[: self.config.max_views]:
+        # Keep each present camera as a batched (B, C, H, W) tensor on its current (GPU) device.
+        # Resizing/normalization and zero-padding of absent views happen batched inside the
+        # embedder, so images never leave the device here (no per-sample .cpu() round-trip).
+        camera_images: list[Tensor] = []
+        for camera_key in camera_keys:
            image = batch[camera_key]
            if image.dim() == 3:
+                # Promote an unbatched (C, H, W) frame so batch_size is read from a real batch dim.
                image = image.unsqueeze(0)
            elif image.dim() == 5:
                image = image[:, -1]
@@ -336,24 +339,16 @@ class EVO1Policy(PreTrainedPolicy):
                raise ValueError(
                    f"Unsupported image tensor shape for EVO1: key={camera_key} shape={tuple(image.shape)}"
                )
-            normalized[camera_key] = image
+            camera_images.append(image)

-        batch_size = normalized[camera_keys[0]].shape[0]
-        image_batches: list[list[Tensor]] = []
-        image_masks = torch.zeros(batch_size, self.config.max_views, dtype=torch.bool)
+        batch_size = camera_images[0].shape[0]
+        n_present = len(camera_images)
+        image_masks = torch.zeros(
+            batch_size, self.config.max_views, dtype=torch.bool, device=camera_images[0].device
+        )
+        image_masks[:, :n_present] = True

-        for batch_index in range(batch_size):
-            sample_images: list[Tensor] = []
-            for camera_key in camera_keys[: self.config.max_views]:
-                sample_images.append(normalized[camera_key][batch_index].detach().cpu())
-            if not sample_images:
-                raise ValueError("EVO1 received a batch without any image tensor.")
-            while len(sample_images) < self.config.max_views:
-                sample_images.append(torch.zeros_like(sample_images[0]))
-            image_batches.append(sample_images[: self.config.max_views])
-            image_masks[batch_index, : min(len(camera_keys), self.config.max_views)] = True
-
-        return image_batches, image_masks
+        return camera_images, image_masks

    def _compute_fused_tokens(
        self,
@@ -24,6 +24,11 @@ import lerobot.policies.evo1.modeling_evo1 as modeling_evo1
 from lerobot.configs.types import FeatureType, PolicyFeature
 from lerobot.policies.evo1.configuration_evo1 import Evo1Config
 from lerobot.policies.evo1.flow_matching import FlowmatchingActionHead
+from lerobot.policies.evo1.internvl3_embedder import (
+    IMAGENET_MEAN,
+    IMAGENET_STD,
+    _batched_pixel_values,
+)
 from lerobot.policies.evo1.processor_evo1 import (
    Evo1ActionProcessorStep,
    Evo1PadActionProcessorStep,
@@ -60,7 +65,9 @@ class DummyEVO1(nn.Module):
        self.get_vl_embeddings_calls += 1
        self.grad_enabled_calls.append(torch.is_grad_enabled())
        self.embedder_training_calls.append(self.embedder.training)
-        return torch.ones(len(images), 4, EMBED_DIM, requires_grad=torch.is_grad_enabled())
+        # images is a list of per-camera (B, C, H, W) tensors, so the batch dim is images[0].shape[0].
+        batch_size = images[0].shape[0]
+        return torch.ones(batch_size, 4, EMBED_DIM, requires_grad=torch.is_grad_enabled())

    def forward(
        self,
@@ -397,10 +404,12 @@ def test_collect_image_batches_handles_unbatched_chw(monkeypatch):
        f"{OBS_IMAGES}.front": torch.rand(3, 16, 16),
    }

-    image_batches, image_masks = policy._collect_image_batches(batch)
+    camera_images, image_masks = policy._collect_image_batches(batch)

-    assert len(image_batches) == 1
-    assert len(image_batches[0]) == policy.config.max_views
+    # One present camera, returned as a batched (B, C, H, W) tensor with the unbatched CHW frame
+    # promoted to batch_size=1 (not read as batch_size=C).
+    assert len(camera_images) == 1
+    assert camera_images[0].shape == (1, 3, 16, 16)
    assert image_masks.tolist() == [[True, False]]


@@ -447,3 +456,28 @@ def test_flowmatching_dict_config_enables_state_encoder_for_horizon_one():

    assert pred_velocity.shape == (2, ACTION_DIM)
    assert noise.shape == (2, 1, ACTION_DIM)
+
+
+def test_evo1_batched_pixel_values_shape_and_zero_padding():
+    torch.manual_seed(0)
+    batch_size, image_size, max_views = 2, 448, 3
+    camera_images = [torch.rand(batch_size, 3, 40, 50)]  # a single present camera
+    mean = torch.tensor(IMAGENET_MEAN)
+    std = torch.tensor(IMAGENET_STD)
+
+    pixel_values = _batched_pixel_values(
+        camera_images, max_views, image_size, mean, std, torch.float32, torch.device("cpu")
+    )
+
+    assert pixel_values.shape == (batch_size * max_views, 3, image_size, image_size)
+    grouped = pixel_values.reshape(batch_size, max_views, 3, image_size, image_size)
+    # Absent views (indices 1, 2) are zero images normalized to -mean/std, matching the old padding.
+    expected_pad = (-mean / std).view(1, 3, 1, 1)
+    for view in (1, 2):
+        assert torch.allclose(
+            grouped[:, view], expected_pad.expand(batch_size, 3, image_size, image_size), atol=1e-5
+        )
+    # The present view is genuinely different from the constant pad value.
+    assert not torch.allclose(
+        grouped[:, 0], expected_pad.expand(batch_size, 3, image_size, image_size), atol=1e-3
+    )
@@ -2988,6 +2988,9 @@ test = [
    { name = "pytest-cov" },
    { name = "pytest-timeout" },
 ]
+timm-dep = [
+    { name = "timm" },
+]
 training = [
    { name = "accelerate" },
    { name = "av" },
@@ -3143,6 +3146,8 @@ requires-dist = [
    { name = "lerobot", extras = ["scipy-dep"], marker = "extra == 'wallx'" },
    { name = "lerobot", extras = ["smolvla"], marker = "extra == 'all'" },
    { name = "lerobot", extras = ["test"], marker = "extra == 'all'" },
+    { name = "lerobot", extras = ["timm-dep"], marker = "extra == 'evo1'" },
+    { name = "lerobot", extras = ["timm-dep"], marker = "extra == 'groot'" },
    { name = "lerobot", extras = ["training"], marker = "extra == 'all'" },
    { name = "lerobot", extras = ["transformers-dep"], marker = "extra == 'eo1'" },
    { name = "lerobot", extras = ["transformers-dep"], marker = "extra == 'evo1'" },
@@ -3206,8 +3211,7 @@ requires-dist = [
    { name = "setuptools", specifier = ">=71.0.0,<81.0.0" },
    { name = "teleop", marker = "extra == 'phone'", specifier = ">=0.1.0,<0.2.0" },
    { name = "termcolor", specifier = ">=2.4.0,<4.0.0" },
-    { name = "timm", marker = "extra == 'evo1'", specifier = ">=1.0.0,<1.1.0" },
-    { name = "timm", marker = "extra == 'groot'", specifier = ">=1.0.0,<1.1.0" },
+    { name = "timm", marker = "extra == 'timm-dep'", specifier = ">=1.0.0,<1.1.0" },
    { name = "torch", marker = "sys_platform != 'linux'", specifier = ">=2.7,<2.12.0" },
    { name = "torch", marker = "sys_platform == 'linux'", specifier = ">=2.7,<2.12.0", index = "https://download.pytorch.org/whl/cu128" },
    { name = "torchcodec", marker = "(platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 'armv7l' and sys_platform == 'linux' and extra == 'dataset') or (platform_machine != 'x86_64' and sys_platform == 'darwin' and extra == 'dataset') or (sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32' and extra == 'dataset')", specifier = ">=0.3.0,<0.12.0" },
@@ -3218,7 +3222,7 @@ requires-dist = [
    { name = "transformers", marker = "extra == 'transformers-dep'", specifier = ">=5.4.0,<5.6.0" },
    { name = "wandb", marker = "extra == 'training'", specifier = ">=0.24.0,<0.25.0" },
 ]
-provides-extras = ["dataset", "training", "hardware", "viz", "core-scripts", "evaluation", "dataset-viz", "av-dep", "pygame-dep", "placo-dep", "transformers-dep", "grpcio-dep", "can-dep", "peft-dep", "scipy-dep", "diffusers-dep", "qwen-vl-utils-dep", "matplotlib-dep", "pyserial-dep", "deepdiff-dep", "pynput-dep", "pyzmq-dep", "feetech", "dynamixel", "damiao", "robstride", "openarms", "gamepad", "hopejr", "lekiwi", "unitree-g1", "reachy2", "kinematics", "intelrealsense", "phone", "diffusion", "wallx", "pi", "smolvla", "multi-task-dit", "groot", "sarm", "xvla", "eo1", "evo1", "hilserl", "async", "peft", "dev", "notebook", "test", "video-benchmark", "aloha", "pusht", "libero", "metaworld", "all"]
+provides-extras = ["dataset", "training", "hardware", "viz", "core-scripts", "evaluation", "dataset-viz", "av-dep", "pygame-dep", "placo-dep", "transformers-dep", "grpcio-dep", "can-dep", "peft-dep", "scipy-dep", "diffusers-dep", "qwen-vl-utils-dep", "matplotlib-dep", "pyserial-dep", "deepdiff-dep", "pynput-dep", "pyzmq-dep", "timm-dep", "feetech", "dynamixel", "damiao", "robstride", "openarms", "gamepad", "hopejr", "lekiwi", "unitree-g1", "reachy2", "kinematics", "intelrealsense", "phone", "diffusion", "wallx", "pi", "smolvla", "multi-task-dit", "groot", "sarm", "xvla", "eo1", "evo1", "hilserl", "async", "peft", "dev", "notebook", "test", "video-benchmark", "aloha", "pusht", "libero", "metaworld", "all"]

 [[package]]
 name = "librt"
@@ -4261,6 +4265,7 @@ dependencies = [
    { name = "protobuf" },
 ]
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/81/b1/d111b1df656761f980d9e298a60039a9cb66036b1d039e777537743d0ac3/onnxruntime-1.26.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:05b028781b322ad74b57ce5b50aa5280bb1fe96ceec334628ade681e0b24c1ac", size = 18016624, upload-time = "2026-05-12T00:41:01.735Z" },
    { url = "https://files.pythonhosted.org/packages/f6/a0/3f9d896a0385a36bd04345d6d0b802821a5782adde562e7e135f6bb71c73/onnxruntime-1.26.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:91f2bb870a4b9224eba0a6728c1fa7a9e552b8e59e1083c51fbbc3d013f2b5c0", size = 16052692, upload-time = "2026-05-08T19:07:13.829Z" },
    { url = "https://files.pythonhosted.org/packages/7c/43/2a4e04f8dbeffad19bbcced4bcd4289bf478921518437404d6b92bdf213b/onnxruntime-1.26.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9b6dd70599005bd1bf29779f04a91978b92b5e719c11a20068a8f8e535f725b6", size = 18185439, upload-time = "2026-05-08T19:07:36.299Z" },
    { url = "https://files.pythonhosted.org/packages/44/fc/026d0a7162b9c2153dac292baea9e027c42304dc1d9dc6f8ff5b4cfbaedd/onnxruntime-1.26.0-cp312-cp312-win_amd64.whl", hash = "sha256:a26374dc7fbcaae593601086b242120e13f2310558df0991da6dd8b8fac00414", size = 13026427, upload-time = "2026-05-08T19:08:03.503Z" },
Author	SHA1	Message	Date
Steven Palma	fa813e41d1	chore(evo1): update uv.lock	2026-07-02 00:03:30 +02:00
Steven Palma	33391821d5	refactor(policy): evo1 GPU-batched preprocessing + vectorized attention masking + remove dead code	2026-07-01 20:05:41 +02:00