From 2afe2864e9caae60bf204b9d77dd68634126bf9d Mon Sep 17 00:00:00 2001 From: Steven Palma Date: Thu, 2 Jul 2026 11:51:27 +0200 Subject: [PATCH] refactor(policies): use config for evo1 + local imports --- pyproject.toml | 2 +- .../policies/evo1/configuration_evo1.py | 3 + src/lerobot/policies/evo1/evo1_model.py | 84 +++++++++---------- src/lerobot/policies/evo1/flow_matching.py | 50 ++--------- .../policies/evo1/internvl3_embedder.py | 3 +- src/lerobot/policies/evo1/modeling_evo1.py | 44 +++------- src/lerobot/policies/evo1/processor_evo1.py | 3 +- tests/policies/evo1/test_evo1.py | 52 +++++++----- uv.lock | 3 - 9 files changed, 99 insertions(+), 145 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index bdd1e9bdd..8b0fa4af2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -235,7 +235,7 @@ fastwam = [ "lerobot[transformers-dep]", "lerobot[diffusers-dep]", ] -evo1 = ["lerobot[transformers-dep]", "lerobot[timm-dep]"] +evo1 = ["lerobot[transformers-dep]"] hilserl = ["lerobot[transformers-dep]", "lerobot[dataset]", "gym-hil>=0.1.14,<0.2.0", "lerobot[grpcio-dep]", "lerobot[placo-dep]"] vla_jepa = ["lerobot[transformers-dep]", "lerobot[diffusers-dep]", "lerobot[qwen-vl-utils-dep]"] diff --git a/src/lerobot/policies/evo1/configuration_evo1.py b/src/lerobot/policies/evo1/configuration_evo1.py index a9f6ffe38..66c086fd0 100644 --- a/src/lerobot/policies/evo1/configuration_evo1.py +++ b/src/lerobot/policies/evo1/configuration_evo1.py @@ -80,6 +80,9 @@ class Evo1Config(PreTrainedConfig): vlm_model_name: str = "OpenGVLab/InternVL3-1B-hf" vlm_num_layers: int | None = 14 vlm_dtype: str = "bfloat16" + # Max token length for tokenizing the (image placeholders + instruction) prompt. Prompts longer + # than this are right-truncated, so raise it for tasks with long language instructions or many views. + max_text_length: int = 1024 use_flash_attn: bool = True action_head: str = "flowmatching" embed_dim: int = 896 diff --git a/src/lerobot/policies/evo1/evo1_model.py b/src/lerobot/policies/evo1/evo1_model.py index c4f75af86..64c1d4fa2 100644 --- a/src/lerobot/policies/evo1/evo1_model.py +++ b/src/lerobot/policies/evo1/evo1_model.py @@ -14,62 +14,64 @@ from __future__ import annotations -from typing import Any - import torch import torch.nn as nn +from .configuration_evo1 import Evo1Config from .flow_matching import FlowmatchingActionHead from .internvl3_embedder import InternVL3Embedder -def _cfgget(config: Any, key: str, default=None): - if isinstance(config, dict): - return config.get(key, default) - return getattr(config, key, default) - - class EVO1(nn.Module): - def __init__(self, config: dict): + def __init__(self, config: Evo1Config): super().__init__() self.config = config - self._device = _cfgget(config, "device", "cuda") - self.return_cls_only = _cfgget(config, "return_cls_only", False) - vlm_name = _cfgget(config, "vlm_name", "OpenGVLab/InternVL3-1B") - image_size = _cfgget(config, "image_size", 448) - if image_size is None: - image_resolution = _cfgget(config, "image_resolution", (448, 448)) - image_size = int(image_resolution[0]) + self._device = config.device + self.return_cls_only = config.return_cls_only + + # Gradient checkpointing only pays off when the VLM is actually being trained; keep it off + # whenever every VLM branch is frozen so the frozen forward stays cheap. + tracks_vlm_gradients = bool( + config.finetune_vlm or config.finetune_language_model or config.finetune_vision_model + ) + enable_gradient_checkpointing = config.enable_gradient_checkpointing and tracks_vlm_gradients self.embedder = InternVL3Embedder( - model_name=vlm_name, - image_size=image_size, + model_name=config.vlm_model_name, + image_size=int(config.image_resolution[0]), device=self._device, - num_language_layers=_cfgget(config, "vlm_num_layers", 14), - model_dtype=_cfgget(config, "vlm_dtype", "bfloat16"), - use_flash_attn=_cfgget(config, "use_flash_attn", True), - enable_gradient_checkpointing=_cfgget(config, "enable_gradient_checkpointing", True), - gradient_checkpointing_use_reentrant=_cfgget( - config, "gradient_checkpointing_use_reentrant", False - ), + num_language_layers=config.vlm_num_layers, + model_dtype=config.vlm_dtype, + use_flash_attn=config.use_flash_attn, + max_text_length=config.max_text_length, + enable_gradient_checkpointing=enable_gradient_checkpointing, + gradient_checkpointing_use_reentrant=config.gradient_checkpointing_use_reentrant, ) - action_head_type = _cfgget(config, "action_head", "flowmatching").lower() + action_head_type = config.action_head.lower() if action_head_type != "flowmatching": raise NotImplementedError(f"Unknown action_head: {action_head_type}") - horizon = _cfgget(config, "action_horizon", _cfgget(config, "horizon", 16)) - per_action_dim = _cfgget(config, "per_action_dim", 7) + horizon = config.chunk_size + per_action_dim = config.max_action_dim action_dim = horizon * per_action_dim - if isinstance(config, dict): - config["horizon"] = horizon - config["per_action_dim"] = per_action_dim - config["action_dim"] = action_dim - self.horizon = horizon self.per_action_dim = per_action_dim - self.action_head = FlowmatchingActionHead(config=config).to(self._device) + self.action_head = FlowmatchingActionHead( + embed_dim=config.embed_dim, + hidden_dim=config.hidden_dim, + action_dim=action_dim, + horizon=horizon, + per_action_dim=per_action_dim, + num_heads=config.num_heads, + num_layers=config.num_layers, + dropout=config.dropout, + num_inference_timesteps=config.num_inference_timesteps, + num_categories=config.num_categories, + state_dim=config.max_state_dim, + state_hidden_dim=config.state_hidden_dim, + ).to(self._device) def get_vl_embeddings( self, @@ -166,15 +168,13 @@ class EVO1(nn.Module): param.requires_grad = trainable def set_finetune_flags(self): - finetune_vlm = _cfgget(self.config, "finetune_vlm", False) - finetune_language_model = _cfgget(self.config, "finetune_language_model", False) - finetune_vision_model = _cfgget(self.config, "finetune_vision_model", False) + finetune_vlm = bool(self.config.finetune_vlm) + finetune_language_model = bool(self.config.finetune_language_model) + finetune_vision_model = bool(self.config.finetune_vision_model) has_explicit_branch_flags = any( - flag is not None for flag in (finetune_language_model, finetune_vision_model) + flag is not None + for flag in (self.config.finetune_language_model, self.config.finetune_vision_model) ) - finetune_language_model = bool(finetune_language_model) - finetune_vision_model = bool(finetune_vision_model) - finetune_vlm = bool(finetune_vlm) if has_explicit_branch_flags: self._set_module_trainable(self.embedder, False) @@ -187,5 +187,5 @@ class EVO1(nn.Module): elif not finetune_vlm: self._set_module_trainable(self.embedder, False) - if not _cfgget(self.config, "finetune_action_head", False): + if not self.config.finetune_action_head: self._set_module_trainable(self.action_head, False) diff --git a/src/lerobot/policies/evo1/flow_matching.py b/src/lerobot/policies/evo1/flow_matching.py index 986ea7a72..016d43163 100644 --- a/src/lerobot/policies/evo1/flow_matching.py +++ b/src/lerobot/policies/evo1/flow_matching.py @@ -16,7 +16,6 @@ from __future__ import annotations import logging import math -from types import SimpleNamespace import torch import torch.nn as nn @@ -24,12 +23,6 @@ import torch.nn as nn logger = logging.getLogger(__name__) -def _cfgget(config, key: str, default=None): - if isinstance(config, dict): - return config.get(key, default) - return getattr(config, key, default) - - class SinusoidalPositionalEncoding(nn.Module): def __init__(self, dim: int, max_len: int = 1000): super().__init__() @@ -171,7 +164,6 @@ class BasicTransformerBlock(nn.Module): class FlowmatchingActionHead(nn.Module): def __init__( self, - config=None, embed_dim: int = 896, hidden_dim: int = 1024, action_dim: int = 16 * 7, @@ -182,40 +174,17 @@ class FlowmatchingActionHead(nn.Module): dropout: float = 0.0, num_inference_timesteps: int = 20, num_categories: int = 1, + state_dim: int | None = None, + state_hidden_dim: int | None = None, ): super().__init__() - if config is not None: - embed_dim = _cfgget(config, "embed_dim", embed_dim) - hidden_dim = _cfgget(config, "hidden_dim", hidden_dim) - action_dim = _cfgget(config, "action_dim", action_dim) - horizon = _cfgget(config, "horizon", horizon) - per_action_dim = _cfgget(config, "per_action_dim", per_action_dim) - num_heads = _cfgget(config, "num_heads", num_heads) - num_layers = _cfgget(config, "num_layers", num_layers) - dropout = _cfgget(config, "dropout", dropout) - num_inference_timesteps = _cfgget(config, "num_inference_timesteps", num_inference_timesteps) - num_categories = _cfgget(config, "num_categories", num_categories) - self.config = config - else: - self.config = SimpleNamespace( - embed_dim=embed_dim, - hidden_dim=hidden_dim, - action_dim=action_dim, - horizon=horizon, - per_action_dim=per_action_dim, - num_heads=num_heads, - num_layers=num_layers, - dropout=dropout, - num_inference_timesteps=num_inference_timesteps, - num_categories=num_categories, - ) - logger.info("FlowmatchingActionHead num_inference_timesteps=%s", num_inference_timesteps) self.embed_dim = embed_dim self.horizon = horizon - self.per_action_dim = _cfgget(self.config, "per_action_dim", per_action_dim) - self.action_dim = _cfgget(self.config, "action_dim", action_dim) + self.per_action_dim = per_action_dim + self.action_dim = action_dim + self.num_inference_timesteps = num_inference_timesteps self.time_pos_enc = SinusoidalPositionalEncoding(embed_dim, max_len=1000) self.transformer_blocks = nn.ModuleList( @@ -239,9 +208,8 @@ class FlowmatchingActionHead(nn.Module): ) self.state_encoder = None - state_dim = _cfgget(self.config, "state_dim") if state_dim is not None: - state_hidden = _cfgget(self.config, "state_hidden_dim", embed_dim) + state_hidden = state_hidden_dim if state_hidden_dim is not None else embed_dim self.state_encoder = CategorySpecificMLP( input_dim=state_dim, hidden_dim=state_hidden, @@ -390,8 +358,8 @@ class FlowmatchingActionHead(nn.Module): state_emb = self.state_encoder(state, embodiment_id).unsqueeze(1) context_tokens = torch.cat([context_tokens, state_emb], dim=1) - action_dim_total = _cfgget(self.config, "action_dim", self.action_dim) - per_action_dim = _cfgget(self.config, "per_action_dim", action_dim_total // max(self.horizon, 1)) + action_dim_total = self.action_dim + per_action_dim = self.per_action_dim action = torch.rand(batch_size, action_dim_total, device=device, dtype=context_tokens.dtype) * 2 - 1 action_seq = ( @@ -411,7 +379,7 @@ class FlowmatchingActionHead(nn.Module): target_dtype = self.dtype context_tokens = context_tokens.to(dtype=target_dtype) - num_steps = int(_cfgget(self.config, "num_inference_timesteps", 32)) + num_steps = int(self.num_inference_timesteps) if num_steps <= 0: raise ValueError(f"num_inference_timesteps must be positive, got {num_steps}") dt = 1.0 / num_steps diff --git a/src/lerobot/policies/evo1/internvl3_embedder.py b/src/lerobot/policies/evo1/internvl3_embedder.py index 6040cf759..faed1bf46 100644 --- a/src/lerobot/policies/evo1/internvl3_embedder.py +++ b/src/lerobot/policies/evo1/internvl3_embedder.py @@ -111,6 +111,7 @@ class InternVL3Embedder(nn.Module): num_language_layers: int | None = 14, model_dtype: str | torch.dtype = "bfloat16", use_flash_attn: bool = True, + max_text_length: int = 1024, enable_gradient_checkpointing: bool = True, gradient_checkpointing_use_reentrant: bool = False, ): @@ -118,7 +119,7 @@ class InternVL3Embedder(nn.Module): self._requested_device = device self.image_size = image_size self.num_language_layers = num_language_layers - self.max_text_length = 1024 + self.max_text_length = max_text_length self.enable_gradient_checkpointing = bool(enable_gradient_checkpointing) self.gradient_checkpointing_use_reentrant = bool(gradient_checkpointing_use_reentrant) diff --git a/src/lerobot/policies/evo1/modeling_evo1.py b/src/lerobot/policies/evo1/modeling_evo1.py index ade94bc53..541ca877a 100644 --- a/src/lerobot/policies/evo1/modeling_evo1.py +++ b/src/lerobot/policies/evo1/modeling_evo1.py @@ -23,11 +23,12 @@ import torch from torch import Tensor from lerobot.configs.policies import PreTrainedConfig -from lerobot.policies.evo1.configuration_evo1 import Evo1Config -from lerobot.policies.evo1.evo1_model import EVO1 from lerobot.policies.pretrained import PreTrainedPolicy, T from lerobot.utils.constants import ACTION, OBS_IMAGES, OBS_STATE +from .configuration_evo1 import Evo1Config +from .evo1_model import EVO1 + class EVO1Policy(PreTrainedPolicy): config_class = Evo1Config @@ -43,7 +44,7 @@ class EVO1Policy(PreTrainedPolicy): ) self.config = config - self.model = EVO1(self._build_model_config(config)) + self.model = EVO1(config) self.model.set_finetune_flags() self._keep_frozen_embedder_eval() self.reset() @@ -80,37 +81,6 @@ class EVO1Policy(PreTrainedPolicy): **kwargs, ) - @staticmethod - def _build_model_config(config: Evo1Config) -> dict: - return { - "device": config.device, - "return_cls_only": config.return_cls_only, - "vlm_name": config.vlm_model_name, - "image_size": int(config.image_resolution[0]), - "vlm_num_layers": config.vlm_num_layers, - "vlm_dtype": config.vlm_dtype, - "use_flash_attn": config.use_flash_attn, - "action_head": config.action_head, - "action_horizon": config.chunk_size, - "per_action_dim": config.max_action_dim, - "state_dim": config.max_state_dim, - "embed_dim": config.embed_dim, - "hidden_dim": config.hidden_dim, - "state_hidden_dim": config.state_hidden_dim, - "num_heads": config.num_heads, - "num_layers": config.num_layers, - "dropout": config.dropout, - "num_inference_timesteps": config.num_inference_timesteps, - "num_categories": config.num_categories, - "enable_gradient_checkpointing": config.enable_gradient_checkpointing - and bool(config.finetune_vlm or config.finetune_language_model or config.finetune_vision_model), - "gradient_checkpointing_use_reentrant": config.gradient_checkpointing_use_reentrant, - "finetune_vlm": config.finetune_vlm, - "finetune_language_model": config.finetune_language_model, - "finetune_vision_model": config.finetune_vision_model, - "finetune_action_head": config.finetune_action_head, - } - @property def _camera_keys(self) -> list[str]: return list(self.config.image_features) @@ -406,6 +376,9 @@ class EVO1Policy(PreTrainedPolicy): embodiment_ids=embodiment_ids, ) flat_action_mask = action_mask.view(action_mask.shape[0], -1).to(dtype=actions_gt.dtype) + # Flow-matching velocity target. Padded (masked-out) action dims are already zero on both sides + # here (`actions_gt` is zero-padded in `_prepare_actions`, and `noise` is masked inside the head), + # and the whole difference is multiplied by `flat_action_mask`, so padded dims contribute nothing. target_velocity = (actions_gt - noise).view(actions_gt.shape[0], -1) * flat_action_mask loss = self._compute_masked_loss(pred_velocity, target_velocity, action_mask, reduction) loss_mean = loss.mean().item() if loss.ndim > 0 else loss.item() @@ -447,4 +420,7 @@ class EVO1Policy(PreTrainedPolicy): if len(self._action_queue) == 0: action_chunk = self.predict_action_chunk(batch)[:, : self.config.n_action_steps] self._action_queue.extend(action_chunk.transpose(0, 1)) + # Returns one step of shape (B, max_action_dim): actions are emitted at the padded max_action_dim + # width and cropped to the real action dim downstream by the postprocessor (Evo1ActionProcessorStep). + # Callers that bypass the postprocessor receive the padded width. return self._action_queue.popleft() diff --git a/src/lerobot/policies/evo1/processor_evo1.py b/src/lerobot/policies/evo1/processor_evo1.py index 9acc7bb74..702c034ce 100644 --- a/src/lerobot/policies/evo1/processor_evo1.py +++ b/src/lerobot/policies/evo1/processor_evo1.py @@ -21,7 +21,6 @@ from typing import Any import torch from lerobot.configs import FeatureType, PipelineFeatureType, PolicyFeature -from lerobot.policies.evo1.configuration_evo1 import Evo1Config from lerobot.processor import ( AddBatchDimensionProcessorStep, DeviceProcessorStep, @@ -54,6 +53,8 @@ from lerobot.utils.constants import ( TRUNCATED, ) +from .configuration_evo1 import Evo1Config + def evo1_batch_to_transition(batch: dict[str, Any]): transition = batch_to_transition(batch) diff --git a/tests/policies/evo1/test_evo1.py b/tests/policies/evo1/test_evo1.py index 81070f92a..af2587b8b 100644 --- a/tests/policies/evo1/test_evo1.py +++ b/tests/policies/evo1/test_evo1.py @@ -20,6 +20,7 @@ import pytest import torch from torch import nn +import lerobot.policies.evo1.evo1_model as evo1_model import lerobot.policies.evo1.modeling_evo1 as modeling_evo1 from lerobot.configs.types import FeatureType, PolicyFeature from lerobot.policies.evo1.configuration_evo1 import Evo1Config @@ -225,17 +226,26 @@ def test_evo1_rejects_non_square_image_resolution(): make_config(image_resolution=(448, 320)) -def test_evo1_build_model_config_uses_image_resolution_and_trainable_checkpointing(): - stage1 = make_config(training_stage="stage1", image_resolution=(224, 224)) - stage1_model_config = modeling_evo1.EVO1Policy._build_model_config(stage1) +def test_evo1_model_uses_image_resolution_and_trainable_checkpointing(monkeypatch): + captured: dict = {} - assert stage1_model_config["image_size"] == 224 - assert stage1_model_config["enable_gradient_checkpointing"] is False + class SpyEmbedder(nn.Module): + def __init__(self, **kwargs): + super().__init__() + captured.clear() + captured.update(kwargs) + + monkeypatch.setattr(evo1_model, "InternVL3Embedder", SpyEmbedder) + + stage1 = make_config(training_stage="stage1", image_resolution=(224, 224)) + evo1_model.EVO1(stage1) + assert captured["image_size"] == 224 + # VLM is frozen in stage1, so gradient checkpointing is gated off. + assert captured["enable_gradient_checkpointing"] is False stage2 = make_config(training_stage="stage2", image_resolution=(224, 224)) - stage2_model_config = modeling_evo1.EVO1Policy._build_model_config(stage2) - - assert stage2_model_config["enable_gradient_checkpointing"] is True + evo1_model.EVO1(stage2) + assert captured["enable_gradient_checkpointing"] is True def test_evo1_policy_processors_pad_state_crop_action_and_binarize_gripper(): @@ -429,21 +439,19 @@ def test_evo1_action_mask_accepts_chunk_size_one(monkeypatch): assert not action_mask[:, :, ACTION_DIM:].any() -def test_flowmatching_dict_config_enables_state_encoder_for_horizon_one(): +def test_flowmatching_state_encoder_for_horizon_one(): head = FlowmatchingActionHead( - config={ - "embed_dim": EMBED_DIM, - "hidden_dim": 16, - "action_dim": ACTION_DIM, - "horizon": 1, - "per_action_dim": ACTION_DIM, - "num_heads": 2, - "num_layers": 1, - "num_inference_timesteps": 2, - "state_dim": STATE_DIM, - "state_hidden_dim": 16, - "num_categories": 1, - } + embed_dim=EMBED_DIM, + hidden_dim=16, + action_dim=ACTION_DIM, + horizon=1, + per_action_dim=ACTION_DIM, + num_heads=2, + num_layers=1, + num_inference_timesteps=2, + state_dim=STATE_DIM, + state_hidden_dim=16, + num_categories=1, ) assert head.state_encoder is not None diff --git a/uv.lock b/uv.lock index 172e7059c..57b1d96d1 100644 --- a/uv.lock +++ b/uv.lock @@ -2875,7 +2875,6 @@ all = [ { name = "scikit-image" }, { name = "scipy" }, { name = "teleop" }, - { name = "timm" }, { name = "torchcodec", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'AMD64' and sys_platform == 'linux') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'arm64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or sys_platform == 'win32'" }, { name = "torchdiffeq" }, { name = "transformers" }, @@ -2980,7 +2979,6 @@ evaluation = [ { name = "av" }, ] evo1 = [ - { name = "timm" }, { name = "transformers" }, ] fastwam = [ @@ -3371,7 +3369,6 @@ requires-dist = [ { name = "lerobot", extras = ["scipy-dep"], marker = "extra == 'wallx'" }, { name = "lerobot", extras = ["smolvla"], marker = "extra == 'all'" }, { name = "lerobot", extras = ["test"], marker = "extra == 'all'" }, - { name = "lerobot", extras = ["timm-dep"], marker = "extra == 'evo1'" }, { name = "lerobot", extras = ["timm-dep"], marker = "extra == 'groot'" }, { name = "lerobot", extras = ["topreward"], marker = "extra == 'all'" }, { name = "lerobot", extras = ["training"], marker = "extra == 'all'" },