diff --git a/src/lerobot/policies/lingbot_va/configuration_lingbot_va.py b/src/lerobot/policies/lingbot_va/configuration_lingbot_va.py index 1f54bd322..5cb3a2341 100644 --- a/src/lerobot/policies/lingbot_va/configuration_lingbot_va.py +++ b/src/lerobot/policies/lingbot_va/configuration_lingbot_va.py @@ -51,38 +51,25 @@ class LingBotVAConfig(PreTrainedConfig): cross_attn_norm: bool = True eps: float = 1e-6 rope_max_seq_len: int = 1024 - # "flex" is supported for training only and needs a recent torch build. Inference uses - # "torch" SDPA (always available) or, optionally, "flashattn". + # "flex" = training only (needs recent torch); inference uses "torch" SDPA or "flashattn". attn_mode: str = "torch" # ── Frozen sub-models (VAE + UMT5 text encoder + tokenizer) ── - # These heavy frozen weights (~20 GB) are NOT bundled into the LeRobot safetensors - # checkpoint (only the trainable ~5B transformer is). They are lazily pulled from this - # HF repo / local directory at policy-init time. The directory must contain the - # diffusers-style ``vae/``, ``text_encoder/`` and ``tokenizer/`` sub-folders. + # ~20 GB of frozen weights, NOT bundled in the checkpoint; lazily pulled from this HF repo / + # local dir (must hold diffusers-style ``vae/``, ``text_encoder/``, ``tokenizer/`` sub-folders). wan_pretrained_path: str = "robbyant/lingbot-va-posttrain-libero-long" - # dtype used for the transformer / VAE / text-encoder weights at inference. - dtype: str = "bfloat16" # one of "bfloat16", "float16", "float32" - # Device for the frozen UMT5-XXL text encoder. It encodes the (fixed) instruction once per - # episode, so keeping it on CPU frees ~11 GB of VRAM and lets the 5B transformer + VAE fit on - # a single 24-32 GB GPU. Set to "cuda" if you have the headroom and want faster prompt encoding. + dtype: str = "bfloat16" # transformer / VAE / text-encoder dtype: "bfloat16", "float16", "float32" + # Frozen UMT5-XXL encoder device; "cpu" frees ~11 GB VRAM (it runs once per episode). text_encoder_device: str = "cpu" - # ── Observation cameras (order matters: latents are concatenated on width) ── - # Defaults match the LIBERO env feature keys (agentview -> image, eye-in-hand -> image2). + # ── Observation cameras (order matters: latents are concatenated on width; LIBERO defaults) ── obs_cam_keys: list[str] = field( default_factory=lambda: ["observation.images.image", "observation.images.image2"] ) - # Horizontally flip the camera images before encoding. LeRobot's LIBERO env processor rotates - # frames 180° (flip H *and* W; the HuggingFaceVLA convention), but upstream LingBot-VA trains / - # evaluates on vertically-flipped-only frames (``obs[::-1]`` in evaluation/libero/client.py). - # Undoing the extra horizontal flip here realigns the input with the model's training orientation. + # Undo the LIBERO env processor's extra horizontal flip to match the model's training orientation. image_hflip: bool = False - # Latent assembly layout for the observation cameras: - # "width_concat" : encode every camera at (height, width) and concat latents on width (LIBERO). - # "robotwin_tshape" : head camera at full (height, width), the two wrist cameras at half - # resolution, assembled in a "T" (wrists side-by-side on top of the head - # on the height axis) using a second streaming VAE (RoboTwin). + # Camera latent layout: "width_concat" (cameras concatenated on width; LIBERO) or + # "robotwin_tshape" (full-res head + half-res wrists in a "T"; RoboTwin). camera_layout: str = "width_concat" # ── Inference hyperparameters (LIBERO defaults) ── @@ -101,19 +88,15 @@ class LingBotVAConfig(PreTrainedConfig): action_snr_shift: float = 0.05 max_sequence_length: int = 512 # UMT5 prompt length - # Subset of the 30-d action space actually used by the benchmark (LIBERO = 7-DoF). - # The fixed action (un)normalization quantiles are not stored here nor hardcoded in the - # processor: they are serialized into the checkpoint's ``policy_postprocessor.json`` - # (``LingBotVAActionUnnormalizeStep``) and restored on load by ``from_pretrained``. + # Subset of the 30-d action space used by the benchmark (LIBERO = 7-DoF). The action + # (un)normalization quantiles live in the checkpoint's ``policy_postprocessor.json``, not here. used_action_channel_ids: list[int] = field(default_factory=lambda: list(range(7))) - # Opt-in: VAE-decode the predicted video latents and stash them on - # ``self.last_predicted_frames`` so eval/train can save predicted-video MP4s. + # Opt-in: VAE-decode predicted video latents to ``self.last_predicted_frames`` for saving MP4s. save_predicted_video: bool = False - # ── Normalization (handled internally / via custom steps, hence IDENTITY here) ── - # Images are scaled to [-1, 1] and VAE-encoded inside the policy; actions are - # quantile-(un)normalized by dedicated processor steps using the fixed quantiles above. + # ── Normalization: IDENTITY here; images are scaled + VAE-encoded and actions are + # quantile-(un)normalized inside the policy / dedicated processor steps. ── normalization_mapping: dict[str, NormalizationMode] = field( default_factory=lambda: { "VISUAL": NormalizationMode.IDENTITY, diff --git a/src/lerobot/policies/lingbot_va/modeling_lingbot_va.py b/src/lerobot/policies/lingbot_va/modeling_lingbot_va.py index bf66f608f..dae563f91 100644 --- a/src/lerobot/policies/lingbot_va/modeling_lingbot_va.py +++ b/src/lerobot/policies/lingbot_va/modeling_lingbot_va.py @@ -1191,9 +1191,7 @@ class LingBotVAPolicy(PreTrainedPolicy): self.last_predicted_frames: Tensor | None = None self.reset() - # ------------------------------------------------------------------ # Frozen-module lazy loading (VAE + UMT5 + tokenizer) - # ------------------------------------------------------------------ def _ensure_frozen_modules(self): if self._frozen: return @@ -1234,9 +1232,7 @@ class LingBotVAPolicy(PreTrainedPolicy): def _streaming_vae(self): return self._frozen["streaming_vae"] - # ------------------------------------------------------------------ # PreTrainedPolicy API - # ------------------------------------------------------------------ def get_optim_params(self) -> dict: # Only the transformer is trainable; the VAE / text encoder stay frozen (kept outside the # nn.Module registry). With PEFT/LoRA this naturally returns just the adapter params. @@ -1283,9 +1279,7 @@ class LingBotVAPolicy(PreTrainedPolicy): if "streaming_vae_half" in self._frozen: self._frozen["streaming_vae_half"].clear_cache() - # ------------------------------------------------------------------ # Training (flow-matching dual-stream loss). Requires attn_mode="flex". - # ------------------------------------------------------------------ def _ensure_train_schedulers(self): if getattr(self, "_train_sched_latent", None) is None: cfg = self.config @@ -1565,9 +1559,7 @@ class LingBotVAPolicy(PreTrainedPolicy): a = a.transpose(1, 2).contiguous() # [B, n_steps, n_used] return a.to(torch.float32) - # ------------------------------------------------------------------ # Prompt / text encoding - # ------------------------------------------------------------------ def _maybe_init_prompt(self, batch): if self._prompt_embeds is not None or batch is None: return @@ -1616,9 +1608,7 @@ class LingBotVAPolicy(PreTrainedPolicy): negative_prompt_embeds = self._get_t5_prompt_embeds("", max_len) return prompt_embeds, negative_prompt_embeds - # ------------------------------------------------------------------ # Observation (image) encoding -> normalized video latents - # ------------------------------------------------------------------ def _extract_raw_obs(self, batch) -> dict[str, Tensor]: """Snapshot the configured camera images from a batch (kept raw for later VAE encoding).""" return {k: batch[k].detach() for k in self.config.obs_cam_keys} @@ -1696,9 +1686,7 @@ class LingBotVAPolicy(PreTrainedPolicy): video_latent = self._normalize_vae_latent(enc_out) return video_latent.to(self.config.device) - # ------------------------------------------------------------------ # KV cache management - # ------------------------------------------------------------------ @property def _latent_hw(self): if self.config.camera_layout == "robotwin_tshape": @@ -1802,9 +1790,7 @@ class LingBotVAPolicy(PreTrainedPolicy): mask[self.config.used_action_channel_ids] = True return mask - # ------------------------------------------------------------------ # Action conditioning (executed action history) (de)normalization - # ------------------------------------------------------------------ def _preprocess_action_state(self, action_norm: Tensor) -> Tensor: """Build the action-conditioning tensor from the already-normalized executed actions. @@ -1845,9 +1831,7 @@ class LingBotVAPolicy(PreTrainedPolicy): ) self._frame_st_id += latent_model_input.shape[2] - # ------------------------------------------------------------------ # The core dual-stream denoising loop (one chunk) - # ------------------------------------------------------------------ @torch.no_grad() def _infer(self, init_latent, frame_st_id=0): cfg = self.config @@ -1937,9 +1921,7 @@ class LingBotVAPolicy(PreTrainedPolicy): actions[:, ~self._action_mask] *= 0 return actions, latents - # ------------------------------------------------------------------ # Predicted-video decoding (opt-in) - # ------------------------------------------------------------------ @torch.no_grad() def _decode_predicted_video(self, latents) -> Tensor: """VAE-decode predicted latents into a uint8 frame stack ``[T, H, W, 3]`` on CPU.""" diff --git a/src/lerobot/policies/lingbot_va/processor_lingbot_va.py b/src/lerobot/policies/lingbot_va/processor_lingbot_va.py index 2e616f702..192abf7fd 100644 --- a/src/lerobot/policies/lingbot_va/processor_lingbot_va.py +++ b/src/lerobot/policies/lingbot_va/processor_lingbot_va.py @@ -47,14 +47,9 @@ from lerobot.utils.constants import ( from .configuration_lingbot_va import LingBotVAConfig -# LingBot-VA applies a *fixed* per-channel action quantile (un)normalization rather than -# dataset-derived stats. The benchmark-specific quantiles (LIBERO 7-DoF, RoboTwin 16-d eef) are -# deliberately NOT hardcoded here: they are serialized into each checkpoint's -# ``policy_postprocessor.json`` (via ``LingBotVAActionUnnormalizeStep.get_config``) and restored on -# load by ``PolicyProcessorPipeline.from_pretrained``. A freshly built (unconverted) policy defaults -# to a neutral ``[-1, 1]`` mapping (identity rescale); the real stats always come from the checkpoint -# (or via ``postprocessor_overrides``). To regenerate a checkpoint from scratch, source the quantiles -# from the upstream ``wan_va/configs/va_{libero,robotwin}_cfg.py`` and pass them through. +# LingBot-VA uses fixed per-channel action quantile (un)normalization. The benchmark quantiles are +# NOT hardcoded here: they live in each checkpoint's ``policy_postprocessor.json`` and are restored on +# load. A fresh (unconverted) policy defaults to a neutral ``[-1, 1]`` mapping (identity rescale). @dataclass