docs(lingbot_va): trim verbose comments

- configuration_lingbot_va.py: condense multi-line field comments to one-liners (keep the ── section headers). - processor_lingbot_va.py: shorten the action-quantile explanation block. - modeling_lingbot_va.py: drop the bare "# ----" separator rules, keeping the one-line section headers. No code changes. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-07-02 07:37:10 +00:00 · 2026-06-08 11:31:05 +02:00
parent c764afb8ef
commit 50b20c5bf1
3 changed files with 17 additions and 57 deletions
@@ -51,38 +51,25 @@ class LingBotVAConfig(PreTrainedConfig):
    cross_attn_norm: bool = True
    eps: float = 1e-6
    rope_max_seq_len: int = 1024
-    # "flex" is supported for training only and needs a recent torch build. Inference uses
-    # "torch" SDPA (always available) or, optionally, "flashattn".
+    # "flex" = training only (needs recent torch); inference uses "torch" SDPA or "flashattn".
    attn_mode: str = "torch"

    # ── Frozen sub-models (VAE + UMT5 text encoder + tokenizer) ──
-    # These heavy frozen weights (~20 GB) are NOT bundled into the LeRobot safetensors
-    # checkpoint (only the trainable ~5B transformer is). They are lazily pulled from this
-    # HF repo / local directory at policy-init time. The directory must contain the
-    # diffusers-style ``vae/``, ``text_encoder/`` and ``tokenizer/`` sub-folders.
+    # ~20 GB of frozen weights, NOT bundled in the checkpoint; lazily pulled from this HF repo /
+    # local dir (must hold diffusers-style ``vae/``, ``text_encoder/``, ``tokenizer/`` sub-folders).
    wan_pretrained_path: str = "robbyant/lingbot-va-posttrain-libero-long"
-    # dtype used for the transformer / VAE / text-encoder weights at inference.
-    dtype: str = "bfloat16"  # one of "bfloat16", "float16", "float32"
-    # Device for the frozen UMT5-XXL text encoder. It encodes the (fixed) instruction once per
-    # episode, so keeping it on CPU frees ~11 GB of VRAM and lets the 5B transformer + VAE fit on
-    # a single 24-32 GB GPU. Set to "cuda" if you have the headroom and want faster prompt encoding.
+    dtype: str = "bfloat16"  # transformer / VAE / text-encoder dtype: "bfloat16", "float16", "float32"
+    # Frozen UMT5-XXL encoder device; "cpu" frees ~11 GB VRAM (it runs once per episode).
    text_encoder_device: str = "cpu"

-    # ── Observation cameras (order matters: latents are concatenated on width) ──
-    # Defaults match the LIBERO env feature keys (agentview -> image, eye-in-hand -> image2).
+    # ── Observation cameras (order matters: latents are concatenated on width; LIBERO defaults) ──
    obs_cam_keys: list[str] = field(
        default_factory=lambda: ["observation.images.image", "observation.images.image2"]
    )
-    # Horizontally flip the camera images before encoding. LeRobot's LIBERO env processor rotates
-    # frames 180° (flip H *and* W; the HuggingFaceVLA convention), but upstream LingBot-VA trains /
-    # evaluates on vertically-flipped-only frames (``obs[::-1]`` in evaluation/libero/client.py).
-    # Undoing the extra horizontal flip here realigns the input with the model's training orientation.
+    # Undo the LIBERO env processor's extra horizontal flip to match the model's training orientation.
    image_hflip: bool = False
-    # Latent assembly layout for the observation cameras:
-    #   "width_concat"    : encode every camera at (height, width) and concat latents on width (LIBERO).
-    #   "robotwin_tshape" : head camera at full (height, width), the two wrist cameras at half
-    #                       resolution, assembled in a "T" (wrists side-by-side on top of the head
-    #                       on the height axis) using a second streaming VAE (RoboTwin).
+    # Camera latent layout: "width_concat" (cameras concatenated on width; LIBERO) or
+    # "robotwin_tshape" (full-res head + half-res wrists in a "T"; RoboTwin).
    camera_layout: str = "width_concat"

    # ── Inference hyperparameters (LIBERO defaults) ──
@@ -101,19 +88,15 @@ class LingBotVAConfig(PreTrainedConfig):
    action_snr_shift: float = 0.05
    max_sequence_length: int = 512  # UMT5 prompt length

-    # Subset of the 30-d action space actually used by the benchmark (LIBERO = 7-DoF).
-    # The fixed action (un)normalization quantiles are not stored here nor hardcoded in the
-    # processor: they are serialized into the checkpoint's ``policy_postprocessor.json``
-    # (``LingBotVAActionUnnormalizeStep``) and restored on load by ``from_pretrained``.
+    # Subset of the 30-d action space used by the benchmark (LIBERO = 7-DoF). The action
+    # (un)normalization quantiles live in the checkpoint's ``policy_postprocessor.json``, not here.
    used_action_channel_ids: list[int] = field(default_factory=lambda: list(range(7)))

-    # Opt-in: VAE-decode the predicted video latents and stash them on
-    # ``self.last_predicted_frames`` so eval/train can save predicted-video MP4s.
+    # Opt-in: VAE-decode predicted video latents to ``self.last_predicted_frames`` for saving MP4s.
    save_predicted_video: bool = False

-    # ── Normalization (handled internally / via custom steps, hence IDENTITY here) ──
-    # Images are scaled to [-1, 1] and VAE-encoded inside the policy; actions are
-    # quantile-(un)normalized by dedicated processor steps using the fixed quantiles above.
+    # ── Normalization: IDENTITY here; images are scaled + VAE-encoded and actions are
+    # quantile-(un)normalized inside the policy / dedicated processor steps. ──
    normalization_mapping: dict[str, NormalizationMode] = field(
        default_factory=lambda: {
            "VISUAL": NormalizationMode.IDENTITY,
@@ -1191,9 +1191,7 @@ class LingBotVAPolicy(PreTrainedPolicy):
        self.last_predicted_frames: Tensor | None = None
        self.reset()

-    # ------------------------------------------------------------------
    # Frozen-module lazy loading (VAE + UMT5 + tokenizer)
-    # ------------------------------------------------------------------
    def _ensure_frozen_modules(self):
        if self._frozen:
            return
@@ -1234,9 +1232,7 @@ class LingBotVAPolicy(PreTrainedPolicy):
    def _streaming_vae(self):
        return self._frozen["streaming_vae"]

-    # ------------------------------------------------------------------
    # PreTrainedPolicy API
-    # ------------------------------------------------------------------
    def get_optim_params(self) -> dict:
        # Only the transformer is trainable; the VAE / text encoder stay frozen (kept outside the
        # nn.Module registry). With PEFT/LoRA this naturally returns just the adapter params.
@@ -1283,9 +1279,7 @@ class LingBotVAPolicy(PreTrainedPolicy):
            if "streaming_vae_half" in self._frozen:
                self._frozen["streaming_vae_half"].clear_cache()

-    # ------------------------------------------------------------------
    # Training (flow-matching dual-stream loss). Requires attn_mode="flex".
-    # ------------------------------------------------------------------
    def _ensure_train_schedulers(self):
        if getattr(self, "_train_sched_latent", None) is None:
            cfg = self.config
@@ -1565,9 +1559,7 @@ class LingBotVAPolicy(PreTrainedPolicy):
        a = a.transpose(1, 2).contiguous()  # [B, n_steps, n_used]
        return a.to(torch.float32)

-    # ------------------------------------------------------------------
    # Prompt / text encoding
-    # ------------------------------------------------------------------
    def _maybe_init_prompt(self, batch):
        if self._prompt_embeds is not None or batch is None:
            return
@@ -1616,9 +1608,7 @@ class LingBotVAPolicy(PreTrainedPolicy):
            negative_prompt_embeds = self._get_t5_prompt_embeds("", max_len)
        return prompt_embeds, negative_prompt_embeds

-    # ------------------------------------------------------------------
    # Observation (image) encoding -> normalized video latents
-    # ------------------------------------------------------------------
    def _extract_raw_obs(self, batch) -> dict[str, Tensor]:
        """Snapshot the configured camera images from a batch (kept raw for later VAE encoding)."""
        return {k: batch[k].detach() for k in self.config.obs_cam_keys}
@@ -1696,9 +1686,7 @@ class LingBotVAPolicy(PreTrainedPolicy):
        video_latent = self._normalize_vae_latent(enc_out)
        return video_latent.to(self.config.device)

-    # ------------------------------------------------------------------
    # KV cache management
-    # ------------------------------------------------------------------
    @property
    def _latent_hw(self):
        if self.config.camera_layout == "robotwin_tshape":
@@ -1802,9 +1790,7 @@ class LingBotVAPolicy(PreTrainedPolicy):
        mask[self.config.used_action_channel_ids] = True
        return mask

-    # ------------------------------------------------------------------
    # Action conditioning (executed action history) (de)normalization
-    # ------------------------------------------------------------------
    def _preprocess_action_state(self, action_norm: Tensor) -> Tensor:
        """Build the action-conditioning tensor from the already-normalized executed actions.

@@ -1845,9 +1831,7 @@ class LingBotVAPolicy(PreTrainedPolicy):
            )
        self._frame_st_id += latent_model_input.shape[2]

-    # ------------------------------------------------------------------
    # The core dual-stream denoising loop (one chunk)
-    # ------------------------------------------------------------------
    @torch.no_grad()
    def _infer(self, init_latent, frame_st_id=0):
        cfg = self.config
@@ -1937,9 +1921,7 @@ class LingBotVAPolicy(PreTrainedPolicy):
        actions[:, ~self._action_mask] *= 0
        return actions, latents

-    # ------------------------------------------------------------------
    # Predicted-video decoding (opt-in)
-    # ------------------------------------------------------------------
    @torch.no_grad()
    def _decode_predicted_video(self, latents) -> Tensor:
        """VAE-decode predicted latents into a uint8 frame stack ``[T, H, W, 3]`` on CPU."""
@@ -47,14 +47,9 @@ from lerobot.utils.constants import (

 from .configuration_lingbot_va import LingBotVAConfig

-# LingBot-VA applies a *fixed* per-channel action quantile (un)normalization rather than
-# dataset-derived stats. The benchmark-specific quantiles (LIBERO 7-DoF, RoboTwin 16-d eef) are
-# deliberately NOT hardcoded here: they are serialized into each checkpoint's
-# ``policy_postprocessor.json`` (via ``LingBotVAActionUnnormalizeStep.get_config``) and restored on
-# load by ``PolicyProcessorPipeline.from_pretrained``. A freshly built (unconverted) policy defaults
-# to a neutral ``[-1, 1]`` mapping (identity rescale); the real stats always come from the checkpoint
-# (or via ``postprocessor_overrides``). To regenerate a checkpoint from scratch, source the quantiles
-# from the upstream ``wan_va/configs/va_{libero,robotwin}_cfg.py`` and pass them through.
+# LingBot-VA uses fixed per-channel action quantile (un)normalization. The benchmark quantiles are
+# NOT hardcoded here: they live in each checkpoint's ``policy_postprocessor.json`` and are restored on
+# load. A fresh (unconverted) policy defaults to a neutral ``[-1, 1]`` mapping (identity rescale).


@dataclass