docs(lingbot_va): trim verbose comments

- configuration_lingbot_va.py: condense multi-line field comments to one-liners
  (keep the ── section headers).
- processor_lingbot_va.py: shorten the action-quantile explanation block.
- modeling_lingbot_va.py: drop the bare "# ----" separator rules, keeping the
  one-line section headers.

No code changes.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Pepijn
2026-06-08 11:31:05 +02:00
committed by Maxime Ellerbach
parent c764afb8ef
commit 50b20c5bf1
3 changed files with 17 additions and 57 deletions
@@ -51,38 +51,25 @@ class LingBotVAConfig(PreTrainedConfig):
cross_attn_norm: bool = True
eps: float = 1e-6
rope_max_seq_len: int = 1024
# "flex" is supported for training only and needs a recent torch build. Inference uses
# "torch" SDPA (always available) or, optionally, "flashattn".
# "flex" = training only (needs recent torch); inference uses "torch" SDPA or "flashattn".
attn_mode: str = "torch"
# ── Frozen sub-models (VAE + UMT5 text encoder + tokenizer) ──
# These heavy frozen weights (~20 GB) are NOT bundled into the LeRobot safetensors
# checkpoint (only the trainable ~5B transformer is). They are lazily pulled from this
# HF repo / local directory at policy-init time. The directory must contain the
# diffusers-style ``vae/``, ``text_encoder/`` and ``tokenizer/`` sub-folders.
# ~20 GB of frozen weights, NOT bundled in the checkpoint; lazily pulled from this HF repo /
# local dir (must hold diffusers-style ``vae/``, ``text_encoder/``, ``tokenizer/`` sub-folders).
wan_pretrained_path: str = "robbyant/lingbot-va-posttrain-libero-long"
# dtype used for the transformer / VAE / text-encoder weights at inference.
dtype: str = "bfloat16" # one of "bfloat16", "float16", "float32"
# Device for the frozen UMT5-XXL text encoder. It encodes the (fixed) instruction once per
# episode, so keeping it on CPU frees ~11 GB of VRAM and lets the 5B transformer + VAE fit on
# a single 24-32 GB GPU. Set to "cuda" if you have the headroom and want faster prompt encoding.
dtype: str = "bfloat16" # transformer / VAE / text-encoder dtype: "bfloat16", "float16", "float32"
# Frozen UMT5-XXL encoder device; "cpu" frees ~11 GB VRAM (it runs once per episode).
text_encoder_device: str = "cpu"
# ── Observation cameras (order matters: latents are concatenated on width) ──
# Defaults match the LIBERO env feature keys (agentview -> image, eye-in-hand -> image2).
# ── Observation cameras (order matters: latents are concatenated on width; LIBERO defaults) ──
obs_cam_keys: list[str] = field(
default_factory=lambda: ["observation.images.image", "observation.images.image2"]
)
# Horizontally flip the camera images before encoding. LeRobot's LIBERO env processor rotates
# frames 180° (flip H *and* W; the HuggingFaceVLA convention), but upstream LingBot-VA trains /
# evaluates on vertically-flipped-only frames (``obs[::-1]`` in evaluation/libero/client.py).
# Undoing the extra horizontal flip here realigns the input with the model's training orientation.
# Undo the LIBERO env processor's extra horizontal flip to match the model's training orientation.
image_hflip: bool = False
# Latent assembly layout for the observation cameras:
# "width_concat" : encode every camera at (height, width) and concat latents on width (LIBERO).
# "robotwin_tshape" : head camera at full (height, width), the two wrist cameras at half
# resolution, assembled in a "T" (wrists side-by-side on top of the head
# on the height axis) using a second streaming VAE (RoboTwin).
# Camera latent layout: "width_concat" (cameras concatenated on width; LIBERO) or
# "robotwin_tshape" (full-res head + half-res wrists in a "T"; RoboTwin).
camera_layout: str = "width_concat"
# ── Inference hyperparameters (LIBERO defaults) ──
@@ -101,19 +88,15 @@ class LingBotVAConfig(PreTrainedConfig):
action_snr_shift: float = 0.05
max_sequence_length: int = 512 # UMT5 prompt length
# Subset of the 30-d action space actually used by the benchmark (LIBERO = 7-DoF).
# The fixed action (un)normalization quantiles are not stored here nor hardcoded in the
# processor: they are serialized into the checkpoint's ``policy_postprocessor.json``
# (``LingBotVAActionUnnormalizeStep``) and restored on load by ``from_pretrained``.
# Subset of the 30-d action space used by the benchmark (LIBERO = 7-DoF). The action
# (un)normalization quantiles live in the checkpoint's ``policy_postprocessor.json``, not here.
used_action_channel_ids: list[int] = field(default_factory=lambda: list(range(7)))
# Opt-in: VAE-decode the predicted video latents and stash them on
# ``self.last_predicted_frames`` so eval/train can save predicted-video MP4s.
# Opt-in: VAE-decode predicted video latents to ``self.last_predicted_frames`` for saving MP4s.
save_predicted_video: bool = False
# ── Normalization (handled internally / via custom steps, hence IDENTITY here) ──
# Images are scaled to [-1, 1] and VAE-encoded inside the policy; actions are
# quantile-(un)normalized by dedicated processor steps using the fixed quantiles above.
# ── Normalization: IDENTITY here; images are scaled + VAE-encoded and actions are
# quantile-(un)normalized inside the policy / dedicated processor steps. ──
normalization_mapping: dict[str, NormalizationMode] = field(
default_factory=lambda: {
"VISUAL": NormalizationMode.IDENTITY,
@@ -1191,9 +1191,7 @@ class LingBotVAPolicy(PreTrainedPolicy):
self.last_predicted_frames: Tensor | None = None
self.reset()
# ------------------------------------------------------------------
# Frozen-module lazy loading (VAE + UMT5 + tokenizer)
# ------------------------------------------------------------------
def _ensure_frozen_modules(self):
if self._frozen:
return
@@ -1234,9 +1232,7 @@ class LingBotVAPolicy(PreTrainedPolicy):
def _streaming_vae(self):
return self._frozen["streaming_vae"]
# ------------------------------------------------------------------
# PreTrainedPolicy API
# ------------------------------------------------------------------
def get_optim_params(self) -> dict:
# Only the transformer is trainable; the VAE / text encoder stay frozen (kept outside the
# nn.Module registry). With PEFT/LoRA this naturally returns just the adapter params.
@@ -1283,9 +1279,7 @@ class LingBotVAPolicy(PreTrainedPolicy):
if "streaming_vae_half" in self._frozen:
self._frozen["streaming_vae_half"].clear_cache()
# ------------------------------------------------------------------
# Training (flow-matching dual-stream loss). Requires attn_mode="flex".
# ------------------------------------------------------------------
def _ensure_train_schedulers(self):
if getattr(self, "_train_sched_latent", None) is None:
cfg = self.config
@@ -1565,9 +1559,7 @@ class LingBotVAPolicy(PreTrainedPolicy):
a = a.transpose(1, 2).contiguous() # [B, n_steps, n_used]
return a.to(torch.float32)
# ------------------------------------------------------------------
# Prompt / text encoding
# ------------------------------------------------------------------
def _maybe_init_prompt(self, batch):
if self._prompt_embeds is not None or batch is None:
return
@@ -1616,9 +1608,7 @@ class LingBotVAPolicy(PreTrainedPolicy):
negative_prompt_embeds = self._get_t5_prompt_embeds("", max_len)
return prompt_embeds, negative_prompt_embeds
# ------------------------------------------------------------------
# Observation (image) encoding -> normalized video latents
# ------------------------------------------------------------------
def _extract_raw_obs(self, batch) -> dict[str, Tensor]:
"""Snapshot the configured camera images from a batch (kept raw for later VAE encoding)."""
return {k: batch[k].detach() for k in self.config.obs_cam_keys}
@@ -1696,9 +1686,7 @@ class LingBotVAPolicy(PreTrainedPolicy):
video_latent = self._normalize_vae_latent(enc_out)
return video_latent.to(self.config.device)
# ------------------------------------------------------------------
# KV cache management
# ------------------------------------------------------------------
@property
def _latent_hw(self):
if self.config.camera_layout == "robotwin_tshape":
@@ -1802,9 +1790,7 @@ class LingBotVAPolicy(PreTrainedPolicy):
mask[self.config.used_action_channel_ids] = True
return mask
# ------------------------------------------------------------------
# Action conditioning (executed action history) (de)normalization
# ------------------------------------------------------------------
def _preprocess_action_state(self, action_norm: Tensor) -> Tensor:
"""Build the action-conditioning tensor from the already-normalized executed actions.
@@ -1845,9 +1831,7 @@ class LingBotVAPolicy(PreTrainedPolicy):
)
self._frame_st_id += latent_model_input.shape[2]
# ------------------------------------------------------------------
# The core dual-stream denoising loop (one chunk)
# ------------------------------------------------------------------
@torch.no_grad()
def _infer(self, init_latent, frame_st_id=0):
cfg = self.config
@@ -1937,9 +1921,7 @@ class LingBotVAPolicy(PreTrainedPolicy):
actions[:, ~self._action_mask] *= 0
return actions, latents
# ------------------------------------------------------------------
# Predicted-video decoding (opt-in)
# ------------------------------------------------------------------
@torch.no_grad()
def _decode_predicted_video(self, latents) -> Tensor:
"""VAE-decode predicted latents into a uint8 frame stack ``[T, H, W, 3]`` on CPU."""
@@ -47,14 +47,9 @@ from lerobot.utils.constants import (
from .configuration_lingbot_va import LingBotVAConfig
# LingBot-VA applies a *fixed* per-channel action quantile (un)normalization rather than
# dataset-derived stats. The benchmark-specific quantiles (LIBERO 7-DoF, RoboTwin 16-d eef) are
# deliberately NOT hardcoded here: they are serialized into each checkpoint's
# ``policy_postprocessor.json`` (via ``LingBotVAActionUnnormalizeStep.get_config``) and restored on
# load by ``PolicyProcessorPipeline.from_pretrained``. A freshly built (unconverted) policy defaults
# to a neutral ``[-1, 1]`` mapping (identity rescale); the real stats always come from the checkpoint
# (or via ``postprocessor_overrides``). To regenerate a checkpoint from scratch, source the quantiles
# from the upstream ``wan_va/configs/va_{libero,robotwin}_cfg.py`` and pass them through.
# LingBot-VA uses fixed per-channel action quantile (un)normalization. The benchmark quantiles are
# NOT hardcoded here: they live in each checkpoint's ``policy_postprocessor.json`` and are restored on
# load. A fresh (unconverted) policy defaults to a neutral ``[-1, 1]`` mapping (identity rescale).
@dataclass