mirror of
https://github.com/huggingface/lerobot.git
synced 2026-07-02 07:37:10 +00:00
docs(lingbot_va): trim verbose comments
- configuration_lingbot_va.py: condense multi-line field comments to one-liners (keep the ── section headers). - processor_lingbot_va.py: shorten the action-quantile explanation block. - modeling_lingbot_va.py: drop the bare "# ----" separator rules, keeping the one-line section headers. No code changes. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -51,38 +51,25 @@ class LingBotVAConfig(PreTrainedConfig):
|
||||
cross_attn_norm: bool = True
|
||||
eps: float = 1e-6
|
||||
rope_max_seq_len: int = 1024
|
||||
# "flex" is supported for training only and needs a recent torch build. Inference uses
|
||||
# "torch" SDPA (always available) or, optionally, "flashattn".
|
||||
# "flex" = training only (needs recent torch); inference uses "torch" SDPA or "flashattn".
|
||||
attn_mode: str = "torch"
|
||||
|
||||
# ── Frozen sub-models (VAE + UMT5 text encoder + tokenizer) ──
|
||||
# These heavy frozen weights (~20 GB) are NOT bundled into the LeRobot safetensors
|
||||
# checkpoint (only the trainable ~5B transformer is). They are lazily pulled from this
|
||||
# HF repo / local directory at policy-init time. The directory must contain the
|
||||
# diffusers-style ``vae/``, ``text_encoder/`` and ``tokenizer/`` sub-folders.
|
||||
# ~20 GB of frozen weights, NOT bundled in the checkpoint; lazily pulled from this HF repo /
|
||||
# local dir (must hold diffusers-style ``vae/``, ``text_encoder/``, ``tokenizer/`` sub-folders).
|
||||
wan_pretrained_path: str = "robbyant/lingbot-va-posttrain-libero-long"
|
||||
# dtype used for the transformer / VAE / text-encoder weights at inference.
|
||||
dtype: str = "bfloat16" # one of "bfloat16", "float16", "float32"
|
||||
# Device for the frozen UMT5-XXL text encoder. It encodes the (fixed) instruction once per
|
||||
# episode, so keeping it on CPU frees ~11 GB of VRAM and lets the 5B transformer + VAE fit on
|
||||
# a single 24-32 GB GPU. Set to "cuda" if you have the headroom and want faster prompt encoding.
|
||||
dtype: str = "bfloat16" # transformer / VAE / text-encoder dtype: "bfloat16", "float16", "float32"
|
||||
# Frozen UMT5-XXL encoder device; "cpu" frees ~11 GB VRAM (it runs once per episode).
|
||||
text_encoder_device: str = "cpu"
|
||||
|
||||
# ── Observation cameras (order matters: latents are concatenated on width) ──
|
||||
# Defaults match the LIBERO env feature keys (agentview -> image, eye-in-hand -> image2).
|
||||
# ── Observation cameras (order matters: latents are concatenated on width; LIBERO defaults) ──
|
||||
obs_cam_keys: list[str] = field(
|
||||
default_factory=lambda: ["observation.images.image", "observation.images.image2"]
|
||||
)
|
||||
# Horizontally flip the camera images before encoding. LeRobot's LIBERO env processor rotates
|
||||
# frames 180° (flip H *and* W; the HuggingFaceVLA convention), but upstream LingBot-VA trains /
|
||||
# evaluates on vertically-flipped-only frames (``obs[::-1]`` in evaluation/libero/client.py).
|
||||
# Undoing the extra horizontal flip here realigns the input with the model's training orientation.
|
||||
# Undo the LIBERO env processor's extra horizontal flip to match the model's training orientation.
|
||||
image_hflip: bool = False
|
||||
# Latent assembly layout for the observation cameras:
|
||||
# "width_concat" : encode every camera at (height, width) and concat latents on width (LIBERO).
|
||||
# "robotwin_tshape" : head camera at full (height, width), the two wrist cameras at half
|
||||
# resolution, assembled in a "T" (wrists side-by-side on top of the head
|
||||
# on the height axis) using a second streaming VAE (RoboTwin).
|
||||
# Camera latent layout: "width_concat" (cameras concatenated on width; LIBERO) or
|
||||
# "robotwin_tshape" (full-res head + half-res wrists in a "T"; RoboTwin).
|
||||
camera_layout: str = "width_concat"
|
||||
|
||||
# ── Inference hyperparameters (LIBERO defaults) ──
|
||||
@@ -101,19 +88,15 @@ class LingBotVAConfig(PreTrainedConfig):
|
||||
action_snr_shift: float = 0.05
|
||||
max_sequence_length: int = 512 # UMT5 prompt length
|
||||
|
||||
# Subset of the 30-d action space actually used by the benchmark (LIBERO = 7-DoF).
|
||||
# The fixed action (un)normalization quantiles are not stored here nor hardcoded in the
|
||||
# processor: they are serialized into the checkpoint's ``policy_postprocessor.json``
|
||||
# (``LingBotVAActionUnnormalizeStep``) and restored on load by ``from_pretrained``.
|
||||
# Subset of the 30-d action space used by the benchmark (LIBERO = 7-DoF). The action
|
||||
# (un)normalization quantiles live in the checkpoint's ``policy_postprocessor.json``, not here.
|
||||
used_action_channel_ids: list[int] = field(default_factory=lambda: list(range(7)))
|
||||
|
||||
# Opt-in: VAE-decode the predicted video latents and stash them on
|
||||
# ``self.last_predicted_frames`` so eval/train can save predicted-video MP4s.
|
||||
# Opt-in: VAE-decode predicted video latents to ``self.last_predicted_frames`` for saving MP4s.
|
||||
save_predicted_video: bool = False
|
||||
|
||||
# ── Normalization (handled internally / via custom steps, hence IDENTITY here) ──
|
||||
# Images are scaled to [-1, 1] and VAE-encoded inside the policy; actions are
|
||||
# quantile-(un)normalized by dedicated processor steps using the fixed quantiles above.
|
||||
# ── Normalization: IDENTITY here; images are scaled + VAE-encoded and actions are
|
||||
# quantile-(un)normalized inside the policy / dedicated processor steps. ──
|
||||
normalization_mapping: dict[str, NormalizationMode] = field(
|
||||
default_factory=lambda: {
|
||||
"VISUAL": NormalizationMode.IDENTITY,
|
||||
|
||||
@@ -1191,9 +1191,7 @@ class LingBotVAPolicy(PreTrainedPolicy):
|
||||
self.last_predicted_frames: Tensor | None = None
|
||||
self.reset()
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Frozen-module lazy loading (VAE + UMT5 + tokenizer)
|
||||
# ------------------------------------------------------------------
|
||||
def _ensure_frozen_modules(self):
|
||||
if self._frozen:
|
||||
return
|
||||
@@ -1234,9 +1232,7 @@ class LingBotVAPolicy(PreTrainedPolicy):
|
||||
def _streaming_vae(self):
|
||||
return self._frozen["streaming_vae"]
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# PreTrainedPolicy API
|
||||
# ------------------------------------------------------------------
|
||||
def get_optim_params(self) -> dict:
|
||||
# Only the transformer is trainable; the VAE / text encoder stay frozen (kept outside the
|
||||
# nn.Module registry). With PEFT/LoRA this naturally returns just the adapter params.
|
||||
@@ -1283,9 +1279,7 @@ class LingBotVAPolicy(PreTrainedPolicy):
|
||||
if "streaming_vae_half" in self._frozen:
|
||||
self._frozen["streaming_vae_half"].clear_cache()
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Training (flow-matching dual-stream loss). Requires attn_mode="flex".
|
||||
# ------------------------------------------------------------------
|
||||
def _ensure_train_schedulers(self):
|
||||
if getattr(self, "_train_sched_latent", None) is None:
|
||||
cfg = self.config
|
||||
@@ -1565,9 +1559,7 @@ class LingBotVAPolicy(PreTrainedPolicy):
|
||||
a = a.transpose(1, 2).contiguous() # [B, n_steps, n_used]
|
||||
return a.to(torch.float32)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Prompt / text encoding
|
||||
# ------------------------------------------------------------------
|
||||
def _maybe_init_prompt(self, batch):
|
||||
if self._prompt_embeds is not None or batch is None:
|
||||
return
|
||||
@@ -1616,9 +1608,7 @@ class LingBotVAPolicy(PreTrainedPolicy):
|
||||
negative_prompt_embeds = self._get_t5_prompt_embeds("", max_len)
|
||||
return prompt_embeds, negative_prompt_embeds
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Observation (image) encoding -> normalized video latents
|
||||
# ------------------------------------------------------------------
|
||||
def _extract_raw_obs(self, batch) -> dict[str, Tensor]:
|
||||
"""Snapshot the configured camera images from a batch (kept raw for later VAE encoding)."""
|
||||
return {k: batch[k].detach() for k in self.config.obs_cam_keys}
|
||||
@@ -1696,9 +1686,7 @@ class LingBotVAPolicy(PreTrainedPolicy):
|
||||
video_latent = self._normalize_vae_latent(enc_out)
|
||||
return video_latent.to(self.config.device)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# KV cache management
|
||||
# ------------------------------------------------------------------
|
||||
@property
|
||||
def _latent_hw(self):
|
||||
if self.config.camera_layout == "robotwin_tshape":
|
||||
@@ -1802,9 +1790,7 @@ class LingBotVAPolicy(PreTrainedPolicy):
|
||||
mask[self.config.used_action_channel_ids] = True
|
||||
return mask
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Action conditioning (executed action history) (de)normalization
|
||||
# ------------------------------------------------------------------
|
||||
def _preprocess_action_state(self, action_norm: Tensor) -> Tensor:
|
||||
"""Build the action-conditioning tensor from the already-normalized executed actions.
|
||||
|
||||
@@ -1845,9 +1831,7 @@ class LingBotVAPolicy(PreTrainedPolicy):
|
||||
)
|
||||
self._frame_st_id += latent_model_input.shape[2]
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# The core dual-stream denoising loop (one chunk)
|
||||
# ------------------------------------------------------------------
|
||||
@torch.no_grad()
|
||||
def _infer(self, init_latent, frame_st_id=0):
|
||||
cfg = self.config
|
||||
@@ -1937,9 +1921,7 @@ class LingBotVAPolicy(PreTrainedPolicy):
|
||||
actions[:, ~self._action_mask] *= 0
|
||||
return actions, latents
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Predicted-video decoding (opt-in)
|
||||
# ------------------------------------------------------------------
|
||||
@torch.no_grad()
|
||||
def _decode_predicted_video(self, latents) -> Tensor:
|
||||
"""VAE-decode predicted latents into a uint8 frame stack ``[T, H, W, 3]`` on CPU."""
|
||||
|
||||
@@ -47,14 +47,9 @@ from lerobot.utils.constants import (
|
||||
|
||||
from .configuration_lingbot_va import LingBotVAConfig
|
||||
|
||||
# LingBot-VA applies a *fixed* per-channel action quantile (un)normalization rather than
|
||||
# dataset-derived stats. The benchmark-specific quantiles (LIBERO 7-DoF, RoboTwin 16-d eef) are
|
||||
# deliberately NOT hardcoded here: they are serialized into each checkpoint's
|
||||
# ``policy_postprocessor.json`` (via ``LingBotVAActionUnnormalizeStep.get_config``) and restored on
|
||||
# load by ``PolicyProcessorPipeline.from_pretrained``. A freshly built (unconverted) policy defaults
|
||||
# to a neutral ``[-1, 1]`` mapping (identity rescale); the real stats always come from the checkpoint
|
||||
# (or via ``postprocessor_overrides``). To regenerate a checkpoint from scratch, source the quantiles
|
||||
# from the upstream ``wan_va/configs/va_{libero,robotwin}_cfg.py`` and pass them through.
|
||||
# LingBot-VA uses fixed per-channel action quantile (un)normalization. The benchmark quantiles are
|
||||
# NOT hardcoded here: they live in each checkpoint's ``policy_postprocessor.json`` and are restored on
|
||||
# load. A fresh (unconverted) policy defaults to a neutral ``[-1, 1]`` mapping (identity rescale).
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
Reference in New Issue
Block a user