diff --git a/src/lerobot/policies/lingbot_va/configuration_lingbot_va.py b/src/lerobot/policies/lingbot_va/configuration_lingbot_va.py
index 1f54bd322..5cb3a2341 100644
--- a/src/lerobot/policies/lingbot_va/configuration_lingbot_va.py
+++ b/src/lerobot/policies/lingbot_va/configuration_lingbot_va.py
@@ -51,38 +51,25 @@ class LingBotVAConfig(PreTrainedConfig):
     cross_attn_norm: bool = True
     eps: float = 1e-6
     rope_max_seq_len: int = 1024
-    # "flex" is supported for training only and needs a recent torch build. Inference uses
-    # "torch" SDPA (always available) or, optionally, "flashattn".
+    # "flex" = training only (needs recent torch); inference uses "torch" SDPA or "flashattn".
     attn_mode: str = "torch"
 
     # ── Frozen sub-models (VAE + UMT5 text encoder + tokenizer) ──
-    # These heavy frozen weights (~20 GB) are NOT bundled into the LeRobot safetensors
-    # checkpoint (only the trainable ~5B transformer is). They are lazily pulled from this
-    # HF repo / local directory at policy-init time. The directory must contain the
-    # diffusers-style ``vae/``, ``text_encoder/`` and ``tokenizer/`` sub-folders.
+    # ~20 GB of frozen weights, NOT bundled in the checkpoint; lazily pulled from this HF repo /
+    # local dir (must hold diffusers-style ``vae/``, ``text_encoder/``, ``tokenizer/`` sub-folders).
     wan_pretrained_path: str = "robbyant/lingbot-va-posttrain-libero-long"
-    # dtype used for the transformer / VAE / text-encoder weights at inference.
-    dtype: str = "bfloat16"  # one of "bfloat16", "float16", "float32"
-    # Device for the frozen UMT5-XXL text encoder. It encodes the (fixed) instruction once per
-    # episode, so keeping it on CPU frees ~11 GB of VRAM and lets the 5B transformer + VAE fit on
-    # a single 24-32 GB GPU. Set to "cuda" if you have the headroom and want faster prompt encoding.
+    dtype: str = "bfloat16"  # transformer / VAE / text-encoder dtype: "bfloat16", "float16", "float32"
+    # Frozen UMT5-XXL encoder device; "cpu" frees ~11 GB VRAM (it runs once per episode).
     text_encoder_device: str = "cpu"
 
-    # ── Observation cameras (order matters: latents are concatenated on width) ──
-    # Defaults match the LIBERO env feature keys (agentview -> image, eye-in-hand -> image2).
+    # ── Observation cameras (order matters: latents are concatenated on width; LIBERO defaults) ──
     obs_cam_keys: list[str] = field(
         default_factory=lambda: ["observation.images.image", "observation.images.image2"]
     )
-    # Horizontally flip the camera images before encoding. LeRobot's LIBERO env processor rotates
-    # frames 180° (flip H *and* W; the HuggingFaceVLA convention), but upstream LingBot-VA trains /
-    # evaluates on vertically-flipped-only frames (``obs[::-1]`` in evaluation/libero/client.py).
-    # Undoing the extra horizontal flip here realigns the input with the model's training orientation.
+    # Undo the LIBERO env processor's extra horizontal flip to match the model's training orientation.
     image_hflip: bool = False
-    # Latent assembly layout for the observation cameras:
-    #   "width_concat"    : encode every camera at (height, width) and concat latents on width (LIBERO).
-    #   "robotwin_tshape" : head camera at full (height, width), the two wrist cameras at half
-    #                       resolution, assembled in a "T" (wrists side-by-side on top of the head
-    #                       on the height axis) using a second streaming VAE (RoboTwin).
+    # Camera latent layout: "width_concat" (cameras concatenated on width; LIBERO) or
+    # "robotwin_tshape" (full-res head + half-res wrists in a "T"; RoboTwin).
     camera_layout: str = "width_concat"
 
     # ── Inference hyperparameters (LIBERO defaults) ──
@@ -101,19 +88,15 @@ class LingBotVAConfig(PreTrainedConfig):
     action_snr_shift: float = 0.05
     max_sequence_length: int = 512  # UMT5 prompt length
 
-    # Subset of the 30-d action space actually used by the benchmark (LIBERO = 7-DoF).
-    # The fixed action (un)normalization quantiles are not stored here nor hardcoded in the
-    # processor: they are serialized into the checkpoint's ``policy_postprocessor.json``
-    # (``LingBotVAActionUnnormalizeStep``) and restored on load by ``from_pretrained``.
+    # Subset of the 30-d action space used by the benchmark (LIBERO = 7-DoF). The action
+    # (un)normalization quantiles live in the checkpoint's ``policy_postprocessor.json``, not here.
     used_action_channel_ids: list[int] = field(default_factory=lambda: list(range(7)))
 
-    # Opt-in: VAE-decode the predicted video latents and stash them on
-    # ``self.last_predicted_frames`` so eval/train can save predicted-video MP4s.
+    # Opt-in: VAE-decode predicted video latents to ``self.last_predicted_frames`` for saving MP4s.
     save_predicted_video: bool = False
 
-    # ── Normalization (handled internally / via custom steps, hence IDENTITY here) ──
-    # Images are scaled to [-1, 1] and VAE-encoded inside the policy; actions are
-    # quantile-(un)normalized by dedicated processor steps using the fixed quantiles above.
+    # ── Normalization: IDENTITY here; images are scaled + VAE-encoded and actions are
+    # quantile-(un)normalized inside the policy / dedicated processor steps. ──
     normalization_mapping: dict[str, NormalizationMode] = field(
         default_factory=lambda: {
             "VISUAL": NormalizationMode.IDENTITY,
diff --git a/src/lerobot/policies/lingbot_va/modeling_lingbot_va.py b/src/lerobot/policies/lingbot_va/modeling_lingbot_va.py
index bf66f608f..dae563f91 100644
--- a/src/lerobot/policies/lingbot_va/modeling_lingbot_va.py
+++ b/src/lerobot/policies/lingbot_va/modeling_lingbot_va.py
@@ -1191,9 +1191,7 @@ class LingBotVAPolicy(PreTrainedPolicy):
         self.last_predicted_frames: Tensor | None = None
         self.reset()
 
-    # ------------------------------------------------------------------
     # Frozen-module lazy loading (VAE + UMT5 + tokenizer)
-    # ------------------------------------------------------------------
     def _ensure_frozen_modules(self):
         if self._frozen:
             return
@@ -1234,9 +1232,7 @@ class LingBotVAPolicy(PreTrainedPolicy):
     def _streaming_vae(self):
         return self._frozen["streaming_vae"]
 
-    # ------------------------------------------------------------------
     # PreTrainedPolicy API
-    # ------------------------------------------------------------------
     def get_optim_params(self) -> dict:
         # Only the transformer is trainable; the VAE / text encoder stay frozen (kept outside the
         # nn.Module registry). With PEFT/LoRA this naturally returns just the adapter params.
@@ -1283,9 +1279,7 @@ class LingBotVAPolicy(PreTrainedPolicy):
             if "streaming_vae_half" in self._frozen:
                 self._frozen["streaming_vae_half"].clear_cache()
 
-    # ------------------------------------------------------------------
     # Training (flow-matching dual-stream loss). Requires attn_mode="flex".
-    # ------------------------------------------------------------------
     def _ensure_train_schedulers(self):
         if getattr(self, "_train_sched_latent", None) is None:
             cfg = self.config
@@ -1565,9 +1559,7 @@ class LingBotVAPolicy(PreTrainedPolicy):
         a = a.transpose(1, 2).contiguous()  # [B, n_steps, n_used]
         return a.to(torch.float32)
 
-    # ------------------------------------------------------------------
     # Prompt / text encoding
-    # ------------------------------------------------------------------
     def _maybe_init_prompt(self, batch):
         if self._prompt_embeds is not None or batch is None:
             return
@@ -1616,9 +1608,7 @@ class LingBotVAPolicy(PreTrainedPolicy):
             negative_prompt_embeds = self._get_t5_prompt_embeds("", max_len)
         return prompt_embeds, negative_prompt_embeds
 
-    # ------------------------------------------------------------------
     # Observation (image) encoding -> normalized video latents
-    # ------------------------------------------------------------------
     def _extract_raw_obs(self, batch) -> dict[str, Tensor]:
         """Snapshot the configured camera images from a batch (kept raw for later VAE encoding)."""
         return {k: batch[k].detach() for k in self.config.obs_cam_keys}
@@ -1696,9 +1686,7 @@ class LingBotVAPolicy(PreTrainedPolicy):
         video_latent = self._normalize_vae_latent(enc_out)
         return video_latent.to(self.config.device)
 
-    # ------------------------------------------------------------------
     # KV cache management
-    # ------------------------------------------------------------------
     @property
     def _latent_hw(self):
         if self.config.camera_layout == "robotwin_tshape":
@@ -1802,9 +1790,7 @@ class LingBotVAPolicy(PreTrainedPolicy):
         mask[self.config.used_action_channel_ids] = True
         return mask
 
-    # ------------------------------------------------------------------
     # Action conditioning (executed action history) (de)normalization
-    # ------------------------------------------------------------------
     def _preprocess_action_state(self, action_norm: Tensor) -> Tensor:
         """Build the action-conditioning tensor from the already-normalized executed actions.
 
@@ -1845,9 +1831,7 @@ class LingBotVAPolicy(PreTrainedPolicy):
             )
         self._frame_st_id += latent_model_input.shape[2]
 
-    # ------------------------------------------------------------------
     # The core dual-stream denoising loop (one chunk)
-    # ------------------------------------------------------------------
     @torch.no_grad()
     def _infer(self, init_latent, frame_st_id=0):
         cfg = self.config
@@ -1937,9 +1921,7 @@ class LingBotVAPolicy(PreTrainedPolicy):
         actions[:, ~self._action_mask] *= 0
         return actions, latents
 
-    # ------------------------------------------------------------------
     # Predicted-video decoding (opt-in)
-    # ------------------------------------------------------------------
     @torch.no_grad()
     def _decode_predicted_video(self, latents) -> Tensor:
         """VAE-decode predicted latents into a uint8 frame stack ``[T, H, W, 3]`` on CPU."""
diff --git a/src/lerobot/policies/lingbot_va/processor_lingbot_va.py b/src/lerobot/policies/lingbot_va/processor_lingbot_va.py
index 2e616f702..192abf7fd 100644
--- a/src/lerobot/policies/lingbot_va/processor_lingbot_va.py
+++ b/src/lerobot/policies/lingbot_va/processor_lingbot_va.py
@@ -47,14 +47,9 @@ from lerobot.utils.constants import (
 
 from .configuration_lingbot_va import LingBotVAConfig
 
-# LingBot-VA applies a *fixed* per-channel action quantile (un)normalization rather than
-# dataset-derived stats. The benchmark-specific quantiles (LIBERO 7-DoF, RoboTwin 16-d eef) are
-# deliberately NOT hardcoded here: they are serialized into each checkpoint's
-# ``policy_postprocessor.json`` (via ``LingBotVAActionUnnormalizeStep.get_config``) and restored on
-# load by ``PolicyProcessorPipeline.from_pretrained``. A freshly built (unconverted) policy defaults
-# to a neutral ``[-1, 1]`` mapping (identity rescale); the real stats always come from the checkpoint
-# (or via ``postprocessor_overrides``). To regenerate a checkpoint from scratch, source the quantiles
-# from the upstream ``wan_va/configs/va_{libero,robotwin}_cfg.py`` and pass them through.
+# LingBot-VA uses fixed per-channel action quantile (un)normalization. The benchmark quantiles are
+# NOT hardcoded here: they live in each checkpoint's ``policy_postprocessor.json`` and are restored on
+# load. A fresh (unconverted) policy defaults to a neutral ``[-1, 1]`` mapping (identity rescale).
 
 
 @dataclass