feat(pi052): FAST action CE loss + knowledge insulation + processor wiring

Three additions ported from ``pi05_full`` on branch ``feat/add-pi05``, giving pi052 full paper-§III.B-C training capabilities alongside the recipe-driven text supervision it already had: * **Config flags** in PI052Config: - ``enable_fast_action_loss`` default False - ``action_tokenizer_name`` default "physical-intelligence/fast" - ``max_action_tokens`` default 256 - ``fast_skip_tokens`` default 128 - ``fast_action_loss_weight`` default 1.0 - ``knowledge_insulation`` default False * **Processor wiring** (processor_pi052.py): when ``enable_fast_action_loss=True``, append an ``ActionTokenizerProcessorStep`` after the text tokenizer. It tokenises the action tensor with the FAST tokenizer and writes ACTION_TOKENS / ACTION_TOKEN_MASK into ``COMPLEMENTARY_DATA`` — the existing batch-collation pipeline forwards them as ``batch['action.tokens']`` / ``batch['action.token_mask']``. * **FAST CE loss** (modeling_pi052.py::_compute_fast_action_loss): Re-embeds the prefix [images, language], appends the FAST token embeddings (using PaliGemma's shared embed_language_tokens), forwards through the backbone, slices the trailing ``fast_len`` positions, applies the LM head, computes shifted next-token CE with the action-mask gating the loss. The loss is summed into ``forward()``'s total with ``fast_action_loss_weight``. * **Knowledge insulation** (modeling_pi052.py::_compute_layer_ki + _paligemma_forward_ki): port of pi05_full's per-layer attention that detaches VLM K/V on the action-query path so action loss gradients cannot flow back into the VLM's K/V projections. Bound per-instance via ``types.MethodType`` so it doesn't leak into stock ``pi05`` policies that share PaliGemmaWithExpertModel. Activated automatically when ``config.knowledge_insulation=True``. Combined with the existing recipe-driven text head, pi052 now supports the full three-loss objective: L = text_w·H(text) + fast_w·H(FAST actions) + flow_w·MSE(flow) matching Eq. (1) of arxiv:2504.16054 §IV.D (α=10 by default for the flow term, 1.0 each for text and FAST CE). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-16 17:20:05 +00:00 · 2026-05-13 11:46:21 +02:00
parent 8eba704f15
commit 8dc0af3c28
3 changed files with 354 additions and 1 deletions
@@ -99,6 +99,50 @@ class PI052Config(PI05Config):
    memory_dropout_prob: float = 0.0
    subtask_dropout_prob: float = 0.0

+    # FAST discrete-action supervision — paper §III.B-C ------------------
+    # When enabled, actions are *also* tokenised via the FAST tokenizer
+    # ("physical-intelligence/fast") and supervised with cross-entropy
+    # on the PaliGemma LM head — exactly as in the paper's pre-training
+    # objective (Eq. 1 mixes FAST CE + flow MSE + subtask CE). The
+    # ActionTokenizerProcessorStep is wired into the preprocessor
+    # pipeline when this flag is set; the loss is computed in
+    # PI052Policy.forward.
+    enable_fast_action_loss: bool = False
+    """If True, tokenise actions with the FAST tokenizer and add a
+    cross-entropy loss on the LM head. Off by default because most
+    fine-tuning runs only need the flow head + text supervision; the
+    FAST CE term is most useful when training from a base PaliGemma
+    rather than an existing π0.5 checkpoint."""
+
+    action_tokenizer_name: str = "physical-intelligence/fast"
+    """HF identifier for the FAST action tokenizer."""
+
+    max_action_tokens: int = 256
+    """Maximum number of FAST tokens per action chunk."""
+
+    fast_skip_tokens: int = 128
+    """Number of low-vocab tokens the FAST tokenizer skips to avoid
+    collisions with PaliGemma's text vocabulary."""
+
+    fast_action_loss_weight: float = 1.0
+    """Weight on the FAST-action-token CE loss. Paper §III.C uses 1.0."""
+
+    # Knowledge insulation — paper §III.B --------------------------------
+    # When enabled, gradients from the action expert's flow loss are
+    # *blocked* from flowing back into the VLM's K/V projections. This
+    # prevents the action loss from over-fitting the language backbone
+    # to robot-specific features. Implementation requires a custom
+    # per-layer attention forward that uses ``.detach()`` on VLM K/V
+    # when computing attention for action queries — see
+    # ``pi05_full/modeling_pi05.py::compute_layer_complete_knowledge_insulation``
+    # on branch ``feat/add-pi05`` for the reference implementation.
+    knowledge_insulation: bool = False
+    """If True, route every transformer layer through the KI
+    attention path (blocks action→VLM gradient flow on K/V).
+    Currently a no-op stub in this branch — PR follow-up required to
+    port the full custom layer from pi05_full. The flag is exposed
+    here so SLURM commands can be written against the final shape."""
+
    def __post_init__(self) -> None:
        super().__post_init__()
        # Backbone needs gradients flowing through the text head when
@@ -39,10 +39,12 @@ from __future__ import annotations

 import logging
 import math
+import types
 from typing import Any

 import torch
 from torch import Tensor
+from torch.nn import functional as F

 from lerobot.utils.constants import OBS_LANGUAGE_ATTENTION_MASK, OBS_LANGUAGE_TOKENS

@@ -53,6 +55,190 @@ from .configuration_pi052 import PI052Config
 logger = logging.getLogger(__name__)


+# ----------------------------------------------------------------------
+# Knowledge insulation — ported from pi05_full (branch ``feat/add-pi05``)
+# ----------------------------------------------------------------------
+#
+# Per-layer attention that splits the queries into VLM and action
+# parts, computing attention for action queries with .detach()'d VLM
+# K/V so the action loss's gradient cannot flow back into the VLM's K
+# and V projections. Forward output is bit-equivalent to the standard
+# layer; backward differs only on the path action_loss → VLM K/V.
+
+def _compute_layer_ki(
+    layer_idx,
+    inputs_embeds,
+    attention_mask,
+    position_ids,
+    adarms_cond,
+    paligemma,
+    gemma_expert,
+):
+    from transformers.models.gemma import modeling_gemma  # noqa: PLC0415
+
+    models = [paligemma.language_model, gemma_expert.model]
+    query_states, key_states, value_states, gates = [], [], [], []
+
+    vlm_len = inputs_embeds[0].shape[1]
+
+    for i, hidden_states in enumerate(inputs_embeds):
+        layer = models[i].layers[layer_idx]
+        hidden_states, gate = layer.input_layernorm(hidden_states, cond=adarms_cond[i])
+        gates.append(gate)
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, layer.self_attn.head_dim)
+        q = layer.self_attn.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        k = layer.self_attn.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        v = layer.self_attn.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        query_states.append(q)
+        key_states.append(k)
+        value_states.append(v)
+
+    query_states = torch.cat(query_states, dim=2)
+    key_states = torch.cat(key_states, dim=2)
+    value_states = torch.cat(value_states, dim=2)
+
+    dummy = torch.zeros(
+        query_states.shape[0],
+        query_states.shape[2],
+        query_states.shape[-1],
+        device=query_states.device,
+        dtype=query_states.dtype,
+    )
+    cos, sin = paligemma.model.language_model.rotary_emb(dummy, position_ids)
+    query_states, key_states = modeling_gemma.apply_rotary_pos_emb(
+        query_states, key_states, cos, sin, unsqueeze_dim=1
+    )
+
+    batch_size = query_states.shape[0]
+    scaling = paligemma.language_model.layers[layer_idx].self_attn.scaling
+
+    # Split queries / K / V at the VLM-vs-action boundary.
+    Q_vlm = query_states[:, :, :vlm_len, :]
+    Q_action = query_states[:, :, vlm_len:, :]
+    K_vlm = key_states[:, :, :vlm_len, :]
+    K_action = key_states[:, :, vlm_len:, :]
+    V_vlm = value_states[:, :, :vlm_len, :]
+    V_action = value_states[:, :, vlm_len:, :]
+
+    # Detach VLM K/V *only* on the path the action queries use.
+    K_vlm_det = K_vlm.detach()
+    V_vlm_det = V_vlm.detach()
+    K_for_vlm = key_states  # full (gradients flow)
+    V_for_vlm = value_states
+    K_for_action = torch.cat([K_vlm_det, K_action], dim=2)
+    V_for_action = torch.cat([V_vlm_det, V_action], dim=2)
+
+    mask_for_vlm = attention_mask[:, :, :vlm_len, :]
+    mask_for_action = attention_mask[:, :, vlm_len:, :]
+
+    att_vlm, _ = modeling_gemma.eager_attention_forward(
+        paligemma.language_model.layers[layer_idx].self_attn,
+        Q_vlm, K_for_vlm, V_for_vlm, mask_for_vlm, scaling,
+    )
+    att_action, _ = modeling_gemma.eager_attention_forward(
+        paligemma.language_model.layers[layer_idx].self_attn,
+        Q_action, K_for_action, V_for_action, mask_for_action, scaling,
+    )
+    att = torch.cat([att_vlm, att_action], dim=1)
+
+    head_dim = paligemma.language_model.layers[layer_idx].self_attn.head_dim
+    att = att.reshape(batch_size, -1, 1 * 8 * head_dim)
+
+    outputs_embeds = []
+    start = 0
+    for i, hidden_states in enumerate(inputs_embeds):
+        layer = models[i].layers[layer_idx]
+        end = start + hidden_states.shape[1]
+        if att.dtype != layer.self_attn.o_proj.weight.dtype:
+            att = att.to(layer.self_attn.o_proj.weight.dtype)
+        out_emb = layer.self_attn.o_proj(att[:, start:end])
+        out_emb = modeling_gemma._gated_residual(hidden_states, out_emb, gates[i])  # noqa: SLF001
+        after_first = out_emb.clone()
+        out_emb, gate = layer.post_attention_layernorm(out_emb.clone(), cond=adarms_cond[i])
+        if layer.mlp.up_proj.weight.dtype == torch.bfloat16:
+            out_emb = out_emb.to(dtype=torch.bfloat16)
+        out_emb = layer.mlp(out_emb)
+        out_emb = modeling_gemma._gated_residual(after_first, out_emb, gate)  # noqa: SLF001
+        outputs_embeds.append(out_emb)
+        start = end
+    return outputs_embeds
+
+
+def _paligemma_forward_ki(
+    self,
+    attention_mask=None,
+    position_ids=None,
+    past_key_values=None,
+    inputs_embeds=None,
+    use_cache=None,
+    adarms_cond=None,
+    fill_kv_cache=None,
+):
+    """Replacement ``PaliGemmaWithExpertModel.forward`` that routes the
+    dual-expert layer pass through :func:`_compute_layer_ki`.
+
+    Bound onto the model instance when ``config.knowledge_insulation``
+    is True (see ``PI052Policy.__init__``). Single-expert branches
+    (VLM-only or action-only) reuse the parent's implementation
+    because there's no KI signal to add — KI only matters when
+    actions and VLM tokens are forwarded together.
+    """
+    from ..pi05.modeling_pi05 import layernorm_forward  # noqa: PLC0415
+
+    if adarms_cond is None:
+        adarms_cond = [None, None]
+
+    # Single-expert paths: defer to the bound class method via super().
+    if inputs_embeds[0] is None or inputs_embeds[1] is None:
+        return type(self).__bases__[0].forward(
+            self,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            adarms_cond=adarms_cond,
+        ) if hasattr(self, "_pi052_orig_forward") else self._pi052_orig_forward(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            adarms_cond=adarms_cond,
+        )
+
+    models = [self.paligemma.model.language_model, self.gemma_expert.model]
+    num_layers = self.paligemma.config.text_config.num_hidden_layers
+    use_gc = (
+        hasattr(self.gemma_expert.model, "gradient_checkpointing")
+        and self.gemma_expert.model.gradient_checkpointing
+        and self.training
+    ) or (
+        hasattr(self, "gradient_checkpointing") and self.gradient_checkpointing and self.training
+    )
+
+    for layer_idx in range(num_layers):
+        if use_gc:
+            inputs_embeds = torch.utils.checkpoint.checkpoint(
+                _compute_layer_ki,
+                layer_idx, inputs_embeds, attention_mask, position_ids, adarms_cond,
+                use_reentrant=False, preserve_rng_state=False,
+                paligemma=self.paligemma, gemma_expert=self.gemma_expert,
+            )
+        else:
+            inputs_embeds = _compute_layer_ki(
+                layer_idx, inputs_embeds, attention_mask, position_ids, adarms_cond,
+                paligemma=self.paligemma, gemma_expert=self.gemma_expert,
+            )
+
+    outputs_embeds = []
+    for i, hidden_states in enumerate(inputs_embeds):
+        out_emb, _ = layernorm_forward(models[i].norm, hidden_states, adarms_cond[i])
+        outputs_embeds.append(out_emb)
+    return [outputs_embeds[0], outputs_embeds[1]], None
+
+
 class PI052Policy(PI05Policy):
    """π0.5 with the PaliGemma LM head re-enabled."""

@@ -68,6 +254,20 @@ class PI052Policy(PI05Policy):
        if config.text_loss_weight > 0 and config.unfreeze_lm_head:
            self._unfreeze_lm_head()

+        # Knowledge insulation: bind a custom ``forward`` on the
+        # PaliGemmaWithExpertModel instance that uses
+        # :func:`_compute_layer_ki` for the dual-expert layer pass.
+        # The bind is per-instance, so this doesn't leak into stock
+        # ``pi05`` policies that share the same class.
+        if getattr(config, "knowledge_insulation", False):
+            backbone = self.model.paligemma_with_expert
+            backbone._pi052_orig_forward = backbone.forward
+            backbone.forward = types.MethodType(_paligemma_forward_ki, backbone)
+            logger.info(
+                "PI052: knowledge insulation enabled — action→VLM K/V "
+                "gradients are blocked in attention."
+            )
+
    # ------------------------------------------------------------------
    # Head unfreeze helper
    # ------------------------------------------------------------------
@@ -143,6 +343,22 @@ class PI052Policy(PI05Policy):
                else total + self.config.text_loss_weight * text_loss
            )

+        # FAST action-token CE loss (paper §III.C). When
+        # ``enable_fast_action_loss=True`` the preprocessor wrote
+        # ACTION_TOKENS / ACTION_TOKEN_MASK into the batch — we
+        # forward them through the PaliGemma backbone alongside the
+        # language prefix and compute CE on the action positions.
+        if getattr(self.config, "enable_fast_action_loss", False):
+            from lerobot.utils.constants import ACTION_TOKEN_MASK, ACTION_TOKENS  # noqa: PLC0415
+
+            action_tokens = batch.get(ACTION_TOKENS)
+            action_mask = batch.get(ACTION_TOKEN_MASK)
+            if action_tokens is not None and action_mask is not None:
+                fast_loss = self._compute_fast_action_loss(batch, action_tokens, action_mask)
+                loss_dict["fast_action_loss"] = float(fast_loss.detach().item())
+                weighted = self.config.fast_action_loss_weight * fast_loss
+                total = weighted if total is None else total + weighted
+
        if total is None:
            # Both flow and text disabled — make this an obvious bug
            # rather than a silent zero loss.
@@ -161,6 +377,82 @@ class PI052Policy(PI05Policy):
    # Text loss
    # ------------------------------------------------------------------

+    def _compute_fast_action_loss(
+        self,
+        batch: dict[str, Tensor],
+        action_tokens: Tensor,
+        action_mask: Tensor,
+    ) -> Tensor:
+        """Cross-entropy on FAST-tokenised actions via PaliGemma's LM head.
+
+        Mirrors the paper's §III.C action-token loss: append the FAST
+        tokens to the language prefix, forward through the backbone,
+        slice the per-token logits at the action positions, and
+        compute CE against the FAST targets (shifted for next-token
+        prediction). Token-mask gates the loss to valid positions.
+        """
+        from ..pi05.modeling_pi05 import make_att_2d_masks  # noqa: PLC0415
+
+        images, img_masks = self.model._preprocess_images(batch)
+        lang_tokens = batch[OBS_LANGUAGE_TOKENS]
+        lang_masks = batch[OBS_LANGUAGE_ATTENTION_MASK]
+
+        # Re-embed prefix to get [images, language] embeddings, then
+        # append the FAST action token embeddings as additional
+        # prefix tokens. PaliGemma's embed_language_tokens is shared
+        # between text and FAST tokens so we can re-use it directly.
+        prefix_embs, prefix_pad, prefix_att = self.model.embed_prefix(
+            images, img_masks, lang_tokens, lang_masks
+        )
+        emb_dim = prefix_embs.shape[-1]
+        fast_emb = self.model.paligemma_with_expert.embed_language_tokens(action_tokens)
+        fast_emb = fast_emb * math.sqrt(emb_dim)
+
+        # Concat onto the prefix. Pad masks: language uses
+        # ``lang_masks``; FAST uses ``action_mask`` (True at valid
+        # token positions). Attention masks add ``True`` (causal)
+        # for FAST so they can attend to the bidirectional prefix
+        # but only causally among themselves.
+        bsize, fast_len = action_tokens.shape
+        device = prefix_embs.device
+        ones_att = torch.ones((bsize, fast_len), dtype=torch.bool, device=device)
+        full_embs = torch.cat([prefix_embs, fast_emb], dim=1)
+        full_pad = torch.cat([prefix_pad, action_mask.to(prefix_pad.dtype)], dim=1)
+        full_att = torch.cat([prefix_att, ones_att], dim=1)
+
+        att_2d = make_att_2d_masks(full_pad, full_att)
+        position_ids = torch.cumsum(full_pad, dim=1) - 1
+
+        (vlm_out, _), _ = self.model.paligemma_with_expert.forward(
+            attention_mask=att_2d,
+            position_ids=position_ids,
+            past_key_values=None,
+            inputs_embeds=[full_embs, None],
+            use_cache=False,
+            fill_kv_cache=True,
+        )
+        if vlm_out is None:
+            raise RuntimeError("PI052 FAST loss: VLM forward returned no hidden states.")
+
+        # Slice the last ``fast_len`` positions — those correspond to
+        # the FAST tokens we just appended.
+        fast_hidden = vlm_out[:, -fast_len:, :]
+        lm_head = self.model.paligemma_with_expert.paligemma.lm_head
+        fast_logits = lm_head(fast_hidden.to(lm_head.weight.dtype))
+
+        # Shift for next-token prediction.
+        shift_logits = fast_logits[:, :-1, :].contiguous()
+        shift_targets = action_tokens[:, 1:].contiguous()
+        shift_mask = action_mask[:, 1:].contiguous().float()
+
+        per_tok = F.cross_entropy(
+            shift_logits.view(-1, shift_logits.size(-1)),
+            shift_targets.view(-1),
+            reduction="none",
+        ).view(shift_targets.shape)
+        loss = (per_tok * shift_mask).sum() / shift_mask.sum().clamp_min(1.0)
+        return loss
+
    def _compute_text_loss(self, batch: dict[str, Tensor], text_labels: Tensor) -> Tensor:
        """Cross-entropy on PaliGemma's LM head over the supervised span.

@@ -43,6 +43,7 @@ import torch
 from lerobot.configs.recipe import TrainingRecipe
 from lerobot.processor import (
    AbsoluteActionsProcessorStep,
+    ActionTokenizerProcessorStep,
    AddBatchDimensionProcessorStep,
    DeviceProcessorStep,
    NormalizerProcessorStep,
@@ -101,9 +102,25 @@ def make_pi052_pre_post_processors(
            memory_dropout_prob=getattr(config, "memory_dropout_prob", 0.0),
            subtask_dropout_prob=getattr(config, "subtask_dropout_prob", 0.0),
        ),
-        DeviceProcessorStep(device=config.device),
    ]

+    # FAST tokenizer for discrete-action CE supervision (paper §III.C).
+    # Only inserted when explicitly enabled — keeps the post-training-
+    # style recipe (flow + text) as the default. When on, the step
+    # writes ACTION_TOKENS / ACTION_TOKEN_MASK into
+    # ``COMPLEMENTARY_DATA`` and the modeling forward picks them up.
+    if getattr(config, "enable_fast_action_loss", False):
+        input_steps.append(
+            ActionTokenizerProcessorStep(
+                action_tokenizer_name=config.action_tokenizer_name,
+                max_action_tokens=config.max_action_tokens,
+                fast_skip_tokens=config.fast_skip_tokens,
+                paligemma_tokenizer_name="google/paligemma-3b-pt-224",
+            )
+        )
+
+    input_steps.append(DeviceProcessorStep(device=config.device))
+
    output_steps = [
        UnnormalizerProcessorStep(
            features=config.output_features,