From 88519cb14cdb4c6df0a08526e1a43189dfd4d461 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Tue, 19 May 2026 23:02:20 +0200
Subject: [PATCH] fix(pi052): quantile-normalize actions before FAST tokenizer
 fit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

base.fit() rejected the data with "Vocab size 1024 is too small for
the range of tokens 9339": the FAST tokenizer was fit on raw
motor-unit actions, whose DCT-token range vastly exceeds the 1024
codebook.

Two problems, one fix. (1) Raw actions blow up the token range. (2) At
training time ActionTokenizerProcessorStep runs after the QUANTILES
NormalizerProcessorStep, so it encodes normalized actions — fitting on
raw actions mismatches that space. Replicate QUANTILES normalization
(per-dim [q01,q99] -> [-1,1], clipped) before base.fit() so the fit and
the training-time encode see the same distribution and the token range
fits the codebook.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../policies/pi052/fit_fast_tokenizer.py      | 20 ++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/src/lerobot/policies/pi052/fit_fast_tokenizer.py b/src/lerobot/policies/pi052/fit_fast_tokenizer.py
index 14e4217ca..513553c00 100644
--- a/src/lerobot/policies/pi052/fit_fast_tokenizer.py
+++ b/src/lerobot/policies/pi052/fit_fast_tokenizer.py
@@ -189,12 +189,30 @@ def fit_fast_tokenizer(
             "lengths."
         )
 
-    actions = np.stack(actions_buf, axis=0)  # (N, H, D)
+    actions = np.stack(actions_buf, axis=0).astype(np.float32)  # (N, H, D)
     logger.info(
         "FAST fit: collected %d chunks of shape %s from %d episodes",
         actions.shape[0], actions.shape[1:], eps_visited,
     )
 
+    # Quantile-normalise per dimension before fitting.
+    #
+    # The FAST tokenizer DCT-transforms actions, scales by ``scale`` and
+    # rounds to integer tokens; the integer *range* must fit the
+    # codebook (vocab_size, default 1024). Raw motor units (e.g. encoder
+    # ticks) blow that range up — hence "Vocab size 1024 is too small".
+    # More importantly, at training time ``ActionTokenizerProcessorStep``
+    # runs *after* the QUANTILES ``NormalizerProcessorStep``, so it
+    # encodes normalised actions. Fitting on raw actions would mismatch
+    # that space. We replicate QUANTILES normalisation here (per-dim
+    # [q01, q99] → [-1, 1], clipped) so the fit and the training-time
+    # encode see the same distribution.
+    flat = actions.reshape(-1, actions.shape[-1])
+    q01 = np.quantile(flat, 0.01, axis=0)
+    q99 = np.quantile(flat, 0.99, axis=0)
+    span = np.where((q99 - q01) > 1e-6, q99 - q01, 1.0)
+    actions = np.clip((actions - q01) / span * 2.0 - 1.0, -1.0, 1.0).astype(np.float32)
+
     base = AutoProcessor.from_pretrained(base_tokenizer_name, trust_remote_code=True)
     if not hasattr(base, "fit"):
         raise ImportError(