From 88519cb14cdb4c6df0a08526e1a43189dfd4d461 Mon Sep 17 00:00:00 2001 From: Pepijn Date: Tue, 19 May 2026 23:02:20 +0200 Subject: [PATCH] fix(pi052): quantile-normalize actions before FAST tokenizer fit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit base.fit() rejected the data with "Vocab size 1024 is too small for the range of tokens 9339": the FAST tokenizer was fit on raw motor-unit actions, whose DCT-token range vastly exceeds the 1024 codebook. Two problems, one fix. (1) Raw actions blow up the token range. (2) At training time ActionTokenizerProcessorStep runs after the QUANTILES NormalizerProcessorStep, so it encodes normalized actions — fitting on raw actions mismatches that space. Replicate QUANTILES normalization (per-dim [q01,q99] -> [-1,1], clipped) before base.fit() so the fit and the training-time encode see the same distribution and the token range fits the codebook. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../policies/pi052/fit_fast_tokenizer.py | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/src/lerobot/policies/pi052/fit_fast_tokenizer.py b/src/lerobot/policies/pi052/fit_fast_tokenizer.py index 14e4217ca..513553c00 100644 --- a/src/lerobot/policies/pi052/fit_fast_tokenizer.py +++ b/src/lerobot/policies/pi052/fit_fast_tokenizer.py @@ -189,12 +189,30 @@ def fit_fast_tokenizer( "lengths." ) - actions = np.stack(actions_buf, axis=0) # (N, H, D) + actions = np.stack(actions_buf, axis=0).astype(np.float32) # (N, H, D) logger.info( "FAST fit: collected %d chunks of shape %s from %d episodes", actions.shape[0], actions.shape[1:], eps_visited, ) + # Quantile-normalise per dimension before fitting. + # + # The FAST tokenizer DCT-transforms actions, scales by ``scale`` and + # rounds to integer tokens; the integer *range* must fit the + # codebook (vocab_size, default 1024). Raw motor units (e.g. encoder + # ticks) blow that range up — hence "Vocab size 1024 is too small". + # More importantly, at training time ``ActionTokenizerProcessorStep`` + # runs *after* the QUANTILES ``NormalizerProcessorStep``, so it + # encodes normalised actions. Fitting on raw actions would mismatch + # that space. We replicate QUANTILES normalisation here (per-dim + # [q01, q99] → [-1, 1], clipped) so the fit and the training-time + # encode see the same distribution. + flat = actions.reshape(-1, actions.shape[-1]) + q01 = np.quantile(flat, 0.01, axis=0) + q99 = np.quantile(flat, 0.99, axis=0) + span = np.where((q99 - q01) > 1e-6, q99 - q01, 1.0) + actions = np.clip((actions - q01) / span * 2.0 - 1.0, -1.0, 1.0).astype(np.float32) + base = AutoProcessor.from_pretrained(base_tokenizer_name, trust_remote_code=True) if not hasattr(base, "fit"): raise ImportError(