mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-22 12:09:42 +00:00
fix(pi052): quantile-normalize actions before FAST tokenizer fit
base.fit() rejected the data with "Vocab size 1024 is too small for the range of tokens 9339": the FAST tokenizer was fit on raw motor-unit actions, whose DCT-token range vastly exceeds the 1024 codebook. Two problems, one fix. (1) Raw actions blow up the token range. (2) At training time ActionTokenizerProcessorStep runs after the QUANTILES NormalizerProcessorStep, so it encodes normalized actions — fitting on raw actions mismatches that space. Replicate QUANTILES normalization (per-dim [q01,q99] -> [-1,1], clipped) before base.fit() so the fit and the training-time encode see the same distribution and the token range fits the codebook. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -189,12 +189,30 @@ def fit_fast_tokenizer(
|
|||||||
"lengths."
|
"lengths."
|
||||||
)
|
)
|
||||||
|
|
||||||
actions = np.stack(actions_buf, axis=0) # (N, H, D)
|
actions = np.stack(actions_buf, axis=0).astype(np.float32) # (N, H, D)
|
||||||
logger.info(
|
logger.info(
|
||||||
"FAST fit: collected %d chunks of shape %s from %d episodes",
|
"FAST fit: collected %d chunks of shape %s from %d episodes",
|
||||||
actions.shape[0], actions.shape[1:], eps_visited,
|
actions.shape[0], actions.shape[1:], eps_visited,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Quantile-normalise per dimension before fitting.
|
||||||
|
#
|
||||||
|
# The FAST tokenizer DCT-transforms actions, scales by ``scale`` and
|
||||||
|
# rounds to integer tokens; the integer *range* must fit the
|
||||||
|
# codebook (vocab_size, default 1024). Raw motor units (e.g. encoder
|
||||||
|
# ticks) blow that range up — hence "Vocab size 1024 is too small".
|
||||||
|
# More importantly, at training time ``ActionTokenizerProcessorStep``
|
||||||
|
# runs *after* the QUANTILES ``NormalizerProcessorStep``, so it
|
||||||
|
# encodes normalised actions. Fitting on raw actions would mismatch
|
||||||
|
# that space. We replicate QUANTILES normalisation here (per-dim
|
||||||
|
# [q01, q99] → [-1, 1], clipped) so the fit and the training-time
|
||||||
|
# encode see the same distribution.
|
||||||
|
flat = actions.reshape(-1, actions.shape[-1])
|
||||||
|
q01 = np.quantile(flat, 0.01, axis=0)
|
||||||
|
q99 = np.quantile(flat, 0.99, axis=0)
|
||||||
|
span = np.where((q99 - q01) > 1e-6, q99 - q01, 1.0)
|
||||||
|
actions = np.clip((actions - q01) / span * 2.0 - 1.0, -1.0, 1.0).astype(np.float32)
|
||||||
|
|
||||||
base = AutoProcessor.from_pretrained(base_tokenizer_name, trust_remote_code=True)
|
base = AutoProcessor.from_pretrained(base_tokenizer_name, trust_remote_code=True)
|
||||||
if not hasattr(base, "fit"):
|
if not hasattr(base, "fit"):
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
|
|||||||
Reference in New Issue
Block a user