add more changes

This commit is contained in:
Jade Choghari
2025-12-17 18:03:09 +00:00
parent b229e7df28
commit 18ddc67714
20 changed files with 5417 additions and 13 deletions
+11 -1
View File
@@ -1402,6 +1402,13 @@ def main():
action="store_true",
help="Push modified dataset to HuggingFace Hub",
)
# add image key
parser.add_argument(
"--image-key",
type=str,
default=None,
help="Image observation key to use for image mode (default: None)",
)
args = parser.parse_args()
console = Console()
@@ -1443,7 +1450,10 @@ def main():
)
# Get image keys (for image mode)
image_keys = dataset.meta.camera_keys[:args.num_image_views_per_sample]
if args.image_key:
image_keys = [args.image_key]
else:
image_keys = dataset.meta.camera_keys[:args.num_image_views_per_sample]
if not args.video_mode:
console.print(f"[cyan]Using image keys: {image_keys}[/cyan]")
+7 -6
View File
@@ -1,10 +1,11 @@
python examples/dataset/annotate.py \
--repo-id jadechoghari/collect-data \
--video-key observation.images.base \
--model Qwen/Qwen3-VL-30B-A3B-Instruct \
--episodes 16 22
# python examples/dataset/annotate.py \
# --repo-id lerobot/svla_so101_pickplace \
# --video-key observation.images.side \
# --model Qwen/Qwen3-VL-30B-A3B-Instruct \
python examples/dataset/annotate.py \
--repo-id lerobot/svla_so101_pickplace \
--video-key observation.images.side \
--model Qwen/Qwen3-VL-30B-A3B-Instruct \
--episodes 5
# --episodes 5
+3 -2
View File
@@ -4,12 +4,12 @@
# This generates user prompts and robot utterances for hierarchical policy training
# Configuration
REPO_ID="lerobot/svla_so101_pickplace"
REPO_ID="jadechoghari/collect-data"
MODEL="Qwen/Qwen3-VL-30B-A3B-Instruct"
# Alternative: MODEL="Qwen/Qwen2-VL-7B-Instruct"
OUTPUT_DIR="/fsx/jade_choghari/outputs/pgen_annotations1"
OUTPUT_DIR="/fsx/jade_choghari/outputs/collect-data-pgen"
BATCH_SIZE=32
TEMPERATURE=0.9
SAMPLE_INTERVAL=5.0 # Generate dialogue every 1 second (all episodes processed)
@@ -22,6 +22,7 @@ python examples/dataset/annotate_pgen.py \
--temperature "$TEMPERATURE" \
--batch-size "$BATCH_SIZE" \
--sample-interval "$SAMPLE_INTERVAL" \
--image-key observation.images.base \
--num-image-views-per-sample 1
# For faster testing, increase sample interval:
+26
View File
@@ -0,0 +1,26 @@
{
"repo_id": "local",
"vocab_size": 1024,
"scale": 10.0,
"encoded_dims": "0:15",
"encoded_dim_ranges": [
[
0,
15
]
],
"total_encoded_dims": 15,
"delta_dims": null,
"delta_dim_list": null,
"use_delta_transform": false,
"state_key": "observation.state",
"action_horizon": 50,
"num_training_chunks": 4900,
"compression_stats": {
"compression_ratio": 15.85791309863622,
"mean_token_length": 47.295,
"p99_token_length": 90.0,
"min_token_length": 9.0,
"max_token_length": 109.0
}
}
@@ -0,0 +1,158 @@
import logging
from typing import ClassVar
import numpy as np
from scipy.fft import dct
from scipy.fft import idct
from tokenizers import ByteLevelBPETokenizer
from tokenizers.trainers import BpeTrainer
from transformers import PreTrainedTokenizerFast
from transformers.processing_utils import ProcessorMixin
class UniversalActionProcessor(ProcessorMixin):
attributes: ClassVar[list[str]] = ["bpe_tokenizer"]
bpe_tokenizer_class: str = "AutoTokenizer"
def __init__(
self,
bpe_tokenizer: PreTrainedTokenizerFast,
scale: float = 10,
vocab_size: int = 1024,
min_token: int = 0,
*,
action_dim: int | None = None,
time_horizon: int | None = None,
):
self.scale = scale
self.vocab_size = vocab_size
self.min_token = min_token
# Action horizon and dimension needed during decoding. These can be specified
# in three ways (in order of priority):
# 1. passed in as kwargs to decode()
# 2. in the constructor
# 3. cached from the last time decode() was called
self.time_horizon = time_horizon
self.action_dim = action_dim
self.called_time_horizon = time_horizon
self.called_action_dim = action_dim
super().__init__(bpe_tokenizer)
def __call__(self, action_chunk: np.array) -> np.array:
assert action_chunk.ndim <= 3, "Only 3 dimensions supported: [batch, timesteps, action_dim]"
if action_chunk.ndim == 2:
action_chunk = action_chunk[None, ...]
# Cache the time horizon and action dimension for decoding
self.called_time_horizon = action_chunk.shape[-2]
self.called_action_dim = action_chunk.shape[-1]
dct_coeff = dct(action_chunk, axis=1, norm="ortho")
dct_coeff = np.around(dct_coeff * self.scale)
tokens = []
for elem in dct_coeff:
token_str = "".join(map(chr, np.maximum(elem.flatten() - self.min_token, 0).astype(int)))
tokens.append(self.bpe_tokenizer(token_str)["input_ids"])
return tokens
def decode(
self,
tokens: list[list[int]],
*,
time_horizon: int | None = None,
action_dim: int | None = None,
) -> np.array:
self.time_horizon = time_horizon or self.time_horizon or self.called_time_horizon
self.action_dim = action_dim or self.action_dim or self.called_action_dim
# Cache the time horizon and action dimension for the next call
self.called_time_horizon = self.time_horizon
self.called_action_dim = self.action_dim
assert (
self.time_horizon is not None and self.action_dim is not None
), "Tokenizer not initialized, call encode() once or pass in time_horizon and action_dim."
decoded_actions = []
for token in tokens:
try:
decoded_tokens = self.bpe_tokenizer.decode(token)
decoded_dct_coeff = np.array(list(map(ord, decoded_tokens))) + self.min_token
decoded_dct_coeff = decoded_dct_coeff.reshape(-1, self.action_dim)
assert (
decoded_dct_coeff.shape
== (
self.time_horizon,
self.action_dim,
)
), f"Decoded DCT coefficients have shape {decoded_dct_coeff.shape}, expected ({self.time_horizon}, {self.action_dim})"
except Exception as e:
print(f"Error decoding tokens: {e}")
print(f"Tokens: {token}")
decoded_dct_coeff = np.zeros((self.time_horizon, self.action_dim))
decoded_actions.append(idct(decoded_dct_coeff / self.scale, axis=0, norm="ortho"))
return np.stack(decoded_actions)
@classmethod
def fit(
cls,
action_data: list[np.array],
scale: float = 10,
vocab_size: int = 1024,
*,
time_horizon: int | None = None,
action_dim: int | None = None,
) -> "UniversalActionProcessor":
# Run DCT over all inputs
dct_tokens = [dct(a, axis=0, norm="ortho").flatten() for a in action_data]
# Quantize and find min token
max_token = int(np.around(np.concatenate(dct_tokens) * scale).max())
min_token = int(np.around(np.concatenate(dct_tokens) * scale).min())
min_vocab_size = max_token - min_token
assert (
min_vocab_size <= vocab_size
), f"Vocab size {vocab_size} is too small for the range of tokens {min_vocab_size}"
if min_vocab_size + 100 > vocab_size:
logging.warning(
f"Initial alphabet size {min_vocab_size} is almost as large as the vocab"
f"size {vocab_size}, consider increasing vocab size"
)
# Make token iterator for BPE training
def _token_iter():
for tokens in dct_tokens:
rounded_tokens = np.around(tokens * scale) - min_token
rounded_tokens = rounded_tokens.astype(int)
string = "".join(map(chr, rounded_tokens))
yield string
# Train BPE tokenizer
bpe = ByteLevelBPETokenizer()
# Set up the entire range of possible tokens as the initial alphabet
alphabet = [chr(i) for i in range(max_token - min_token + 1)]
trainer = BpeTrainer(
vocab_size=vocab_size,
min_frequency=2,
show_progress=True,
special_tokens=[],
initial_alphabet=alphabet,
max_token_length=10000,
)
# Train the inner tokenizer (don't use ByteLevelBPETokenizer.train_from_iterator()
# because it doesn't support custom alphabets)
bpe._tokenizer.train_from_iterator(_token_iter(), trainer=trainer)
return cls(
PreTrainedTokenizerFast(tokenizer_object=bpe, clean_up_tokenization_spaces=False),
scale=scale,
vocab_size=vocab_size,
min_token=min_token,
time_horizon=time_horizon,
action_dim=action_dim,
)
@@ -0,0 +1,11 @@
{
"action_dim": 15,
"auto_map": {
"AutoProcessor": "processing_action_tokenizer.UniversalActionProcessor"
},
"min_token": -71,
"processor_class": "UniversalActionProcessor",
"scale": 10.0,
"time_horizon": 50,
"vocab_size": 1024
}
@@ -0,0 +1 @@
{}
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,11 @@
{
"added_tokens_decoder": {},
"auto_map": {
"AutoProcessor": "processing_action_tokenizer.UniversalActionProcessor"
},
"clean_up_tokenization_spaces": false,
"extra_special_tokens": {},
"model_max_length": 1000000000000000019884624838656,
"processor_class": "UniversalActionProcessor",
"tokenizer_class": "PreTrainedTokenizerFast"
}
@@ -0,0 +1,196 @@
# FAST Tokenizer Training for LeRobotDataset
This directory contains tools for training a FAST (Factorized Action Sequence Tokenizer) on LeRobot datasets.
## Files
- **`train_fast_tokenizer.py`**: Main training script (refactored for LeRobotDataset)
- **`train_fast_tokenizer_example.md`**: Usage examples and parameter documentation
- **`MIGRATION_NOTES.md`**: Migration guide from B1K to LeRobotDataset
## Quick Start
```bash
# Basic usage
python train_fast_tokenizer.py \
--repo_id "lerobot/aloha_sim_insertion_human" \
--action_horizon 10 \
--encoded_dims "0:14"
# With delta transform
python train_fast_tokenizer.py \
--repo_id "lerobot/aloha_sim_insertion_human" \
--action_horizon 10 \
--encoded_dims "0:14" \
--delta_dims "0,1,2,3,4,5,6,7,8,9,10,11,12,13" \
--state_key "observation.state" \
--vocab_size 1024
```
## What is FAST?
FAST is a tokenizer for robotic action sequences that:
1. Applies DCT (Discrete Cosine Transform) to action chunks
2. Quantizes DCT coefficients
3. Uses BPE (Byte-Pair Encoding) to compress the quantized sequence
4. Achieves high compression ratios (e.g., 10-20x) while maintaining accuracy
This enables efficient storage and processing of long action sequences in vision-language-action models.
## Requirements
- Python 3.10+
- LeRobot dataset (either local or from HuggingFace Hub)
- transformers (for AutoProcessor)
- numpy
- torch
- tyro
## Workflow
```
LeRobotDataset → Extract Episodes → Apply Delta Transform
Select Dimensions → Normalize (q01, q99) → Create Chunks
Train FAST Tokenizer → Compute Stats → Save
```
## Parameters Guide
### Essential Parameters
- **`repo_id`**: HuggingFace dataset repository ID
- Example: `"lerobot/aloha_sim_insertion_human"`
- **`action_horizon`**: Length of action sequences to tokenize
- Typical: 10-16 steps
- **`encoded_dims`**: Which action dimensions to encode
- Format: `"start:end,start:end"`
- Example: `"0:7"` = dimensions 0-6
- Example: `"0:3,7:10"` = dimensions 0-2 and 7-9
### Optional Parameters
- **`delta_dims`**: Apply delta transform (action - state) to these dimensions
- Format: `"0,1,2,3,4,5"`
- Use for position-based actions
- **`state_key`**: Dataset key containing state observations
- Default: `"observation.state"`
- **`vocab_size`**: BPE vocabulary size
- Default: 1024
- Larger = better compression but more memory
- **`scale`**: DCT quantization scale
- Default: 10.0
- Smaller = finer quantization, larger = coarser
- **`sample_fraction`**: Fraction of action chunks to use per episode
- Default: 0.1 (10%)
- Increase for small datasets, decrease for large datasets
## Output
The script creates a directory (default: `./fast_tokenizer_{repo_id}`) containing:
1. **Tokenizer files**: Can be loaded with `AutoProcessor.from_pretrained()`
2. **`metadata.json`**: Contains:
- Training configuration
- Compression statistics
- Dataset information
## Example Output
```
Loading dataset: lerobot/aloha_sim_insertion_human
Dataset loaded: 50 episodes, 5000 frames
Encoding 14 dimensions: 0:14
Delta dimensions: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
Action horizon: 10
Processing 50 episodes...
Collected 4500 action chunks
Extracted 14 encoded dimensions
Before normalization - overall stats:
Min: -2.3451, Max: 3.1234, Mean: 0.0234, Std: 0.8765
Applied quantile normalization [q01, q99] → [-1, 1]
After normalization - overall stats:
Min: -1.0000, Max: 1.0000, Mean: 0.0156, Std: 0.4321
Training FAST tokenizer on 4500 action chunks...
Action chunk shape: (4500, 10, 14)
Vocab size: 1024
DCT scale: 10.0
✓ Tokenizer training complete!
Compression Statistics:
Average compression ratio: 14.23x
Mean token length: 9.8
P99 token length: 15
Min token length: 6
Max token length: 18
✅ Saved FAST tokenizer to ./fast_tokenizer_lerobot_aloha_sim_insertion_human
```
## Using the Trained Tokenizer
```python
from transformers import AutoProcessor
# Load tokenizer
tokenizer = AutoProcessor.from_pretrained(
"./fast_tokenizer_lerobot_aloha_sim_insertion_human",
trust_remote_code=True
)
# Encode action chunk [horizon, action_dim]
action_chunk = np.random.randn(10, 14) # Example
tokens = tokenizer(action_chunk[None])[0] # Returns token IDs
# Decode tokens back to actions
reconstructed = tokenizer.decode(tokens)
```
## Tips
1. **Start Small**: Use `--max_episodes 10` for initial testing
2. **Check Dimensions**: Verify encoded dimensions match your robot's action space
3. **Delta Transform**: Use for position-based actions, not velocity-based
4. **Normalization**: Ensure dataset has proper statistics computed
5. **Compression Ratio**: Aim for 10-20x for good balance of compression and accuracy
## Troubleshooting
**Issue**: "No normalization stats found"
- **Solution**: Compute dataset statistics first, or use raw actions
**Issue**: "Episode too short for action horizon"
- **Solution**: Reduce `--action_horizon` or filter short episodes
**Issue**: "State key not found"
- **Solution**: Check dataset features and use correct `--state_key`
**Issue**: Memory error with large datasets
- **Solution**: Reduce `--sample_fraction` or `--max_episodes`
## Citation
If you use FAST in your research, please cite:
```bibtex
@article{black2023fast,
title={FAST: Factorized Action Sequence Tokenizer for Vision-Language-Action Models},
author={Black, Kevin and others},
journal={arXiv preprint},
year={2023}
}
```
+21
View File
@@ -0,0 +1,21 @@
lerobot-train \
--dataset.repo_id=lerobot \
--dataset.root=/fsx/jade_choghari/outputs/collect-data-pgen \
--output_dir=/fsx/jade_choghari/outputs/pi0test1 \
--job_name=pi0_training \
--policy.repo_id=jade_choghari/pi0-base \
--policy.path=/fsx/jade_choghari/outputs/pi0_fast_fruit1/checkpoints/last/pretrained_model \
--policy.dtype=bfloat16 \
--steps=3000 \
--save_freq=1000 \
--rename_map='{
"observation.images.base": "observation.images.base_0_rgb",
"observation.images.left_wrist": "observation.images.left_wrist_0_rgb",
"observation.images.right_wrist": "observation.images.right_wrist_0_rgb",
}' \
--batch_size=4 \
--policy.device=cuda \
# --wandb.enable=true \
# --wandb.disable_artifact=true \
# --wandb.project=pi05hi-training \
+1 -1
View File
@@ -893,7 +893,7 @@ class PI05Pytorch(nn.Module): # see openpi `PI0Pytorch`
fast_emb = self.fast_action_embedding(fast_action_tokens)
fast_emb_dim = fast_emb.shape[-1]
return fast_emb * math.sqrt(fast_emb_dim)
fast_action_emb = self._apply_checkpoint(fast_action_embed_func, fast_action_tokens)
embs.append(fast_action_emb)
+1 -1
View File
@@ -178,7 +178,7 @@ def make_pi05_pre_post_processors(
padding="max_length",
),
ActionTokenizerProcessorStep(
tokenizer_name="physical-intelligence/fast",
tokenizer_name="/fsx/jade_choghari/outputs/fast_tokenizer", # TODO: jade put the PI
),
DeviceProcessorStep(device=config.device),
]
+22
View File
@@ -0,0 +1,22 @@
export CUDA_LAUNCH_BLOCKING=1
lerobot-train \
--dataset.repo_id=local \
--dataset.root=/fsx/jade_choghari/outputs/collect-data-pgen \
--output_dir=/fsx/jade_choghari/outputs/pi0_fast_fruit1 \
--job_name=pi0_training \
--policy.repo_id=jade_choghari/pi0-base1 \
--policy.path=lerobot/pi05_base \
--policy.dtype=bfloat16 \
--steps=200000 \
--save_freq=5000 \
--rename_map='{
"observation.images.base": "observation.images.base_0_rgb",
"observation.images.left_wrist": "observation.images.left_wrist_0_rgb",
"observation.images.right_wrist": "observation.images.right_wrist_0_rgb",
}' \
--batch_size=4 \
--policy.device=cuda \
--wandb.enable=true \
--wandb.disable_artifact=true \
--wandb.project=pi05hi-training \
# /fsx/jade_choghari/.cache/huggingface/lerobot/jadechoghari/collect-data
+18
View File
@@ -0,0 +1,18 @@
rm -rf /fsx/jade_choghari/outputs/pi0_multi_training
lerobot-train \
--dataset.repo_id=local\
--dataset.root=/fsx/jade_choghari/outputs/collect-data-pgen \
--output_dir=/fsx/jade_choghari/outputs/pi0_multi_training \
--job_name=pi0_multi_training \
--policy.repo_id=jadechoghari/pi0-base1 \
--policy.path=lerobot/pi05_base \
--policy.dtype=bfloat16 \
--steps=50000 \
--save_freq=5000 \
--rename_map='{
"observation.images.base": "observation.images.base_0_rgb",
"observation.images.left_wrist": "observation.images.left_wrist_0_rgb",
"observation.images.right_wrist": "observation.images.right_wrist_0_rgb",
}' \
--batch_size=32 \
--policy.device=cuda \
+9
View File
@@ -0,0 +1,9 @@
python src/lerobot/policies/pi05/train_fast_tokenizer.py \
--repo_id "local" \
--root "/fsx/jade_choghari/outputs/collect-data-pgen" \
--action_horizon 16 \
--encoded_dims "0:15" \
--action_horizon 50 \
--vocab_size 1024 \
--scale 10.0 \
--output_dir "/fsx/jade_choghari/outputs/fast_tokenizer"
@@ -0,0 +1,410 @@
"""Train FAST tokenizer for action encoding.
This script:
1. Loads action chunks from LeRobotDataset (with sampling)
2. Applies delta transforms and per-timestamp normalization
3. Trains FAST tokenizer on specified action dimensions
4. Saves tokenizer to assets directory
5. Reports compression statistics
"""
import json
import numpy as np
import tyro
from pathlib import Path
from transformers import AutoProcessor
import torch
from lerobot.datasets.lerobot_dataset import LeRobotDataset
def apply_delta_transform(state: np.ndarray, actions: np.ndarray, delta_dims: list[int] | None) -> np.ndarray:
"""Apply delta transform to specified dimensions.
Args:
state: Current state [D]
actions: Future actions [D]
delta_dims: List of dimension indices to apply delta transform to
Returns:
Transformed actions [D]
"""
if delta_dims is None or len(delta_dims) == 0:
return actions
delta_actions = actions.copy()
for dim in delta_dims:
delta_actions[dim] = actions[dim] - state[dim]
return delta_actions
def process_episode(args):
"""Process single episode and return action chunks."""
dataset, ep_idx, action_horizon, delta_dims, sample_fraction, state_key, use_delta_transform = args
try:
# Get episode info
ep_info = dataset.meta.episodes[ep_idx]
from_idx = ep_info["dataset_from_index"]
to_idx = ep_info["dataset_to_index"]
ep_length = to_idx - from_idx
if ep_length < action_horizon:
return None
# Load all frames in episode
# If dataset has episode filtering, we need to use the mapping
states = []
actions = []
for abs_idx in range(from_idx, to_idx):
# Map absolute index to relative index if needed
if dataset._absolute_to_relative_idx is not None:
if abs_idx not in dataset._absolute_to_relative_idx:
# This episode's frames aren't in the filtered dataset
return None
rel_idx = dataset._absolute_to_relative_idx[abs_idx]
else:
rel_idx = abs_idx
frame = dataset.hf_dataset[rel_idx]
# Get state (could be from observation.state or other state key)
if state_key in frame:
state = frame[state_key].numpy() if torch.is_tensor(frame[state_key]) else np.array(frame[state_key])
else:
# If no state key, use zeros (no delta transform)
state = np.zeros_like(frame["action"].numpy() if torch.is_tensor(frame["action"]) else np.array(frame["action"]))
action = frame["action"].numpy() if torch.is_tensor(frame["action"]) else np.array(frame["action"])
states.append(state)
actions.append(action)
states = np.array(states)
actions = np.array(actions)
# Create action chunks (sliding window)
# All actions in a chunk are relative to the FIRST state in that chunk
action_chunks = []
for i in range(len(states) - action_horizon + 1):
current_state = states[i] # First state in chunk
future_absolute_actions = actions[i:i + action_horizon]
if use_delta_transform:
# Relative actions
delta_chunk = np.zeros_like(future_absolute_actions)
for t in range(action_horizon):
delta_chunk[t] = apply_delta_transform(
current_state,
future_absolute_actions[t],
delta_dims,
)
action_chunks.append(delta_chunk)
else:
# Absolute actions (NO delta)
action_chunks.append(future_absolute_actions)
if len(action_chunks) == 0:
return None
action_chunks = np.array(action_chunks)
# Sample chunks
if sample_fraction < 1.0:
n_chunks = len(action_chunks)
n_samples = max(1, int(n_chunks * sample_fraction))
episode_seed = hash(ep_idx) % (2**31)
rng = np.random.RandomState(episode_seed)
indices = rng.choice(n_chunks, size=n_samples, replace=False)
action_chunks = action_chunks[indices]
return action_chunks
except Exception as e:
print(f"Error processing episode {ep_idx}: {e}")
import traceback
traceback.print_exc()
return None
def train_fast_tokenizer(
action_chunks: np.ndarray,
vocab_size: int = 1024,
scale: float = 10.0,
) -> AutoProcessor:
"""
Train FAST tokenizer (BPE on DCT coefficients) on action chunks.
Uses the .fit() method to train a new tokenizer on the provided data.
Args:
action_chunks: Array of action chunks [N, H, D] where N=num_chunks, H=horizon, D=action_dim
vocab_size: BPE vocabulary size
scale: DCT scaling factor for quantization
Returns:
Trained FAST tokenizer
"""
print(f"Training FAST tokenizer on {len(action_chunks)} action chunks...")
print(f"Action chunk shape: {action_chunks.shape}")
print(f"Vocab size: {vocab_size}")
print(f"DCT scale: {scale}")
# Download the tokenizer source code (not pretrained weights)
# We'll train a new tokenizer on our own data
base_tokenizer = AutoProcessor.from_pretrained(
"physical-intelligence/fast",
trust_remote_code=True
)
# Convert action_chunks array to list of arrays (expected by .fit())
action_data_list = [action_chunks[i] for i in range(len(action_chunks))]
# Train the new tokenizer on our action data using .fit()
# This trains the BPE tokenizer on DCT coefficients
print("Training new tokenizer (this may take a few minutes)...")
tokenizer = base_tokenizer.fit(
action_data_list,
scale=scale,
vocab_size=vocab_size,
time_horizon=action_chunks.shape[1], # action_horizon
action_dim=action_chunks.shape[2], # encoded dimensions
)
print("✓ Tokenizer training complete!")
# Validate it works
sample_chunk = action_chunks[0]
encoded = tokenizer(sample_chunk[None])[0]
if isinstance(encoded, list):
encoded = np.array(encoded)
print(f"Sample encoding: {len(encoded)} tokens for chunk shape {sample_chunk.shape}")
return tokenizer
def compute_compression_stats(tokenizer, action_chunks: np.ndarray):
"""Compute compression statistics."""
print("\nComputing compression statistics...")
# Sample for stats (use max 1000 chunks for speed)
sample_size = min(1000, len(action_chunks))
sample_indices = np.random.RandomState(42).choice(len(action_chunks), size=sample_size, replace=False)
sample_chunks = action_chunks[sample_indices]
token_lengths = []
for chunk in sample_chunks:
encoded = tokenizer(chunk[None])[0]
if isinstance(encoded, list):
token_lengths.append(len(encoded))
else:
token_lengths.append(encoded.shape[0] if hasattr(encoded, 'shape') else len(encoded))
token_lengths = np.array(token_lengths)
# Compression ratio: (H * D) / avg_tokens
input_size = action_chunks.shape[1] * action_chunks.shape[2]
avg_tokens = np.mean(token_lengths)
compression_ratio = input_size / avg_tokens
stats = {
'compression_ratio': float(compression_ratio),
'mean_token_length': float(np.mean(token_lengths)),
'p99_token_length': float(np.percentile(token_lengths, 99)),
'min_token_length': float(np.min(token_lengths)),
'max_token_length': float(np.max(token_lengths)),
}
print(f"Compression Statistics:")
print(f" Average compression ratio: {stats['compression_ratio']:.2f}x")
print(f" Mean token length: {stats['mean_token_length']:.1f}")
print(f" P99 token length: {stats['p99_token_length']:.0f}")
print(f" Min token length: {stats['min_token_length']:.0f}")
print(f" Max token length: {stats['max_token_length']:.0f}")
return stats
def main(
repo_id: str,
root: str | None = None,
action_horizon: int = 10,
max_episodes: int | None = None,
sample_fraction: float = 0.1,
encoded_dims: str = "0:6,7:23",
delta_dims: str | None = None,
use_delta_transform: bool = False,
state_key: str = "observation.state",
vocab_size: int = 1024,
scale: float = 10.0,
output_dir: str | None = None,
):
"""
Train FAST tokenizer for action encoding.
Args:
repo_id: LeRobot dataset repository ID
root: Root directory for dataset (default: ~/.cache/huggingface/lerobot)
action_horizon: Number of future actions in each chunk
max_episodes: Max episodes to use (None = all episodes in dataset)
sample_fraction: Fraction of chunks to sample per episode
encoded_dims: Comma-separated dimension ranges to encode (e.g., "0:6,7:23")
delta_dims: Comma-separated dimension indices for delta transform (e.g., "0,1,2,3,4,5")
use_delta_transform: Whether to apply delta transform (relative actions vs absolute actions)
state_key: Dataset key for state observations (default: "observation.state")
vocab_size: FAST vocabulary size (BPE vocab size)
scale: DCT scaling factor (default: 10.0)
output_dir: Directory to save tokenizer (default: ./fast_tokenizer_{repo_id})
"""
# Load dataset
print(f"Loading dataset: {repo_id}")
dataset = LeRobotDataset(repo_id=repo_id, root=root)
print(f"Dataset loaded: {dataset.num_episodes} episodes, {dataset.num_frames} frames")
# Parse encoded dimensions
encoded_dim_ranges = []
for range_str in encoded_dims.split(','):
start, end = map(int, range_str.strip().split(':'))
encoded_dim_ranges.append((start, end))
total_encoded_dims = sum(end - start for start, end in encoded_dim_ranges)
print(f"Encoding {total_encoded_dims} dimensions: {encoded_dims}")
# Parse delta dimensions
delta_dim_list = None
if delta_dims is not None and delta_dims.strip():
delta_dim_list = [int(d.strip()) for d in delta_dims.split(',')]
print(f"Delta dimensions: {delta_dim_list}")
else:
print("No delta dimensions specified")
print(f"Use delta transform: {use_delta_transform}")
if use_delta_transform and (delta_dim_list is None or len(delta_dim_list) == 0):
print("Warning: use_delta_transform=True but no delta_dims specified. No delta will be applied.")
print(f"Action horizon: {action_horizon}")
print(f"State key: {state_key}")
# Determine episodes to process
num_episodes = dataset.num_episodes
if max_episodes is not None:
num_episodes = min(max_episodes, num_episodes)
print(f"Processing {num_episodes} episodes...")
# Process episodes sequentially (to avoid pickling issues with dataset)
all_chunks = []
for ep_idx in range(num_episodes):
if ep_idx % 10 == 0:
print(f" Processing episode {ep_idx}/{num_episodes}...")
chunks = process_episode(
(dataset, ep_idx, action_horizon, delta_dim_list, sample_fraction, state_key, use_delta_transform)
)
if chunks is not None:
all_chunks.append(chunks)
# Concatenate all chunks
all_chunks = np.concatenate(all_chunks, axis=0)
print(f"Collected {len(all_chunks)} action chunks")
# Extract only encoded dimensions FIRST (before normalization)
encoded_chunks = []
for start, end in encoded_dim_ranges:
encoded_chunks.append(all_chunks[:, :, start:end])
encoded_chunks = np.concatenate(encoded_chunks, axis=-1) # [N, H, D_encoded]
print(f"Extracted {encoded_chunks.shape[-1]} encoded dimensions")
# Apply normalization to encoded dimensions only
# NOTE: For FAST, we ALWAYS use QUANTILE normalization (no per-timestamp)
# This clips outliers and provides consistent [-1, 1] range for DCT compression
print(f"\nBefore normalization - overall stats:")
print(f" Min: {np.min(encoded_chunks):.4f}, Max: {np.max(encoded_chunks):.4f}")
print(f" Mean: {np.mean(encoded_chunks):.4f}, Std: {np.std(encoded_chunks):.4f}")
norm_stats = dataset.meta.stats
if norm_stats is not None and "action" in norm_stats:
action_stats = norm_stats["action"]
# Build encoded dimension indices
encoded_dim_indices = []
for start, end in encoded_dim_ranges:
encoded_dim_indices.extend(range(start, end))
encoded_dim_indices = np.array(encoded_dim_indices)
# Use QUANTILE normalization: clip to [q01, q99] and map to [-1, 1]
if "q01" in action_stats and "q99" in action_stats:
q01 = np.array(action_stats["q01"])[encoded_dim_indices] # [D_encoded]
q99 = np.array(action_stats["q99"])[encoded_dim_indices] # [D_encoded]
print(f"\nNormalization stats (q01, q99) for encoded dimensions:")
for i, dim_idx in enumerate(encoded_dim_indices):
print(f" Orig dim {dim_idx}: q01={q01[i]:7.4f}, q99={q99[i]:7.4f}, range={q99[i]-q01[i]:7.4f}")
# Clip to quantile range and normalize to [-1, 1]
encoded_chunks = np.clip(encoded_chunks, q01, q99)
encoded_chunks = 2.0 * (encoded_chunks - q01) / np.maximum(q99 - q01, 1e-6) - 1.0
print(f"\nApplied quantile normalization [q01, q99] → [-1, 1]")
print(f"\nAfter normalization - overall stats:")
print(f" Min: {np.min(encoded_chunks):.4f}, Max: {np.max(encoded_chunks):.4f}")
print(f" Mean: {np.mean(encoded_chunks):.4f}, Std: {np.std(encoded_chunks):.4f}")
print(f"\nPer-dimension stats (after normalization):")
for d in range(encoded_chunks.shape[-1]):
dim_data = encoded_chunks[:, :, d]
print(f" Dim {d}: min={np.min(dim_data):7.4f}, max={np.max(dim_data):7.4f}, "
f"mean={np.mean(dim_data):7.4f}, std={np.std(dim_data):7.4f}")
else:
print("Warning: q01/q99 stats not found, using raw actions")
else:
print("Warning: No normalization stats found, using raw actions")
print(f"Encoded chunks shape: {encoded_chunks.shape}")
# Train FAST tokenizer
tokenizer = train_fast_tokenizer(
encoded_chunks,
vocab_size=vocab_size,
scale=scale,
)
# Compute compression statistics
compression_stats = compute_compression_stats(tokenizer, encoded_chunks)
# Save tokenizer
if output_dir is None:
output_dir = f"fast_tokenizer_{repo_id.replace('/', '_')}"
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
tokenizer.save_pretrained(output_path)
# Save metadata
metadata = {
'repo_id': repo_id,
'vocab_size': vocab_size,
'scale': scale,
'encoded_dims': encoded_dims,
'encoded_dim_ranges': encoded_dim_ranges,
'total_encoded_dims': total_encoded_dims,
'delta_dims': delta_dims,
'delta_dim_list': delta_dim_list,
'use_delta_transform': use_delta_transform,
'state_key': state_key,
'action_horizon': action_horizon,
'num_training_chunks': len(encoded_chunks),
'compression_stats': compression_stats,
}
with open(output_path / "metadata.json", 'w') as f:
json.dump(metadata, f, indent=2)
print(f"\n✅ Saved FAST tokenizer to {output_path}")
print(f"Metadata: {json.dumps(metadata, indent=2)}")
if __name__ == "__main__":
tyro.cli(main)
@@ -0,0 +1,101 @@
# Train FAST Tokenizer - Usage Examples
This script trains a FAST (Factorized Action Sequence Tokenizer) on LeRobotDataset action data.
## Basic Usage
```bash
python src/lerobot/policies/pi05/train_fast_tokenizer.py \
--repo_id "lerobot/aloha_sim_insertion_human" \
--action_horizon 10 \
--encoded_dims "0:7" \
--vocab_size 1024 \
--scale 10.0
```
## Parameters
### Required
- `--repo_id`: LeRobot dataset repository ID (e.g., "lerobot/aloha_sim_insertion_human")
### Optional
- `--root`: Root directory for dataset (default: ~/.cache/huggingface/lerobot)
- `--action_horizon`: Number of future actions in each chunk (default: 10)
- `--max_episodes`: Maximum number of episodes to use (default: None = all)
- `--sample_fraction`: Fraction of chunks to sample per episode (default: 0.1)
- `--encoded_dims`: Comma-separated dimension ranges to encode (default: "0:6,7:23")
- Example: "0:7" encodes dimensions 0-6
- Example: "0:3,6:9" encodes dimensions 0-2 and 6-8
- `--delta_dims`: Comma-separated dimension indices for delta transform (default: None)
- Example: "0,1,2,3,4,5" applies delta transform to first 6 dimensions
- Delta transform: action[i] - state[i] for specified dimensions
- `--state_key`: Dataset key for state observations (default: "observation.state")
- `--vocab_size`: FAST vocabulary size / BPE vocab size (default: 1024)
- `--scale`: DCT scaling factor (default: 10.0)
- `--output_dir`: Directory to save tokenizer (default: ./fast_tokenizer_{repo_id})
## Examples
### Example 1: Train on full action space
```bash
python src/lerobot/policies/pi05/train_fast_tokenizer.py \
--repo_id "lerobot/pusht" \
--action_horizon 16 \
--encoded_dims "0:2" \
--vocab_size 512 \
--max_episodes 100
```
### Example 2: Train with delta transform
```bash
python src/lerobot/policies/pi05/train_fast_tokenizer.py \
--repo_id "lerobot/aloha_sim_insertion_human" \
--action_horizon 10 \
--encoded_dims "0:14" \
--delta_dims "0,1,2,3,4,5,6,7,8,9,10,11,12,13" \
--state_key "observation.state" \
--vocab_size 1024 \
--scale 10.0 \
--sample_fraction 0.2
```
### Example 3: Train on subset of dimensions
```bash
python src/lerobot/policies/pi05/train_fast_tokenizer.py \
--repo_id "lerobot/aloha_sim_insertion_human" \
--action_horizon 10 \
--encoded_dims "0:7" \
--vocab_size 1024 \
--output_dir "./my_tokenizer"
```
## Output
The script saves:
1. **Tokenizer files**: Trained FAST tokenizer (can be loaded with `AutoProcessor.from_pretrained()`)
2. **metadata.json**: Contains:
- Configuration parameters
- Compression statistics (compression ratio, token lengths)
- Training dataset information
## Understanding the Process
1. **Load Dataset**: Loads the LeRobotDataset from HuggingFace
2. **Extract Action Chunks**: Creates sliding windows of actions with specified horizon
3. **Apply Delta Transform**: (Optional) Computes action deltas relative to current state
4. **Select Encoded Dimensions**: Extracts only the dimensions to be encoded
5. **Normalize**: Applies quantile normalization ([q01, q99] → [-1, 1])
6. **Train Tokenizer**: Trains BPE tokenizer on DCT coefficients
7. **Compute Stats**: Reports compression ratio and token length statistics
8. **Save**: Saves tokenizer and metadata
## Notes
- **Normalization**: The script uses quantile normalization (q01, q99) from the dataset's statistics
- **Sampling**: To speed up training, you can sample a fraction of chunks per episode
- **Delta Transform**: Applied per-dimension to make actions relative to current state
- **Compression**: FAST uses DCT + BPE to compress action sequences efficiently
+23
View File
@@ -0,0 +1,23 @@
rm -rf /fsx/jade_choghari/outputs/pi0_multi_training
accelerate launch --multi_gpu --num_processes=2 \
$(which lerobot-train) \
--dataset.repo_id=local \
--dataset.root=/fsx/jade_choghari/outputs/collect-data-pgen \
--output_dir=/fsx/jade_choghari/outputs/pi0_multi_training \
--job_name=pi0_multi_training \
--policy.repo_id=jadechoghari/pi0-base1 \
--policy.path=lerobot/pi05_base \
--policy.dtype=bfloat16 \
--steps=50000 \
--save_freq=5000 \
--rename_map='{
"observation.images.base": "observation.images.base_0_rgb",
"observation.images.left_wrist": "observation.images.left_wrist_0_rgb",
"observation.images.right_wrist": "observation.images.right_wrist_0_rgb",
}' \
--policy.gradient_checkpointing=true \
--batch_size=1 \
--policy.device=cpu
# --wandb.enable=true \
# --wandb.disable_artifact=true \
# --wandb.project=pi05hi-training \
-2
View File
@@ -90,8 +90,6 @@ def update_policy(
# Let accelerator handle mixed precision
with accelerator.autocast():
loss, output_dict = policy.forward(batch)
action = policy.select_action(batch)
breakpoint()
# TODO(rcadene): policy.unnormalize_outputs(out_dict)
# Use accelerator's backward method