add more changes

2026-05-15 00:29:52 +00:00 · 2025-12-17 18:03:09 +00:00
parent b229e7df28
commit 18ddc67714
20 changed files with 5417 additions and 13 deletions
@@ -1402,6 +1402,13 @@ def main():
        action="store_true",
        help="Push modified dataset to HuggingFace Hub",
    )
+    # add image key
+    parser.add_argument(
+        "--image-key",
+        type=str,
+        default=None,
+        help="Image observation key to use for image mode (default: None)",
+    )
    
    args = parser.parse_args()
    console = Console()
@@ -1443,7 +1450,10 @@ def main():
    )
    
    # Get image keys (for image mode)
-    image_keys = dataset.meta.camera_keys[:args.num_image_views_per_sample]
+    if args.image_key:
+        image_keys = [args.image_key]
+    else:
+        image_keys = dataset.meta.camera_keys[:args.num_image_views_per_sample]
    if not args.video_mode:
        console.print(f"[cyan]Using image keys: {image_keys}[/cyan]")
    
@@ -1,10 +1,11 @@
+python examples/dataset/annotate.py \
+    --repo-id jadechoghari/collect-data \
+    --video-key observation.images.base \
+    --model Qwen/Qwen3-VL-30B-A3B-Instruct \
+    --episodes 16 22
+
 # python examples/dataset/annotate.py \
 #     --repo-id lerobot/svla_so101_pickplace \
 #     --video-key observation.images.side \
 #     --model Qwen/Qwen3-VL-30B-A3B-Instruct \
-
-python examples/dataset/annotate.py \
-    --repo-id lerobot/svla_so101_pickplace \
-    --video-key observation.images.side \
-    --model Qwen/Qwen3-VL-30B-A3B-Instruct \
-    --episodes 5
+#     --episodes 5
@@ -4,12 +4,12 @@
 # This generates user prompts and robot utterances for hierarchical policy training

 # Configuration
-REPO_ID="lerobot/svla_so101_pickplace"
+REPO_ID="jadechoghari/collect-data"
 MODEL="Qwen/Qwen3-VL-30B-A3B-Instruct"
 # Alternative: MODEL="Qwen/Qwen2-VL-7B-Instruct"


-OUTPUT_DIR="/fsx/jade_choghari/outputs/pgen_annotations1"
+OUTPUT_DIR="/fsx/jade_choghari/outputs/collect-data-pgen"
 BATCH_SIZE=32
 TEMPERATURE=0.9
 SAMPLE_INTERVAL=5.0  # Generate dialogue every 1 second (all episodes processed)
@@ -22,6 +22,7 @@ python examples/dataset/annotate_pgen.py \
    --temperature "$TEMPERATURE" \
    --batch-size "$BATCH_SIZE" \
    --sample-interval "$SAMPLE_INTERVAL" \
+    --image-key observation.images.base \
    --num-image-views-per-sample 1

 # For faster testing, increase sample interval:
@@ -0,0 +1,26 @@
+{
+  "repo_id": "local",
+  "vocab_size": 1024,
+  "scale": 10.0,
+  "encoded_dims": "0:15",
+  "encoded_dim_ranges": [
+    [
+      0,
+      15
+    ]
+  ],
+  "total_encoded_dims": 15,
+  "delta_dims": null,
+  "delta_dim_list": null,
+  "use_delta_transform": false,
+  "state_key": "observation.state",
+  "action_horizon": 50,
+  "num_training_chunks": 4900,
+  "compression_stats": {
+    "compression_ratio": 15.85791309863622,
+    "mean_token_length": 47.295,
+    "p99_token_length": 90.0,
+    "min_token_length": 9.0,
+    "max_token_length": 109.0
+  }
+}
@@ -0,0 +1,158 @@
+import logging
+from typing import ClassVar
+
+import numpy as np
+from scipy.fft import dct
+from scipy.fft import idct
+from tokenizers import ByteLevelBPETokenizer
+from tokenizers.trainers import BpeTrainer
+from transformers import PreTrainedTokenizerFast
+from transformers.processing_utils import ProcessorMixin
+
+
+class UniversalActionProcessor(ProcessorMixin):
+    attributes: ClassVar[list[str]] = ["bpe_tokenizer"]
+    bpe_tokenizer_class: str = "AutoTokenizer"
+
+    def __init__(
+        self,
+        bpe_tokenizer: PreTrainedTokenizerFast,
+        scale: float = 10,
+        vocab_size: int = 1024,
+        min_token: int = 0,
+        *,
+        action_dim: int | None = None,
+        time_horizon: int | None = None,
+    ):
+        self.scale = scale
+        self.vocab_size = vocab_size
+        self.min_token = min_token
+
+        # Action horizon and dimension needed during decoding. These can be specified
+        # in three ways (in order of priority):
+        # 1. passed in as kwargs to decode()
+        # 2. in the constructor
+        # 3. cached from the last time decode() was called
+        self.time_horizon = time_horizon
+        self.action_dim = action_dim
+        self.called_time_horizon = time_horizon
+        self.called_action_dim = action_dim
+
+        super().__init__(bpe_tokenizer)
+
+    def __call__(self, action_chunk: np.array) -> np.array:
+        assert action_chunk.ndim <= 3, "Only 3 dimensions supported: [batch, timesteps, action_dim]"
+        if action_chunk.ndim == 2:
+            action_chunk = action_chunk[None, ...]
+
+        # Cache the time horizon and action dimension for decoding
+        self.called_time_horizon = action_chunk.shape[-2]
+        self.called_action_dim = action_chunk.shape[-1]
+
+        dct_coeff = dct(action_chunk, axis=1, norm="ortho")
+        dct_coeff = np.around(dct_coeff * self.scale)
+        tokens = []
+        for elem in dct_coeff:
+            token_str = "".join(map(chr, np.maximum(elem.flatten() - self.min_token, 0).astype(int)))
+            tokens.append(self.bpe_tokenizer(token_str)["input_ids"])
+        return tokens
+
+    def decode(
+        self,
+        tokens: list[list[int]],
+        *,
+        time_horizon: int | None = None,
+        action_dim: int | None = None,
+    ) -> np.array:
+        self.time_horizon = time_horizon or self.time_horizon or self.called_time_horizon
+        self.action_dim = action_dim or self.action_dim or self.called_action_dim
+
+        # Cache the time horizon and action dimension for the next call
+        self.called_time_horizon = self.time_horizon
+        self.called_action_dim = self.action_dim
+
+        assert (
+            self.time_horizon is not None and self.action_dim is not None
+        ), "Tokenizer not initialized, call encode() once or pass in time_horizon and action_dim."
+
+        decoded_actions = []
+        for token in tokens:
+            try:
+                decoded_tokens = self.bpe_tokenizer.decode(token)
+                decoded_dct_coeff = np.array(list(map(ord, decoded_tokens))) + self.min_token
+                decoded_dct_coeff = decoded_dct_coeff.reshape(-1, self.action_dim)
+                assert (
+                    decoded_dct_coeff.shape
+                    == (
+                        self.time_horizon,
+                        self.action_dim,
+                    )
+                ), f"Decoded DCT coefficients have shape {decoded_dct_coeff.shape}, expected ({self.time_horizon}, {self.action_dim})"
+            except Exception as e:
+                print(f"Error decoding tokens: {e}")
+                print(f"Tokens: {token}")
+                decoded_dct_coeff = np.zeros((self.time_horizon, self.action_dim))
+            decoded_actions.append(idct(decoded_dct_coeff / self.scale, axis=0, norm="ortho"))
+        return np.stack(decoded_actions)
+
+    @classmethod
+    def fit(
+        cls,
+        action_data: list[np.array],
+        scale: float = 10,
+        vocab_size: int = 1024,
+        *,
+        time_horizon: int | None = None,
+        action_dim: int | None = None,
+    ) -> "UniversalActionProcessor":
+        # Run DCT over all inputs
+        dct_tokens = [dct(a, axis=0, norm="ortho").flatten() for a in action_data]
+
+        # Quantize and find min token
+        max_token = int(np.around(np.concatenate(dct_tokens) * scale).max())
+        min_token = int(np.around(np.concatenate(dct_tokens) * scale).min())
+        min_vocab_size = max_token - min_token
+
+        assert (
+            min_vocab_size <= vocab_size
+        ), f"Vocab size {vocab_size} is too small for the range of tokens {min_vocab_size}"
+        if min_vocab_size + 100 > vocab_size:
+            logging.warning(
+                f"Initial alphabet size {min_vocab_size} is almost as large as the vocab"
+                f"size {vocab_size}, consider increasing vocab size"
+            )
+
+        # Make token iterator for BPE training
+        def _token_iter():
+            for tokens in dct_tokens:
+                rounded_tokens = np.around(tokens * scale) - min_token
+                rounded_tokens = rounded_tokens.astype(int)
+                string = "".join(map(chr, rounded_tokens))
+                yield string
+
+        # Train BPE tokenizer
+        bpe = ByteLevelBPETokenizer()
+
+        # Set up the entire range of possible tokens as the initial alphabet
+        alphabet = [chr(i) for i in range(max_token - min_token + 1)]
+        trainer = BpeTrainer(
+            vocab_size=vocab_size,
+            min_frequency=2,
+            show_progress=True,
+            special_tokens=[],
+            initial_alphabet=alphabet,
+            max_token_length=10000,
+        )
+
+        # Train the inner tokenizer (don't use ByteLevelBPETokenizer.train_from_iterator()
+        # because it doesn't support custom alphabets)
+        bpe._tokenizer.train_from_iterator(_token_iter(), trainer=trainer)
+
+        return cls(
+            PreTrainedTokenizerFast(tokenizer_object=bpe, clean_up_tokenization_spaces=False),
+            scale=scale,
+            vocab_size=vocab_size,
+            min_token=min_token,
+            time_horizon=time_horizon,
+            action_dim=action_dim,
+        )
@@ -0,0 +1,11 @@
+{
+  "action_dim": 15,
+  "auto_map": {
+    "AutoProcessor": "processing_action_tokenizer.UniversalActionProcessor"
+  },
+  "min_token": -71,
+  "processor_class": "UniversalActionProcessor",
+  "scale": 10.0,
+  "time_horizon": 50,
+  "vocab_size": 1024
+}
@@ -0,0 +1 @@
+{}
@@ -0,0 +1,11 @@
+{
+  "added_tokens_decoder": {},
+  "auto_map": {
+    "AutoProcessor": "processing_action_tokenizer.UniversalActionProcessor"
+  },
+  "clean_up_tokenization_spaces": false,
+  "extra_special_tokens": {},
+  "model_max_length": 1000000000000000019884624838656,
+  "processor_class": "UniversalActionProcessor",
+  "tokenizer_class": "PreTrainedTokenizerFast"
+}
@@ -0,0 +1,196 @@
+# FAST Tokenizer Training for LeRobotDataset
+
+This directory contains tools for training a FAST (Factorized Action Sequence Tokenizer) on LeRobot datasets.
+
+## Files
+
+- **`train_fast_tokenizer.py`**: Main training script (refactored for LeRobotDataset)
+- **`train_fast_tokenizer_example.md`**: Usage examples and parameter documentation
+- **`MIGRATION_NOTES.md`**: Migration guide from B1K to LeRobotDataset
+
+## Quick Start
+
+```bash
+# Basic usage
+python train_fast_tokenizer.py \
+    --repo_id "lerobot/aloha_sim_insertion_human" \
+    --action_horizon 10 \
+    --encoded_dims "0:14"
+
+# With delta transform
+python train_fast_tokenizer.py \
+    --repo_id "lerobot/aloha_sim_insertion_human" \
+    --action_horizon 10 \
+    --encoded_dims "0:14" \
+    --delta_dims "0,1,2,3,4,5,6,7,8,9,10,11,12,13" \
+    --state_key "observation.state" \
+    --vocab_size 1024
+```
+
+## What is FAST?
+
+FAST is a tokenizer for robotic action sequences that:
+1. Applies DCT (Discrete Cosine Transform) to action chunks
+2. Quantizes DCT coefficients 
+3. Uses BPE (Byte-Pair Encoding) to compress the quantized sequence
+4. Achieves high compression ratios (e.g., 10-20x) while maintaining accuracy
+
+This enables efficient storage and processing of long action sequences in vision-language-action models.
+
+## Requirements
+
+- Python 3.10+
+- LeRobot dataset (either local or from HuggingFace Hub)
+- transformers (for AutoProcessor)
+- numpy
+- torch
+- tyro
+
+## Workflow
+
+```
+LeRobotDataset → Extract Episodes → Apply Delta Transform 
+    ↓
+Select Dimensions → Normalize (q01, q99) → Create Chunks
+    ↓
+Train FAST Tokenizer → Compute Stats → Save
+```
+
+## Parameters Guide
+
+### Essential Parameters
+
+- **`repo_id`**: HuggingFace dataset repository ID
+  - Example: `"lerobot/aloha_sim_insertion_human"`
+  
+- **`action_horizon`**: Length of action sequences to tokenize
+  - Typical: 10-16 steps
+  
+- **`encoded_dims`**: Which action dimensions to encode
+  - Format: `"start:end,start:end"`
+  - Example: `"0:7"` = dimensions 0-6
+  - Example: `"0:3,7:10"` = dimensions 0-2 and 7-9
+
+### Optional Parameters
+
+- **`delta_dims`**: Apply delta transform (action - state) to these dimensions
+  - Format: `"0,1,2,3,4,5"`
+  - Use for position-based actions
+  
+- **`state_key`**: Dataset key containing state observations
+  - Default: `"observation.state"`
+  
+- **`vocab_size`**: BPE vocabulary size
+  - Default: 1024
+  - Larger = better compression but more memory
+  
+- **`scale`**: DCT quantization scale
+  - Default: 10.0
+  - Smaller = finer quantization, larger = coarser
+
+- **`sample_fraction`**: Fraction of action chunks to use per episode
+  - Default: 0.1 (10%)
+  - Increase for small datasets, decrease for large datasets
+
+## Output
+
+The script creates a directory (default: `./fast_tokenizer_{repo_id}`) containing:
+
+1. **Tokenizer files**: Can be loaded with `AutoProcessor.from_pretrained()`
+2. **`metadata.json`**: Contains:
+   - Training configuration
+   - Compression statistics
+   - Dataset information
+
+## Example Output
+
+```
+Loading dataset: lerobot/aloha_sim_insertion_human
+Dataset loaded: 50 episodes, 5000 frames
+Encoding 14 dimensions: 0:14
+Delta dimensions: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
+Action horizon: 10
+Processing 50 episodes...
+Collected 4500 action chunks
+Extracted 14 encoded dimensions
+
+Before normalization - overall stats:
+  Min: -2.3451, Max: 3.1234, Mean: 0.0234, Std: 0.8765
+
+Applied quantile normalization [q01, q99] → [-1, 1]
+
+After normalization - overall stats:
+  Min: -1.0000, Max: 1.0000, Mean: 0.0156, Std: 0.4321
+
+Training FAST tokenizer on 4500 action chunks...
+Action chunk shape: (4500, 10, 14)
+Vocab size: 1024
+DCT scale: 10.0
+✓ Tokenizer training complete!
+
+Compression Statistics:
+  Average compression ratio: 14.23x
+  Mean token length: 9.8
+  P99 token length: 15
+  Min token length: 6
+  Max token length: 18
+
+✅ Saved FAST tokenizer to ./fast_tokenizer_lerobot_aloha_sim_insertion_human
+```
+
+## Using the Trained Tokenizer
+
+```python
+from transformers import AutoProcessor
+
+# Load tokenizer
+tokenizer = AutoProcessor.from_pretrained(
+    "./fast_tokenizer_lerobot_aloha_sim_insertion_human",
+    trust_remote_code=True
+)
+
+# Encode action chunk [horizon, action_dim]
+action_chunk = np.random.randn(10, 14)  # Example
+tokens = tokenizer(action_chunk[None])[0]  # Returns token IDs
+
+# Decode tokens back to actions
+reconstructed = tokenizer.decode(tokens)
+```
+
+## Tips
+
+1. **Start Small**: Use `--max_episodes 10` for initial testing
+2. **Check Dimensions**: Verify encoded dimensions match your robot's action space
+3. **Delta Transform**: Use for position-based actions, not velocity-based
+4. **Normalization**: Ensure dataset has proper statistics computed
+5. **Compression Ratio**: Aim for 10-20x for good balance of compression and accuracy
+
+## Troubleshooting
+
+**Issue**: "No normalization stats found"
+- **Solution**: Compute dataset statistics first, or use raw actions
+
+**Issue**: "Episode too short for action horizon"
+- **Solution**: Reduce `--action_horizon` or filter short episodes
+
+**Issue**: "State key not found"
+- **Solution**: Check dataset features and use correct `--state_key`
+
+**Issue**: Memory error with large datasets
+- **Solution**: Reduce `--sample_fraction` or `--max_episodes`
+
+## Citation
+
+If you use FAST in your research, please cite:
+
+```bibtex
+@article{black2023fast,
+  title={FAST: Factorized Action Sequence Tokenizer for Vision-Language-Action Models},
+  author={Black, Kevin and others},
+  journal={arXiv preprint},
+  year={2023}
+}
+```
+
+
+
@@ -0,0 +1,21 @@
+lerobot-train \
+    --dataset.repo_id=lerobot \
+    --dataset.root=/fsx/jade_choghari/outputs/collect-data-pgen \
+    --output_dir=/fsx/jade_choghari/outputs/pi0test1 \
+    --job_name=pi0_training \
+    --policy.repo_id=jade_choghari/pi0-base \
+    --policy.path=/fsx/jade_choghari/outputs/pi0_fast_fruit1/checkpoints/last/pretrained_model \
+    --policy.dtype=bfloat16 \
+    --steps=3000 \
+    --save_freq=1000 \
+    --rename_map='{
+        "observation.images.base": "observation.images.base_0_rgb",
+        "observation.images.left_wrist": "observation.images.left_wrist_0_rgb",
+        "observation.images.right_wrist": "observation.images.right_wrist_0_rgb",
+        }' \
+    --batch_size=4 \
+    --policy.device=cuda \
+    # --wandb.enable=true \
+    # --wandb.disable_artifact=true \
+    # --wandb.project=pi05hi-training \
+
@@ -893,7 +893,7 @@ class PI05Pytorch(nn.Module):  # see openpi `PI0Pytorch`
                fast_emb = self.fast_action_embedding(fast_action_tokens)
                fast_emb_dim = fast_emb.shape[-1]
                return fast_emb * math.sqrt(fast_emb_dim)
-
+            
            fast_action_emb = self._apply_checkpoint(fast_action_embed_func, fast_action_tokens)
            embs.append(fast_action_emb)
            
@@ -178,7 +178,7 @@ def make_pi05_pre_post_processors(
            padding="max_length",
        ),
        ActionTokenizerProcessorStep(
-            tokenizer_name="physical-intelligence/fast",
+            tokenizer_name="/fsx/jade_choghari/outputs/fast_tokenizer", # TODO: jade put the PI
        ),
        DeviceProcessorStep(device=config.device),
    ]
@@ -0,0 +1,22 @@
+export CUDA_LAUNCH_BLOCKING=1 
+lerobot-train \
+    --dataset.repo_id=local \
+    --dataset.root=/fsx/jade_choghari/outputs/collect-data-pgen \
+    --output_dir=/fsx/jade_choghari/outputs/pi0_fast_fruit1 \
+    --job_name=pi0_training \
+    --policy.repo_id=jade_choghari/pi0-base1 \
+    --policy.path=lerobot/pi05_base \
+    --policy.dtype=bfloat16 \
+    --steps=200000 \
+    --save_freq=5000 \
+    --rename_map='{
+        "observation.images.base": "observation.images.base_0_rgb",
+        "observation.images.left_wrist": "observation.images.left_wrist_0_rgb",
+        "observation.images.right_wrist": "observation.images.right_wrist_0_rgb",
+        }' \
+    --batch_size=4 \
+    --policy.device=cuda \
+    --wandb.enable=true \
+    --wandb.disable_artifact=true \
+    --wandb.project=pi05hi-training \
+# /fsx/jade_choghari/.cache/huggingface/lerobot/jadechoghari/collect-data
@@ -0,0 +1,18 @@
+rm -rf /fsx/jade_choghari/outputs/pi0_multi_training
+lerobot-train \
+    --dataset.repo_id=local\
+    --dataset.root=/fsx/jade_choghari/outputs/collect-data-pgen \
+    --output_dir=/fsx/jade_choghari/outputs/pi0_multi_training \
+    --job_name=pi0_multi_training \
+    --policy.repo_id=jadechoghari/pi0-base1 \
+    --policy.path=lerobot/pi05_base \
+    --policy.dtype=bfloat16 \
+    --steps=50000 \
+    --save_freq=5000 \
+    --rename_map='{
+        "observation.images.base": "observation.images.base_0_rgb",
+        "observation.images.left_wrist": "observation.images.left_wrist_0_rgb",
+        "observation.images.right_wrist": "observation.images.right_wrist_0_rgb",
+        }' \
+    --batch_size=32 \
+    --policy.device=cuda \
@@ -0,0 +1,9 @@
+python src/lerobot/policies/pi05/train_fast_tokenizer.py \
+    --repo_id "local" \
+    --root "/fsx/jade_choghari/outputs/collect-data-pgen" \
+    --action_horizon 16 \
+    --encoded_dims "0:15" \
+    --action_horizon 50 \
+    --vocab_size 1024 \
+    --scale 10.0 \
+    --output_dir "/fsx/jade_choghari/outputs/fast_tokenizer"
@@ -0,0 +1,410 @@
+"""Train FAST tokenizer for action encoding.
+
+This script:
+1. Loads action chunks from LeRobotDataset (with sampling)
+2. Applies delta transforms and per-timestamp normalization
+3. Trains FAST tokenizer on specified action dimensions
+4. Saves tokenizer to assets directory
+5. Reports compression statistics
+"""
+
+import json
+import numpy as np
+import tyro
+from pathlib import Path
+from transformers import AutoProcessor
+import torch
+
+from lerobot.datasets.lerobot_dataset import LeRobotDataset
+
+
+def apply_delta_transform(state: np.ndarray, actions: np.ndarray, delta_dims: list[int] | None) -> np.ndarray:
+    """Apply delta transform to specified dimensions.
+    
+    Args:
+        state: Current state [D]
+        actions: Future actions [D]
+        delta_dims: List of dimension indices to apply delta transform to
+    
+    Returns:
+        Transformed actions [D]
+    """
+    if delta_dims is None or len(delta_dims) == 0:
+        return actions
+    
+    delta_actions = actions.copy()
+    for dim in delta_dims:
+        delta_actions[dim] = actions[dim] - state[dim]
+    
+    return delta_actions
+
+
+def process_episode(args):
+    """Process single episode and return action chunks."""
+    dataset, ep_idx, action_horizon, delta_dims, sample_fraction, state_key, use_delta_transform = args
+    
+    try:
+        # Get episode info
+        ep_info = dataset.meta.episodes[ep_idx]
+        from_idx = ep_info["dataset_from_index"]
+        to_idx = ep_info["dataset_to_index"]
+        ep_length = to_idx - from_idx
+        
+        if ep_length < action_horizon:
+            return None
+        
+        # Load all frames in episode
+        # If dataset has episode filtering, we need to use the mapping
+        states = []
+        actions = []
+        
+        for abs_idx in range(from_idx, to_idx):
+            # Map absolute index to relative index if needed
+            if dataset._absolute_to_relative_idx is not None:
+                if abs_idx not in dataset._absolute_to_relative_idx:
+                    # This episode's frames aren't in the filtered dataset
+                    return None
+                rel_idx = dataset._absolute_to_relative_idx[abs_idx]
+            else:
+                rel_idx = abs_idx
+            
+            frame = dataset.hf_dataset[rel_idx]
+            
+            # Get state (could be from observation.state or other state key)
+            if state_key in frame:
+                state = frame[state_key].numpy() if torch.is_tensor(frame[state_key]) else np.array(frame[state_key])
+            else:
+                # If no state key, use zeros (no delta transform)
+                state = np.zeros_like(frame["action"].numpy() if torch.is_tensor(frame["action"]) else np.array(frame["action"]))
+            
+            action = frame["action"].numpy() if torch.is_tensor(frame["action"]) else np.array(frame["action"])
+            
+            states.append(state)
+            actions.append(action)
+        
+        states = np.array(states)
+        actions = np.array(actions)
+        
+        # Create action chunks (sliding window)
+        # All actions in a chunk are relative to the FIRST state in that chunk
+        action_chunks = []
+        
+        for i in range(len(states) - action_horizon + 1):
+            current_state = states[i]  # First state in chunk
+            future_absolute_actions = actions[i:i + action_horizon]
+            
+            if use_delta_transform:
+                # Relative actions
+                delta_chunk = np.zeros_like(future_absolute_actions)
+                for t in range(action_horizon):
+                    delta_chunk[t] = apply_delta_transform(
+                        current_state,
+                        future_absolute_actions[t],
+                        delta_dims,
+                    )
+                action_chunks.append(delta_chunk)
+            else:
+                # Absolute actions (NO delta)
+                action_chunks.append(future_absolute_actions)
+        
+        if len(action_chunks) == 0:
+            return None
+        
+        action_chunks = np.array(action_chunks)
+        
+        # Sample chunks
+        if sample_fraction < 1.0:
+            n_chunks = len(action_chunks)
+            n_samples = max(1, int(n_chunks * sample_fraction))
+            episode_seed = hash(ep_idx) % (2**31)
+            rng = np.random.RandomState(episode_seed)
+            indices = rng.choice(n_chunks, size=n_samples, replace=False)
+            action_chunks = action_chunks[indices]
+        
+        return action_chunks
+        
+    except Exception as e:
+        print(f"Error processing episode {ep_idx}: {e}")
+        import traceback
+        traceback.print_exc()
+        return None
+
+
+def train_fast_tokenizer(
+    action_chunks: np.ndarray,
+    vocab_size: int = 1024,
+    scale: float = 10.0,
+) -> AutoProcessor:
+    """
+    Train FAST tokenizer (BPE on DCT coefficients) on action chunks.
+    
+    Uses the .fit() method to train a new tokenizer on the provided data.
+    
+    Args:
+        action_chunks: Array of action chunks [N, H, D] where N=num_chunks, H=horizon, D=action_dim
+        vocab_size: BPE vocabulary size
+        scale: DCT scaling factor for quantization
+    
+    Returns:
+        Trained FAST tokenizer
+    """
+    print(f"Training FAST tokenizer on {len(action_chunks)} action chunks...")
+    print(f"Action chunk shape: {action_chunks.shape}")
+    print(f"Vocab size: {vocab_size}")
+    print(f"DCT scale: {scale}")
+    
+    # Download the tokenizer source code (not pretrained weights)
+    # We'll train a new tokenizer on our own data
+    base_tokenizer = AutoProcessor.from_pretrained(
+        "physical-intelligence/fast",
+        trust_remote_code=True
+    )
+    
+    # Convert action_chunks array to list of arrays (expected by .fit())
+    action_data_list = [action_chunks[i] for i in range(len(action_chunks))]
+    
+    # Train the new tokenizer on our action data using .fit()
+    # This trains the BPE tokenizer on DCT coefficients
+    print("Training new tokenizer (this may take a few minutes)...")
+    tokenizer = base_tokenizer.fit(
+        action_data_list,
+        scale=scale,
+        vocab_size=vocab_size,
+        time_horizon=action_chunks.shape[1],  # action_horizon
+        action_dim=action_chunks.shape[2],     # encoded dimensions
+    )
+    print("✓ Tokenizer training complete!")
+    
+    # Validate it works
+    sample_chunk = action_chunks[0]
+    encoded = tokenizer(sample_chunk[None])[0]
+    if isinstance(encoded, list):
+        encoded = np.array(encoded)
+    print(f"Sample encoding: {len(encoded)} tokens for chunk shape {sample_chunk.shape}")
+    
+    return tokenizer
+
+
+def compute_compression_stats(tokenizer, action_chunks: np.ndarray):
+    """Compute compression statistics."""
+    print("\nComputing compression statistics...")
+    
+    # Sample for stats (use max 1000 chunks for speed)
+    sample_size = min(1000, len(action_chunks))
+    sample_indices = np.random.RandomState(42).choice(len(action_chunks), size=sample_size, replace=False)
+    sample_chunks = action_chunks[sample_indices]
+    
+    token_lengths = []
+    for chunk in sample_chunks:
+        encoded = tokenizer(chunk[None])[0]
+        if isinstance(encoded, list):
+            token_lengths.append(len(encoded))
+        else:
+            token_lengths.append(encoded.shape[0] if hasattr(encoded, 'shape') else len(encoded))
+    
+    token_lengths = np.array(token_lengths)
+    
+    # Compression ratio: (H * D) / avg_tokens
+    input_size = action_chunks.shape[1] * action_chunks.shape[2]
+    avg_tokens = np.mean(token_lengths)
+    compression_ratio = input_size / avg_tokens
+    
+    stats = {
+        'compression_ratio': float(compression_ratio),
+        'mean_token_length': float(np.mean(token_lengths)),
+        'p99_token_length': float(np.percentile(token_lengths, 99)),
+        'min_token_length': float(np.min(token_lengths)),
+        'max_token_length': float(np.max(token_lengths)),
+    }
+    
+    print(f"Compression Statistics:")
+    print(f"  Average compression ratio: {stats['compression_ratio']:.2f}x")
+    print(f"  Mean token length: {stats['mean_token_length']:.1f}")
+    print(f"  P99 token length: {stats['p99_token_length']:.0f}")
+    print(f"  Min token length: {stats['min_token_length']:.0f}")
+    print(f"  Max token length: {stats['max_token_length']:.0f}")
+    
+    return stats
+
+
+def main(
+    repo_id: str,
+    root: str | None = None,
+    action_horizon: int = 10,
+    max_episodes: int | None = None,
+    sample_fraction: float = 0.1,
+    encoded_dims: str = "0:6,7:23",
+    delta_dims: str | None = None,
+    use_delta_transform: bool = False,
+    state_key: str = "observation.state",
+    vocab_size: int = 1024,
+    scale: float = 10.0,
+    output_dir: str | None = None,
+):
+    """
+    Train FAST tokenizer for action encoding.
+    
+    Args:
+        repo_id: LeRobot dataset repository ID
+        root: Root directory for dataset (default: ~/.cache/huggingface/lerobot)
+        action_horizon: Number of future actions in each chunk
+        max_episodes: Max episodes to use (None = all episodes in dataset)
+        sample_fraction: Fraction of chunks to sample per episode
+        encoded_dims: Comma-separated dimension ranges to encode (e.g., "0:6,7:23")
+        delta_dims: Comma-separated dimension indices for delta transform (e.g., "0,1,2,3,4,5")
+        use_delta_transform: Whether to apply delta transform (relative actions vs absolute actions)
+        state_key: Dataset key for state observations (default: "observation.state")
+        vocab_size: FAST vocabulary size (BPE vocab size)
+        scale: DCT scaling factor (default: 10.0)
+        output_dir: Directory to save tokenizer (default: ./fast_tokenizer_{repo_id})
+    """
+    # Load dataset
+    print(f"Loading dataset: {repo_id}")
+    dataset = LeRobotDataset(repo_id=repo_id, root=root)
+    print(f"Dataset loaded: {dataset.num_episodes} episodes, {dataset.num_frames} frames")
+    
+    # Parse encoded dimensions
+    encoded_dim_ranges = []
+    for range_str in encoded_dims.split(','):
+        start, end = map(int, range_str.strip().split(':'))
+        encoded_dim_ranges.append((start, end))
+    
+    total_encoded_dims = sum(end - start for start, end in encoded_dim_ranges)
+    print(f"Encoding {total_encoded_dims} dimensions: {encoded_dims}")
+    
+    # Parse delta dimensions
+    delta_dim_list = None
+    if delta_dims is not None and delta_dims.strip():
+        delta_dim_list = [int(d.strip()) for d in delta_dims.split(',')]
+        print(f"Delta dimensions: {delta_dim_list}")
+    else:
+        print("No delta dimensions specified")
+    
+    print(f"Use delta transform: {use_delta_transform}")
+    if use_delta_transform and (delta_dim_list is None or len(delta_dim_list) == 0):
+        print("Warning: use_delta_transform=True but no delta_dims specified. No delta will be applied.")
+    
+    print(f"Action horizon: {action_horizon}")
+    print(f"State key: {state_key}")
+    
+    # Determine episodes to process
+    num_episodes = dataset.num_episodes
+    if max_episodes is not None:
+        num_episodes = min(max_episodes, num_episodes)
+    
+    print(f"Processing {num_episodes} episodes...")
+    
+    # Process episodes sequentially (to avoid pickling issues with dataset)
+    all_chunks = []
+    for ep_idx in range(num_episodes):
+        if ep_idx % 10 == 0:
+            print(f"  Processing episode {ep_idx}/{num_episodes}...")
+        
+        chunks = process_episode(
+            (dataset, ep_idx, action_horizon, delta_dim_list, sample_fraction, state_key, use_delta_transform)
+        )
+        if chunks is not None:
+            all_chunks.append(chunks)
+    
+    # Concatenate all chunks
+    all_chunks = np.concatenate(all_chunks, axis=0)
+    print(f"Collected {len(all_chunks)} action chunks")
+    
+    # Extract only encoded dimensions FIRST (before normalization)
+    encoded_chunks = []
+    for start, end in encoded_dim_ranges:
+        encoded_chunks.append(all_chunks[:, :, start:end])
+    encoded_chunks = np.concatenate(encoded_chunks, axis=-1)  # [N, H, D_encoded]
+    print(f"Extracted {encoded_chunks.shape[-1]} encoded dimensions")
+    
+    # Apply normalization to encoded dimensions only
+    # NOTE: For FAST, we ALWAYS use QUANTILE normalization (no per-timestamp)
+    # This clips outliers and provides consistent [-1, 1] range for DCT compression
+    print(f"\nBefore normalization - overall stats:")
+    print(f"  Min: {np.min(encoded_chunks):.4f}, Max: {np.max(encoded_chunks):.4f}")
+    print(f"  Mean: {np.mean(encoded_chunks):.4f}, Std: {np.std(encoded_chunks):.4f}")
+    
+    norm_stats = dataset.meta.stats
+    if norm_stats is not None and "action" in norm_stats:
+        action_stats = norm_stats["action"]
+        
+        # Build encoded dimension indices
+        encoded_dim_indices = []
+        for start, end in encoded_dim_ranges:
+            encoded_dim_indices.extend(range(start, end))
+        encoded_dim_indices = np.array(encoded_dim_indices)
+        
+        # Use QUANTILE normalization: clip to [q01, q99] and map to [-1, 1]
+        if "q01" in action_stats and "q99" in action_stats:
+            q01 = np.array(action_stats["q01"])[encoded_dim_indices]  # [D_encoded]
+            q99 = np.array(action_stats["q99"])[encoded_dim_indices]  # [D_encoded]
+            
+            print(f"\nNormalization stats (q01, q99) for encoded dimensions:")
+            for i, dim_idx in enumerate(encoded_dim_indices):
+                print(f"  Orig dim {dim_idx}: q01={q01[i]:7.4f}, q99={q99[i]:7.4f}, range={q99[i]-q01[i]:7.4f}")
+            
+            # Clip to quantile range and normalize to [-1, 1]
+            encoded_chunks = np.clip(encoded_chunks, q01, q99)
+            encoded_chunks = 2.0 * (encoded_chunks - q01) / np.maximum(q99 - q01, 1e-6) - 1.0
+            print(f"\nApplied quantile normalization [q01, q99] → [-1, 1]")
+            
+            print(f"\nAfter normalization - overall stats:")
+            print(f"  Min: {np.min(encoded_chunks):.4f}, Max: {np.max(encoded_chunks):.4f}")
+            print(f"  Mean: {np.mean(encoded_chunks):.4f}, Std: {np.std(encoded_chunks):.4f}")
+            
+            print(f"\nPer-dimension stats (after normalization):")
+            for d in range(encoded_chunks.shape[-1]):
+                dim_data = encoded_chunks[:, :, d]
+                print(f"  Dim {d}: min={np.min(dim_data):7.4f}, max={np.max(dim_data):7.4f}, "
+                      f"mean={np.mean(dim_data):7.4f}, std={np.std(dim_data):7.4f}")
+        else:
+            print("Warning: q01/q99 stats not found, using raw actions")
+    else:
+        print("Warning: No normalization stats found, using raw actions")
+    
+    print(f"Encoded chunks shape: {encoded_chunks.shape}")
+    
+    # Train FAST tokenizer
+    tokenizer = train_fast_tokenizer(
+        encoded_chunks,
+        vocab_size=vocab_size,
+        scale=scale,
+    )
+    
+    # Compute compression statistics
+    compression_stats = compute_compression_stats(tokenizer, encoded_chunks)
+    
+    # Save tokenizer
+    if output_dir is None:
+        output_dir = f"fast_tokenizer_{repo_id.replace('/', '_')}"
+    output_path = Path(output_dir)
+    output_path.mkdir(parents=True, exist_ok=True)
+    
+    tokenizer.save_pretrained(output_path)
+    
+    # Save metadata
+    metadata = {
+        'repo_id': repo_id,
+        'vocab_size': vocab_size,
+        'scale': scale,
+        'encoded_dims': encoded_dims,
+        'encoded_dim_ranges': encoded_dim_ranges,
+        'total_encoded_dims': total_encoded_dims,
+        'delta_dims': delta_dims,
+        'delta_dim_list': delta_dim_list,
+        'use_delta_transform': use_delta_transform,
+        'state_key': state_key,
+        'action_horizon': action_horizon,
+        'num_training_chunks': len(encoded_chunks),
+        'compression_stats': compression_stats,
+    }
+    
+    with open(output_path / "metadata.json", 'w') as f:
+        json.dump(metadata, f, indent=2)
+    
+    print(f"\n✅ Saved FAST tokenizer to {output_path}")
+    print(f"Metadata: {json.dumps(metadata, indent=2)}")
+
+
+if __name__ == "__main__":
+    tyro.cli(main)
@@ -0,0 +1,101 @@
+# Train FAST Tokenizer - Usage Examples
+
+This script trains a FAST (Factorized Action Sequence Tokenizer) on LeRobotDataset action data.
+
+## Basic Usage
+
+```bash
+python src/lerobot/policies/pi05/train_fast_tokenizer.py \
+    --repo_id "lerobot/aloha_sim_insertion_human" \
+    --action_horizon 10 \
+    --encoded_dims "0:7" \
+    --vocab_size 1024 \
+    --scale 10.0
+```
+
+## Parameters
+
+### Required
+- `--repo_id`: LeRobot dataset repository ID (e.g., "lerobot/aloha_sim_insertion_human")
+
+### Optional
+- `--root`: Root directory for dataset (default: ~/.cache/huggingface/lerobot)
+- `--action_horizon`: Number of future actions in each chunk (default: 10)
+- `--max_episodes`: Maximum number of episodes to use (default: None = all)
+- `--sample_fraction`: Fraction of chunks to sample per episode (default: 0.1)
+- `--encoded_dims`: Comma-separated dimension ranges to encode (default: "0:6,7:23")
+  - Example: "0:7" encodes dimensions 0-6
+  - Example: "0:3,6:9" encodes dimensions 0-2 and 6-8
+- `--delta_dims`: Comma-separated dimension indices for delta transform (default: None)
+  - Example: "0,1,2,3,4,5" applies delta transform to first 6 dimensions
+  - Delta transform: action[i] - state[i] for specified dimensions
+- `--state_key`: Dataset key for state observations (default: "observation.state")
+- `--vocab_size`: FAST vocabulary size / BPE vocab size (default: 1024)
+- `--scale`: DCT scaling factor (default: 10.0)
+- `--output_dir`: Directory to save tokenizer (default: ./fast_tokenizer_{repo_id})
+
+## Examples
+
+### Example 1: Train on full action space
+
+```bash
+python src/lerobot/policies/pi05/train_fast_tokenizer.py \
+    --repo_id "lerobot/pusht" \
+    --action_horizon 16 \
+    --encoded_dims "0:2" \
+    --vocab_size 512 \
+    --max_episodes 100
+```
+
+### Example 2: Train with delta transform
+
+```bash
+python src/lerobot/policies/pi05/train_fast_tokenizer.py \
+    --repo_id "lerobot/aloha_sim_insertion_human" \
+    --action_horizon 10 \
+    --encoded_dims "0:14" \
+    --delta_dims "0,1,2,3,4,5,6,7,8,9,10,11,12,13" \
+    --state_key "observation.state" \
+    --vocab_size 1024 \
+    --scale 10.0 \
+    --sample_fraction 0.2
+```
+
+### Example 3: Train on subset of dimensions
+
+```bash
+python src/lerobot/policies/pi05/train_fast_tokenizer.py \
+    --repo_id "lerobot/aloha_sim_insertion_human" \
+    --action_horizon 10 \
+    --encoded_dims "0:7" \
+    --vocab_size 1024 \
+    --output_dir "./my_tokenizer"
+```
+
+## Output
+
+The script saves:
+1. **Tokenizer files**: Trained FAST tokenizer (can be loaded with `AutoProcessor.from_pretrained()`)
+2. **metadata.json**: Contains:
+   - Configuration parameters
+   - Compression statistics (compression ratio, token lengths)
+   - Training dataset information
+
+## Understanding the Process
+
+1. **Load Dataset**: Loads the LeRobotDataset from HuggingFace
+2. **Extract Action Chunks**: Creates sliding windows of actions with specified horizon
+3. **Apply Delta Transform**: (Optional) Computes action deltas relative to current state
+4. **Select Encoded Dimensions**: Extracts only the dimensions to be encoded
+5. **Normalize**: Applies quantile normalization ([q01, q99] → [-1, 1])
+6. **Train Tokenizer**: Trains BPE tokenizer on DCT coefficients
+7. **Compute Stats**: Reports compression ratio and token length statistics
+8. **Save**: Saves tokenizer and metadata
+
+## Notes
+
+- **Normalization**: The script uses quantile normalization (q01, q99) from the dataset's statistics
+- **Sampling**: To speed up training, you can sample a fraction of chunks per episode
+- **Delta Transform**: Applied per-dimension to make actions relative to current state
+- **Compression**: FAST uses DCT + BPE to compress action sequences efficiently
+
@@ -0,0 +1,23 @@
+rm -rf /fsx/jade_choghari/outputs/pi0_multi_training
+accelerate launch --multi_gpu --num_processes=2 \
+    $(which lerobot-train) \
+    --dataset.repo_id=local \
+    --dataset.root=/fsx/jade_choghari/outputs/collect-data-pgen \
+    --output_dir=/fsx/jade_choghari/outputs/pi0_multi_training \
+    --job_name=pi0_multi_training \
+    --policy.repo_id=jadechoghari/pi0-base1 \
+    --policy.path=lerobot/pi05_base \
+    --policy.dtype=bfloat16 \
+    --steps=50000 \
+    --save_freq=5000 \
+    --rename_map='{
+        "observation.images.base": "observation.images.base_0_rgb",
+        "observation.images.left_wrist": "observation.images.left_wrist_0_rgb",
+        "observation.images.right_wrist": "observation.images.right_wrist_0_rgb",
+        }' \
+    --policy.gradient_checkpointing=true \
+    --batch_size=1 \
+    --policy.device=cpu
+    # --wandb.enable=true \
+    # --wandb.disable_artifact=true \
+    # --wandb.project=pi05hi-training \
@@ -90,8 +90,6 @@ def update_policy(
    # Let accelerator handle mixed precision
    with accelerator.autocast():
        loss, output_dict = policy.forward(batch)
-        action = policy.select_action(batch)
-        breakpoint()
        # TODO(rcadene): policy.unnormalize_outputs(out_dict)

    # Use accelerator's backward method