mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-26 22:20:06 +00:00
add more changes
This commit is contained in:
@@ -1402,6 +1402,13 @@ def main():
|
|||||||
action="store_true",
|
action="store_true",
|
||||||
help="Push modified dataset to HuggingFace Hub",
|
help="Push modified dataset to HuggingFace Hub",
|
||||||
)
|
)
|
||||||
|
# add image key
|
||||||
|
parser.add_argument(
|
||||||
|
"--image-key",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Image observation key to use for image mode (default: None)",
|
||||||
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
console = Console()
|
console = Console()
|
||||||
@@ -1443,7 +1450,10 @@ def main():
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Get image keys (for image mode)
|
# Get image keys (for image mode)
|
||||||
image_keys = dataset.meta.camera_keys[:args.num_image_views_per_sample]
|
if args.image_key:
|
||||||
|
image_keys = [args.image_key]
|
||||||
|
else:
|
||||||
|
image_keys = dataset.meta.camera_keys[:args.num_image_views_per_sample]
|
||||||
if not args.video_mode:
|
if not args.video_mode:
|
||||||
console.print(f"[cyan]Using image keys: {image_keys}[/cyan]")
|
console.print(f"[cyan]Using image keys: {image_keys}[/cyan]")
|
||||||
|
|
||||||
|
|||||||
@@ -1,10 +1,11 @@
|
|||||||
|
python examples/dataset/annotate.py \
|
||||||
|
--repo-id jadechoghari/collect-data \
|
||||||
|
--video-key observation.images.base \
|
||||||
|
--model Qwen/Qwen3-VL-30B-A3B-Instruct \
|
||||||
|
--episodes 16 22
|
||||||
|
|
||||||
# python examples/dataset/annotate.py \
|
# python examples/dataset/annotate.py \
|
||||||
# --repo-id lerobot/svla_so101_pickplace \
|
# --repo-id lerobot/svla_so101_pickplace \
|
||||||
# --video-key observation.images.side \
|
# --video-key observation.images.side \
|
||||||
# --model Qwen/Qwen3-VL-30B-A3B-Instruct \
|
# --model Qwen/Qwen3-VL-30B-A3B-Instruct \
|
||||||
|
# --episodes 5
|
||||||
python examples/dataset/annotate.py \
|
|
||||||
--repo-id lerobot/svla_so101_pickplace \
|
|
||||||
--video-key observation.images.side \
|
|
||||||
--model Qwen/Qwen3-VL-30B-A3B-Instruct \
|
|
||||||
--episodes 5
|
|
||||||
@@ -4,12 +4,12 @@
|
|||||||
# This generates user prompts and robot utterances for hierarchical policy training
|
# This generates user prompts and robot utterances for hierarchical policy training
|
||||||
|
|
||||||
# Configuration
|
# Configuration
|
||||||
REPO_ID="lerobot/svla_so101_pickplace"
|
REPO_ID="jadechoghari/collect-data"
|
||||||
MODEL="Qwen/Qwen3-VL-30B-A3B-Instruct"
|
MODEL="Qwen/Qwen3-VL-30B-A3B-Instruct"
|
||||||
# Alternative: MODEL="Qwen/Qwen2-VL-7B-Instruct"
|
# Alternative: MODEL="Qwen/Qwen2-VL-7B-Instruct"
|
||||||
|
|
||||||
|
|
||||||
OUTPUT_DIR="/fsx/jade_choghari/outputs/pgen_annotations1"
|
OUTPUT_DIR="/fsx/jade_choghari/outputs/collect-data-pgen"
|
||||||
BATCH_SIZE=32
|
BATCH_SIZE=32
|
||||||
TEMPERATURE=0.9
|
TEMPERATURE=0.9
|
||||||
SAMPLE_INTERVAL=5.0 # Generate dialogue every 1 second (all episodes processed)
|
SAMPLE_INTERVAL=5.0 # Generate dialogue every 1 second (all episodes processed)
|
||||||
@@ -22,6 +22,7 @@ python examples/dataset/annotate_pgen.py \
|
|||||||
--temperature "$TEMPERATURE" \
|
--temperature "$TEMPERATURE" \
|
||||||
--batch-size "$BATCH_SIZE" \
|
--batch-size "$BATCH_SIZE" \
|
||||||
--sample-interval "$SAMPLE_INTERVAL" \
|
--sample-interval "$SAMPLE_INTERVAL" \
|
||||||
|
--image-key observation.images.base \
|
||||||
--num-image-views-per-sample 1
|
--num-image-views-per-sample 1
|
||||||
|
|
||||||
# For faster testing, increase sample interval:
|
# For faster testing, increase sample interval:
|
||||||
|
|||||||
@@ -0,0 +1,26 @@
|
|||||||
|
{
|
||||||
|
"repo_id": "local",
|
||||||
|
"vocab_size": 1024,
|
||||||
|
"scale": 10.0,
|
||||||
|
"encoded_dims": "0:15",
|
||||||
|
"encoded_dim_ranges": [
|
||||||
|
[
|
||||||
|
0,
|
||||||
|
15
|
||||||
|
]
|
||||||
|
],
|
||||||
|
"total_encoded_dims": 15,
|
||||||
|
"delta_dims": null,
|
||||||
|
"delta_dim_list": null,
|
||||||
|
"use_delta_transform": false,
|
||||||
|
"state_key": "observation.state",
|
||||||
|
"action_horizon": 50,
|
||||||
|
"num_training_chunks": 4900,
|
||||||
|
"compression_stats": {
|
||||||
|
"compression_ratio": 15.85791309863622,
|
||||||
|
"mean_token_length": 47.295,
|
||||||
|
"p99_token_length": 90.0,
|
||||||
|
"min_token_length": 9.0,
|
||||||
|
"max_token_length": 109.0
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,158 @@
|
|||||||
|
import logging
|
||||||
|
from typing import ClassVar
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from scipy.fft import dct
|
||||||
|
from scipy.fft import idct
|
||||||
|
from tokenizers import ByteLevelBPETokenizer
|
||||||
|
from tokenizers.trainers import BpeTrainer
|
||||||
|
from transformers import PreTrainedTokenizerFast
|
||||||
|
from transformers.processing_utils import ProcessorMixin
|
||||||
|
|
||||||
|
|
||||||
|
class UniversalActionProcessor(ProcessorMixin):
|
||||||
|
attributes: ClassVar[list[str]] = ["bpe_tokenizer"]
|
||||||
|
bpe_tokenizer_class: str = "AutoTokenizer"
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
bpe_tokenizer: PreTrainedTokenizerFast,
|
||||||
|
scale: float = 10,
|
||||||
|
vocab_size: int = 1024,
|
||||||
|
min_token: int = 0,
|
||||||
|
*,
|
||||||
|
action_dim: int | None = None,
|
||||||
|
time_horizon: int | None = None,
|
||||||
|
):
|
||||||
|
self.scale = scale
|
||||||
|
self.vocab_size = vocab_size
|
||||||
|
self.min_token = min_token
|
||||||
|
|
||||||
|
# Action horizon and dimension needed during decoding. These can be specified
|
||||||
|
# in three ways (in order of priority):
|
||||||
|
# 1. passed in as kwargs to decode()
|
||||||
|
# 2. in the constructor
|
||||||
|
# 3. cached from the last time decode() was called
|
||||||
|
self.time_horizon = time_horizon
|
||||||
|
self.action_dim = action_dim
|
||||||
|
self.called_time_horizon = time_horizon
|
||||||
|
self.called_action_dim = action_dim
|
||||||
|
|
||||||
|
super().__init__(bpe_tokenizer)
|
||||||
|
|
||||||
|
def __call__(self, action_chunk: np.array) -> np.array:
|
||||||
|
assert action_chunk.ndim <= 3, "Only 3 dimensions supported: [batch, timesteps, action_dim]"
|
||||||
|
if action_chunk.ndim == 2:
|
||||||
|
action_chunk = action_chunk[None, ...]
|
||||||
|
|
||||||
|
# Cache the time horizon and action dimension for decoding
|
||||||
|
self.called_time_horizon = action_chunk.shape[-2]
|
||||||
|
self.called_action_dim = action_chunk.shape[-1]
|
||||||
|
|
||||||
|
dct_coeff = dct(action_chunk, axis=1, norm="ortho")
|
||||||
|
dct_coeff = np.around(dct_coeff * self.scale)
|
||||||
|
tokens = []
|
||||||
|
for elem in dct_coeff:
|
||||||
|
token_str = "".join(map(chr, np.maximum(elem.flatten() - self.min_token, 0).astype(int)))
|
||||||
|
tokens.append(self.bpe_tokenizer(token_str)["input_ids"])
|
||||||
|
return tokens
|
||||||
|
|
||||||
|
def decode(
|
||||||
|
self,
|
||||||
|
tokens: list[list[int]],
|
||||||
|
*,
|
||||||
|
time_horizon: int | None = None,
|
||||||
|
action_dim: int | None = None,
|
||||||
|
) -> np.array:
|
||||||
|
self.time_horizon = time_horizon or self.time_horizon or self.called_time_horizon
|
||||||
|
self.action_dim = action_dim or self.action_dim or self.called_action_dim
|
||||||
|
|
||||||
|
# Cache the time horizon and action dimension for the next call
|
||||||
|
self.called_time_horizon = self.time_horizon
|
||||||
|
self.called_action_dim = self.action_dim
|
||||||
|
|
||||||
|
assert (
|
||||||
|
self.time_horizon is not None and self.action_dim is not None
|
||||||
|
), "Tokenizer not initialized, call encode() once or pass in time_horizon and action_dim."
|
||||||
|
|
||||||
|
decoded_actions = []
|
||||||
|
for token in tokens:
|
||||||
|
try:
|
||||||
|
decoded_tokens = self.bpe_tokenizer.decode(token)
|
||||||
|
decoded_dct_coeff = np.array(list(map(ord, decoded_tokens))) + self.min_token
|
||||||
|
decoded_dct_coeff = decoded_dct_coeff.reshape(-1, self.action_dim)
|
||||||
|
assert (
|
||||||
|
decoded_dct_coeff.shape
|
||||||
|
== (
|
||||||
|
self.time_horizon,
|
||||||
|
self.action_dim,
|
||||||
|
)
|
||||||
|
), f"Decoded DCT coefficients have shape {decoded_dct_coeff.shape}, expected ({self.time_horizon}, {self.action_dim})"
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error decoding tokens: {e}")
|
||||||
|
print(f"Tokens: {token}")
|
||||||
|
decoded_dct_coeff = np.zeros((self.time_horizon, self.action_dim))
|
||||||
|
decoded_actions.append(idct(decoded_dct_coeff / self.scale, axis=0, norm="ortho"))
|
||||||
|
return np.stack(decoded_actions)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def fit(
|
||||||
|
cls,
|
||||||
|
action_data: list[np.array],
|
||||||
|
scale: float = 10,
|
||||||
|
vocab_size: int = 1024,
|
||||||
|
*,
|
||||||
|
time_horizon: int | None = None,
|
||||||
|
action_dim: int | None = None,
|
||||||
|
) -> "UniversalActionProcessor":
|
||||||
|
# Run DCT over all inputs
|
||||||
|
dct_tokens = [dct(a, axis=0, norm="ortho").flatten() for a in action_data]
|
||||||
|
|
||||||
|
# Quantize and find min token
|
||||||
|
max_token = int(np.around(np.concatenate(dct_tokens) * scale).max())
|
||||||
|
min_token = int(np.around(np.concatenate(dct_tokens) * scale).min())
|
||||||
|
min_vocab_size = max_token - min_token
|
||||||
|
|
||||||
|
assert (
|
||||||
|
min_vocab_size <= vocab_size
|
||||||
|
), f"Vocab size {vocab_size} is too small for the range of tokens {min_vocab_size}"
|
||||||
|
if min_vocab_size + 100 > vocab_size:
|
||||||
|
logging.warning(
|
||||||
|
f"Initial alphabet size {min_vocab_size} is almost as large as the vocab"
|
||||||
|
f"size {vocab_size}, consider increasing vocab size"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Make token iterator for BPE training
|
||||||
|
def _token_iter():
|
||||||
|
for tokens in dct_tokens:
|
||||||
|
rounded_tokens = np.around(tokens * scale) - min_token
|
||||||
|
rounded_tokens = rounded_tokens.astype(int)
|
||||||
|
string = "".join(map(chr, rounded_tokens))
|
||||||
|
yield string
|
||||||
|
|
||||||
|
# Train BPE tokenizer
|
||||||
|
bpe = ByteLevelBPETokenizer()
|
||||||
|
|
||||||
|
# Set up the entire range of possible tokens as the initial alphabet
|
||||||
|
alphabet = [chr(i) for i in range(max_token - min_token + 1)]
|
||||||
|
trainer = BpeTrainer(
|
||||||
|
vocab_size=vocab_size,
|
||||||
|
min_frequency=2,
|
||||||
|
show_progress=True,
|
||||||
|
special_tokens=[],
|
||||||
|
initial_alphabet=alphabet,
|
||||||
|
max_token_length=10000,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Train the inner tokenizer (don't use ByteLevelBPETokenizer.train_from_iterator()
|
||||||
|
# because it doesn't support custom alphabets)
|
||||||
|
bpe._tokenizer.train_from_iterator(_token_iter(), trainer=trainer)
|
||||||
|
|
||||||
|
return cls(
|
||||||
|
PreTrainedTokenizerFast(tokenizer_object=bpe, clean_up_tokenization_spaces=False),
|
||||||
|
scale=scale,
|
||||||
|
vocab_size=vocab_size,
|
||||||
|
min_token=min_token,
|
||||||
|
time_horizon=time_horizon,
|
||||||
|
action_dim=action_dim,
|
||||||
|
)
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
{
|
||||||
|
"action_dim": 15,
|
||||||
|
"auto_map": {
|
||||||
|
"AutoProcessor": "processing_action_tokenizer.UniversalActionProcessor"
|
||||||
|
},
|
||||||
|
"min_token": -71,
|
||||||
|
"processor_class": "UniversalActionProcessor",
|
||||||
|
"scale": 10.0,
|
||||||
|
"time_horizon": 50,
|
||||||
|
"vocab_size": 1024
|
||||||
|
}
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
{}
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,11 @@
|
|||||||
|
{
|
||||||
|
"added_tokens_decoder": {},
|
||||||
|
"auto_map": {
|
||||||
|
"AutoProcessor": "processing_action_tokenizer.UniversalActionProcessor"
|
||||||
|
},
|
||||||
|
"clean_up_tokenization_spaces": false,
|
||||||
|
"extra_special_tokens": {},
|
||||||
|
"model_max_length": 1000000000000000019884624838656,
|
||||||
|
"processor_class": "UniversalActionProcessor",
|
||||||
|
"tokenizer_class": "PreTrainedTokenizerFast"
|
||||||
|
}
|
||||||
@@ -0,0 +1,196 @@
|
|||||||
|
# FAST Tokenizer Training for LeRobotDataset
|
||||||
|
|
||||||
|
This directory contains tools for training a FAST (Factorized Action Sequence Tokenizer) on LeRobot datasets.
|
||||||
|
|
||||||
|
## Files
|
||||||
|
|
||||||
|
- **`train_fast_tokenizer.py`**: Main training script (refactored for LeRobotDataset)
|
||||||
|
- **`train_fast_tokenizer_example.md`**: Usage examples and parameter documentation
|
||||||
|
- **`MIGRATION_NOTES.md`**: Migration guide from B1K to LeRobotDataset
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Basic usage
|
||||||
|
python train_fast_tokenizer.py \
|
||||||
|
--repo_id "lerobot/aloha_sim_insertion_human" \
|
||||||
|
--action_horizon 10 \
|
||||||
|
--encoded_dims "0:14"
|
||||||
|
|
||||||
|
# With delta transform
|
||||||
|
python train_fast_tokenizer.py \
|
||||||
|
--repo_id "lerobot/aloha_sim_insertion_human" \
|
||||||
|
--action_horizon 10 \
|
||||||
|
--encoded_dims "0:14" \
|
||||||
|
--delta_dims "0,1,2,3,4,5,6,7,8,9,10,11,12,13" \
|
||||||
|
--state_key "observation.state" \
|
||||||
|
--vocab_size 1024
|
||||||
|
```
|
||||||
|
|
||||||
|
## What is FAST?
|
||||||
|
|
||||||
|
FAST is a tokenizer for robotic action sequences that:
|
||||||
|
1. Applies DCT (Discrete Cosine Transform) to action chunks
|
||||||
|
2. Quantizes DCT coefficients
|
||||||
|
3. Uses BPE (Byte-Pair Encoding) to compress the quantized sequence
|
||||||
|
4. Achieves high compression ratios (e.g., 10-20x) while maintaining accuracy
|
||||||
|
|
||||||
|
This enables efficient storage and processing of long action sequences in vision-language-action models.
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
- Python 3.10+
|
||||||
|
- LeRobot dataset (either local or from HuggingFace Hub)
|
||||||
|
- transformers (for AutoProcessor)
|
||||||
|
- numpy
|
||||||
|
- torch
|
||||||
|
- tyro
|
||||||
|
|
||||||
|
## Workflow
|
||||||
|
|
||||||
|
```
|
||||||
|
LeRobotDataset → Extract Episodes → Apply Delta Transform
|
||||||
|
↓
|
||||||
|
Select Dimensions → Normalize (q01, q99) → Create Chunks
|
||||||
|
↓
|
||||||
|
Train FAST Tokenizer → Compute Stats → Save
|
||||||
|
```
|
||||||
|
|
||||||
|
## Parameters Guide
|
||||||
|
|
||||||
|
### Essential Parameters
|
||||||
|
|
||||||
|
- **`repo_id`**: HuggingFace dataset repository ID
|
||||||
|
- Example: `"lerobot/aloha_sim_insertion_human"`
|
||||||
|
|
||||||
|
- **`action_horizon`**: Length of action sequences to tokenize
|
||||||
|
- Typical: 10-16 steps
|
||||||
|
|
||||||
|
- **`encoded_dims`**: Which action dimensions to encode
|
||||||
|
- Format: `"start:end,start:end"`
|
||||||
|
- Example: `"0:7"` = dimensions 0-6
|
||||||
|
- Example: `"0:3,7:10"` = dimensions 0-2 and 7-9
|
||||||
|
|
||||||
|
### Optional Parameters
|
||||||
|
|
||||||
|
- **`delta_dims`**: Apply delta transform (action - state) to these dimensions
|
||||||
|
- Format: `"0,1,2,3,4,5"`
|
||||||
|
- Use for position-based actions
|
||||||
|
|
||||||
|
- **`state_key`**: Dataset key containing state observations
|
||||||
|
- Default: `"observation.state"`
|
||||||
|
|
||||||
|
- **`vocab_size`**: BPE vocabulary size
|
||||||
|
- Default: 1024
|
||||||
|
- Larger = better compression but more memory
|
||||||
|
|
||||||
|
- **`scale`**: DCT quantization scale
|
||||||
|
- Default: 10.0
|
||||||
|
- Smaller = finer quantization, larger = coarser
|
||||||
|
|
||||||
|
- **`sample_fraction`**: Fraction of action chunks to use per episode
|
||||||
|
- Default: 0.1 (10%)
|
||||||
|
- Increase for small datasets, decrease for large datasets
|
||||||
|
|
||||||
|
## Output
|
||||||
|
|
||||||
|
The script creates a directory (default: `./fast_tokenizer_{repo_id}`) containing:
|
||||||
|
|
||||||
|
1. **Tokenizer files**: Can be loaded with `AutoProcessor.from_pretrained()`
|
||||||
|
2. **`metadata.json`**: Contains:
|
||||||
|
- Training configuration
|
||||||
|
- Compression statistics
|
||||||
|
- Dataset information
|
||||||
|
|
||||||
|
## Example Output
|
||||||
|
|
||||||
|
```
|
||||||
|
Loading dataset: lerobot/aloha_sim_insertion_human
|
||||||
|
Dataset loaded: 50 episodes, 5000 frames
|
||||||
|
Encoding 14 dimensions: 0:14
|
||||||
|
Delta dimensions: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
|
||||||
|
Action horizon: 10
|
||||||
|
Processing 50 episodes...
|
||||||
|
Collected 4500 action chunks
|
||||||
|
Extracted 14 encoded dimensions
|
||||||
|
|
||||||
|
Before normalization - overall stats:
|
||||||
|
Min: -2.3451, Max: 3.1234, Mean: 0.0234, Std: 0.8765
|
||||||
|
|
||||||
|
Applied quantile normalization [q01, q99] → [-1, 1]
|
||||||
|
|
||||||
|
After normalization - overall stats:
|
||||||
|
Min: -1.0000, Max: 1.0000, Mean: 0.0156, Std: 0.4321
|
||||||
|
|
||||||
|
Training FAST tokenizer on 4500 action chunks...
|
||||||
|
Action chunk shape: (4500, 10, 14)
|
||||||
|
Vocab size: 1024
|
||||||
|
DCT scale: 10.0
|
||||||
|
✓ Tokenizer training complete!
|
||||||
|
|
||||||
|
Compression Statistics:
|
||||||
|
Average compression ratio: 14.23x
|
||||||
|
Mean token length: 9.8
|
||||||
|
P99 token length: 15
|
||||||
|
Min token length: 6
|
||||||
|
Max token length: 18
|
||||||
|
|
||||||
|
✅ Saved FAST tokenizer to ./fast_tokenizer_lerobot_aloha_sim_insertion_human
|
||||||
|
```
|
||||||
|
|
||||||
|
## Using the Trained Tokenizer
|
||||||
|
|
||||||
|
```python
|
||||||
|
from transformers import AutoProcessor
|
||||||
|
|
||||||
|
# Load tokenizer
|
||||||
|
tokenizer = AutoProcessor.from_pretrained(
|
||||||
|
"./fast_tokenizer_lerobot_aloha_sim_insertion_human",
|
||||||
|
trust_remote_code=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# Encode action chunk [horizon, action_dim]
|
||||||
|
action_chunk = np.random.randn(10, 14) # Example
|
||||||
|
tokens = tokenizer(action_chunk[None])[0] # Returns token IDs
|
||||||
|
|
||||||
|
# Decode tokens back to actions
|
||||||
|
reconstructed = tokenizer.decode(tokens)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Tips
|
||||||
|
|
||||||
|
1. **Start Small**: Use `--max_episodes 10` for initial testing
|
||||||
|
2. **Check Dimensions**: Verify encoded dimensions match your robot's action space
|
||||||
|
3. **Delta Transform**: Use for position-based actions, not velocity-based
|
||||||
|
4. **Normalization**: Ensure dataset has proper statistics computed
|
||||||
|
5. **Compression Ratio**: Aim for 10-20x for good balance of compression and accuracy
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
**Issue**: "No normalization stats found"
|
||||||
|
- **Solution**: Compute dataset statistics first, or use raw actions
|
||||||
|
|
||||||
|
**Issue**: "Episode too short for action horizon"
|
||||||
|
- **Solution**: Reduce `--action_horizon` or filter short episodes
|
||||||
|
|
||||||
|
**Issue**: "State key not found"
|
||||||
|
- **Solution**: Check dataset features and use correct `--state_key`
|
||||||
|
|
||||||
|
**Issue**: Memory error with large datasets
|
||||||
|
- **Solution**: Reduce `--sample_fraction` or `--max_episodes`
|
||||||
|
|
||||||
|
## Citation
|
||||||
|
|
||||||
|
If you use FAST in your research, please cite:
|
||||||
|
|
||||||
|
```bibtex
|
||||||
|
@article{black2023fast,
|
||||||
|
title={FAST: Factorized Action Sequence Tokenizer for Vision-Language-Action Models},
|
||||||
|
author={Black, Kevin and others},
|
||||||
|
journal={arXiv preprint},
|
||||||
|
year={2023}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -0,0 +1,21 @@
|
|||||||
|
lerobot-train \
|
||||||
|
--dataset.repo_id=lerobot \
|
||||||
|
--dataset.root=/fsx/jade_choghari/outputs/collect-data-pgen \
|
||||||
|
--output_dir=/fsx/jade_choghari/outputs/pi0test1 \
|
||||||
|
--job_name=pi0_training \
|
||||||
|
--policy.repo_id=jade_choghari/pi0-base \
|
||||||
|
--policy.path=/fsx/jade_choghari/outputs/pi0_fast_fruit1/checkpoints/last/pretrained_model \
|
||||||
|
--policy.dtype=bfloat16 \
|
||||||
|
--steps=3000 \
|
||||||
|
--save_freq=1000 \
|
||||||
|
--rename_map='{
|
||||||
|
"observation.images.base": "observation.images.base_0_rgb",
|
||||||
|
"observation.images.left_wrist": "observation.images.left_wrist_0_rgb",
|
||||||
|
"observation.images.right_wrist": "observation.images.right_wrist_0_rgb",
|
||||||
|
}' \
|
||||||
|
--batch_size=4 \
|
||||||
|
--policy.device=cuda \
|
||||||
|
# --wandb.enable=true \
|
||||||
|
# --wandb.disable_artifact=true \
|
||||||
|
# --wandb.project=pi05hi-training \
|
||||||
|
|
||||||
@@ -178,7 +178,7 @@ def make_pi05_pre_post_processors(
|
|||||||
padding="max_length",
|
padding="max_length",
|
||||||
),
|
),
|
||||||
ActionTokenizerProcessorStep(
|
ActionTokenizerProcessorStep(
|
||||||
tokenizer_name="physical-intelligence/fast",
|
tokenizer_name="/fsx/jade_choghari/outputs/fast_tokenizer", # TODO: jade put the PI
|
||||||
),
|
),
|
||||||
DeviceProcessorStep(device=config.device),
|
DeviceProcessorStep(device=config.device),
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -0,0 +1,22 @@
|
|||||||
|
export CUDA_LAUNCH_BLOCKING=1
|
||||||
|
lerobot-train \
|
||||||
|
--dataset.repo_id=local \
|
||||||
|
--dataset.root=/fsx/jade_choghari/outputs/collect-data-pgen \
|
||||||
|
--output_dir=/fsx/jade_choghari/outputs/pi0_fast_fruit1 \
|
||||||
|
--job_name=pi0_training \
|
||||||
|
--policy.repo_id=jade_choghari/pi0-base1 \
|
||||||
|
--policy.path=lerobot/pi05_base \
|
||||||
|
--policy.dtype=bfloat16 \
|
||||||
|
--steps=200000 \
|
||||||
|
--save_freq=5000 \
|
||||||
|
--rename_map='{
|
||||||
|
"observation.images.base": "observation.images.base_0_rgb",
|
||||||
|
"observation.images.left_wrist": "observation.images.left_wrist_0_rgb",
|
||||||
|
"observation.images.right_wrist": "observation.images.right_wrist_0_rgb",
|
||||||
|
}' \
|
||||||
|
--batch_size=4 \
|
||||||
|
--policy.device=cuda \
|
||||||
|
--wandb.enable=true \
|
||||||
|
--wandb.disable_artifact=true \
|
||||||
|
--wandb.project=pi05hi-training \
|
||||||
|
# /fsx/jade_choghari/.cache/huggingface/lerobot/jadechoghari/collect-data
|
||||||
@@ -0,0 +1,18 @@
|
|||||||
|
rm -rf /fsx/jade_choghari/outputs/pi0_multi_training
|
||||||
|
lerobot-train \
|
||||||
|
--dataset.repo_id=local\
|
||||||
|
--dataset.root=/fsx/jade_choghari/outputs/collect-data-pgen \
|
||||||
|
--output_dir=/fsx/jade_choghari/outputs/pi0_multi_training \
|
||||||
|
--job_name=pi0_multi_training \
|
||||||
|
--policy.repo_id=jadechoghari/pi0-base1 \
|
||||||
|
--policy.path=lerobot/pi05_base \
|
||||||
|
--policy.dtype=bfloat16 \
|
||||||
|
--steps=50000 \
|
||||||
|
--save_freq=5000 \
|
||||||
|
--rename_map='{
|
||||||
|
"observation.images.base": "observation.images.base_0_rgb",
|
||||||
|
"observation.images.left_wrist": "observation.images.left_wrist_0_rgb",
|
||||||
|
"observation.images.right_wrist": "observation.images.right_wrist_0_rgb",
|
||||||
|
}' \
|
||||||
|
--batch_size=32 \
|
||||||
|
--policy.device=cuda \
|
||||||
@@ -0,0 +1,9 @@
|
|||||||
|
python src/lerobot/policies/pi05/train_fast_tokenizer.py \
|
||||||
|
--repo_id "local" \
|
||||||
|
--root "/fsx/jade_choghari/outputs/collect-data-pgen" \
|
||||||
|
--action_horizon 16 \
|
||||||
|
--encoded_dims "0:15" \
|
||||||
|
--action_horizon 50 \
|
||||||
|
--vocab_size 1024 \
|
||||||
|
--scale 10.0 \
|
||||||
|
--output_dir "/fsx/jade_choghari/outputs/fast_tokenizer"
|
||||||
@@ -0,0 +1,410 @@
|
|||||||
|
"""Train FAST tokenizer for action encoding.
|
||||||
|
|
||||||
|
This script:
|
||||||
|
1. Loads action chunks from LeRobotDataset (with sampling)
|
||||||
|
2. Applies delta transforms and per-timestamp normalization
|
||||||
|
3. Trains FAST tokenizer on specified action dimensions
|
||||||
|
4. Saves tokenizer to assets directory
|
||||||
|
5. Reports compression statistics
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import numpy as np
|
||||||
|
import tyro
|
||||||
|
from pathlib import Path
|
||||||
|
from transformers import AutoProcessor
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from lerobot.datasets.lerobot_dataset import LeRobotDataset
|
||||||
|
|
||||||
|
|
||||||
|
def apply_delta_transform(state: np.ndarray, actions: np.ndarray, delta_dims: list[int] | None) -> np.ndarray:
|
||||||
|
"""Apply delta transform to specified dimensions.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
state: Current state [D]
|
||||||
|
actions: Future actions [D]
|
||||||
|
delta_dims: List of dimension indices to apply delta transform to
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Transformed actions [D]
|
||||||
|
"""
|
||||||
|
if delta_dims is None or len(delta_dims) == 0:
|
||||||
|
return actions
|
||||||
|
|
||||||
|
delta_actions = actions.copy()
|
||||||
|
for dim in delta_dims:
|
||||||
|
delta_actions[dim] = actions[dim] - state[dim]
|
||||||
|
|
||||||
|
return delta_actions
|
||||||
|
|
||||||
|
|
||||||
|
def process_episode(args):
|
||||||
|
"""Process single episode and return action chunks."""
|
||||||
|
dataset, ep_idx, action_horizon, delta_dims, sample_fraction, state_key, use_delta_transform = args
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Get episode info
|
||||||
|
ep_info = dataset.meta.episodes[ep_idx]
|
||||||
|
from_idx = ep_info["dataset_from_index"]
|
||||||
|
to_idx = ep_info["dataset_to_index"]
|
||||||
|
ep_length = to_idx - from_idx
|
||||||
|
|
||||||
|
if ep_length < action_horizon:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Load all frames in episode
|
||||||
|
# If dataset has episode filtering, we need to use the mapping
|
||||||
|
states = []
|
||||||
|
actions = []
|
||||||
|
|
||||||
|
for abs_idx in range(from_idx, to_idx):
|
||||||
|
# Map absolute index to relative index if needed
|
||||||
|
if dataset._absolute_to_relative_idx is not None:
|
||||||
|
if abs_idx not in dataset._absolute_to_relative_idx:
|
||||||
|
# This episode's frames aren't in the filtered dataset
|
||||||
|
return None
|
||||||
|
rel_idx = dataset._absolute_to_relative_idx[abs_idx]
|
||||||
|
else:
|
||||||
|
rel_idx = abs_idx
|
||||||
|
|
||||||
|
frame = dataset.hf_dataset[rel_idx]
|
||||||
|
|
||||||
|
# Get state (could be from observation.state or other state key)
|
||||||
|
if state_key in frame:
|
||||||
|
state = frame[state_key].numpy() if torch.is_tensor(frame[state_key]) else np.array(frame[state_key])
|
||||||
|
else:
|
||||||
|
# If no state key, use zeros (no delta transform)
|
||||||
|
state = np.zeros_like(frame["action"].numpy() if torch.is_tensor(frame["action"]) else np.array(frame["action"]))
|
||||||
|
|
||||||
|
action = frame["action"].numpy() if torch.is_tensor(frame["action"]) else np.array(frame["action"])
|
||||||
|
|
||||||
|
states.append(state)
|
||||||
|
actions.append(action)
|
||||||
|
|
||||||
|
states = np.array(states)
|
||||||
|
actions = np.array(actions)
|
||||||
|
|
||||||
|
# Create action chunks (sliding window)
|
||||||
|
# All actions in a chunk are relative to the FIRST state in that chunk
|
||||||
|
action_chunks = []
|
||||||
|
|
||||||
|
for i in range(len(states) - action_horizon + 1):
|
||||||
|
current_state = states[i] # First state in chunk
|
||||||
|
future_absolute_actions = actions[i:i + action_horizon]
|
||||||
|
|
||||||
|
if use_delta_transform:
|
||||||
|
# Relative actions
|
||||||
|
delta_chunk = np.zeros_like(future_absolute_actions)
|
||||||
|
for t in range(action_horizon):
|
||||||
|
delta_chunk[t] = apply_delta_transform(
|
||||||
|
current_state,
|
||||||
|
future_absolute_actions[t],
|
||||||
|
delta_dims,
|
||||||
|
)
|
||||||
|
action_chunks.append(delta_chunk)
|
||||||
|
else:
|
||||||
|
# Absolute actions (NO delta)
|
||||||
|
action_chunks.append(future_absolute_actions)
|
||||||
|
|
||||||
|
if len(action_chunks) == 0:
|
||||||
|
return None
|
||||||
|
|
||||||
|
action_chunks = np.array(action_chunks)
|
||||||
|
|
||||||
|
# Sample chunks
|
||||||
|
if sample_fraction < 1.0:
|
||||||
|
n_chunks = len(action_chunks)
|
||||||
|
n_samples = max(1, int(n_chunks * sample_fraction))
|
||||||
|
episode_seed = hash(ep_idx) % (2**31)
|
||||||
|
rng = np.random.RandomState(episode_seed)
|
||||||
|
indices = rng.choice(n_chunks, size=n_samples, replace=False)
|
||||||
|
action_chunks = action_chunks[indices]
|
||||||
|
|
||||||
|
return action_chunks
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error processing episode {ep_idx}: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def train_fast_tokenizer(
|
||||||
|
action_chunks: np.ndarray,
|
||||||
|
vocab_size: int = 1024,
|
||||||
|
scale: float = 10.0,
|
||||||
|
) -> AutoProcessor:
|
||||||
|
"""
|
||||||
|
Train FAST tokenizer (BPE on DCT coefficients) on action chunks.
|
||||||
|
|
||||||
|
Uses the .fit() method to train a new tokenizer on the provided data.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
action_chunks: Array of action chunks [N, H, D] where N=num_chunks, H=horizon, D=action_dim
|
||||||
|
vocab_size: BPE vocabulary size
|
||||||
|
scale: DCT scaling factor for quantization
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Trained FAST tokenizer
|
||||||
|
"""
|
||||||
|
print(f"Training FAST tokenizer on {len(action_chunks)} action chunks...")
|
||||||
|
print(f"Action chunk shape: {action_chunks.shape}")
|
||||||
|
print(f"Vocab size: {vocab_size}")
|
||||||
|
print(f"DCT scale: {scale}")
|
||||||
|
|
||||||
|
# Download the tokenizer source code (not pretrained weights)
|
||||||
|
# We'll train a new tokenizer on our own data
|
||||||
|
base_tokenizer = AutoProcessor.from_pretrained(
|
||||||
|
"physical-intelligence/fast",
|
||||||
|
trust_remote_code=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# Convert action_chunks array to list of arrays (expected by .fit())
|
||||||
|
action_data_list = [action_chunks[i] for i in range(len(action_chunks))]
|
||||||
|
|
||||||
|
# Train the new tokenizer on our action data using .fit()
|
||||||
|
# This trains the BPE tokenizer on DCT coefficients
|
||||||
|
print("Training new tokenizer (this may take a few minutes)...")
|
||||||
|
tokenizer = base_tokenizer.fit(
|
||||||
|
action_data_list,
|
||||||
|
scale=scale,
|
||||||
|
vocab_size=vocab_size,
|
||||||
|
time_horizon=action_chunks.shape[1], # action_horizon
|
||||||
|
action_dim=action_chunks.shape[2], # encoded dimensions
|
||||||
|
)
|
||||||
|
print("✓ Tokenizer training complete!")
|
||||||
|
|
||||||
|
# Validate it works
|
||||||
|
sample_chunk = action_chunks[0]
|
||||||
|
encoded = tokenizer(sample_chunk[None])[0]
|
||||||
|
if isinstance(encoded, list):
|
||||||
|
encoded = np.array(encoded)
|
||||||
|
print(f"Sample encoding: {len(encoded)} tokens for chunk shape {sample_chunk.shape}")
|
||||||
|
|
||||||
|
return tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
def compute_compression_stats(tokenizer, action_chunks: np.ndarray):
|
||||||
|
"""Compute compression statistics."""
|
||||||
|
print("\nComputing compression statistics...")
|
||||||
|
|
||||||
|
# Sample for stats (use max 1000 chunks for speed)
|
||||||
|
sample_size = min(1000, len(action_chunks))
|
||||||
|
sample_indices = np.random.RandomState(42).choice(len(action_chunks), size=sample_size, replace=False)
|
||||||
|
sample_chunks = action_chunks[sample_indices]
|
||||||
|
|
||||||
|
token_lengths = []
|
||||||
|
for chunk in sample_chunks:
|
||||||
|
encoded = tokenizer(chunk[None])[0]
|
||||||
|
if isinstance(encoded, list):
|
||||||
|
token_lengths.append(len(encoded))
|
||||||
|
else:
|
||||||
|
token_lengths.append(encoded.shape[0] if hasattr(encoded, 'shape') else len(encoded))
|
||||||
|
|
||||||
|
token_lengths = np.array(token_lengths)
|
||||||
|
|
||||||
|
# Compression ratio: (H * D) / avg_tokens
|
||||||
|
input_size = action_chunks.shape[1] * action_chunks.shape[2]
|
||||||
|
avg_tokens = np.mean(token_lengths)
|
||||||
|
compression_ratio = input_size / avg_tokens
|
||||||
|
|
||||||
|
stats = {
|
||||||
|
'compression_ratio': float(compression_ratio),
|
||||||
|
'mean_token_length': float(np.mean(token_lengths)),
|
||||||
|
'p99_token_length': float(np.percentile(token_lengths, 99)),
|
||||||
|
'min_token_length': float(np.min(token_lengths)),
|
||||||
|
'max_token_length': float(np.max(token_lengths)),
|
||||||
|
}
|
||||||
|
|
||||||
|
print(f"Compression Statistics:")
|
||||||
|
print(f" Average compression ratio: {stats['compression_ratio']:.2f}x")
|
||||||
|
print(f" Mean token length: {stats['mean_token_length']:.1f}")
|
||||||
|
print(f" P99 token length: {stats['p99_token_length']:.0f}")
|
||||||
|
print(f" Min token length: {stats['min_token_length']:.0f}")
|
||||||
|
print(f" Max token length: {stats['max_token_length']:.0f}")
|
||||||
|
|
||||||
|
return stats
|
||||||
|
|
||||||
|
|
||||||
|
def main(
|
||||||
|
repo_id: str,
|
||||||
|
root: str | None = None,
|
||||||
|
action_horizon: int = 10,
|
||||||
|
max_episodes: int | None = None,
|
||||||
|
sample_fraction: float = 0.1,
|
||||||
|
encoded_dims: str = "0:6,7:23",
|
||||||
|
delta_dims: str | None = None,
|
||||||
|
use_delta_transform: bool = False,
|
||||||
|
state_key: str = "observation.state",
|
||||||
|
vocab_size: int = 1024,
|
||||||
|
scale: float = 10.0,
|
||||||
|
output_dir: str | None = None,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Train FAST tokenizer for action encoding.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
repo_id: LeRobot dataset repository ID
|
||||||
|
root: Root directory for dataset (default: ~/.cache/huggingface/lerobot)
|
||||||
|
action_horizon: Number of future actions in each chunk
|
||||||
|
max_episodes: Max episodes to use (None = all episodes in dataset)
|
||||||
|
sample_fraction: Fraction of chunks to sample per episode
|
||||||
|
encoded_dims: Comma-separated dimension ranges to encode (e.g., "0:6,7:23")
|
||||||
|
delta_dims: Comma-separated dimension indices for delta transform (e.g., "0,1,2,3,4,5")
|
||||||
|
use_delta_transform: Whether to apply delta transform (relative actions vs absolute actions)
|
||||||
|
state_key: Dataset key for state observations (default: "observation.state")
|
||||||
|
vocab_size: FAST vocabulary size (BPE vocab size)
|
||||||
|
scale: DCT scaling factor (default: 10.0)
|
||||||
|
output_dir: Directory to save tokenizer (default: ./fast_tokenizer_{repo_id})
|
||||||
|
"""
|
||||||
|
# Load dataset
|
||||||
|
print(f"Loading dataset: {repo_id}")
|
||||||
|
dataset = LeRobotDataset(repo_id=repo_id, root=root)
|
||||||
|
print(f"Dataset loaded: {dataset.num_episodes} episodes, {dataset.num_frames} frames")
|
||||||
|
|
||||||
|
# Parse encoded dimensions
|
||||||
|
encoded_dim_ranges = []
|
||||||
|
for range_str in encoded_dims.split(','):
|
||||||
|
start, end = map(int, range_str.strip().split(':'))
|
||||||
|
encoded_dim_ranges.append((start, end))
|
||||||
|
|
||||||
|
total_encoded_dims = sum(end - start for start, end in encoded_dim_ranges)
|
||||||
|
print(f"Encoding {total_encoded_dims} dimensions: {encoded_dims}")
|
||||||
|
|
||||||
|
# Parse delta dimensions
|
||||||
|
delta_dim_list = None
|
||||||
|
if delta_dims is not None and delta_dims.strip():
|
||||||
|
delta_dim_list = [int(d.strip()) for d in delta_dims.split(',')]
|
||||||
|
print(f"Delta dimensions: {delta_dim_list}")
|
||||||
|
else:
|
||||||
|
print("No delta dimensions specified")
|
||||||
|
|
||||||
|
print(f"Use delta transform: {use_delta_transform}")
|
||||||
|
if use_delta_transform and (delta_dim_list is None or len(delta_dim_list) == 0):
|
||||||
|
print("Warning: use_delta_transform=True but no delta_dims specified. No delta will be applied.")
|
||||||
|
|
||||||
|
print(f"Action horizon: {action_horizon}")
|
||||||
|
print(f"State key: {state_key}")
|
||||||
|
|
||||||
|
# Determine episodes to process
|
||||||
|
num_episodes = dataset.num_episodes
|
||||||
|
if max_episodes is not None:
|
||||||
|
num_episodes = min(max_episodes, num_episodes)
|
||||||
|
|
||||||
|
print(f"Processing {num_episodes} episodes...")
|
||||||
|
|
||||||
|
# Process episodes sequentially (to avoid pickling issues with dataset)
|
||||||
|
all_chunks = []
|
||||||
|
for ep_idx in range(num_episodes):
|
||||||
|
if ep_idx % 10 == 0:
|
||||||
|
print(f" Processing episode {ep_idx}/{num_episodes}...")
|
||||||
|
|
||||||
|
chunks = process_episode(
|
||||||
|
(dataset, ep_idx, action_horizon, delta_dim_list, sample_fraction, state_key, use_delta_transform)
|
||||||
|
)
|
||||||
|
if chunks is not None:
|
||||||
|
all_chunks.append(chunks)
|
||||||
|
|
||||||
|
# Concatenate all chunks
|
||||||
|
all_chunks = np.concatenate(all_chunks, axis=0)
|
||||||
|
print(f"Collected {len(all_chunks)} action chunks")
|
||||||
|
|
||||||
|
# Extract only encoded dimensions FIRST (before normalization)
|
||||||
|
encoded_chunks = []
|
||||||
|
for start, end in encoded_dim_ranges:
|
||||||
|
encoded_chunks.append(all_chunks[:, :, start:end])
|
||||||
|
encoded_chunks = np.concatenate(encoded_chunks, axis=-1) # [N, H, D_encoded]
|
||||||
|
print(f"Extracted {encoded_chunks.shape[-1]} encoded dimensions")
|
||||||
|
|
||||||
|
# Apply normalization to encoded dimensions only
|
||||||
|
# NOTE: For FAST, we ALWAYS use QUANTILE normalization (no per-timestamp)
|
||||||
|
# This clips outliers and provides consistent [-1, 1] range for DCT compression
|
||||||
|
print(f"\nBefore normalization - overall stats:")
|
||||||
|
print(f" Min: {np.min(encoded_chunks):.4f}, Max: {np.max(encoded_chunks):.4f}")
|
||||||
|
print(f" Mean: {np.mean(encoded_chunks):.4f}, Std: {np.std(encoded_chunks):.4f}")
|
||||||
|
|
||||||
|
norm_stats = dataset.meta.stats
|
||||||
|
if norm_stats is not None and "action" in norm_stats:
|
||||||
|
action_stats = norm_stats["action"]
|
||||||
|
|
||||||
|
# Build encoded dimension indices
|
||||||
|
encoded_dim_indices = []
|
||||||
|
for start, end in encoded_dim_ranges:
|
||||||
|
encoded_dim_indices.extend(range(start, end))
|
||||||
|
encoded_dim_indices = np.array(encoded_dim_indices)
|
||||||
|
|
||||||
|
# Use QUANTILE normalization: clip to [q01, q99] and map to [-1, 1]
|
||||||
|
if "q01" in action_stats and "q99" in action_stats:
|
||||||
|
q01 = np.array(action_stats["q01"])[encoded_dim_indices] # [D_encoded]
|
||||||
|
q99 = np.array(action_stats["q99"])[encoded_dim_indices] # [D_encoded]
|
||||||
|
|
||||||
|
print(f"\nNormalization stats (q01, q99) for encoded dimensions:")
|
||||||
|
for i, dim_idx in enumerate(encoded_dim_indices):
|
||||||
|
print(f" Orig dim {dim_idx}: q01={q01[i]:7.4f}, q99={q99[i]:7.4f}, range={q99[i]-q01[i]:7.4f}")
|
||||||
|
|
||||||
|
# Clip to quantile range and normalize to [-1, 1]
|
||||||
|
encoded_chunks = np.clip(encoded_chunks, q01, q99)
|
||||||
|
encoded_chunks = 2.0 * (encoded_chunks - q01) / np.maximum(q99 - q01, 1e-6) - 1.0
|
||||||
|
print(f"\nApplied quantile normalization [q01, q99] → [-1, 1]")
|
||||||
|
|
||||||
|
print(f"\nAfter normalization - overall stats:")
|
||||||
|
print(f" Min: {np.min(encoded_chunks):.4f}, Max: {np.max(encoded_chunks):.4f}")
|
||||||
|
print(f" Mean: {np.mean(encoded_chunks):.4f}, Std: {np.std(encoded_chunks):.4f}")
|
||||||
|
|
||||||
|
print(f"\nPer-dimension stats (after normalization):")
|
||||||
|
for d in range(encoded_chunks.shape[-1]):
|
||||||
|
dim_data = encoded_chunks[:, :, d]
|
||||||
|
print(f" Dim {d}: min={np.min(dim_data):7.4f}, max={np.max(dim_data):7.4f}, "
|
||||||
|
f"mean={np.mean(dim_data):7.4f}, std={np.std(dim_data):7.4f}")
|
||||||
|
else:
|
||||||
|
print("Warning: q01/q99 stats not found, using raw actions")
|
||||||
|
else:
|
||||||
|
print("Warning: No normalization stats found, using raw actions")
|
||||||
|
|
||||||
|
print(f"Encoded chunks shape: {encoded_chunks.shape}")
|
||||||
|
|
||||||
|
# Train FAST tokenizer
|
||||||
|
tokenizer = train_fast_tokenizer(
|
||||||
|
encoded_chunks,
|
||||||
|
vocab_size=vocab_size,
|
||||||
|
scale=scale,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Compute compression statistics
|
||||||
|
compression_stats = compute_compression_stats(tokenizer, encoded_chunks)
|
||||||
|
|
||||||
|
# Save tokenizer
|
||||||
|
if output_dir is None:
|
||||||
|
output_dir = f"fast_tokenizer_{repo_id.replace('/', '_')}"
|
||||||
|
output_path = Path(output_dir)
|
||||||
|
output_path.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
tokenizer.save_pretrained(output_path)
|
||||||
|
|
||||||
|
# Save metadata
|
||||||
|
metadata = {
|
||||||
|
'repo_id': repo_id,
|
||||||
|
'vocab_size': vocab_size,
|
||||||
|
'scale': scale,
|
||||||
|
'encoded_dims': encoded_dims,
|
||||||
|
'encoded_dim_ranges': encoded_dim_ranges,
|
||||||
|
'total_encoded_dims': total_encoded_dims,
|
||||||
|
'delta_dims': delta_dims,
|
||||||
|
'delta_dim_list': delta_dim_list,
|
||||||
|
'use_delta_transform': use_delta_transform,
|
||||||
|
'state_key': state_key,
|
||||||
|
'action_horizon': action_horizon,
|
||||||
|
'num_training_chunks': len(encoded_chunks),
|
||||||
|
'compression_stats': compression_stats,
|
||||||
|
}
|
||||||
|
|
||||||
|
with open(output_path / "metadata.json", 'w') as f:
|
||||||
|
json.dump(metadata, f, indent=2)
|
||||||
|
|
||||||
|
print(f"\n✅ Saved FAST tokenizer to {output_path}")
|
||||||
|
print(f"Metadata: {json.dumps(metadata, indent=2)}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
tyro.cli(main)
|
||||||
@@ -0,0 +1,101 @@
|
|||||||
|
# Train FAST Tokenizer - Usage Examples
|
||||||
|
|
||||||
|
This script trains a FAST (Factorized Action Sequence Tokenizer) on LeRobotDataset action data.
|
||||||
|
|
||||||
|
## Basic Usage
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python src/lerobot/policies/pi05/train_fast_tokenizer.py \
|
||||||
|
--repo_id "lerobot/aloha_sim_insertion_human" \
|
||||||
|
--action_horizon 10 \
|
||||||
|
--encoded_dims "0:7" \
|
||||||
|
--vocab_size 1024 \
|
||||||
|
--scale 10.0
|
||||||
|
```
|
||||||
|
|
||||||
|
## Parameters
|
||||||
|
|
||||||
|
### Required
|
||||||
|
- `--repo_id`: LeRobot dataset repository ID (e.g., "lerobot/aloha_sim_insertion_human")
|
||||||
|
|
||||||
|
### Optional
|
||||||
|
- `--root`: Root directory for dataset (default: ~/.cache/huggingface/lerobot)
|
||||||
|
- `--action_horizon`: Number of future actions in each chunk (default: 10)
|
||||||
|
- `--max_episodes`: Maximum number of episodes to use (default: None = all)
|
||||||
|
- `--sample_fraction`: Fraction of chunks to sample per episode (default: 0.1)
|
||||||
|
- `--encoded_dims`: Comma-separated dimension ranges to encode (default: "0:6,7:23")
|
||||||
|
- Example: "0:7" encodes dimensions 0-6
|
||||||
|
- Example: "0:3,6:9" encodes dimensions 0-2 and 6-8
|
||||||
|
- `--delta_dims`: Comma-separated dimension indices for delta transform (default: None)
|
||||||
|
- Example: "0,1,2,3,4,5" applies delta transform to first 6 dimensions
|
||||||
|
- Delta transform: action[i] - state[i] for specified dimensions
|
||||||
|
- `--state_key`: Dataset key for state observations (default: "observation.state")
|
||||||
|
- `--vocab_size`: FAST vocabulary size / BPE vocab size (default: 1024)
|
||||||
|
- `--scale`: DCT scaling factor (default: 10.0)
|
||||||
|
- `--output_dir`: Directory to save tokenizer (default: ./fast_tokenizer_{repo_id})
|
||||||
|
|
||||||
|
## Examples
|
||||||
|
|
||||||
|
### Example 1: Train on full action space
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python src/lerobot/policies/pi05/train_fast_tokenizer.py \
|
||||||
|
--repo_id "lerobot/pusht" \
|
||||||
|
--action_horizon 16 \
|
||||||
|
--encoded_dims "0:2" \
|
||||||
|
--vocab_size 512 \
|
||||||
|
--max_episodes 100
|
||||||
|
```
|
||||||
|
|
||||||
|
### Example 2: Train with delta transform
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python src/lerobot/policies/pi05/train_fast_tokenizer.py \
|
||||||
|
--repo_id "lerobot/aloha_sim_insertion_human" \
|
||||||
|
--action_horizon 10 \
|
||||||
|
--encoded_dims "0:14" \
|
||||||
|
--delta_dims "0,1,2,3,4,5,6,7,8,9,10,11,12,13" \
|
||||||
|
--state_key "observation.state" \
|
||||||
|
--vocab_size 1024 \
|
||||||
|
--scale 10.0 \
|
||||||
|
--sample_fraction 0.2
|
||||||
|
```
|
||||||
|
|
||||||
|
### Example 3: Train on subset of dimensions
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python src/lerobot/policies/pi05/train_fast_tokenizer.py \
|
||||||
|
--repo_id "lerobot/aloha_sim_insertion_human" \
|
||||||
|
--action_horizon 10 \
|
||||||
|
--encoded_dims "0:7" \
|
||||||
|
--vocab_size 1024 \
|
||||||
|
--output_dir "./my_tokenizer"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Output
|
||||||
|
|
||||||
|
The script saves:
|
||||||
|
1. **Tokenizer files**: Trained FAST tokenizer (can be loaded with `AutoProcessor.from_pretrained()`)
|
||||||
|
2. **metadata.json**: Contains:
|
||||||
|
- Configuration parameters
|
||||||
|
- Compression statistics (compression ratio, token lengths)
|
||||||
|
- Training dataset information
|
||||||
|
|
||||||
|
## Understanding the Process
|
||||||
|
|
||||||
|
1. **Load Dataset**: Loads the LeRobotDataset from HuggingFace
|
||||||
|
2. **Extract Action Chunks**: Creates sliding windows of actions with specified horizon
|
||||||
|
3. **Apply Delta Transform**: (Optional) Computes action deltas relative to current state
|
||||||
|
4. **Select Encoded Dimensions**: Extracts only the dimensions to be encoded
|
||||||
|
5. **Normalize**: Applies quantile normalization ([q01, q99] → [-1, 1])
|
||||||
|
6. **Train Tokenizer**: Trains BPE tokenizer on DCT coefficients
|
||||||
|
7. **Compute Stats**: Reports compression ratio and token length statistics
|
||||||
|
8. **Save**: Saves tokenizer and metadata
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- **Normalization**: The script uses quantile normalization (q01, q99) from the dataset's statistics
|
||||||
|
- **Sampling**: To speed up training, you can sample a fraction of chunks per episode
|
||||||
|
- **Delta Transform**: Applied per-dimension to make actions relative to current state
|
||||||
|
- **Compression**: FAST uses DCT + BPE to compress action sequences efficiently
|
||||||
|
|
||||||
@@ -0,0 +1,23 @@
|
|||||||
|
rm -rf /fsx/jade_choghari/outputs/pi0_multi_training
|
||||||
|
accelerate launch --multi_gpu --num_processes=2 \
|
||||||
|
$(which lerobot-train) \
|
||||||
|
--dataset.repo_id=local \
|
||||||
|
--dataset.root=/fsx/jade_choghari/outputs/collect-data-pgen \
|
||||||
|
--output_dir=/fsx/jade_choghari/outputs/pi0_multi_training \
|
||||||
|
--job_name=pi0_multi_training \
|
||||||
|
--policy.repo_id=jadechoghari/pi0-base1 \
|
||||||
|
--policy.path=lerobot/pi05_base \
|
||||||
|
--policy.dtype=bfloat16 \
|
||||||
|
--steps=50000 \
|
||||||
|
--save_freq=5000 \
|
||||||
|
--rename_map='{
|
||||||
|
"observation.images.base": "observation.images.base_0_rgb",
|
||||||
|
"observation.images.left_wrist": "observation.images.left_wrist_0_rgb",
|
||||||
|
"observation.images.right_wrist": "observation.images.right_wrist_0_rgb",
|
||||||
|
}' \
|
||||||
|
--policy.gradient_checkpointing=true \
|
||||||
|
--batch_size=1 \
|
||||||
|
--policy.device=cpu
|
||||||
|
# --wandb.enable=true \
|
||||||
|
# --wandb.disable_artifact=true \
|
||||||
|
# --wandb.project=pi05hi-training \
|
||||||
@@ -90,8 +90,6 @@ def update_policy(
|
|||||||
# Let accelerator handle mixed precision
|
# Let accelerator handle mixed precision
|
||||||
with accelerator.autocast():
|
with accelerator.autocast():
|
||||||
loss, output_dict = policy.forward(batch)
|
loss, output_dict = policy.forward(batch)
|
||||||
action = policy.select_action(batch)
|
|
||||||
breakpoint()
|
|
||||||
# TODO(rcadene): policy.unnormalize_outputs(out_dict)
|
# TODO(rcadene): policy.unnormalize_outputs(out_dict)
|
||||||
|
|
||||||
# Use accelerator's backward method
|
# Use accelerator's backward method
|
||||||
|
|||||||
Reference in New Issue
Block a user