lerobot/examples/dataset/run_pgen.sh

#!/bin/bash

# Example script to run synthetic data generation with Qwen VLM
# This generates user prompts and robot utterances for hierarchical policy training

# Configuration
REPO_ID="lerobot/svla_so101_pickplace"
MODEL="Qwen/Qwen3-VL-30B-A3B-Instruct"
# Alternative: MODEL="Qwen/Qwen2-VL-7B-Instruct"


OUTPUT_DIR="/fsx/jade_choghari/outputs/pgen_annotations1"
BATCH_SIZE=32
TEMPERATURE=0.9
SAMPLE_INTERVAL=5.0  # Generate dialogue every 1 second (all episodes processed)

# Run synthetic data generation (processes ALL episodes)
python examples/dataset/annotate_pgen.py \
    --repo-id "$REPO_ID" \
    --model "$MODEL" \
    --output-dir "$OUTPUT_DIR" \
    --temperature "$TEMPERATURE" \
    --batch-size "$BATCH_SIZE" \
    --sample-interval "$SAMPLE_INTERVAL" \
    --num-image-views-per-sample 1

# For faster testing, increase sample interval:
# --sample-interval 5.0  # Samples every 5 seconds (much faster)

# To push to hub after generation:
# Add --push-to-hub flag

# Efficient batch processing: 4 episodes at once
# python examples/dataset/annotate_pgen.py \
#     --repo-id "$REPO_ID" \
#     --model "$MODEL" \
#     --output-dir "$OUTPUT_DIR" \
#     --video-mode \
#     --video-key observation.images.up \
#     --video-batch-size "$BATCH_SIZE" \
#     --sample-interval 1.0