lerobot/examples/dataset/run_pgen.sh

#!/bin/bash

# Example script to run synthetic data generation with Qwen VLM
# This generates user prompts and robot utterances for hierarchical policy training

# Configuration
REPO_ID="lerobot/svla_so101_pickplace"
MODEL="Qwen/Qwen3-VL-30B-A3B-Instruct"
# Alternative: MODEL="Qwen/Qwen2-VL-7B-Instruct"


OUTPUT_DIR="/fsx/jade_choghari/outputs/pgen_annotations"
BATCH_SIZE=1
TEMPERATURE=0.7
SAMPLE_INTERVAL=1.0  # Generate dialogue every 1 second (all episodes processed)

# Run synthetic data generation (processes ALL episodes)
python examples/dataset/annotate_pgen.py \
    --repo-id "$REPO_ID" \
    --model "$MODEL" \
    --output-dir "$OUTPUT_DIR" \
    --temperature "$TEMPERATURE" \
    --sample-interval "$SAMPLE_INTERVAL" \
    --num-image-views-per-sample 1

# For faster testing, increase sample interval:
# --sample-interval 5.0  # Samples every 5 seconds (much faster)

# To push to hub after generation:
# Add --push-to-hub flag