more changres

2026-07-25 18:56:09 +00:00 · 2025-12-09 08:57:49 +00:00
parent 5fab1ed5cd
commit 6216932fb0
5 changed files with 973 additions and 0 deletions
@@ -0,0 +1,141 @@
+#!/bin/bash
+#SBATCH --time=24:00:00
+#SBATCH --partition=hopper-cpu
+#SBATCH --cpus-per-task=96
+#SBATCH --output=/fsx/jade_choghari/logs/launcher_%j.out
+#SBATCH --error=/fsx/jade_choghari/logs/launcher_%j.err
+# Activate conda environment
+# Load conda
+source /fsx/jade_choghari/miniforge3/etc/profile.d/conda.sh
+conda activate lerobot
+set -e  # Exit on error
+
+# Input dataset
+REPO_ID="lerobot"
+ROOT="/fsx/jade_choghari/vlabench-primitive/"
+
+# Output paths
+OUTPUT_DIR="/fsx/jade_choghari/vlabench-primitive-encoded"
+OUTPUT_REPO_ID="vlabench-primitive-encoded"
+LOGS_DIR="/fsx/jade_choghari/logs/convert_video"
+
+# Video encoding settings
+VCODEC="libsvtav1"
+PIX_FMT="yuv420p"
+GOP_SIZE=2
+CRF=30
+FAST_DECODE=0
+
+# Parallelization settings
+NUM_WORKERS=24           # Number of parallel SLURM workers
+NUM_IMAGE_WORKERS=4       # Threads per worker for image saving
+
+# SLURM settings
+PARTITION="hopper-cpu"  # Change to your CPU partition name
+CPUS_PER_TASK=32           # CPUs per worker
+MEM_PER_CPU="4G"          # Memory per CPU
+TIME_LIMIT="24:00:00"     # Time limit per job
+
+###############################################################################
+# STEP 1: Parallel Video Conversion
+###############################################################################
+rm -rf "${OUTPUT_DIR}"
+mkdir -p "${OUTPUT_DIR}"
+
+echo "=============================================="
+echo "STEP 1: Starting parallel video conversion"
+echo "  Workers: ${NUM_WORKERS}"
+echo "  Input: ${REPO_ID} (root: ${ROOT})"
+echo "  Output: ${OUTPUT_DIR}"
+echo "=============================================="
+
+python /admin/home/jade_choghari/lerobot/examples/port_datasets/slurm_convert_to_video.py\
+    --repo-id "${REPO_ID}" \
+    --root "${ROOT}" \
+    --output-dir "${OUTPUT_DIR}" \
+    --output-repo-id "${OUTPUT_REPO_ID}" \
+    --vcodec "${VCODEC}" \
+    --pix-fmt "${PIX_FMT}" \
+    --g ${GOP_SIZE} \
+    --crf ${CRF} \
+    --fast-decode ${FAST_DECODE} \
+    --num-image-workers ${NUM_IMAGE_WORKERS} \
+    --logs-dir "${LOGS_DIR}" \
+    --job-name "convert_video" \
+    --slurm 1 \
+    --workers ${NUM_WORKERS} \
+    --partition "${PARTITION}" \
+    --cpus-per-task ${CPUS_PER_TASK} \
+    --mem-per-cpu "${MEM_PER_CPU}" \
+    --time-limit "${TIME_LIMIT}"
+
+echo ""
+echo "✓ Parallel conversion jobs submitted!"
+echo "  Monitor with: squeue -u \$USER"
+echo "  Check logs in: ${LOGS_DIR}/convert_video"
+echo ""
+echo "Wait for all jobs to complete before running Step 2."
+echo "You can check completion with: squeue -u \$USER | grep convert_video"
+echo ""
+echo "After all jobs complete, run Step 2 to aggregate shards:"
+echo "  bash convert_to_video_parallel.sh aggregate"
+
+###############################################################################
+# STEP 2: Aggregate Shards (run this after Step 1 completes)
+###############################################################################
+
+if [ "$1" == "aggregate" ]; then
+    echo ""
+    echo "=============================================="
+    echo "STEP 2: Aggregating video shards"
+    echo "  Shards: ${NUM_WORKERS}"
+    echo "  Input: ${OUTPUT_DIR}/shard_XXXX"
+    echo "  Output: ${OUTPUT_DIR}_final"
+    echo "=============================================="
+
+    python slurm_aggregate_video_shards.py \
+        --shards-dir "${OUTPUT_DIR}" \
+        --output-dir "${OUTPUT_DIR}_final" \
+        --output-repo-id "${OUTPUT_REPO_ID}" \
+        --num-shards ${NUM_WORKERS} \
+        --logs-dir "${LOGS_DIR}" \
+        --job-name "aggregate_video" \
+        --slurm 1 \
+        --partition "${PARTITION}" \
+        --cpus-per-task 16 \
+        --mem-per-cpu "8G" \
+        --time-limit "08:00:00"
+
+    echo ""
+    echo "✓ Aggregation job submitted!"
+    echo "  Monitor with: squeue -u \$USER | grep aggregate_video"
+    echo "  Check logs in: ${LOGS_DIR}/aggregate_video"
+    echo ""
+    echo "After completion, your final dataset will be in:"
+    echo "  ${OUTPUT_DIR}_final"
+fi
+
+###############################################################################
+# Helpful information
+###############################################################################
+
+if [ "$1" != "aggregate" ]; then
+    echo ""
+    echo "=============================================="
+    echo "WORKFLOW SUMMARY"
+    echo "=============================================="
+    echo ""
+    echo "1. Step 1 is now running - it will:"
+    echo "   - Split episodes across ${NUM_WORKERS} workers"
+    echo "   - Each worker converts its episodes to video"
+    echo "   - Creates shard datasets in ${OUTPUT_DIR}/shard_XXXX"
+    echo ""
+    echo "2. After Step 1 completes, run Step 2:"
+    echo "   bash convert_to_video_parallel.sh aggregate"
+    echo ""
+    echo "3. Step 2 will merge all shards into a single dataset"
+    echo ""
+    echo "=============================================="
+fi
+
+