diff --git a/examples/behavior_1k/aggregate.slurm b/examples/behavior_1k/aggregate.slurm new file mode 100644 index 000000000..22c65a51a --- /dev/null +++ b/examples/behavior_1k/aggregate.slurm @@ -0,0 +1,20 @@ +#!/bin/bash +#SBATCH -J b1k-aggregate +#SBATCH -p hopper-cpu +#SBATCH --qos=high +#SBATCH -c 2 # More CPUs won't help much (I/O bound) +#SBATCH -t 20:00:00 # Conservative: 5.2x pattern suggests ~13h, so 12h is tight but reasonable +#SBATCH --mem=8G # Peak will be ~5-6GB, 8G gives comfortable margin +#SBATCH -o logs/aggregate-50-%j.out +#SBATCH -e logs/aggregate-50-%j.err + +set -euo pipefail +export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK:-1} +export PYTHONUNBUFFERED=1 + +source "$HOME/.bashrc" +conda activate lerobot + +cd /admin/home/francesco_capuano/lerobot + +python examples/behavior_1k/aggregate_tasks_datasets.py diff --git a/examples/behavior_1k/aggregate_tasks_datasets.py b/examples/behavior_1k/aggregate_tasks_datasets.py new file mode 100644 index 000000000..4087e329d --- /dev/null +++ b/examples/behavior_1k/aggregate_tasks_datasets.py @@ -0,0 +1,30 @@ +from pathlib import Path + +from lerobot.datasets.aggregate import aggregate_datasets +from lerobot.datasets.lerobot_dataset import LeRobotDataset + + +def main(): + """Aggregate all tasks datasets into a single LeRobotDataset and push it to the hub.""" + task_indices = range(50) + + repo_ids = [f"fracapuano/behavior1k-task{i:04d}" for i in task_indices] + + roots = [Path(f"/fsx/francesco_capuano/behavior1k/behavior1k-task{i:04d}") for i in task_indices] + + aggregated_root = Path("/fsx/francesco_capuano/behavior1k/behavior1k") + aggregated_repo_id = "fracapuano/behavior1k" + + aggregate_datasets( + repo_ids=repo_ids, + roots=roots, + aggr_repo_id=aggregated_repo_id, + aggr_root=aggregated_root, + ) + + ds = LeRobotDataset(repo_id=aggregated_repo_id, root=aggregated_root) + ds.push_to_hub() + + +if __name__ == "__main__": + main()