add: final aggregation utils to obtain one dataset only

This commit is contained in:
fracapuano
2025-11-11 14:20:41 +00:00
parent 31274975f0
commit e70dd620f3
2 changed files with 50 additions and 0 deletions
+20
View File
@@ -0,0 +1,20 @@
#!/bin/bash
#SBATCH -J b1k-aggregate
#SBATCH -p hopper-cpu
#SBATCH --qos=high
#SBATCH -c 2 # More CPUs won't help much (I/O bound)
#SBATCH -t 20:00:00 # Conservative: 5.2x pattern suggests ~13h, so 12h is tight but reasonable
#SBATCH --mem=8G # Peak will be ~5-6GB, 8G gives comfortable margin
#SBATCH -o logs/aggregate-50-%j.out
#SBATCH -e logs/aggregate-50-%j.err
set -euo pipefail
export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK:-1}
export PYTHONUNBUFFERED=1
source "$HOME/.bashrc"
conda activate lerobot
cd /admin/home/francesco_capuano/lerobot
python examples/behavior_1k/aggregate_tasks_datasets.py
@@ -0,0 +1,30 @@
from pathlib import Path
from lerobot.datasets.aggregate import aggregate_datasets
from lerobot.datasets.lerobot_dataset import LeRobotDataset
def main():
"""Aggregate all tasks datasets into a single LeRobotDataset and push it to the hub."""
task_indices = range(50)
repo_ids = [f"fracapuano/behavior1k-task{i:04d}" for i in task_indices]
roots = [Path(f"/fsx/francesco_capuano/behavior1k/behavior1k-task{i:04d}") for i in task_indices]
aggregated_root = Path("/fsx/francesco_capuano/behavior1k/behavior1k")
aggregated_repo_id = "fracapuano/behavior1k"
aggregate_datasets(
repo_ids=repo_ids,
roots=roots,
aggr_repo_id=aggregated_repo_id,
aggr_root=aggregated_root,
)
ds = LeRobotDataset(repo_id=aggregated_repo_id, root=aggregated_root)
ds.push_to_hub()
if __name__ == "__main__":
main()