#!/usr/bin/env python # Copyright 2025 The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Mirror a bimanual robot dataset using SLURM for distributed video processing. This script creates a mirrored version of a dataset where: 1. Left and right arm observations/actions are swapped 2. Joint values are inverted according to a mirroring mask 3. Video frames are horizontally flipped (parallelized via SLURM) Example usage: ```shell # SLURM execution python examples/port_datasets/slurm_mirror_dataset.py \ --repo-id pepijn/openarm_bimanual \ --output-repo-id pepijn/openarm_bimanual_mirrored \ --logs-dir /fsx/user/logs \ --partition hopper-cpu # Local execution (for debugging) python examples/port_datasets/slurm_mirror_dataset.py \ --repo-id pepijn/openarm_bimanual \ --output-repo-id pepijn/openarm_bimanual_mirrored \ --slurm 0 \ --push-to-hub ``` """ import argparse import logging from pathlib import Path from datatrove.executor import LocalPipelineExecutor from datatrove.executor.slurm import SlurmPipelineExecutor from datatrove.pipeline.base import PipelineStep logger = logging.getLogger(__name__) OPENARM_MIRRORING_MASK = { "joint_1": -1, "joint_2": -1, "joint_3": -1, "joint_4": 1, "joint_5": -1, "joint_6": -1, "joint_7": -1, "gripper": 1, } class MirrorVideos(PipelineStep): """Pipeline step that mirrors video files for assigned episodes.""" def __init__( self, repo_id: str, output_repo_id: str, root: str | None = None, output_root: str | None = None, vcodec: str = "libsvtav1", ): super().__init__() self.repo_id = repo_id self.output_repo_id = output_repo_id self.root = root self.output_root = output_root self.vcodec = vcodec def run(self, data=None, rank: int = 0, world_size: int = 1): import logging import subprocess from pathlib import Path from datasets.utils.tqdm import disable_progress_bars from lerobot.datasets.lerobot_dataset import LeRobotDataset from lerobot.utils.constants import HF_LEROBOT_HOME from lerobot.utils.utils import init_logging init_logging() disable_progress_bars() logger = logging.getLogger(__name__) def swap_left_right_name(name: str) -> str: result = name.replace("left_", "LEFT_PLACEHOLDER_") result = result.replace("right_", "left_") result = result.replace("LEFT_PLACEHOLDER_", "right_") return result def flip_video_frames(input_path: Path, output_path: Path, fps: float, vcodec: str): output_path.parent.mkdir(parents=True, exist_ok=True) cmd = [ "ffmpeg", "-y", "-i", str(input_path), "-vf", "hflip", "-c:v", vcodec, "-g", "2", "-crf", "30", "-r", str(int(fps)), "-pix_fmt", "yuv420p", "-loglevel", "error", ] if vcodec == "libsvtav1": cmd.extend(["-preset", "12"]) cmd.append(str(output_path)) result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode != 0: raise RuntimeError(f"FFmpeg failed: {result.stderr}") def video_is_valid(path: Path) -> bool: if not path.exists(): return False try: result = subprocess.run( ["ffprobe", "-v", "error", "-select_streams", "v:0", "-show_entries", "stream=nb_frames", "-of", "csv=p=0", str(path)], capture_output=True, text=True, timeout=30 ) return result.returncode == 0 and result.stdout.strip().isdigit() except Exception: return False root = Path(self.root) if self.root else None output_root = Path(self.output_root) if self.output_root else None dataset = LeRobotDataset(self.repo_id, root=root) output_root = output_root or (HF_LEROBOT_HOME / self.output_repo_id) if not dataset.meta.video_keys: logger.info(f"Rank {rank}: No videos to process") return video_tasks = [] for old_video_key in dataset.meta.video_keys: new_video_key = swap_left_right_name(old_video_key) for ep_idx in range(dataset.meta.total_episodes): try: src_path = dataset.root / dataset.meta.get_video_file_path(ep_idx, old_video_key) dst_relative = dataset.meta.get_video_file_path(ep_idx, old_video_key) dst_relative_str = str(dst_relative).replace(old_video_key, new_video_key) dst_path = output_root / dst_relative_str if src_path.exists(): video_tasks.append((src_path, dst_path, ep_idx, old_video_key)) except KeyError: continue my_tasks = [t for i, t in enumerate(video_tasks) if i % world_size == rank] logger.info(f"Rank {rank}/{world_size}: Processing {len(my_tasks)}/{len(video_tasks)} videos") for src_path, dst_path, ep_idx, video_key in my_tasks: if video_is_valid(dst_path): logger.info(f"Rank {rank}: Skipping {dst_path.name} (already done)") continue logger.info(f"Rank {rank}: Processing {src_path.name} -> {dst_path.name}") flip_video_frames(src_path, dst_path, dataset.meta.fps, self.vcodec) class MirrorDataAndMetadata(PipelineStep): """Pipeline step that mirrors parquet data and metadata (runs once on rank 0).""" def __init__( self, repo_id: str, output_repo_id: str, root: str | None = None, output_root: str | None = None, ): super().__init__() self.repo_id = repo_id self.output_repo_id = output_repo_id self.root = root self.output_root = output_root def run(self, data=None, rank: int = 0, world_size: int = 1): if rank != 0: return import logging from pathlib import Path import numpy as np import pandas as pd from datasets.utils.tqdm import disable_progress_bars from lerobot.datasets.lerobot_dataset import LeRobotDataset, LeRobotDatasetMetadata from lerobot.datasets.utils import DATA_DIR, DEFAULT_DATA_PATH, write_info, write_stats, write_tasks from lerobot.utils.constants import HF_LEROBOT_HOME from lerobot.utils.utils import init_logging init_logging() disable_progress_bars() logger = logging.getLogger(__name__) MIRRORING_MASK = { "joint_1": -1, "joint_2": -1, "joint_3": -1, "joint_4": 1, "joint_5": -1, "joint_6": -1, "joint_7": -1, "gripper": 1, } def get_mirroring_mask(robot_type: str) -> dict[str, int]: if robot_type in ["bi_openarm_follower", "openarm_follower", "bi_openarms_follower", "openarms_follower"]: return MIRRORING_MASK raise ValueError(f"Unknown robot type: {robot_type}. Add a mirroring mask for this robot.") def swap_left_right_name(name: str) -> str: result = name.replace("left_", "LEFT_PLACEHOLDER_") result = result.replace("right_", "left_") result = result.replace("LEFT_PLACEHOLDER_", "right_") return result def mirror_feature_names(names: list[str]) -> tuple[list[str], dict[int, int]]: mirrored_names = [swap_left_right_name(n) for n in names] old_to_new_idx = {} for old_idx, old_name in enumerate(names): new_name = swap_left_right_name(old_name) new_idx = mirrored_names.index(new_name) old_to_new_idx[old_idx] = new_idx return mirrored_names, old_to_new_idx def apply_mirroring_mask(value: float, feature_name: str, mirroring_mask: dict[str, int]) -> float: name_without_prefix = feature_name.split("_", 1)[1] if "_" in feature_name else feature_name joint_name = name_without_prefix.split(".")[0] if joint_name in mirroring_mask: return value * mirroring_mask[joint_name] return value def mirror_array(array: np.ndarray, names: list[str], mirroring_mask: dict[str, int]) -> np.ndarray: mirrored_names, idx_mapping = mirror_feature_names(names) result = np.zeros_like(array) for old_idx, new_idx in idx_mapping.items(): new_name = mirrored_names[new_idx] value = array[old_idx] mirrored_value = apply_mirroring_mask(value, new_name, mirroring_mask) result[new_idx] = mirrored_value return result def mirror_stats(stats: dict) -> dict: mirrored = {} for key, value in stats.items(): new_key = swap_left_right_name(key) if isinstance(value, dict): mirrored[new_key] = mirror_stats(value) else: mirrored[new_key] = value return mirrored import shutil root = Path(self.root) if self.root else None output_root = Path(self.output_root) if self.output_root else None dataset = LeRobotDataset(self.repo_id, root=root) output_root = output_root or (HF_LEROBOT_HOME / self.output_repo_id) done_marker = output_root / ".data_mirrored" if done_marker.exists(): logger.info("Data and metadata already mirrored, skipping") return # Clean up partial output from previous failed runs if output_root.exists(): logger.info(f"Removing existing partial output: {output_root}") shutil.rmtree(output_root) robot_type = dataset.meta.robot_type or "bi_openarms_follower" mirroring_mask = get_mirroring_mask(robot_type) mirrored_features = {} for key, feat in dataset.meta.features.items(): new_key = swap_left_right_name(key) new_feat = feat.copy() if "names" in new_feat and new_feat["names"]: new_feat["names"] = [swap_left_right_name(n) for n in new_feat["names"]] mirrored_features[new_key] = new_feat new_meta = LeRobotDatasetMetadata.create( repo_id=self.output_repo_id, fps=dataset.meta.fps, features=mirrored_features, robot_type=dataset.meta.robot_type, root=output_root, use_videos=len(dataset.meta.video_keys) > 0, ) if dataset.meta.tasks is not None: write_tasks(dataset.meta.tasks, new_meta.root) data_dir = dataset.root / DATA_DIR parquet_files = sorted(data_dir.glob("*/*.parquet")) action_names = dataset.meta.features.get("action", {}).get("names", []) state_names = dataset.meta.features.get("observation.state", {}).get("names", []) for src_path in parquet_files: df = pd.read_parquet(src_path).reset_index(drop=True) relative_path = src_path.relative_to(dataset.root) chunk_dir = relative_path.parts[1] file_name = relative_path.parts[2] chunk_idx = int(chunk_dir.split("-")[1]) file_idx = int(file_name.split("-")[1].split(".")[0]) if "action" in df.columns and action_names: actions = np.stack(df["action"].values) mirrored_actions = np.array([mirror_array(row, action_names, mirroring_mask) for row in actions]) df["action"] = list(mirrored_actions) if "observation.state" in df.columns and state_names: states = np.stack(df["observation.state"].values) mirrored_states = np.array([mirror_array(row, state_names, mirroring_mask) for row in states]) df["observation.state"] = list(mirrored_states) dst_path = new_meta.root / DEFAULT_DATA_PATH.format(chunk_index=chunk_idx, file_index=file_idx) dst_path.parent.mkdir(parents=True, exist_ok=True) df.to_parquet(dst_path, index=False) episodes_dir = dataset.root / "meta/episodes" dst_episodes_dir = new_meta.root / "meta/episodes" if episodes_dir.exists(): dst_episodes_dir.mkdir(parents=True, exist_ok=True) for src_parquet in episodes_dir.glob("*/*.parquet"): df = pd.read_parquet(src_parquet) columns_to_rename = {} for col in df.columns: if col.startswith("videos/"): parts = col.split("/") if len(parts) >= 2: video_key = parts[1] new_video_key = swap_left_right_name(video_key) new_col = col.replace(f"videos/{video_key}/", f"videos/{new_video_key}/") columns_to_rename[col] = new_col if columns_to_rename: df = df.rename(columns=columns_to_rename) dst_parquet = dst_episodes_dir / src_parquet.relative_to(episodes_dir) dst_parquet.parent.mkdir(parents=True, exist_ok=True) df.to_parquet(dst_parquet, index=False) new_meta.info.update({ "total_episodes": dataset.meta.info["total_episodes"], "total_frames": dataset.meta.info["total_frames"], "total_tasks": dataset.meta.info["total_tasks"], "splits": dataset.meta.info.get("splits", {}), }) write_info(new_meta.info, new_meta.root) if dataset.meta.stats is not None: mirrored_stats = mirror_stats(dataset.meta.stats) write_stats(mirrored_stats, new_meta.root) done_marker.touch() logger.info(f"Data and metadata mirrored to {output_root}") def swap_left_right_name(name: str) -> str: result = name.replace("left_", "LEFT_PLACEHOLDER_") result = result.replace("right_", "left_") result = result.replace("LEFT_PLACEHOLDER_", "right_") return result def get_num_video_tasks(repo_id: str, root: str | None = None) -> int: from lerobot.datasets.lerobot_dataset import LeRobotDataset root_path = Path(root) if root else None dataset = LeRobotDataset(repo_id, root=root_path) count = 0 for video_key in dataset.meta.video_keys: for ep_idx in range(dataset.meta.total_episodes): try: src_path = dataset.root / dataset.meta.get_video_file_path(ep_idx, video_key) if src_path.exists(): count += 1 except KeyError: continue return count def make_mirror_executor( repo_id: str, output_repo_id: str, root: str | None, output_root: str | None, vcodec: str, job_name: str, logs_dir: Path, workers: int, partition: str, cpus_per_task: int, mem_per_cpu: str, time_limit: str, slurm: bool = True, ): num_tasks = get_num_video_tasks(repo_id, root) if slurm else 1 num_tasks = max(1, num_tasks) kwargs = { "pipeline": [ MirrorDataAndMetadata(repo_id, output_repo_id, root, output_root), MirrorVideos(repo_id, output_repo_id, root, output_root, vcodec), ], "logging_dir": str(logs_dir / job_name), } if slurm: kwargs.update({ "job_name": job_name, "tasks": num_tasks, "workers": min(workers, num_tasks), "time": time_limit, "partition": partition, "cpus_per_task": cpus_per_task, "sbatch_args": { "mem-per-cpu": mem_per_cpu, "requeue": True, "signal": "USR1@30", }, }) return SlurmPipelineExecutor(**kwargs) else: kwargs.update({"tasks": 1, "workers": 1}) return LocalPipelineExecutor(**kwargs) def main(): logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser(description="Mirror a bimanual robot dataset using SLURM") parser.add_argument("--repo-id", type=str, required=True, help="Source dataset repo_id") parser.add_argument("--output-repo-id", type=str, required=True, help="Output dataset repo_id") parser.add_argument("--root", type=str, default=None, help="Source dataset root directory") parser.add_argument("--output-root", type=str, default=None, help="Output dataset root directory") parser.add_argument("--vcodec", type=str, default="libsvtav1", help="Video codec") parser.add_argument("--logs-dir", type=Path, default=Path("logs"), help="Directory for datatrove logs") parser.add_argument("--job-name", type=str, default="mirror_dataset", help="SLURM job name") parser.add_argument("--slurm", type=int, default=1, help="Use SLURM (1) or local (0)") parser.add_argument("--workers", type=int, default=64, help="Number of SLURM workers") parser.add_argument("--partition", type=str, default="hopper-cpu", help="SLURM partition") parser.add_argument("--cpus-per-task", type=int, default=4, help="CPUs per task") parser.add_argument("--mem-per-cpu", type=str, default="2G", help="Memory per CPU") parser.add_argument("--time-limit", type=str, default="04:00:00", help="SLURM time limit") parser.add_argument("--push-to-hub", action="store_true", help="Push mirrored dataset to HuggingFace Hub") args = parser.parse_args() executor = make_mirror_executor( repo_id=args.repo_id, output_repo_id=args.output_repo_id, root=args.root, output_root=args.output_root, vcodec=args.vcodec, job_name=args.job_name, logs_dir=args.logs_dir, workers=args.workers, partition=args.partition, cpus_per_task=args.cpus_per_task, mem_per_cpu=args.mem_per_cpu, time_limit=args.time_limit, slurm=args.slurm == 1, ) executor.run() if args.push_to_hub: from lerobot.datasets.lerobot_dataset import LeRobotDataset from lerobot.utils.constants import HF_LEROBOT_HOME output_root = Path(args.output_root) if args.output_root else HF_LEROBOT_HOME / args.output_repo_id logger.info(f"Pushing dataset to HuggingFace Hub: {args.output_repo_id}") dataset = LeRobotDataset(args.output_repo_id, root=output_root) dataset.push_to_hub() if __name__ == "__main__": main()