Modify convert_to_lerobot_v3 script for behaviours dataset to take a single task id and create a dataset outof it

2026-07-23 09:46:00 +00:00 · 2025-10-24 17:06:21 +02:00
parent a52e88d349
commit fd623e0cc5
3 changed files with 203 additions and 238 deletions
@@ -1,46 +1,49 @@
 #!/usr/bin/env python
 import json
 import logging
 from pathlib import Path
 from typing import Any
 import numpy as np
 import torch as th
 from pathlib import Path
 from typing import Dict, Any
 from lerobot.datasets.lerobot_dataset import LeRobotDataset
 from .behaviour_1k_constants import (
    TASK_INDICES_TO_NAMES,
    ROBOT_CAMERA_NAMES,
    PROPRIOCEPTION_INDICES,
    BEHAVIOR_DATASET_FEATURES,
 )
 import logging
 from lerobot.utils.utils import init_logging
 from .behaviour_1k_constants import (
    PROPRIOCEPTION_INDICES,
    ROBOT_CAMERA_NAMES,
    TASK_INDICES_TO_NAMES,
 )
 init_logging()
 class BehaviorLeRobotDatasetV3(LeRobotDataset):
    """
    Extends LeRobotDataset v3.0 for BEHAVIOR-1K specific requirements.
    Handles task-based episode organization and BEHAVIOR-1K metadata.
    """
    @classmethod
    def create(
        cls,
        repo_id: str,
-        fps: int = 30,
+        fps: int,
        features: dict,
        root: str | Path | None = None,
-        robot_type: str = "R1Pro",
+        robot_type: str | None = None,
        use_videos: bool = True,
-        video_backend: str = "pyav",
+        tolerance_s: float = 1e-4,
        batch_encoding_size: int = 1,
        image_writer_processes: int = 0,
-        image_writer_threads: int = 4,
+        image_writer_threads: int = 0,
        video_backend: str | None = None,
        batch_encoding_size: int = 1,
    ) -> "BehaviorLeRobotDatasetV3":
        """
        Create a new BEHAVIOR-1K dataset in v3.0 format.
-        
+
        Args:
            repo_id: HuggingFace repository ID
            fps: Frames per second (default: 30)
@@ -51,7 +54,7 @@ class BehaviorLeRobotDatasetV3(LeRobotDataset):
            batch_encoding_size: Number of episodes to batch before encoding videos
            image_writer_processes: Number of processes for async image writing
            image_writer_threads: Number of threads per process for image writing
-            
+
        Returns:
            BehaviorLeRobotDatasetV3 instance
        """
@@ -59,7 +62,7 @@ class BehaviorLeRobotDatasetV3(LeRobotDataset):
        obj = super().create(
            repo_id=repo_id,
            fps=fps,
-            features=BEHAVIOR_DATASET_FEATURES,
+            features=features,
            root=root,
            robot_type=robot_type,
            use_videos=use_videos,
@@ -69,14 +72,14 @@ class BehaviorLeRobotDatasetV3(LeRobotDataset):
            video_backend=video_backend,
            batch_encoding_size=batch_encoding_size,
        )
-        
+
        # Convert to BehaviorLeRobotDatasetV3 instance
        obj.__class__ = cls
-        
+
        # Initialize BEHAVIOR-1K specific attributes
        obj.task_episode_mapping = {}  # Maps task_id to list of episode indices
        obj.episode_task_mapping = {}  # Maps episode_index to task info
-        
+
        # Additional metadata for BEHAVIOR-1K
        obj.behavior_metadata = {
            "robot_type": robot_type,
@@ -84,41 +87,41 @@ class BehaviorLeRobotDatasetV3(LeRobotDataset):
            "proprioception_indices": PROPRIOCEPTION_INDICES[robot_type],
            "camera_names": ROBOT_CAMERA_NAMES[robot_type],
        }
-        
+
        logging.info(f"Created BehaviorLeRobotDatasetV3 with repo_id: {repo_id}")
        return obj
-    
+
    def __init__(self, *args, **kwargs):
        """
        Initialize from existing dataset.
        Use the create() classmethod to create a new dataset.
        """
        super().__init__(*args, **kwargs)
-        
+
        # Initialize BEHAVIOR-1K specific attributes for loading existing datasets
        self.task_episode_mapping = {}
        self.episode_task_mapping = {}
        self.behavior_metadata = {}
-        
+
        # Try to load BEHAVIOR-1K metadata if it exists
        metadata_path = self.root / "meta" / "behavior_metadata.json"
        if metadata_path.exists():
-            with open(metadata_path, "r") as f:
+            with open(metadata_path) as f:
                stored_metadata = json.load(f)
                self.behavior_metadata = stored_metadata
                self.task_episode_mapping = stored_metadata.get("task_episode_mapping", {})
                self.episode_task_mapping = stored_metadata.get("episode_task_mapping", {})
-    
+
    def add_episode_from_hdf5(
        self,
-        hdf5_data: Dict[str, Any],
+        hdf5_data: dict[str, Any],
        task_id: int,
        episode_id: int,
        include_videos: bool = True,
    ) -> None:
        """
        Add an episode from HDF5 data to the dataset.
-        
+
        Args:
            hdf5_data: Dictionary containing the HDF5 episode data
            task_id: Task ID for this episode
@@ -127,9 +130,9 @@ class BehaviorLeRobotDatasetV3(LeRobotDataset):
        """
        task_name = TASK_INDICES_TO_NAMES[task_id]
        num_frames = len(hdf5_data["action"])
-        
+
        logging.info(f"Adding episode {episode_id} (task: {task_name}) with {num_frames} frames")
-        
+
        # Process each frame
        for frame_idx in range(num_frames):
            frame_data = {
@@ -140,44 +143,44 @@ class BehaviorLeRobotDatasetV3(LeRobotDataset):
                "task": task_name,
                "timestamp": frame_idx / self.fps,
            }
-            
+
            # Add video frames if requested
            if include_videos:
                for modality in ["rgb", "depth_linear", "seg_instance_id"]:
                    # Map depth_linear to depth for consistency
                    output_modality = "depth" if modality == "depth_linear" else modality
-                    
+
                    for camera_name, robot_camera_name in ROBOT_CAMERA_NAMES[self.robot_type].items():
                        key = f"observation.images.{output_modality}.{camera_name}"
                        hdf5_key = f"{robot_camera_name}::{modality}"
-                        
+
                        if hdf5_key in hdf5_data["obs"]:
                            # Get the frame data
                            frame = hdf5_data["obs"][hdf5_key][frame_idx]
-                            
+
                            # Handle different data types
                            if isinstance(frame, th.Tensor):
                                frame = frame.numpy()
-                            
+
                            # Ensure correct shape
                            if modality == "seg_instance_id" and len(frame.shape) == 2:
                                # Add channel dimension for grayscale
                                frame = np.expand_dims(frame, axis=-1)
                            elif modality == "depth_linear" and len(frame.shape) == 2:
                                frame = np.expand_dims(frame, axis=-1)
-                            
+
                            frame_data[key] = frame
-            
+
            # Add frame to dataset
            self.add_frame(frame_data)
-        
+
        # Save episode with metadata
        episode_metadata = {
            "task_id": task_id,
            "task_name": task_name,
            "original_episode_id": episode_id,
        }
-        
+
        # Add any additional HDF5 attributes as metadata
        if "attrs" in hdf5_data:
            for attr_name, attr_value in hdf5_data["attrs"].items():
@@ -185,10 +188,10 @@ class BehaviorLeRobotDatasetV3(LeRobotDataset):
                    episode_metadata[attr_name] = list(attr_value)
                else:
                    episode_metadata[attr_name] = attr_value
-        
+
        # Save the episode
        self.save_episode(episode_data=None)
-        
+
        # Track task-episode mapping
        if task_id not in self.task_episode_mapping:
            self.task_episode_mapping[task_id] = []
@@ -198,26 +201,30 @@ class BehaviorLeRobotDatasetV3(LeRobotDataset):
            "task_name": task_name,
            "original_episode_id": episode_id,
        }
-    
+
    def finalize(self) -> None:
        """Finalize the dataset and save additional BEHAVIOR-1K metadata."""
        # Save BEHAVIOR-1K specific metadata
        metadata_path = self.root / "meta" / "behavior_metadata.json"
        metadata_path.parent.mkdir(parents=True, exist_ok=True)
-        
+
-        self.behavior_metadata.update({
+        self.behavior_metadata.update(
-            "task_episode_mapping": self.task_episode_mapping,
+            {
-            "episode_task_mapping": self.episode_task_mapping,
+                "task_episode_mapping": self.task_episode_mapping,
-            "total_tasks": len(self.task_episode_mapping),
+                "episode_task_mapping": self.episode_task_mapping,
-            "total_episodes": self.num_episodes,
+                "total_tasks": len(self.task_episode_mapping),
-            "total_frames": self.num_frames,
+                "total_episodes": self.num_episodes,
-        })
+                "total_frames": self.num_frames,
-        
+            }
        )
        with open(metadata_path, "w") as f:
            json.dump(self.behavior_metadata, f, indent=2)
-        
+
        # Finalize the parent dataset
        super().finalize()
-        
+
-        logging.info(f"Finalized dataset with {self.num_episodes} episodes "
+        logging.info(
-                   f"and {self.num_frames} frames across {len(self.task_episode_mapping)} tasks")
+            f"Finalized dataset with {self.num_episodes} episodes "
            f"and {self.num_frames} frames across {len(self.task_episode_mapping)} tasks"
        )
@@ -1,7 +1,10 @@
 import numpy as np
 import torch as th
 from collections import OrderedDict
 import numpy as np
 import torch as th
 ROBOT_TYPE = "R1Pro"
 FPS = 30
 ROBOT_CAMERA_NAMES = {
    "A1": {
@@ -21,13 +24,17 @@ WRIST_RESOLUTION = (480, 480)
 # TODO: Fix A1
 CAMERA_INTRINSICS = {
    "A1": {
-        "external": np.array([[306.0, 0.0, 360.0], [0.0, 306.0, 360.0], [0.0, 0.0, 1.0]], dtype=np.float32),  # 240x240
+        "external": np.array(
            [[306.0, 0.0, 360.0], [0.0, 306.0, 360.0], [0.0, 0.0, 1.0]], dtype=np.float32
        ),  # 240x240
        "wrist": np.array(
            [[388.6639, 0.0, 240.0], [0.0, 388.6639, 240.0], [0.0, 0.0, 1.0]], dtype=np.float32
        ),  # 240x240
    },
    "R1Pro": {
-        "head": np.array([[306.0, 0.0, 360.0], [0.0, 306.0, 360.0], [0.0, 0.0, 1.0]], dtype=np.float32),  # 720x720
+        "head": np.array(
            [[306.0, 0.0, 360.0], [0.0, 306.0, 360.0], [0.0, 0.0, 1.0]], dtype=np.float32
        ),  # 720x720
        "left_wrist": np.array(
            [[388.6639, 0.0, 240.0], [0.0, 388.6639, 240.0], [0.0, 0.0, 1.0]], dtype=np.float32
        ),  # 480x480
@@ -48,7 +55,7 @@ BEHAVIOR_DATASET_FEATURES = {
    },
    # Proprioception
    "observation.state": {
-        "dtype": "float32", 
+        "dtype": "float32",
        "shape": (256,),  # Full proprioception state
        "names": None,
    },
@@ -229,7 +236,10 @@ JOINT_RANGE = {
        "gripper": (th.tensor([0.00], dtype=th.float32), th.tensor([0.03], dtype=th.float32)),
    },
    "R1Pro": {
-        "base": (th.tensor([-0.75, -0.75, -1.0], dtype=th.float32), th.tensor([0.75, 0.75, 1.0], dtype=th.float32)),
+        "base": (
            th.tensor([-0.75, -0.75, -1.0], dtype=th.float32),
            th.tensor([0.75, 0.75, 1.0], dtype=th.float32),
        ),
        "torso": (
            th.tensor([-1.1345, -2.7925, -1.8326, -3.0543], dtype=th.float32),
            th.tensor([1.8326, 2.5307, 1.5708, 3.0543], dtype=th.float32),
@@ -253,8 +263,14 @@ EEF_POSITION_RANGE = {
        "0": (th.tensor([0.0, -0.7, 0.0], dtype=th.float32), th.tensor([0.7, 0.7, 0.7], dtype=th.float32)),
    },
    "R1Pro": {
-        "left": (th.tensor([0.0, -0.65, 0.0], dtype=th.float32), th.tensor([0.65, 0.65, 2.5], dtype=th.float32)),
+        "left": (
-        "right": (th.tensor([0.0, -0.65, 0.0], dtype=th.float32), th.tensor([0.65, 0.65, 2.5], dtype=th.float32)),
+            th.tensor([0.0, -0.65, 0.0], dtype=th.float32),
            th.tensor([0.65, 0.65, 2.5], dtype=th.float32),
        ),
        "right": (
            th.tensor([0.0, -0.65, 0.0], dtype=th.float32),
            th.tensor([0.65, 0.65, 2.5], dtype=th.float32),
        ),
    },
 }
@@ -317,4 +333,3 @@ TASK_NAMES_TO_INDICES = {
    "make_pizza": 49,
 }
 TASK_INDICES_TO_NAMES = {v: k for k, v in TASK_NAMES_TO_INDICES.items()}
@@ -1,69 +1,82 @@
 #!/usr/bin/env python
 """
 Convert a single BEHAVIOR-1K task from HDF5 to LeRobotDataset v3.0 format.
 Usage examples:
 # Convert a single task
 python convert_to_lerobot_v3.py \
    --data-folder /path/to/data \
    --repo-id "username/behavior-1k-assembling-gift-baskets" \
    --task-id 0 \
    --push-to-hub
 """
 import argparse
 import logging
 import os
 from pathlib import Path
 import h5py
 import numpy as np
 import os
 import torch as th
 from pathlib import Path
 from tqdm import tqdm
 import logging
 from .behavior_lerobot_dataset_v3 import BehaviorLeRobotDatasetV3
 from .behaviour_1k_constants import TASK_NAMES_TO_INDICES, TASK_INDICES_TO_NAMES, BEHAVIOR_DATASET_FEATURES
 from lerobot.utils.utils import init_logging
 from .behavior_lerobot_dataset_v3 import BehaviorLeRobotDatasetV3
 from .behaviour_1k_constants import BEHAVIOR_DATASET_FEATURES, FPS, ROBOT_TYPE, TASK_INDICES_TO_NAMES
 init_logging()
 def load_hdf5_episode(hdf5_path: str, episode_id: int = 0) -> dict:
    """
    Load episode data from HDF5 file.
-    
+
    Args:
        hdf5_path: Path to the HDF5 file
        episode_id: Episode ID to load (default: 0)
-        
+
    Returns:
        Dictionary containing episode data
    """
    episode_data = {}
-    
+
    with h5py.File(hdf5_path, "r") as f:
        # Find the episode with most samples if episode_id not specified
        if episode_id == -1:
-            num_samples = [f["data"][key].attrs["num_samples"] for key in f["data"].keys()]
+            num_samples = [f["data"][key].attrs["num_samples"] for key in f["data"]]
            episode_id = num_samples.index(max(num_samples))
-        
+
        demo_key = f"demo_{episode_id}"
        if demo_key not in f["data"]:
            raise ValueError(f"Episode {episode_id} not found in {hdf5_path}")
-        
+
        demo_data = f["data"][demo_key]
-        
+
        # Load actions
        episode_data["action"] = np.array(demo_data["action"][:])
-        
+
        # Load observations
        episode_data["obs"] = {}
-        for key in demo_data["obs"].keys():
+        for key in demo_data["obs"]:
            episode_data["obs"][key] = np.array(demo_data["obs"][key][:])
-        
+
        # Load attributes
        episode_data["attrs"] = {}
        for attr_name in demo_data.attrs:
            episode_data["attrs"][attr_name] = demo_data.attrs[attr_name]
-        
+
        # Add global attributes
        for attr_name in f["data"].attrs:
            episode_data["attrs"][f"global_{attr_name}"] = f["data"].attrs[attr_name]
-    
+
    return episode_data
 def convert_episode(
    data_folder: str,
    output_repo_id: str,
    task_id: int,
    demo_id: int,
    dataset: BehaviorLeRobotDatasetV3,
@@ -72,10 +85,10 @@ def convert_episode(
 ) -> None:
    """
    Convert a single episode from HDF5 to LeRobotDataset v3.0 format.
-    
+
    Args:
        data_folder: Base data folder containing HDF5 files
-        output_repo_id: Output repository ID for the dataset
+        repo_id: Repository ID for the dataset
        task_id: Task ID
        demo_id: Demo ID (episode ID)
        dataset: BehaviorLeRobotDatasetV3 instance to add data to
@@ -85,26 +98,22 @@ def convert_episode(
    # Construct paths
    task_name = TASK_INDICES_TO_NAMES[task_id]
    hdf5_path = f"{data_folder}/2025-challenge-rawdata/task-{task_id:04d}/episode_{demo_id:08d}.hdf5"
-    
+
    if not os.path.exists(hdf5_path):
        logging.error(f"HDF5 file not found: {hdf5_path}")
        return
-    
+
    logging.info(f"Converting episode {demo_id} from task {task_name}")
-    
+
    # Load episode data
-    try:
+    episode_data = load_hdf5_episode(hdf5_path, episode_id=0)
-        episode_data = load_hdf5_episode(hdf5_path, episode_id=0)
+
    except Exception as e:
        logging.error(f"Failed to load episode data: {e}")
        return
    # Filter out segmentation if not requested
    if not include_segmentation:
-        keys_to_remove = [k for k in episode_data["obs"].keys() if "seg_instance_id" in k]
+        keys_to_remove = [k for k in episode_data["obs"] if "seg_instance_id" in k]
        for key in keys_to_remove:
            del episode_data["obs"][key]
-    
+
    # Add episode to dataset
    dataset.add_episode_from_hdf5(
        hdf5_data=episode_data,
@@ -114,169 +123,103 @@ def convert_episode(
    )
-def convert_dataset(
+def convert_task_to_dataset(
    data_folder: str,
-    output_repo_id: str,
+    repo_id: str,
-    task_names: list = None,
+    task_id: int,
    episode_ids: list = None,
    max_episodes_per_task: int = None,
    include_videos: bool = True,
    include_segmentation: bool = True,
    fps: int = 30,
    batch_encoding_size: int = 1,
    image_writer_processes: int = 0,
    image_writer_threads: int = 4,
    push_to_hub: bool = False,
 ) -> None:
    """
-    Convert BEHAVIOR-1K dataset from HDF5 to LeRobotDataset v3.0 format.
+    Convert a single BEHAVIOR-1K task from HDF5 to LeRobotDataset v3.0 format.
-    
+
    Args:
        data_folder: Base folder containing HDF5 data
-        output_repo_id: Output repository ID (e.g., "username/dataset-name")
+        repo_id: Repository ID (e.g., "username/behavior-1k-task-name")
-        task_names: List of task names to convert (None = all tasks)
+        task_id: Task ID to convert
        episode_ids: Specific episode IDs to convert (None = all episodes)
        max_episodes_per_task: Maximum episodes per task to convert
        include_videos: Whether to include video data
        include_segmentation: Whether to include segmentation data
        fps: Frames per second
        batch_encoding_size: Number of episodes to batch before encoding
        image_writer_processes: Number of processes for image writing
        image_writer_threads: Number of threads for image writing
        push_to_hub: Whether to push to HuggingFace Hub
    """
    task_name = TASK_INDICES_TO_NAMES[task_id]
    task_folder = f"{data_folder}/2025-challenge-rawdata/task-{task_id:04d}"
    if not os.path.exists(task_folder):
        raise ValueError(f"Task folder not found: {task_folder}")
    # Create output directory
-    output_dir = Path.home() / ".cache/huggingface/lerobot" / output_repo_id
+    output_dir = Path.home() / ".cache/huggingface/lerobot" / repo_id
    output_dir.mkdir(parents=True, exist_ok=True)
-    
+
-    logging.info(f"Converting dataset to: {output_dir}")
+    logging.info(f"Converting task '{task_name}' (ID: {task_id}) to: {output_dir}")
-    
+
-    # Initialize dataset
+    # Initialize dataset for this task
    dataset = BehaviorLeRobotDatasetV3.create(
-        repo_id=output_repo_id,
+        repo_id=repo_id,
-        root=output_dir,
+        fps=FPS,
-        fps=fps,
+        features=BEHAVIOR_DATASET_FEATURES,
-        robot_type="R1Pro",
+        robot_type=ROBOT_TYPE,
        use_videos=include_videos,
        video_backend="pyav",
        batch_encoding_size=batch_encoding_size,
        image_writer_processes=image_writer_processes,
        image_writer_threads=image_writer_threads,
    )
-    
+
-    # Determine which tasks to process
+    # Find all episodes in the task folder
-    if task_names is None:
+    task_episode_ids = []
-        task_names = list(TASK_NAMES_TO_INDICES.keys())
+    for filename in os.listdir(task_folder):
-    
+        if filename.startswith("episode_") and filename.endswith(".hdf5"):
-    task_ids = [TASK_NAMES_TO_INDICES[name] for name in task_names]
+            eid = int(filename.split("_")[1].split(".")[0])
-    
+            task_episode_ids.append(eid)
-    # Process each task
+    task_episode_ids.sort()
-    total_episodes = 0
+
-    for task_id in tqdm(task_ids, desc="Processing tasks"):
+    logging.info(f"Processing {len(task_episode_ids)} episodes for task {task_name}")
-        task_name = TASK_INDICES_TO_NAMES[task_id]
+
-        task_folder = f"{data_folder}/2025-challenge-rawdata/task-{task_id:04d}"
+    # Convert each episode
-        
+    episodes_converted = 0
-        if not os.path.exists(task_folder):
+    for demo_id in tqdm(task_episode_ids, desc="Converting episodes"):
-            logging.warning(f"Task folder not found: {task_folder}")
+        convert_episode(
-            continue
+            data_folder=data_folder,
-        
+            task_id=task_id,
-        # Find all episodes for this task
+            demo_id=demo_id,
-        if episode_ids is not None:
+            dataset=dataset,
-            # Use specified episode IDs
+            include_videos=True,
-            task_episode_ids = [eid for eid in episode_ids if eid // 10000 == task_id]
+            include_segmentation=True,
-        else:
+        )
-            # Find all episodes in the task folder
+        episodes_converted += 1
-            task_episode_ids = []
+
-            for filename in os.listdir(task_folder):
+    logging.info(f"Converted {episodes_converted} episodes for task {task_name}")
-                if filename.startswith("episode_") and filename.endswith(".hdf5"):
+
                    eid = int(filename.split("_")[1].split(".")[0])
                    task_episode_ids.append(eid)
            task_episode_ids.sort()
        # Limit episodes if requested
        if max_episodes_per_task is not None:
            task_episode_ids = task_episode_ids[:max_episodes_per_task]
        logging.info(f"Processing {len(task_episode_ids)} episodes for task {task_name}")
        # Convert each episode
        for demo_id in tqdm(task_episode_ids, desc=f"Task {task_name}", leave=False):
            try:
                convert_episode(
                    data_folder=data_folder,
                    output_repo_id=output_repo_id,
                    task_id=task_id,
                    demo_id=demo_id,
                    dataset=dataset,
                    include_videos=include_videos,
                    include_segmentation=include_segmentation,
                )
                total_episodes += 1
            except Exception as e:
                logging.error(f"Failed to convert episode {demo_id}: {e}")
                continue
    logging.info(f"Converted {total_episodes} episodes total")
    # Finalize dataset
-    logging.info("Finalizing dataset...")
+    logging.info(f"Finalizing dataset for task {task_name}...")
    dataset.finalize()
-    
+
    # Push to hub if requested
    if push_to_hub:
-        logging.info("Pushing dataset to HuggingFace Hub...")
+        logging.info(f"Pushing task {task_name} dataset to HuggingFace Hub...")
-        dataset.push_to_hub(
+        dataset.push_to_hub()
-            private=True,
+
            license="apache-2.0",
        )
    logging.info("Conversion complete!")
 def main():
-    parser = argparse.ArgumentParser(description="Convert BEHAVIOR-1K data to LeRobotDataset v3.0")
+    parser = argparse.ArgumentParser(description="Convert a single BEHAVIOR-1K task to LeRobotDataset v3.0")
-    parser.add_argument("--data_folder", type=str, required=True, help="Path to the data folder")
+    parser.add_argument("--data-folder", type=str, required=True, help="Path to the data folder")
-    parser.add_argument("--output_repo_id", type=str, required=True, 
+    parser.add_argument(
-                       help="Output repository ID (e.g., 'username/behavior-dataset-v3')")
+        "--repo-id",
-    parser.add_argument("--task_names", type=str, nargs="+", default=None,
+        type=str,
-                       help="Task names to convert (default: all)")
+        required=True,
-    parser.add_argument("--episode_ids", type=int, nargs="+", default=None,
+        help="Output repository ID (e.g., 'username/behavior-1k-assembling-gift-baskets')",
-                       help="Specific episode IDs to convert")
+    )
-    parser.add_argument("--max_episodes_per_task", type=int, default=None,
+    parser.add_argument(
-                       help="Maximum episodes per task to convert")
+        "--task-id", type=int, required=True, help="Task ID to convert (e.g., 0 for assembling_gift_baskets)"
-    parser.add_argument("--no_videos", action="store_true",
+    )
-                       help="Exclude video data")
+    parser.add_argument(
-    parser.add_argument("--no_segmentation", action="store_true",
+        "--push-to-hub", action="store_true", help="Push dataset to HuggingFace Hub after conversion"
-                       help="Exclude segmentation data")
+    )
-    parser.add_argument("--fps", type=int, default=30,
+
                       help="Frames per second (default: 30)")
    parser.add_argument("--batch_encoding_size", type=int, default=1,
                       help="Number of episodes to batch before encoding videos")
    parser.add_argument("--image_writer_processes", type=int, default=0,
                       help="Number of processes for async image writing")
    parser.add_argument("--image_writer_threads", type=int, default=4,
                       help="Number of threads for image writing")
    parser.add_argument("--push_to_hub", action="store_true",
                       help="Push dataset to HuggingFace Hub")
    args = parser.parse_args()
-    
+
-    # Convert dataset
+    # Convert single task to dataset
-    convert_dataset(
+    convert_task_to_dataset(
        data_folder=args.data_folder,
-        output_repo_id=args.output_repo_id,
+        repo_id=args.repo_id,
-        task_names=args.task_names,
+        task_id=args.task_id,
        episode_ids=args.episode_ids,
        max_episodes_per_task=args.max_episodes_per_task,
        include_videos=not args.no_videos,
        include_segmentation=not args.no_segmentation,
        fps=args.fps,
        batch_encoding_size=args.batch_encoding_size,
        image_writer_processes=args.image_writer_processes,
        image_writer_threads=args.image_writer_threads,
        push_to_hub=args.push_to_hub,
    )
 if __name__ == "__main__":