diff --git a/src/lerobot/datasets/v30/convert_dataset_v21_to_v30.py b/src/lerobot/datasets/v30/convert_dataset_v21_to_v30.py index 2a69945e1..5362c52f4 100644 --- a/src/lerobot/datasets/v30/convert_dataset_v21_to_v30.py +++ b/src/lerobot/datasets/v30/convert_dataset_v21_to_v30.py @@ -204,7 +204,6 @@ def convert_data(root: Path, new_root: Path, data_file_size_in_mb: int): image_keys = get_image_keys(root) - ep_idx = 0 chunk_idx = 0 file_idx = 0 size_in_mb = 0 @@ -214,9 +213,24 @@ def convert_data(root: Path, new_root: Path, data_file_size_in_mb: int): logging.info(f"Converting data files from {len(ep_paths)} episodes") - for ep_path in tqdm.tqdm(ep_paths, desc="convert data files"): + for ep_idx, ep_path in enumerate(tqdm.tqdm(ep_paths, desc="convert data files")): ep_size_in_mb = get_parquet_file_size_in_mb(ep_path) ep_num_frames = get_parquet_num_frames(ep_path) + + # Check if we need to start a new file BEFORE creating metadata + if size_in_mb + ep_size_in_mb >= data_file_size_in_mb and len(paths_to_cat) > 0: + # Write the accumulated data files + concat_data_files(paths_to_cat, new_root, chunk_idx, file_idx, image_keys) + + # Move to next file + chunk_idx, file_idx = update_chunk_file_indices(chunk_idx, file_idx, DEFAULT_CHUNK_SIZE) + + # Reset for the next file + size_in_mb = 0 + num_frames += ep_num_frames # Still need to accumulate total frames + paths_to_cat = [] + + # Now create metadata with correct chunk/file indices ep_metadata = { "episode_index": ep_idx, "data/chunk_index": chunk_idx, @@ -227,20 +241,7 @@ def convert_data(root: Path, new_root: Path, data_file_size_in_mb: int): size_in_mb += ep_size_in_mb num_frames += ep_num_frames episodes_metadata.append(ep_metadata) - ep_idx += 1 - - if size_in_mb < data_file_size_in_mb: - paths_to_cat.append(ep_path) - continue - - if paths_to_cat: - concat_data_files(paths_to_cat, new_root, chunk_idx, file_idx, image_keys) - - # Reset for the next file - size_in_mb = ep_size_in_mb - paths_to_cat = [ep_path] - - chunk_idx, file_idx = update_chunk_file_indices(chunk_idx, file_idx, DEFAULT_CHUNK_SIZE) + paths_to_cat.append(ep_path) # Write remaining data if any if paths_to_cat: