From 0d1be72dc8309b8841c363cdd322174ed13a7c9f Mon Sep 17 00:00:00 2001 From: Paul Crook <37202747+skiingpacman@users.noreply.github.com> Date: Thu, 5 Mar 2026 00:53:34 +0900 Subject: [PATCH] Fixing metadata indexing when writing new Parquet file (#2941) * Fixing metadata indexing when writing new Parquet file Summary: - addressing this issue: https://github.com/huggingface/lerobot/issues/2401 - vibe-coded bugfix by Claude Sonnet 4.5 * Backing out changes to convert_videos_of_camera * Addressing Ruff pre-commit complaint Summary: - addressing "SIM113 Use `enumerate()` for index variable `ep_idx` in `for` loop" --------- Co-authored-by: Paul <238953601+pac-robotics@users.noreply.github.com> --- .../v30/convert_dataset_v21_to_v30.py | 33 ++++++++++--------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/src/lerobot/datasets/v30/convert_dataset_v21_to_v30.py b/src/lerobot/datasets/v30/convert_dataset_v21_to_v30.py index 2a69945e1..5362c52f4 100644 --- a/src/lerobot/datasets/v30/convert_dataset_v21_to_v30.py +++ b/src/lerobot/datasets/v30/convert_dataset_v21_to_v30.py @@ -204,7 +204,6 @@ def convert_data(root: Path, new_root: Path, data_file_size_in_mb: int): image_keys = get_image_keys(root) - ep_idx = 0 chunk_idx = 0 file_idx = 0 size_in_mb = 0 @@ -214,9 +213,24 @@ def convert_data(root: Path, new_root: Path, data_file_size_in_mb: int): logging.info(f"Converting data files from {len(ep_paths)} episodes") - for ep_path in tqdm.tqdm(ep_paths, desc="convert data files"): + for ep_idx, ep_path in enumerate(tqdm.tqdm(ep_paths, desc="convert data files")): ep_size_in_mb = get_parquet_file_size_in_mb(ep_path) ep_num_frames = get_parquet_num_frames(ep_path) + + # Check if we need to start a new file BEFORE creating metadata + if size_in_mb + ep_size_in_mb >= data_file_size_in_mb and len(paths_to_cat) > 0: + # Write the accumulated data files + concat_data_files(paths_to_cat, new_root, chunk_idx, file_idx, image_keys) + + # Move to next file + chunk_idx, file_idx = update_chunk_file_indices(chunk_idx, file_idx, DEFAULT_CHUNK_SIZE) + + # Reset for the next file + size_in_mb = 0 + num_frames += ep_num_frames # Still need to accumulate total frames + paths_to_cat = [] + + # Now create metadata with correct chunk/file indices ep_metadata = { "episode_index": ep_idx, "data/chunk_index": chunk_idx, @@ -227,20 +241,7 @@ def convert_data(root: Path, new_root: Path, data_file_size_in_mb: int): size_in_mb += ep_size_in_mb num_frames += ep_num_frames episodes_metadata.append(ep_metadata) - ep_idx += 1 - - if size_in_mb < data_file_size_in_mb: - paths_to_cat.append(ep_path) - continue - - if paths_to_cat: - concat_data_files(paths_to_cat, new_root, chunk_idx, file_idx, image_keys) - - # Reset for the next file - size_in_mb = ep_size_in_mb - paths_to_cat = [ep_path] - - chunk_idx, file_idx = update_chunk_file_indices(chunk_idx, file_idx, DEFAULT_CHUNK_SIZE) + paths_to_cat.append(ep_path) # Write remaining data if any if paths_to_cat: