From 723bd71cf2be32579e957c0863d82c7b3bfb7a1a Mon Sep 17 00:00:00 2001 From: FennMai Date: Thu, 30 Apr 2026 07:03:03 +0000 Subject: [PATCH] =?UTF-8?q?=F0=9F=90=9B=20fix=20v30=5Fto=5Fv21=20ArrowType?= =?UTF-8?q?Error=20on=20pandas=20extension=20dtypes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `table.slice(...).to_pandas()` produces pandas ExtensionArrays for `array[float32]` columns (e.g. `observation.states.end.orientation`) on newer pandas/pyarrow combos, which then fail in `pa.Table.from_pandas` inside `Dataset.from_pandas(...).to_parquet(...)`. Skip the pandas round-trip and wrap the `pa.Table` slice in a `Dataset` directly with `Dataset(episode_table).to_parquet(...)`. This preserves the HuggingFace dataset metadata that `Dataset.to_parquet` writes, while avoiding the ExtensionArray crash. No version pin on datasets/pyarrow needed. Closes #87 --- ds_version_convert/v30_to_v21/convert_dataset_v30_to_v21.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ds_version_convert/v30_to_v21/convert_dataset_v30_to_v21.py b/ds_version_convert/v30_to_v21/convert_dataset_v30_to_v21.py index 11b1696..6531f17 100644 --- a/ds_version_convert/v30_to_v21/convert_dataset_v30_to_v21.py +++ b/ds_version_convert/v30_to_v21/convert_dataset_v30_to_v21.py @@ -181,7 +181,7 @@ def convert_data(root: Path, new_root: Path, episode_records: list[dict[str, Any f"episode_index={episode_index}, length={length}" ) - episode_table = table.slice(start, length).to_pandas() + episode_table = table.slice(start, length) dest_chunk = episode_index // DEFAULT_CHUNK_SIZE dest_path = new_root / LEGACY_DATA_PATH_TEMPLATE.format( @@ -189,7 +189,7 @@ def convert_data(root: Path, new_root: Path, episode_records: list[dict[str, Any episode_index=episode_index, ) dest_path.parent.mkdir(parents=True, exist_ok=True) - Dataset.from_pandas(episode_table).to_parquet(dest_path) + Dataset(episode_table).to_parquet(dest_path) def _group_episodes_by_video_file(