mirror of
https://github.com/huggingface/lerobot.git
synced 2026-06-19 01:07:18 +00:00
fix(datasets): enforce one parquet row group per episode in v3 data writes
LeRobot v3 data shards must hold exactly one row group per episode so a reader can fetch episode i with pq.ParquetFile(path).read_row_group(i) (a byte-range read) instead of loading the whole shard. The recording writer already does this (one write_table per episode); the aggregate and lerobot-annotate re-write paths instead concatenated many episodes and wrote them in one shot, collapsing the file to a single row group. - io_utils: add write_table_one_row_group_per_episode (one ParquetWriter, one write_table per episode — same pattern as the recording writer); to_parquet_with_hf_images embeds images then writes per-episode row groups; to_parquet_one_row_group_per_episode wraps it for plain frames - aggregate: route non-image data writes through the per-episode writer; leave the episodes-metadata parquet untouched (already one row/episode) - annotate: rewrite shards via the per-episode writer instead of a single bulk pq.write_table - tests: invariant coverage through the aggregate (image + video) and annotate paths No change to on-disk schema, paths, naming, rollover thresholds, or compression. Readers stay backward-compatible (old collapsed files load).
This commit is contained in:
@@ -28,6 +28,7 @@ import pytest
|
||||
pytest.importorskip("datasets", reason="datasets is required (install lerobot[dataset])")
|
||||
pytest.importorskip("pandas", reason="pandas is required (install lerobot[dataset])")
|
||||
|
||||
import pandas as pd # noqa: E402
|
||||
import pyarrow.parquet as pq # noqa: E402
|
||||
|
||||
from lerobot.annotations.steerable_pipeline.reader import iter_episodes # noqa: E402
|
||||
@@ -344,6 +345,78 @@ def test_annotation_metadata_sync_allows_non_streaming_load(
|
||||
assert len(dataset) == 24
|
||||
|
||||
|
||||
def _build_packed_dataset(root: Path, episode_lengths: list[int], *, fps: int = 10) -> Path:
|
||||
"""Pack several episodes into a single shard (vs build_annotation_dataset's one-per-file),
|
||||
so the writer's rewrite must re-emit one row group per episode instead of collapsing them."""
|
||||
from lerobot.datasets.io_utils import write_tasks
|
||||
from lerobot.utils.io_utils import write_json
|
||||
|
||||
data_dir = root / "data" / "chunk-000"
|
||||
data_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
episode_index, frame_index, timestamp, task_index, subtask_index = [], [], [], [], []
|
||||
for ep, length in enumerate(episode_lengths):
|
||||
episode_index += [ep] * length
|
||||
frame_index += list(range(length))
|
||||
timestamp += [round(i / fps, 6) for i in range(length)]
|
||||
task_index += [0] * length
|
||||
subtask_index += [0] * length # legacy column the writer must drop
|
||||
pd.DataFrame(
|
||||
{
|
||||
"episode_index": episode_index,
|
||||
"frame_index": frame_index,
|
||||
"timestamp": timestamp,
|
||||
"task_index": task_index,
|
||||
"subtask_index": subtask_index,
|
||||
}
|
||||
).to_parquet(data_dir / "file-000.parquet", index=False)
|
||||
|
||||
tasks_df = pd.DataFrame({"task_index": [0]}, index=pd.Index(["do the thing"], name="task"))
|
||||
write_tasks(tasks_df, root)
|
||||
write_json(
|
||||
{"codebase_version": "v3.1", "fps": fps, "features": {}, "total_episodes": len(episode_lengths)},
|
||||
root / "meta" / "info.json",
|
||||
)
|
||||
return root
|
||||
|
||||
|
||||
def test_writer_one_row_group_per_episode(tmp_path: Path) -> None:
|
||||
"""Rewriting a packed shard must keep one row group per episode, not collapse
|
||||
every episode into a single giant row group."""
|
||||
episode_lengths = [4, 6, 5] # unequal lengths, all in one shard
|
||||
root = _build_packed_dataset(tmp_path / "ds", episode_lengths)
|
||||
shard = root / "data" / "chunk-000" / "file-000.parquet"
|
||||
assert pq.ParquetFile(shard).metadata.num_row_groups == 1, "fixture should start collapsed"
|
||||
|
||||
staging_dir = tmp_path / "stage"
|
||||
for ep in range(len(episode_lengths)):
|
||||
_stage_episode(
|
||||
staging_dir,
|
||||
ep,
|
||||
plan=[
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": f"subtask for ep {ep}",
|
||||
"style": "subtask",
|
||||
"timestamp": 0.0,
|
||||
"tool_calls": None,
|
||||
}
|
||||
],
|
||||
)
|
||||
|
||||
records = list(iter_episodes(root))
|
||||
LanguageColumnsWriter().write_all(records, staging_dir, root)
|
||||
|
||||
# One row group per episode, with row counts matching the episode lengths.
|
||||
md = pq.ParquetFile(shard).metadata
|
||||
assert md.num_row_groups == len(episode_lengths)
|
||||
assert [md.row_group(i).num_rows for i in range(md.num_row_groups)] == episode_lengths
|
||||
# Language columns are still present after the per-episode rewrite.
|
||||
table = pq.read_table(shard)
|
||||
assert "language_persistent" in table.column_names
|
||||
assert "language_events" in table.column_names
|
||||
|
||||
|
||||
def test_speech_atom_shape_matches_plan_spec() -> None:
|
||||
atom = speech_atom(2.5, "I'm cleaning up!")
|
||||
assert atom["role"] == "assistant"
|
||||
|
||||
@@ -32,6 +32,26 @@ from lerobot.datasets.lerobot_dataset import LeRobotDataset
|
||||
from tests.fixtures.constants import DUMMY_REPO_ID
|
||||
|
||||
|
||||
def assert_data_shards_one_row_group_per_episode(root):
|
||||
"""Every aggregated DATA shard must have exactly one parquet row group per episode."""
|
||||
import pyarrow.parquet as pq
|
||||
|
||||
shards = sorted((root / "data").rglob("*.parquet"))
|
||||
assert shards, f"no data shards found under {root}/data"
|
||||
n_episodes = 0
|
||||
for shard in shards:
|
||||
pf = pq.ParquetFile(shard)
|
||||
episodes = pf.read(columns=["episode_index"]).column("episode_index").to_pylist()
|
||||
assert pf.metadata.num_row_groups == len(set(episodes)), shard
|
||||
for i in range(pf.metadata.num_row_groups):
|
||||
rg_episodes = set(
|
||||
pf.read_row_group(i, columns=["episode_index"]).column("episode_index").to_pylist()
|
||||
)
|
||||
assert len(rg_episodes) == 1, f"{shard} row group {i} spans episodes {rg_episodes}"
|
||||
n_episodes += len(set(episodes))
|
||||
return n_episodes
|
||||
|
||||
|
||||
def assert_episode_and_frame_counts(aggr_ds, expected_episodes, expected_frames):
|
||||
"""Test that total number of episodes and frames are correctly aggregated."""
|
||||
assert aggr_ds.num_episodes == expected_episodes, (
|
||||
@@ -566,6 +586,41 @@ def assert_image_frames_integrity(aggr_ds, ds_0, ds_1):
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_videos", [True, False], ids=["video", "image"])
|
||||
def test_aggregate_one_row_group_per_episode(tmp_path, lerobot_dataset_factory, use_videos):
|
||||
"""Aggregated DATA shards keep one row group per episode (not one collapsed group).
|
||||
|
||||
Covers both the non-image (``df.to_parquet``) and image
|
||||
(``to_parquet_with_hf_images``) write branches, including the merge-into-
|
||||
existing-file branch via a low file-size threshold that forces packing.
|
||||
"""
|
||||
ds_0 = lerobot_dataset_factory(
|
||||
root=tmp_path / "rg_0",
|
||||
repo_id=f"{DUMMY_REPO_ID}_rg_0",
|
||||
total_episodes=3,
|
||||
total_frames=60,
|
||||
use_videos=use_videos,
|
||||
)
|
||||
ds_1 = lerobot_dataset_factory(
|
||||
root=tmp_path / "rg_1",
|
||||
repo_id=f"{DUMMY_REPO_ID}_rg_1",
|
||||
total_episodes=4,
|
||||
total_frames=80,
|
||||
use_videos=use_videos,
|
||||
)
|
||||
|
||||
aggr_root = tmp_path / "rg_aggr"
|
||||
aggregate_datasets(
|
||||
repo_ids=[ds_0.repo_id, ds_1.repo_id],
|
||||
roots=[ds_0.root, ds_1.root],
|
||||
aggr_repo_id=f"{DUMMY_REPO_ID}_rg_aggr",
|
||||
aggr_root=aggr_root,
|
||||
)
|
||||
|
||||
n_episodes = assert_data_shards_one_row_group_per_episode(aggr_root)
|
||||
assert n_episodes == ds_0.num_episodes + ds_1.num_episodes
|
||||
|
||||
|
||||
def test_aggregate_image_datasets(tmp_path, lerobot_dataset_factory):
|
||||
"""Test aggregation of image-based datasets preserves HuggingFace Image schema.
|
||||
|
||||
|
||||
Reference in New Issue
Block a user