mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-22 03:59:42 +00:00
fix: implement actual conversion for lerobotdataset-v3 compatibility
This commit is contained in:
@@ -59,10 +59,19 @@ NEW_ROOT = Path("/fsx/jade_choghari/tmp/bb")
|
|||||||
|
|
||||||
|
|
||||||
def fix_episode_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
def fix_episode_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
"""Performs several fixes to an underlying dataframe to make it LeRobotDataset-v3 compatible"""
|
||||||
# Inject per-episode frame_index if missing (0..N-1 within each episode)
|
# Inject per-episode frame_index if missing (0..N-1 within each episode)
|
||||||
if "frame_index" not in df.columns:
|
if "frame_index" not in df.columns:
|
||||||
df["frame_index"] = range(len(df))
|
df["frame_index"] = range(len(df))
|
||||||
|
|
||||||
|
# Remove variable-length task_info feature (NOTE(fracapuano): change to padding at some point?)
|
||||||
|
if "observation.task_info" in df.columns:
|
||||||
|
df = df.drop(columns=["observation.task_info"])
|
||||||
|
|
||||||
|
# NOTE(fracapuano): tasks are ordered (and there is one task per file/dataset)
|
||||||
|
if "task_index" in df.columns:
|
||||||
|
df["task_index"] = 0
|
||||||
|
|
||||||
return df
|
return df
|
||||||
|
|
||||||
|
|
||||||
@@ -96,8 +105,13 @@ def convert_info(
|
|||||||
root, new_root, data_file_size_in_mb, video_file_size_in_mb, meta_path, task_id: int, task_ranges, step
|
root, new_root, data_file_size_in_mb, video_file_size_in_mb, meta_path, task_id: int, task_ranges, step
|
||||||
):
|
):
|
||||||
info = load_info(root)
|
info = load_info(root)
|
||||||
|
features = {**info["features"], **DEFAULT_FEATURES}
|
||||||
|
del features[
|
||||||
|
"observation.task_info"
|
||||||
|
] # variable-length task_info is not supported in LeRobotDataset v3.0!
|
||||||
|
|
||||||
info["codebase_version"] = "v3.0"
|
info["codebase_version"] = "v3.0"
|
||||||
info["features"] = {**info["features"], **DEFAULT_FEATURES}
|
info["features"] = features
|
||||||
del info["total_videos"]
|
del info["total_videos"]
|
||||||
info["data_files_size_in_mb"] = data_file_size_in_mb
|
info["data_files_size_in_mb"] = data_file_size_in_mb
|
||||||
info["video_files_size_in_mb"] = video_file_size_in_mb
|
info["video_files_size_in_mb"] = video_file_size_in_mb
|
||||||
@@ -135,7 +149,8 @@ def convert_tasks(root, new_root, task_id: int):
|
|||||||
if task_id not in tasks:
|
if task_id not in tasks:
|
||||||
raise ValueError(f"Task ID {task_id} not found in tasks (available: {list(tasks.keys())})")
|
raise ValueError(f"Task ID {task_id} not found in tasks (available: {list(tasks.keys())})")
|
||||||
tasks = {task_id: tasks[task_id]}
|
tasks = {task_id: tasks[task_id]}
|
||||||
task_indices = tasks.keys()
|
# Tasks are ordered with 0..ntasks-1 in the converted dataset
|
||||||
|
task_indices = range(len(tasks.keys()))
|
||||||
task_strings = tasks.values()
|
task_strings = tasks.values()
|
||||||
df_tasks = pd.DataFrame({"task_index": task_indices}, index=task_strings)
|
df_tasks = pd.DataFrame({"task_index": task_indices}, index=task_strings)
|
||||||
write_tasks(df_tasks, new_root)
|
write_tasks(df_tasks, new_root)
|
||||||
@@ -502,11 +517,16 @@ def convert_episodes_metadata(
|
|||||||
if len(num_eps_set) != 1:
|
if len(num_eps_set) != 1:
|
||||||
raise ValueError(f"Number of episodes is not the same ({num_eps_set}).")
|
raise ValueError(f"Number of episodes is not the same ({num_eps_set}).")
|
||||||
|
|
||||||
|
# Single file approach: set meta indices to 0 for all rows and write once
|
||||||
ds_episodes = Dataset.from_generator(
|
ds_episodes = Dataset.from_generator(
|
||||||
lambda: generate_episode_metadata_dict(
|
lambda: generate_episode_metadata_dict(
|
||||||
episodes_legacy_metadata, episodes_metadata, episodes_stats, episodes_video_metadata
|
episodes_legacy_metadata, episodes_metadata, episodes_stats, episodes_video_metadata
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
num_eps = len(ds_episodes)
|
||||||
|
# NOTE(fracapuano): for the size of the average dataset this is fine!
|
||||||
|
ds_episodes = ds_episodes.add_column("meta/episodes/chunk_index", [0] * num_eps)
|
||||||
|
ds_episodes = ds_episodes.add_column("meta/episodes/file_index", [0] * num_eps)
|
||||||
write_episodes(ds_episodes, new_root)
|
write_episodes(ds_episodes, new_root)
|
||||||
|
|
||||||
stats = aggregate_stats(list(episodes_stats.values()))
|
stats = aggregate_stats(list(episodes_stats.values()))
|
||||||
|
|||||||
Reference in New Issue
Block a user