mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-22 03:59:42 +00:00
new things
This commit is contained in:
@@ -0,0 +1,126 @@
|
||||
import os
|
||||
import pyarrow.parquet as pq
|
||||
import tempfile
|
||||
import shutil
|
||||
|
||||
# Root directory of converted data
|
||||
root_dir = "/raid/jade/libero_converted"
|
||||
|
||||
# No renaming
|
||||
rename_map = {
|
||||
|
||||
}
|
||||
|
||||
# Hugging Face features metadata (constant across all files)
|
||||
HF_METADATA = {
|
||||
b"huggingface": b'{"info": {"features": {"observation.images.image": {"_type": "Image"}, "observation.images.image2": {"_type": "Image"}, "state": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 8, "_type": "Sequence"}, "actions": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 7, "_type": "Sequence"}, "timestamp": {"dtype": "float32", "_type": "Value"}, "frame_index": {"dtype": "int64", "_type": "Value"}, "episode_index": {"dtype": "int64", "_type": "Value"}, "index": {"dtype": "int64", "_type": "Value"}, "task_index": {"dtype": "int64", "_type": "Value"}}}}'
|
||||
}
|
||||
|
||||
def patch_parquet(parquet_path, hf_metadata):
|
||||
try:
|
||||
table = pq.read_table(parquet_path)
|
||||
|
||||
# Merge metadata
|
||||
new_meta = dict(table.schema.metadata or {})
|
||||
new_meta.update(hf_metadata)
|
||||
|
||||
# Apply metadata to table
|
||||
table = table.replace_schema_metadata(new_meta)
|
||||
|
||||
# Write safely via temp file
|
||||
tmp_fd, tmp_path = tempfile.mkstemp(suffix=".parquet")
|
||||
os.close(tmp_fd)
|
||||
pq.write_table(table, tmp_path)
|
||||
shutil.move(tmp_path, parquet_path)
|
||||
|
||||
print(f"✅ Patched: {parquet_path}")
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"❌ Failed on {parquet_path}: {e}")
|
||||
return False
|
||||
|
||||
# Walk through all chunk dirs and patch parquet files
|
||||
for dirpath, _, filenames in os.walk(root_dir):
|
||||
for fname in filenames:
|
||||
if fname.endswith(".parquet"):
|
||||
fpath = os.path.join(dirpath, fname)
|
||||
patch_parquet(fpath, HF_METADATA)#!/usr/bin/env python3
|
||||
|
||||
#!/usr/bin/env python3
|
||||
import os
|
||||
import pyarrow.parquet as pq
|
||||
import tempfile
|
||||
import shutil
|
||||
|
||||
# Explicit list of files to patch
|
||||
FILES_TO_PATCH = [
|
||||
"/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000021.parquet",
|
||||
"/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000022.parquet",
|
||||
"/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000023.parquet",
|
||||
"/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000024.parquet",
|
||||
"/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000025.parquet",
|
||||
]
|
||||
|
||||
# Optional renaming map (fill in as needed)
|
||||
rename_map = {
|
||||
# "old_column_name": "new_column_name",
|
||||
"image": "observation.images.image",
|
||||
"image2": "observation.images.image2",
|
||||
"actions": "action",
|
||||
}
|
||||
|
||||
# Hugging Face features metadata (constant across all files)
|
||||
HF_METADATA = {
|
||||
b"huggingface": b'{"info": {"features": {'
|
||||
b'"observation.images.image": {"_type": "Image"}, '
|
||||
b'"observation.images.image2": {"_type": "Image"}, '
|
||||
b'"state": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 8, "_type": "Sequence"}, '
|
||||
b'"actions": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 7, "_type": "Sequence"}, '
|
||||
b'"timestamp": {"dtype": "float32", "_type": "Value"}, '
|
||||
b'"frame_index": {"dtype": "int64", "_type": "Value"}, '
|
||||
b'"episode_index": {"dtype": "int64", "_type": "Value"}, '
|
||||
b'"index": {"dtype": "int64", "_type": "Value"}, '
|
||||
b'"task_index": {"dtype": "int64", "_type": "Value"}}}}'
|
||||
}
|
||||
|
||||
def patch_parquet(parquet_path, hf_metadata, rename_map):
|
||||
try:
|
||||
# Load parquet table
|
||||
table = pq.read_table(parquet_path)
|
||||
|
||||
# If renaming is needed
|
||||
if rename_map:
|
||||
schema = table.schema
|
||||
new_names = [
|
||||
rename_map.get(name, name) for name in schema.names
|
||||
]
|
||||
table = table.rename_columns(new_names)
|
||||
|
||||
# Merge schema metadata
|
||||
new_meta = dict(table.schema.metadata or {})
|
||||
new_meta.update(hf_metadata)
|
||||
|
||||
# Replace metadata in table
|
||||
table = table.replace_schema_metadata(new_meta)
|
||||
|
||||
# Write safely via temp file
|
||||
tmp_fd, tmp_path = tempfile.mkstemp(suffix=".parquet")
|
||||
os.close(tmp_fd)
|
||||
pq.write_table(table, tmp_path)
|
||||
|
||||
# Replace original file
|
||||
shutil.move(tmp_path, parquet_path)
|
||||
|
||||
print(f"✅ Patched: {parquet_path}")
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"❌ Failed on {parquet_path}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
for fpath in FILES_TO_PATCH:
|
||||
if os.path.exists(fpath):
|
||||
patch_parquet(fpath, HF_METADATA, rename_map)
|
||||
else:
|
||||
print(f"⚠️ File not found: {fpath}")
|
||||
Reference in New Issue
Block a user