add support for agibot2lerobot (#15)

Co-authored-by: ModiShi <modishi@buaa.edu.cn>
Co-authored-by: aopolin-lv <aopolin.ii@gmail.com>
Co-authored-by: HaomingSong <haomingsong24@gmail.com>
This commit is contained in:
Qizhi Chen
2025-04-14 20:01:09 +08:00
committed by GitHub
parent 9ca6ce773b
commit fe558f7adb
8 changed files with 1038 additions and 1 deletions
@@ -0,0 +1,94 @@
import json
from pathlib import Path
import h5py
import numpy as np
from PIL import Image
def get_task_instruction(task_json_path: str) -> dict:
"""Get task language instruction"""
with open(task_json_path, "r") as f:
task_info = json.load(f)
task_name = task_info[0]["task_name"]
task_init_scene = task_info[0]["init_scene_text"]
task_instruction = f"{task_name}.{task_init_scene}"
return task_instruction
def load_depths(root_dir: str, camera_name: str):
cam_path = Path(root_dir)
all_imgs = sorted(list(cam_path.glob(f"{camera_name}*")))
return [np.array(Image.open(f)).astype(np.float32)[:, :, None] / 1000 for f in all_imgs]
def load_local_dataset(
episode_id: int, src_path: str, task_id: int, task_name: str, save_depth: bool, AgiBotWorld_CONFIG: dict
) -> tuple[list, dict]:
"""Load local dataset and return a dict with observations and actions"""
ob_dir = Path(src_path) / f"observations/{task_id}/{episode_id}"
proprio_dir = Path(src_path) / f"proprio_stats/{task_id}/{episode_id}"
state = {}
action = {}
with h5py.File(proprio_dir / "proprio_stats.h5", "r") as f:
for key in AgiBotWorld_CONFIG["states"]:
state[f"observation.states.{key}"] = np.array(f["state/" + key.replace(".", "/")], dtype=np.float32)
for key in AgiBotWorld_CONFIG["actions"]:
action[f"actions.{key}"] = np.array(f["action/" + key.replace(".", "/")], dtype=np.float32)
# HACK: agibot team forgot to pad some of the values
num_frames = len(next(iter(state.values())))
for action_key, action_value in action.items():
if action_value.size and len(action_value) != num_frames:
state_key = action_key.replace("actions", "state").replace(".", "/")
new_action_value = np.array(f[state_key], dtype=np.float32).copy()
action_index_key = "/".join(list(action_key.replace("actions", "action").split(".")[:-1]) + ["index"])
action_index = np.array(f[action_index_key])
# agibot lost end index, replace it with joint
if not action_index.size:
action_index_key = action_index_key.replace("end", "joint")
action_index = np.array(f[action_index_key])
new_action_value[action_index] = action_value
action[action_key] = new_action_value
if save_depth:
depth_imgs = load_depths(ob_dir / "depth", "head_depth")
assert num_frames == len(depth_imgs), "Number of images and states are not equal"
state_key_prefix_len = len("observation.states.")
action_key_prefix_len = len("actions.")
frames = [
{
**({"observation.images.head_depth": depth_imgs[i]} if save_depth else {}),
**{
key: value[i]
if value.size
else np.zeros(
AgiBotWorld_CONFIG["states"][key[state_key_prefix_len:]]["shape"],
dtype=AgiBotWorld_CONFIG["states"][key[state_key_prefix_len:]]["dtype"],
)
for key, value in state.items()
},
**{
key: value[i]
if value.size
else np.zeros(
AgiBotWorld_CONFIG["actions"][key[action_key_prefix_len:]]["shape"],
dtype=AgiBotWorld_CONFIG["actions"][key[action_key_prefix_len:]]["dtype"],
)
for key, value in action.items()
},
"task": task_name,
}
for i in range(num_frames)
]
videos = {
f"observation.images.{key}": ob_dir / "videos" / f"{key}_color.mp4"
if "sensor" not in key
else ob_dir / "tactile" / f"{key}.mp4" # HACK: handle tactile videos
for key in AgiBotWorld_CONFIG["images"]
if "depth" not in key
}
return frames, videos
+310
View File
@@ -0,0 +1,310 @@
AgiBotWorld_BETA_GRIPPER_CONFIG = {
"images": {
"head": {
"dtype": "video",
"shape": (480, 640, 3),
"names": ["height", "width", "rgb"],
},
"head_center_fisheye": {
"dtype": "video",
"shape": (748, 960, 3),
"names": ["height", "width", "rgb"],
},
"head_depth": {
"dtype": "image",
"shape": (480, 640, 1),
"names": ["height", "width", "channel"],
},
"head_left_fisheye": {
"dtype": "video",
"shape": (748, 960, 3),
"names": ["height", "width", "rgb"],
},
"head_right_fisheye": {
"dtype": "video",
"shape": (748, 960, 3),
"names": ["height", "width", "rgb"],
},
"hand_left": {
"dtype": "video",
"shape": (480, 640, 3),
"names": ["height", "width", "rgb"],
},
"hand_right": {
"dtype": "video",
"shape": (480, 640, 3),
"names": ["height", "width", "rgb"],
},
"back_left_fisheye": {
"dtype": "video",
"shape": (748, 960, 3),
"names": ["height", "width", "rgb"],
},
"back_right_fisheye": {
"dtype": "video",
"shape": (748, 960, 3),
"names": ["height", "width", "rgb"],
},
},
"states": {
"effector.position": {
"dtype": "float32",
"shape": (2,),
"names": {"motors": ["left_gripper", "right_gripper"]},
},
"end.orientation": {"dtype": "float32", "shape": (2, 4), "names": {"motors": ["left_xyzw", "right_xyzw"]}},
"end.position": {"dtype": "float32", "shape": (2, 3), "names": {"motors": ["left_xyz", "right_xyz"]}},
"head.position": {"dtype": "float32", "shape": (2,), "names": {"motors": ["yaw", "patch"]}},
"joint.current_value": {
"dtype": "float32",
"shape": (14,),
"names": {
"motors": [
"left_arm_0",
"left_arm_1",
"left_arm_2",
"left_arm_3",
"left_arm_4",
"left_arm_5",
"left_arm_6",
"right_arm_0",
"right_arm_1",
"right_arm_2",
"right_arm_3",
"right_arm_4",
"right_arm_5",
"right_arm_6",
]
},
},
"joint.position": {
"dtype": "float32",
"shape": (14,),
"names": {
"motors": [
"left_arm_0",
"left_arm_1",
"left_arm_2",
"left_arm_3",
"left_arm_4",
"left_arm_5",
"left_arm_6",
"right_arm_0",
"right_arm_1",
"right_arm_2",
"right_arm_3",
"right_arm_4",
"right_arm_5",
"right_arm_6",
]
},
},
"robot.orientation": {"dtype": "float32", "shape": (4,), "names": {"motors": ["x", "y", "z", "w"]}},
"robot.position": {"dtype": "float32", "shape": (3,), "names": {"motors": ["x", "y", "z"]}},
"waist.position": {"dtype": "float32", "shape": (2,), "names": {"motors": ["pitch", "lift"]}},
},
"actions": {
"effector.position": {
"dtype": "float32",
"shape": (2,),
"names": {"motors": ["left_gripper", "right_gripper"]},
},
"end.orientation": {"dtype": "float32", "shape": (2, 4), "names": {"motors": ["left_xyzw", "right_xyzw"]}},
"end.position": {"dtype": "float32", "shape": (2, 3), "names": {"motors": ["left_xyz", "right_xyz"]}},
"head.position": {"dtype": "float32", "shape": (2,), "names": {"motors": ["yaw", "patch"]}},
"joint.position": {
"dtype": "float32",
"shape": (14,),
"names": {
"motors": [
"left_arm_0",
"left_arm_1",
"left_arm_2",
"left_arm_3",
"left_arm_4",
"left_arm_5",
"left_arm_6",
"right_arm_0",
"right_arm_1",
"right_arm_2",
"right_arm_3",
"right_arm_4",
"right_arm_5",
"right_arm_6",
]
},
},
"robot.velocity": {"dtype": "float32", "shape": (2,), "names": {"motors": ["x_vel", "yaw_vel"]}},
"waist.position": {"dtype": "float32", "shape": (2,), "names": {"motors": ["pitch", "lift"]}},
},
}
AgiBotWorld_BETA_DEXHAND_CONFIG = {
"images": {
"head": {
"dtype": "video",
"shape": (480, 640, 3),
"names": ["height", "width", "rgb"],
},
"head_center_fisheye": {
"dtype": "video",
"shape": (748, 960, 3),
"names": ["height", "width", "rgb"],
},
"head_depth": {
"dtype": "image",
"shape": (480, 640, 1),
"names": ["height", "width", "channel"],
},
"head_left_fisheye": {
"dtype": "video",
"shape": (748, 960, 3),
"names": ["height", "width", "rgb"],
},
"head_right_fisheye": {
"dtype": "video",
"shape": (748, 960, 3),
"names": ["height", "width", "rgb"],
},
"hand_left_fisheye": {
"dtype": "video",
"shape": (748, 960, 3),
"names": ["height", "width", "rgb"],
},
"hand_right_fisheye": {
"dtype": "video",
"shape": (748, 960, 3),
"names": ["height", "width", "rgb"],
},
"back_left_fisheye": {
"dtype": "video",
"shape": (748, 960, 3),
"names": ["height", "width", "rgb"],
},
"back_right_fisheye": {
"dtype": "video",
"shape": (748, 960, 3),
"names": ["height", "width", "rgb"],
},
},
"states": {
**AgiBotWorld_BETA_GRIPPER_CONFIG["states"],
"effector.position": {
"dtype": "float32",
"shape": (12,),
"names": {
"motors": [
"left_joint_0",
"left_joint_1",
"left_joint_2",
"left_joint_3",
"left_joint_4",
"left_joint_5",
"right_joint_0",
"right_joint_1",
"right_joint_2",
"right_joint_3",
"right_joint_4",
"right_joint_5",
]
},
},
},
"actions": {
**AgiBotWorld_BETA_GRIPPER_CONFIG["actions"],
"effector.position": {
"dtype": "float32",
"shape": (12,),
"names": {
"motors": [
"left_joint_0",
"left_joint_1",
"left_joint_2",
"left_joint_3",
"left_joint_4",
"left_joint_5",
"right_joint_0",
"right_joint_1",
"right_joint_2",
"right_joint_3",
"right_joint_4",
"right_joint_5",
]
},
},
},
}
AgiBotWorld_BETA_TACTILE_CONFIG = {
**AgiBotWorld_BETA_GRIPPER_CONFIG,
"images": {
**AgiBotWorld_BETA_GRIPPER_CONFIG["images"],
"left_sensor_1": {
"dtype": "video",
"shape": (700, 400, 3),
"names": ["height", "width", "rgb"],
},
"left_sensor_2": {
"dtype": "video",
"shape": (700, 400, 3),
"names": ["height", "width", "rgb"],
},
"right_sensor_1": {
"dtype": "video",
"shape": (700, 400, 3),
"names": ["height", "width", "rgb"],
},
"right_sensor_2": {
"dtype": "video",
"shape": (700, 400, 3),
"names": ["height", "width", "rgb"],
},
},
}
# Task statistics coming from https://docs.google.com/spreadsheets/d/1GWMFHYo3UJADS7kkScoJ5ObbQfAFasPuaeC7TJUr1Cc/edit?gid=0#gid=0
AgiBotWorld_TASK_TYPE = {
"gripper": {
"task_config": AgiBotWorld_BETA_GRIPPER_CONFIG,
"task_ids": [], # The remaining are all gripper
},
"dexhand": {
"task_config": AgiBotWorld_BETA_DEXHAND_CONFIG,
"task_ids": [
"task_475",
"task_536",
"task_547",
"task_548",
"task_549",
"task_554",
"task_577",
"task_578",
"task_591",
"task_595",
"task_608",
"task_620",
"task_622",
"task_660",
"task_679",
"task_705",
"task_710",
"task_727",
"task_730",
"task_731",
"task_749",
"task_753",
],
},
"tactile": {
"task_config": AgiBotWorld_BETA_TACTILE_CONFIG,
"task_ids": [
"task_666",
"task_675",
"task_676",
"task_677",
"task_694",
"task_737",
"task_774",
],
},
}
@@ -0,0 +1,75 @@
import numpy as np
import torch
import torchvision
from lerobot.common.datasets.compute_stats import auto_downsample_height_width, get_feature_stats, sample_indices
torchvision.set_video_backend("pyav")
def generate_features_from_config(AgiBotWorld_CONFIG):
features = {}
for key, value in AgiBotWorld_CONFIG["images"].items():
features[f"observation.images.{key}"] = value
for key, value in AgiBotWorld_CONFIG["states"].items():
features[f"observation.states.{key}"] = value
for key, value in AgiBotWorld_CONFIG["actions"].items():
features[f"actions.{key}"] = value
return features
def sample_images(input):
if type(input) is str:
video_path = input
reader = torchvision.io.VideoReader(video_path, stream="video")
frames = [frame["data"] for frame in reader]
frames_array = torch.stack(frames).numpy() # Shape: [T, C, H, W]
sampled_indices = sample_indices(len(frames_array))
images = None
for i, idx in enumerate(sampled_indices):
img = frames_array[idx]
img = auto_downsample_height_width(img)
if images is None:
images = np.empty((len(sampled_indices), *img.shape), dtype=np.uint8)
images[i] = img
elif type(input) is np.ndarray:
frames_array = input[:, None, :, :] # Shape: [T, C, H, W]
sampled_indices = sample_indices(len(frames_array))
images = None
for i, idx in enumerate(sampled_indices):
img = frames_array[idx]
img = auto_downsample_height_width(img)
if images is None:
images = np.empty((len(sampled_indices), *img.shape), dtype=np.uint8)
images[i] = img
return images
def compute_episode_stats(episode_data: dict[str, list[str] | np.ndarray], features: dict) -> dict:
ep_stats = {}
for key, data in episode_data.items():
if features[key]["dtype"] == "string":
continue # HACK: we should receive np.arrays of strings
elif features[key]["dtype"] in ["image", "video"]:
ep_ft_array = sample_images(data)
axes_to_reduce = (0, 2, 3) # keep channel dim
keepdims = True
else:
ep_ft_array = data # data is already a np.ndarray
axes_to_reduce = 0 # compute stats over the first axis
keepdims = data.ndim == 1 # keep as np.array
ep_stats[key] = get_feature_stats(ep_ft_array, axis=axes_to_reduce, keepdims=keepdims)
if features[key]["dtype"] in ["image", "video"]:
value_norm = 1.0 if "depth" in key else 255.0
ep_stats[key] = {
k: v if k == "count" else np.squeeze(v / value_norm, axis=0) for k, v in ep_stats[key].items()
}
return ep_stats