add support for agibot2lerobot (#15)

Co-authored-by: ModiShi <modishi@buaa.edu.cn> Co-authored-by: aopolin-lv <aopolin.ii@gmail.com> Co-authored-by: HaomingSong <haomingsong24@gmail.com>
2026-05-22 17:39:39 +00:00 · 2025-04-14 20:01:09 +08:00
parent 9ca6ce773b
commit fe558f7adb
8 changed files with 1038 additions and 1 deletions
@@ -0,0 +1,94 @@
+import json
+from pathlib import Path
+
+import h5py
+import numpy as np
+from PIL import Image
+
+
+def get_task_instruction(task_json_path: str) -> dict:
+    """Get task language instruction"""
+    with open(task_json_path, "r") as f:
+        task_info = json.load(f)
+    task_name = task_info[0]["task_name"]
+    task_init_scene = task_info[0]["init_scene_text"]
+    task_instruction = f"{task_name}.{task_init_scene}"
+    return task_instruction
+
+
+def load_depths(root_dir: str, camera_name: str):
+    cam_path = Path(root_dir)
+    all_imgs = sorted(list(cam_path.glob(f"{camera_name}*")))
+    return [np.array(Image.open(f)).astype(np.float32)[:, :, None] / 1000 for f in all_imgs]
+
+
+def load_local_dataset(
+    episode_id: int, src_path: str, task_id: int, task_name: str, save_depth: bool, AgiBotWorld_CONFIG: dict
+) -> tuple[list, dict]:
+    """Load local dataset and return a dict with observations and actions"""
+    ob_dir = Path(src_path) / f"observations/{task_id}/{episode_id}"
+    proprio_dir = Path(src_path) / f"proprio_stats/{task_id}/{episode_id}"
+
+    state = {}
+    action = {}
+    with h5py.File(proprio_dir / "proprio_stats.h5", "r") as f:
+        for key in AgiBotWorld_CONFIG["states"]:
+            state[f"observation.states.{key}"] = np.array(f["state/" + key.replace(".", "/")], dtype=np.float32)
+        for key in AgiBotWorld_CONFIG["actions"]:
+            action[f"actions.{key}"] = np.array(f["action/" + key.replace(".", "/")], dtype=np.float32)
+
+        # HACK: agibot team forgot to pad some of the values
+        num_frames = len(next(iter(state.values())))
+        for action_key, action_value in action.items():
+            if action_value.size and len(action_value) != num_frames:
+                state_key = action_key.replace("actions", "state").replace(".", "/")
+                new_action_value = np.array(f[state_key], dtype=np.float32).copy()
+                action_index_key = "/".join(list(action_key.replace("actions", "action").split(".")[:-1]) + ["index"])
+                action_index = np.array(f[action_index_key])
+                # agibot lost end index, replace it with joint
+                if not action_index.size:
+                    action_index_key = action_index_key.replace("end", "joint")
+                    action_index = np.array(f[action_index_key])
+                new_action_value[action_index] = action_value
+                action[action_key] = new_action_value
+
+    if save_depth:
+        depth_imgs = load_depths(ob_dir / "depth", "head_depth")
+        assert num_frames == len(depth_imgs), "Number of images and states are not equal"
+
+    state_key_prefix_len = len("observation.states.")
+    action_key_prefix_len = len("actions.")
+    frames = [
+        {
+            **({"observation.images.head_depth": depth_imgs[i]} if save_depth else {}),
+            **{
+                key: value[i]
+                if value.size
+                else np.zeros(
+                    AgiBotWorld_CONFIG["states"][key[state_key_prefix_len:]]["shape"],
+                    dtype=AgiBotWorld_CONFIG["states"][key[state_key_prefix_len:]]["dtype"],
+                )
+                for key, value in state.items()
+            },
+            **{
+                key: value[i]
+                if value.size
+                else np.zeros(
+                    AgiBotWorld_CONFIG["actions"][key[action_key_prefix_len:]]["shape"],
+                    dtype=AgiBotWorld_CONFIG["actions"][key[action_key_prefix_len:]]["dtype"],
+                )
+                for key, value in action.items()
+            },
+            "task": task_name,
+        }
+        for i in range(num_frames)
+    ]
+
+    videos = {
+        f"observation.images.{key}": ob_dir / "videos" / f"{key}_color.mp4"
+        if "sensor" not in key
+        else ob_dir / "tactile" / f"{key}.mp4"  # HACK: handle tactile videos
+        for key in AgiBotWorld_CONFIG["images"]
+        if "depth" not in key
+    }
+    return frames, videos
@@ -0,0 +1,310 @@
+AgiBotWorld_BETA_GRIPPER_CONFIG = {
+    "images": {
+        "head": {
+            "dtype": "video",
+            "shape": (480, 640, 3),
+            "names": ["height", "width", "rgb"],
+        },
+        "head_center_fisheye": {
+            "dtype": "video",
+            "shape": (748, 960, 3),
+            "names": ["height", "width", "rgb"],
+        },
+        "head_depth": {
+            "dtype": "image",
+            "shape": (480, 640, 1),
+            "names": ["height", "width", "channel"],
+        },
+        "head_left_fisheye": {
+            "dtype": "video",
+            "shape": (748, 960, 3),
+            "names": ["height", "width", "rgb"],
+        },
+        "head_right_fisheye": {
+            "dtype": "video",
+            "shape": (748, 960, 3),
+            "names": ["height", "width", "rgb"],
+        },
+        "hand_left": {
+            "dtype": "video",
+            "shape": (480, 640, 3),
+            "names": ["height", "width", "rgb"],
+        },
+        "hand_right": {
+            "dtype": "video",
+            "shape": (480, 640, 3),
+            "names": ["height", "width", "rgb"],
+        },
+        "back_left_fisheye": {
+            "dtype": "video",
+            "shape": (748, 960, 3),
+            "names": ["height", "width", "rgb"],
+        },
+        "back_right_fisheye": {
+            "dtype": "video",
+            "shape": (748, 960, 3),
+            "names": ["height", "width", "rgb"],
+        },
+    },
+    "states": {
+        "effector.position": {
+            "dtype": "float32",
+            "shape": (2,),
+            "names": {"motors": ["left_gripper", "right_gripper"]},
+        },
+        "end.orientation": {"dtype": "float32", "shape": (2, 4), "names": {"motors": ["left_xyzw", "right_xyzw"]}},
+        "end.position": {"dtype": "float32", "shape": (2, 3), "names": {"motors": ["left_xyz", "right_xyz"]}},
+        "head.position": {"dtype": "float32", "shape": (2,), "names": {"motors": ["yaw", "patch"]}},
+        "joint.current_value": {
+            "dtype": "float32",
+            "shape": (14,),
+            "names": {
+                "motors": [
+                    "left_arm_0",
+                    "left_arm_1",
+                    "left_arm_2",
+                    "left_arm_3",
+                    "left_arm_4",
+                    "left_arm_5",
+                    "left_arm_6",
+                    "right_arm_0",
+                    "right_arm_1",
+                    "right_arm_2",
+                    "right_arm_3",
+                    "right_arm_4",
+                    "right_arm_5",
+                    "right_arm_6",
+                ]
+            },
+        },
+        "joint.position": {
+            "dtype": "float32",
+            "shape": (14,),
+            "names": {
+                "motors": [
+                    "left_arm_0",
+                    "left_arm_1",
+                    "left_arm_2",
+                    "left_arm_3",
+                    "left_arm_4",
+                    "left_arm_5",
+                    "left_arm_6",
+                    "right_arm_0",
+                    "right_arm_1",
+                    "right_arm_2",
+                    "right_arm_3",
+                    "right_arm_4",
+                    "right_arm_5",
+                    "right_arm_6",
+                ]
+            },
+        },
+        "robot.orientation": {"dtype": "float32", "shape": (4,), "names": {"motors": ["x", "y", "z", "w"]}},
+        "robot.position": {"dtype": "float32", "shape": (3,), "names": {"motors": ["x", "y", "z"]}},
+        "waist.position": {"dtype": "float32", "shape": (2,), "names": {"motors": ["pitch", "lift"]}},
+    },
+    "actions": {
+        "effector.position": {
+            "dtype": "float32",
+            "shape": (2,),
+            "names": {"motors": ["left_gripper", "right_gripper"]},
+        },
+        "end.orientation": {"dtype": "float32", "shape": (2, 4), "names": {"motors": ["left_xyzw", "right_xyzw"]}},
+        "end.position": {"dtype": "float32", "shape": (2, 3), "names": {"motors": ["left_xyz", "right_xyz"]}},
+        "head.position": {"dtype": "float32", "shape": (2,), "names": {"motors": ["yaw", "patch"]}},
+        "joint.position": {
+            "dtype": "float32",
+            "shape": (14,),
+            "names": {
+                "motors": [
+                    "left_arm_0",
+                    "left_arm_1",
+                    "left_arm_2",
+                    "left_arm_3",
+                    "left_arm_4",
+                    "left_arm_5",
+                    "left_arm_6",
+                    "right_arm_0",
+                    "right_arm_1",
+                    "right_arm_2",
+                    "right_arm_3",
+                    "right_arm_4",
+                    "right_arm_5",
+                    "right_arm_6",
+                ]
+            },
+        },
+        "robot.velocity": {"dtype": "float32", "shape": (2,), "names": {"motors": ["x_vel", "yaw_vel"]}},
+        "waist.position": {"dtype": "float32", "shape": (2,), "names": {"motors": ["pitch", "lift"]}},
+    },
+}
+
+AgiBotWorld_BETA_DEXHAND_CONFIG = {
+    "images": {
+        "head": {
+            "dtype": "video",
+            "shape": (480, 640, 3),
+            "names": ["height", "width", "rgb"],
+        },
+        "head_center_fisheye": {
+            "dtype": "video",
+            "shape": (748, 960, 3),
+            "names": ["height", "width", "rgb"],
+        },
+        "head_depth": {
+            "dtype": "image",
+            "shape": (480, 640, 1),
+            "names": ["height", "width", "channel"],
+        },
+        "head_left_fisheye": {
+            "dtype": "video",
+            "shape": (748, 960, 3),
+            "names": ["height", "width", "rgb"],
+        },
+        "head_right_fisheye": {
+            "dtype": "video",
+            "shape": (748, 960, 3),
+            "names": ["height", "width", "rgb"],
+        },
+        "hand_left_fisheye": {
+            "dtype": "video",
+            "shape": (748, 960, 3),
+            "names": ["height", "width", "rgb"],
+        },
+        "hand_right_fisheye": {
+            "dtype": "video",
+            "shape": (748, 960, 3),
+            "names": ["height", "width", "rgb"],
+        },
+        "back_left_fisheye": {
+            "dtype": "video",
+            "shape": (748, 960, 3),
+            "names": ["height", "width", "rgb"],
+        },
+        "back_right_fisheye": {
+            "dtype": "video",
+            "shape": (748, 960, 3),
+            "names": ["height", "width", "rgb"],
+        },
+    },
+    "states": {
+        **AgiBotWorld_BETA_GRIPPER_CONFIG["states"],
+        "effector.position": {
+            "dtype": "float32",
+            "shape": (12,),
+            "names": {
+                "motors": [
+                    "left_joint_0",
+                    "left_joint_1",
+                    "left_joint_2",
+                    "left_joint_3",
+                    "left_joint_4",
+                    "left_joint_5",
+                    "right_joint_0",
+                    "right_joint_1",
+                    "right_joint_2",
+                    "right_joint_3",
+                    "right_joint_4",
+                    "right_joint_5",
+                ]
+            },
+        },
+    },
+    "actions": {
+        **AgiBotWorld_BETA_GRIPPER_CONFIG["actions"],
+        "effector.position": {
+            "dtype": "float32",
+            "shape": (12,),
+            "names": {
+                "motors": [
+                    "left_joint_0",
+                    "left_joint_1",
+                    "left_joint_2",
+                    "left_joint_3",
+                    "left_joint_4",
+                    "left_joint_5",
+                    "right_joint_0",
+                    "right_joint_1",
+                    "right_joint_2",
+                    "right_joint_3",
+                    "right_joint_4",
+                    "right_joint_5",
+                ]
+            },
+        },
+    },
+}
+
+AgiBotWorld_BETA_TACTILE_CONFIG = {
+    **AgiBotWorld_BETA_GRIPPER_CONFIG,
+    "images": {
+        **AgiBotWorld_BETA_GRIPPER_CONFIG["images"],
+        "left_sensor_1": {
+            "dtype": "video",
+            "shape": (700, 400, 3),
+            "names": ["height", "width", "rgb"],
+        },
+        "left_sensor_2": {
+            "dtype": "video",
+            "shape": (700, 400, 3),
+            "names": ["height", "width", "rgb"],
+        },
+        "right_sensor_1": {
+            "dtype": "video",
+            "shape": (700, 400, 3),
+            "names": ["height", "width", "rgb"],
+        },
+        "right_sensor_2": {
+            "dtype": "video",
+            "shape": (700, 400, 3),
+            "names": ["height", "width", "rgb"],
+        },
+    },
+}
+
+# Task statistics coming from https://docs.google.com/spreadsheets/d/1GWMFHYo3UJADS7kkScoJ5ObbQfAFasPuaeC7TJUr1Cc/edit?gid=0#gid=0
+AgiBotWorld_TASK_TYPE = {
+    "gripper": {
+        "task_config": AgiBotWorld_BETA_GRIPPER_CONFIG,
+        "task_ids": [],  # The remaining are all gripper
+    },
+    "dexhand": {
+        "task_config": AgiBotWorld_BETA_DEXHAND_CONFIG,
+        "task_ids": [
+            "task_475",
+            "task_536",
+            "task_547",
+            "task_548",
+            "task_549",
+            "task_554",
+            "task_577",
+            "task_578",
+            "task_591",
+            "task_595",
+            "task_608",
+            "task_620",
+            "task_622",
+            "task_660",
+            "task_679",
+            "task_705",
+            "task_710",
+            "task_727",
+            "task_730",
+            "task_731",
+            "task_749",
+            "task_753",
+        ],
+    },
+    "tactile": {
+        "task_config": AgiBotWorld_BETA_TACTILE_CONFIG,
+        "task_ids": [
+            "task_666",
+            "task_675",
+            "task_676",
+            "task_677",
+            "task_694",
+            "task_737",
+            "task_774",
+        ],
+    },
+}
@@ -0,0 +1,75 @@
+import numpy as np
+import torch
+import torchvision
+from lerobot.common.datasets.compute_stats import auto_downsample_height_width, get_feature_stats, sample_indices
+
+torchvision.set_video_backend("pyav")
+
+
+def generate_features_from_config(AgiBotWorld_CONFIG):
+    features = {}
+    for key, value in AgiBotWorld_CONFIG["images"].items():
+        features[f"observation.images.{key}"] = value
+    for key, value in AgiBotWorld_CONFIG["states"].items():
+        features[f"observation.states.{key}"] = value
+    for key, value in AgiBotWorld_CONFIG["actions"].items():
+        features[f"actions.{key}"] = value
+    return features
+
+
+def sample_images(input):
+    if type(input) is str:
+        video_path = input
+        reader = torchvision.io.VideoReader(video_path, stream="video")
+        frames = [frame["data"] for frame in reader]
+        frames_array = torch.stack(frames).numpy()  # Shape: [T, C, H, W]
+
+        sampled_indices = sample_indices(len(frames_array))
+        images = None
+        for i, idx in enumerate(sampled_indices):
+            img = frames_array[idx]
+            img = auto_downsample_height_width(img)
+
+            if images is None:
+                images = np.empty((len(sampled_indices), *img.shape), dtype=np.uint8)
+
+            images[i] = img
+    elif type(input) is np.ndarray:
+        frames_array = input[:, None, :, :]  # Shape: [T, C, H, W]
+        sampled_indices = sample_indices(len(frames_array))
+        images = None
+        for i, idx in enumerate(sampled_indices):
+            img = frames_array[idx]
+            img = auto_downsample_height_width(img)
+
+            if images is None:
+                images = np.empty((len(sampled_indices), *img.shape), dtype=np.uint8)
+
+            images[i] = img
+
+    return images
+
+
+def compute_episode_stats(episode_data: dict[str, list[str] | np.ndarray], features: dict) -> dict:
+    ep_stats = {}
+    for key, data in episode_data.items():
+        if features[key]["dtype"] == "string":
+            continue  # HACK: we should receive np.arrays of strings
+        elif features[key]["dtype"] in ["image", "video"]:
+            ep_ft_array = sample_images(data)
+            axes_to_reduce = (0, 2, 3)  # keep channel dim
+            keepdims = True
+        else:
+            ep_ft_array = data  # data is already a np.ndarray
+            axes_to_reduce = 0  # compute stats over the first axis
+            keepdims = data.ndim == 1  # keep as np.array
+
+        ep_stats[key] = get_feature_stats(ep_ft_array, axis=axes_to_reduce, keepdims=keepdims)
+
+        if features[key]["dtype"] in ["image", "video"]:
+            value_norm = 1.0 if "depth" in key else 255.0
+            ep_stats[key] = {
+                k: v if k == "count" else np.squeeze(v / value_norm, axis=0) for k, v in ep_stats[key].items()
+            }
+
+    return ep_stats