mirror of
https://github.com/Tavish9/any4lerobot.git
synced 2026-05-22 17:39:39 +00:00
add support for agibot2lerobot (#15)
Co-authored-by: ModiShi <modishi@buaa.edu.cn> Co-authored-by: aopolin-lv <aopolin.ii@gmail.com> Co-authored-by: HaomingSong <haomingsong24@gmail.com>
This commit is contained in:
@@ -0,0 +1,94 @@
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import h5py
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
|
||||
def get_task_instruction(task_json_path: str) -> dict:
|
||||
"""Get task language instruction"""
|
||||
with open(task_json_path, "r") as f:
|
||||
task_info = json.load(f)
|
||||
task_name = task_info[0]["task_name"]
|
||||
task_init_scene = task_info[0]["init_scene_text"]
|
||||
task_instruction = f"{task_name}.{task_init_scene}"
|
||||
return task_instruction
|
||||
|
||||
|
||||
def load_depths(root_dir: str, camera_name: str):
|
||||
cam_path = Path(root_dir)
|
||||
all_imgs = sorted(list(cam_path.glob(f"{camera_name}*")))
|
||||
return [np.array(Image.open(f)).astype(np.float32)[:, :, None] / 1000 for f in all_imgs]
|
||||
|
||||
|
||||
def load_local_dataset(
|
||||
episode_id: int, src_path: str, task_id: int, task_name: str, save_depth: bool, AgiBotWorld_CONFIG: dict
|
||||
) -> tuple[list, dict]:
|
||||
"""Load local dataset and return a dict with observations and actions"""
|
||||
ob_dir = Path(src_path) / f"observations/{task_id}/{episode_id}"
|
||||
proprio_dir = Path(src_path) / f"proprio_stats/{task_id}/{episode_id}"
|
||||
|
||||
state = {}
|
||||
action = {}
|
||||
with h5py.File(proprio_dir / "proprio_stats.h5", "r") as f:
|
||||
for key in AgiBotWorld_CONFIG["states"]:
|
||||
state[f"observation.states.{key}"] = np.array(f["state/" + key.replace(".", "/")], dtype=np.float32)
|
||||
for key in AgiBotWorld_CONFIG["actions"]:
|
||||
action[f"actions.{key}"] = np.array(f["action/" + key.replace(".", "/")], dtype=np.float32)
|
||||
|
||||
# HACK: agibot team forgot to pad some of the values
|
||||
num_frames = len(next(iter(state.values())))
|
||||
for action_key, action_value in action.items():
|
||||
if action_value.size and len(action_value) != num_frames:
|
||||
state_key = action_key.replace("actions", "state").replace(".", "/")
|
||||
new_action_value = np.array(f[state_key], dtype=np.float32).copy()
|
||||
action_index_key = "/".join(list(action_key.replace("actions", "action").split(".")[:-1]) + ["index"])
|
||||
action_index = np.array(f[action_index_key])
|
||||
# agibot lost end index, replace it with joint
|
||||
if not action_index.size:
|
||||
action_index_key = action_index_key.replace("end", "joint")
|
||||
action_index = np.array(f[action_index_key])
|
||||
new_action_value[action_index] = action_value
|
||||
action[action_key] = new_action_value
|
||||
|
||||
if save_depth:
|
||||
depth_imgs = load_depths(ob_dir / "depth", "head_depth")
|
||||
assert num_frames == len(depth_imgs), "Number of images and states are not equal"
|
||||
|
||||
state_key_prefix_len = len("observation.states.")
|
||||
action_key_prefix_len = len("actions.")
|
||||
frames = [
|
||||
{
|
||||
**({"observation.images.head_depth": depth_imgs[i]} if save_depth else {}),
|
||||
**{
|
||||
key: value[i]
|
||||
if value.size
|
||||
else np.zeros(
|
||||
AgiBotWorld_CONFIG["states"][key[state_key_prefix_len:]]["shape"],
|
||||
dtype=AgiBotWorld_CONFIG["states"][key[state_key_prefix_len:]]["dtype"],
|
||||
)
|
||||
for key, value in state.items()
|
||||
},
|
||||
**{
|
||||
key: value[i]
|
||||
if value.size
|
||||
else np.zeros(
|
||||
AgiBotWorld_CONFIG["actions"][key[action_key_prefix_len:]]["shape"],
|
||||
dtype=AgiBotWorld_CONFIG["actions"][key[action_key_prefix_len:]]["dtype"],
|
||||
)
|
||||
for key, value in action.items()
|
||||
},
|
||||
"task": task_name,
|
||||
}
|
||||
for i in range(num_frames)
|
||||
]
|
||||
|
||||
videos = {
|
||||
f"observation.images.{key}": ob_dir / "videos" / f"{key}_color.mp4"
|
||||
if "sensor" not in key
|
||||
else ob_dir / "tactile" / f"{key}.mp4" # HACK: handle tactile videos
|
||||
for key in AgiBotWorld_CONFIG["images"]
|
||||
if "depth" not in key
|
||||
}
|
||||
return frames, videos
|
||||
@@ -0,0 +1,310 @@
|
||||
AgiBotWorld_BETA_GRIPPER_CONFIG = {
|
||||
"images": {
|
||||
"head": {
|
||||
"dtype": "video",
|
||||
"shape": (480, 640, 3),
|
||||
"names": ["height", "width", "rgb"],
|
||||
},
|
||||
"head_center_fisheye": {
|
||||
"dtype": "video",
|
||||
"shape": (748, 960, 3),
|
||||
"names": ["height", "width", "rgb"],
|
||||
},
|
||||
"head_depth": {
|
||||
"dtype": "image",
|
||||
"shape": (480, 640, 1),
|
||||
"names": ["height", "width", "channel"],
|
||||
},
|
||||
"head_left_fisheye": {
|
||||
"dtype": "video",
|
||||
"shape": (748, 960, 3),
|
||||
"names": ["height", "width", "rgb"],
|
||||
},
|
||||
"head_right_fisheye": {
|
||||
"dtype": "video",
|
||||
"shape": (748, 960, 3),
|
||||
"names": ["height", "width", "rgb"],
|
||||
},
|
||||
"hand_left": {
|
||||
"dtype": "video",
|
||||
"shape": (480, 640, 3),
|
||||
"names": ["height", "width", "rgb"],
|
||||
},
|
||||
"hand_right": {
|
||||
"dtype": "video",
|
||||
"shape": (480, 640, 3),
|
||||
"names": ["height", "width", "rgb"],
|
||||
},
|
||||
"back_left_fisheye": {
|
||||
"dtype": "video",
|
||||
"shape": (748, 960, 3),
|
||||
"names": ["height", "width", "rgb"],
|
||||
},
|
||||
"back_right_fisheye": {
|
||||
"dtype": "video",
|
||||
"shape": (748, 960, 3),
|
||||
"names": ["height", "width", "rgb"],
|
||||
},
|
||||
},
|
||||
"states": {
|
||||
"effector.position": {
|
||||
"dtype": "float32",
|
||||
"shape": (2,),
|
||||
"names": {"motors": ["left_gripper", "right_gripper"]},
|
||||
},
|
||||
"end.orientation": {"dtype": "float32", "shape": (2, 4), "names": {"motors": ["left_xyzw", "right_xyzw"]}},
|
||||
"end.position": {"dtype": "float32", "shape": (2, 3), "names": {"motors": ["left_xyz", "right_xyz"]}},
|
||||
"head.position": {"dtype": "float32", "shape": (2,), "names": {"motors": ["yaw", "patch"]}},
|
||||
"joint.current_value": {
|
||||
"dtype": "float32",
|
||||
"shape": (14,),
|
||||
"names": {
|
||||
"motors": [
|
||||
"left_arm_0",
|
||||
"left_arm_1",
|
||||
"left_arm_2",
|
||||
"left_arm_3",
|
||||
"left_arm_4",
|
||||
"left_arm_5",
|
||||
"left_arm_6",
|
||||
"right_arm_0",
|
||||
"right_arm_1",
|
||||
"right_arm_2",
|
||||
"right_arm_3",
|
||||
"right_arm_4",
|
||||
"right_arm_5",
|
||||
"right_arm_6",
|
||||
]
|
||||
},
|
||||
},
|
||||
"joint.position": {
|
||||
"dtype": "float32",
|
||||
"shape": (14,),
|
||||
"names": {
|
||||
"motors": [
|
||||
"left_arm_0",
|
||||
"left_arm_1",
|
||||
"left_arm_2",
|
||||
"left_arm_3",
|
||||
"left_arm_4",
|
||||
"left_arm_5",
|
||||
"left_arm_6",
|
||||
"right_arm_0",
|
||||
"right_arm_1",
|
||||
"right_arm_2",
|
||||
"right_arm_3",
|
||||
"right_arm_4",
|
||||
"right_arm_5",
|
||||
"right_arm_6",
|
||||
]
|
||||
},
|
||||
},
|
||||
"robot.orientation": {"dtype": "float32", "shape": (4,), "names": {"motors": ["x", "y", "z", "w"]}},
|
||||
"robot.position": {"dtype": "float32", "shape": (3,), "names": {"motors": ["x", "y", "z"]}},
|
||||
"waist.position": {"dtype": "float32", "shape": (2,), "names": {"motors": ["pitch", "lift"]}},
|
||||
},
|
||||
"actions": {
|
||||
"effector.position": {
|
||||
"dtype": "float32",
|
||||
"shape": (2,),
|
||||
"names": {"motors": ["left_gripper", "right_gripper"]},
|
||||
},
|
||||
"end.orientation": {"dtype": "float32", "shape": (2, 4), "names": {"motors": ["left_xyzw", "right_xyzw"]}},
|
||||
"end.position": {"dtype": "float32", "shape": (2, 3), "names": {"motors": ["left_xyz", "right_xyz"]}},
|
||||
"head.position": {"dtype": "float32", "shape": (2,), "names": {"motors": ["yaw", "patch"]}},
|
||||
"joint.position": {
|
||||
"dtype": "float32",
|
||||
"shape": (14,),
|
||||
"names": {
|
||||
"motors": [
|
||||
"left_arm_0",
|
||||
"left_arm_1",
|
||||
"left_arm_2",
|
||||
"left_arm_3",
|
||||
"left_arm_4",
|
||||
"left_arm_5",
|
||||
"left_arm_6",
|
||||
"right_arm_0",
|
||||
"right_arm_1",
|
||||
"right_arm_2",
|
||||
"right_arm_3",
|
||||
"right_arm_4",
|
||||
"right_arm_5",
|
||||
"right_arm_6",
|
||||
]
|
||||
},
|
||||
},
|
||||
"robot.velocity": {"dtype": "float32", "shape": (2,), "names": {"motors": ["x_vel", "yaw_vel"]}},
|
||||
"waist.position": {"dtype": "float32", "shape": (2,), "names": {"motors": ["pitch", "lift"]}},
|
||||
},
|
||||
}
|
||||
|
||||
AgiBotWorld_BETA_DEXHAND_CONFIG = {
|
||||
"images": {
|
||||
"head": {
|
||||
"dtype": "video",
|
||||
"shape": (480, 640, 3),
|
||||
"names": ["height", "width", "rgb"],
|
||||
},
|
||||
"head_center_fisheye": {
|
||||
"dtype": "video",
|
||||
"shape": (748, 960, 3),
|
||||
"names": ["height", "width", "rgb"],
|
||||
},
|
||||
"head_depth": {
|
||||
"dtype": "image",
|
||||
"shape": (480, 640, 1),
|
||||
"names": ["height", "width", "channel"],
|
||||
},
|
||||
"head_left_fisheye": {
|
||||
"dtype": "video",
|
||||
"shape": (748, 960, 3),
|
||||
"names": ["height", "width", "rgb"],
|
||||
},
|
||||
"head_right_fisheye": {
|
||||
"dtype": "video",
|
||||
"shape": (748, 960, 3),
|
||||
"names": ["height", "width", "rgb"],
|
||||
},
|
||||
"hand_left_fisheye": {
|
||||
"dtype": "video",
|
||||
"shape": (748, 960, 3),
|
||||
"names": ["height", "width", "rgb"],
|
||||
},
|
||||
"hand_right_fisheye": {
|
||||
"dtype": "video",
|
||||
"shape": (748, 960, 3),
|
||||
"names": ["height", "width", "rgb"],
|
||||
},
|
||||
"back_left_fisheye": {
|
||||
"dtype": "video",
|
||||
"shape": (748, 960, 3),
|
||||
"names": ["height", "width", "rgb"],
|
||||
},
|
||||
"back_right_fisheye": {
|
||||
"dtype": "video",
|
||||
"shape": (748, 960, 3),
|
||||
"names": ["height", "width", "rgb"],
|
||||
},
|
||||
},
|
||||
"states": {
|
||||
**AgiBotWorld_BETA_GRIPPER_CONFIG["states"],
|
||||
"effector.position": {
|
||||
"dtype": "float32",
|
||||
"shape": (12,),
|
||||
"names": {
|
||||
"motors": [
|
||||
"left_joint_0",
|
||||
"left_joint_1",
|
||||
"left_joint_2",
|
||||
"left_joint_3",
|
||||
"left_joint_4",
|
||||
"left_joint_5",
|
||||
"right_joint_0",
|
||||
"right_joint_1",
|
||||
"right_joint_2",
|
||||
"right_joint_3",
|
||||
"right_joint_4",
|
||||
"right_joint_5",
|
||||
]
|
||||
},
|
||||
},
|
||||
},
|
||||
"actions": {
|
||||
**AgiBotWorld_BETA_GRIPPER_CONFIG["actions"],
|
||||
"effector.position": {
|
||||
"dtype": "float32",
|
||||
"shape": (12,),
|
||||
"names": {
|
||||
"motors": [
|
||||
"left_joint_0",
|
||||
"left_joint_1",
|
||||
"left_joint_2",
|
||||
"left_joint_3",
|
||||
"left_joint_4",
|
||||
"left_joint_5",
|
||||
"right_joint_0",
|
||||
"right_joint_1",
|
||||
"right_joint_2",
|
||||
"right_joint_3",
|
||||
"right_joint_4",
|
||||
"right_joint_5",
|
||||
]
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
AgiBotWorld_BETA_TACTILE_CONFIG = {
|
||||
**AgiBotWorld_BETA_GRIPPER_CONFIG,
|
||||
"images": {
|
||||
**AgiBotWorld_BETA_GRIPPER_CONFIG["images"],
|
||||
"left_sensor_1": {
|
||||
"dtype": "video",
|
||||
"shape": (700, 400, 3),
|
||||
"names": ["height", "width", "rgb"],
|
||||
},
|
||||
"left_sensor_2": {
|
||||
"dtype": "video",
|
||||
"shape": (700, 400, 3),
|
||||
"names": ["height", "width", "rgb"],
|
||||
},
|
||||
"right_sensor_1": {
|
||||
"dtype": "video",
|
||||
"shape": (700, 400, 3),
|
||||
"names": ["height", "width", "rgb"],
|
||||
},
|
||||
"right_sensor_2": {
|
||||
"dtype": "video",
|
||||
"shape": (700, 400, 3),
|
||||
"names": ["height", "width", "rgb"],
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
# Task statistics coming from https://docs.google.com/spreadsheets/d/1GWMFHYo3UJADS7kkScoJ5ObbQfAFasPuaeC7TJUr1Cc/edit?gid=0#gid=0
|
||||
AgiBotWorld_TASK_TYPE = {
|
||||
"gripper": {
|
||||
"task_config": AgiBotWorld_BETA_GRIPPER_CONFIG,
|
||||
"task_ids": [], # The remaining are all gripper
|
||||
},
|
||||
"dexhand": {
|
||||
"task_config": AgiBotWorld_BETA_DEXHAND_CONFIG,
|
||||
"task_ids": [
|
||||
"task_475",
|
||||
"task_536",
|
||||
"task_547",
|
||||
"task_548",
|
||||
"task_549",
|
||||
"task_554",
|
||||
"task_577",
|
||||
"task_578",
|
||||
"task_591",
|
||||
"task_595",
|
||||
"task_608",
|
||||
"task_620",
|
||||
"task_622",
|
||||
"task_660",
|
||||
"task_679",
|
||||
"task_705",
|
||||
"task_710",
|
||||
"task_727",
|
||||
"task_730",
|
||||
"task_731",
|
||||
"task_749",
|
||||
"task_753",
|
||||
],
|
||||
},
|
||||
"tactile": {
|
||||
"task_config": AgiBotWorld_BETA_TACTILE_CONFIG,
|
||||
"task_ids": [
|
||||
"task_666",
|
||||
"task_675",
|
||||
"task_676",
|
||||
"task_677",
|
||||
"task_694",
|
||||
"task_737",
|
||||
"task_774",
|
||||
],
|
||||
},
|
||||
}
|
||||
@@ -0,0 +1,75 @@
|
||||
import numpy as np
|
||||
import torch
|
||||
import torchvision
|
||||
from lerobot.common.datasets.compute_stats import auto_downsample_height_width, get_feature_stats, sample_indices
|
||||
|
||||
torchvision.set_video_backend("pyav")
|
||||
|
||||
|
||||
def generate_features_from_config(AgiBotWorld_CONFIG):
|
||||
features = {}
|
||||
for key, value in AgiBotWorld_CONFIG["images"].items():
|
||||
features[f"observation.images.{key}"] = value
|
||||
for key, value in AgiBotWorld_CONFIG["states"].items():
|
||||
features[f"observation.states.{key}"] = value
|
||||
for key, value in AgiBotWorld_CONFIG["actions"].items():
|
||||
features[f"actions.{key}"] = value
|
||||
return features
|
||||
|
||||
|
||||
def sample_images(input):
|
||||
if type(input) is str:
|
||||
video_path = input
|
||||
reader = torchvision.io.VideoReader(video_path, stream="video")
|
||||
frames = [frame["data"] for frame in reader]
|
||||
frames_array = torch.stack(frames).numpy() # Shape: [T, C, H, W]
|
||||
|
||||
sampled_indices = sample_indices(len(frames_array))
|
||||
images = None
|
||||
for i, idx in enumerate(sampled_indices):
|
||||
img = frames_array[idx]
|
||||
img = auto_downsample_height_width(img)
|
||||
|
||||
if images is None:
|
||||
images = np.empty((len(sampled_indices), *img.shape), dtype=np.uint8)
|
||||
|
||||
images[i] = img
|
||||
elif type(input) is np.ndarray:
|
||||
frames_array = input[:, None, :, :] # Shape: [T, C, H, W]
|
||||
sampled_indices = sample_indices(len(frames_array))
|
||||
images = None
|
||||
for i, idx in enumerate(sampled_indices):
|
||||
img = frames_array[idx]
|
||||
img = auto_downsample_height_width(img)
|
||||
|
||||
if images is None:
|
||||
images = np.empty((len(sampled_indices), *img.shape), dtype=np.uint8)
|
||||
|
||||
images[i] = img
|
||||
|
||||
return images
|
||||
|
||||
|
||||
def compute_episode_stats(episode_data: dict[str, list[str] | np.ndarray], features: dict) -> dict:
|
||||
ep_stats = {}
|
||||
for key, data in episode_data.items():
|
||||
if features[key]["dtype"] == "string":
|
||||
continue # HACK: we should receive np.arrays of strings
|
||||
elif features[key]["dtype"] in ["image", "video"]:
|
||||
ep_ft_array = sample_images(data)
|
||||
axes_to_reduce = (0, 2, 3) # keep channel dim
|
||||
keepdims = True
|
||||
else:
|
||||
ep_ft_array = data # data is already a np.ndarray
|
||||
axes_to_reduce = 0 # compute stats over the first axis
|
||||
keepdims = data.ndim == 1 # keep as np.array
|
||||
|
||||
ep_stats[key] = get_feature_stats(ep_ft_array, axis=axes_to_reduce, keepdims=keepdims)
|
||||
|
||||
if features[key]["dtype"] in ["image", "video"]:
|
||||
value_norm = 1.0 if "depth" in key else 255.0
|
||||
ep_stats[key] = {
|
||||
k: v if k == "count" else np.squeeze(v / value_norm, axis=0) for k, v in ep_stats[key].items()
|
||||
}
|
||||
|
||||
return ep_stats
|
||||
Reference in New Issue
Block a user