add support for robocasa2lerobot (#86)

* Support robocasa2lerobot * Support robocasa2lerobot * NIT: formatting * update to latest lerobot * update readme * Apply suggestion from @gemini-code-assist[bot] Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> * fix h5py open --------- Co-authored-by: Tavish <tavish9.chen@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2026-07-25 04:55:58 +00:00 · 2026-03-21 14:55:33 +07:00
parent e97ca6531e
commit ef184e44be
7 changed files with 683 additions and 1 deletions
@@ -0,0 +1,78 @@
+import numpy as np
+import robosuite.utils.transform_utils as T
+
+
+def get_camera_intrinsic_matrix(sim, camera_name, camera_height, camera_width):
+    """
+    Obtains camera intrinsic matrix.
+
+    Args:
+        sim (MjSim): simulator instance
+        camera_name (str): name of camera
+        camera_height (int): height of camera images in pixels
+        camera_width (int): width of camera images in pixels
+    Return:
+        K (np.array): 3x3 camera matrix
+    """
+    cam_id = sim.model.camera_name2id(camera_name)
+    fovy = sim.model.cam_fovy[cam_id]
+    f = 0.5 * camera_height / np.tan(fovy * np.pi / 360)
+    K = np.array([[f, 0, camera_width / 2], [0, f, camera_height / 2], [0, 0, 1]])
+    return K
+
+
+def get_camera_extrinsic_matrix(sim, camera_name):
+    """
+    Returns a 4x4 homogenous matrix corresponding to the camera pose in the
+    world frame. MuJoCo has a weird convention for how it sets up the
+    camera body axis, so we also apply a correction so that the x and y
+    axis are along the camera view and the z axis points along the
+    viewpoint.
+    Normal camera convention: https://docs.opencv.org/2.4/modules/calib3d/doc/camera_calibration_and_3d_reconstruction.html
+
+    Args:
+        sim (MjSim): simulator instance
+        camera_name (str): name of camera
+    Return:
+        R (np.array): 4x4 camera extrinsic matrix
+    """
+    cam_id = sim.model.camera_name2id(camera_name)
+    camera_pos = sim.data.cam_xpos[cam_id]
+    camera_rot = sim.data.cam_xmat[cam_id].reshape(3, 3)
+    R = T.make_pose(camera_pos, camera_rot)
+
+    # IMPORTANT! This is a correction so that the camera axis is set up along the viewpoint correctly.
+    camera_axis_correction = np.array(
+        [[1.0, 0.0, 0.0, 0.0], [0.0, -1.0, 0.0, 0.0], [0.0, 0.0, -1.0, 0.0], [0.0, 0.0, 0.0, 1.0]]
+    )
+    R = R @ camera_axis_correction
+    return R
+
+def get_camera_extrinsic_matrix_rel(sim, camera_name):
+    """
+    Returns a 4x4 homogenous matrix corresponding to the camera pose in the
+    world frame. MuJoCo has a weird convention for how it sets up the
+    camera body axis, so we also apply a correction so that the x and y
+    axis are along the camera view and the z axis points along the
+    viewpoint.
+    Normal camera convention: https://docs.opencv.org/2.4/modules/calib3d/doc/camera_calibration_and_3d_reconstruction.html
+
+    Args:
+        sim (MjSim): simulator instance
+        camera_name (str): name of camera
+    Return:
+        R (np.array): 4x4 camera extrinsic matrix
+    """
+    cam_id = sim.model.camera_name2id(camera_name)
+    camera_pos = sim.model.cam_pos[cam_id]
+    camera_quat = sim.model.cam_quat[cam_id]
+    camera_rot = T.quat2mat(camera_quat)
+    R = T.make_pose(camera_pos, camera_rot)
+
+    # IMPORTANT! This is a correction so that the camera axis is set up along the viewpoint correctly.
+    camera_axis_correction = np.array(
+        [[1.0, 0.0, 0.0, 0.0], [0.0, -1.0, 0.0, 0.0], [0.0, 0.0, -1.0, 0.0], [0.0, 0.0, 0.0, 1.0]]
+    )
+    R = R @ camera_axis_correction
+    return R
+ 
@@ -0,0 +1,70 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "44b6da09",
+   "metadata": {},
+   "source": [
+    "# Extract subset data \n",
+    "\n",
+    "Original hdf5 file contains about 3000 episodes. However, it contains a key \"masks\", which contain list of subset demo_ids. For example: 30_demos : [demo123, demo234, demo 345, etc.]\n",
+    "\n",
+    "Run the code bellow to extract only chosen subset demos, which is much smaller and easier for later process."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6ac64550",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import h5py\n",
+    "\n",
+    "DATA_DIR=\"direction/to/your/hdf5/files/\"\n",
+    "# E.x: DATA_DIR=\"/projects/extern/kisski/kisski-spath/dir.project/VLA_3D/binh/robocasa/test\"\n",
+    "\n",
+    "# file_name = \"PnPCabToCounter.hdf5\"\n",
+    "# file_name = \"PnPCounterToCab.hdf5\"\n",
+    "# file_name = \"CoffeeSetupMug.hdf5\"\n",
+    "# file_name = \"TurnOnMicrowave.hdf5\"\n",
+    "file_name = \"TurnOffStove.hdf5\"\n",
+    "\n",
+    "file_path = DATA_DIR + \"/\" + file_name\n",
+    "\n",
+    "f = h5py.File(file_path, 'r')\n",
+    "chosen_demo_list = []\n",
+    "for i in f['mask']['100_demos'][:]: # or \"30_demos\"\n",
+    "    chosen_demo_list.append(i.decode('utf-8'))\n",
+    "    \n",
+    "chosen_data = []\n",
+    "for k in f['data'].keys():\n",
+    "    if k in chosen_demo_list:\n",
+    "        chosen_data.append(f['data'][k])\n",
+    "        \n",
+    "with h5py.File(f\"direction_to_your_new_extracted_subset/{file_name}\", \"w\") as out:\n",
+    "    out_data = out.create_group(\"data\")\n",
+    "    \n",
+    "    for key, val in f['data'].attrs.items():\n",
+    "            out_data.attrs[key] = val # IMPORTANT: set attributes for new hdf5 files (need for reset env and later re-render)\n",
+    "\n",
+    "    for grp in chosen_data:\n",
+    "        name = grp.name.split(\"/\")[-1]   # demo_xxx\n",
+    "        grp.file.copy(grp, out_data, name=name)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "robocasa",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.19"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
@@ -0,0 +1,277 @@
+import json
+import os
+
+import h5py
+import numpy as np
+import robosuite
+from robocasa.scripts.playback_dataset import reset_to
+from robosuite.utils.camera_utils import (
+    get_camera_extrinsic_matrix,
+    get_camera_extrinsic_matrix_rel,
+    get_camera_intrinsic_matrix,
+)
+from tqdm import tqdm
+
+ROBOCASA_DUMMY_ACTION = [0.0] * 6 + [-1.0] + [0.0] * 4 + [-1.0]
+
+
+def get_camera_info(sim, camera_name, camera_height, camera_width):
+    camera_intrinsics = get_camera_intrinsic_matrix(sim, camera_name, camera_height, camera_width)
+    camera_extrinsics = get_camera_extrinsic_matrix(sim, camera_name)
+
+    return camera_intrinsics, camera_extrinsics
+
+
+def creat_env_from_hdf5(f):
+    env_meta = json.loads(f["data"].attrs["env_args"])
+    env_meta["env_kwargs"]["camera_depths"] = True
+    env_meta["env_kwargs"]["camera_heights"] = 256
+    env_meta["env_kwargs"]["camera_widths"] = 256
+    env_meta["env_kwargs"]["camera_segmentations"] = "element"  # element' #'instance'
+    # f.close()
+
+    env_kwargs = env_meta["env_kwargs"]
+    env_kwargs["env_name"] = env_meta["env_name"]
+    env_kwargs["has_renderer"] = False
+    env_kwargs["renderer"] = "mjviewer"
+    env_kwargs["has_offscreen_renderer"] = True  # write_video
+    env_kwargs["use_camera_obs"] = True
+    env_kwargs["ignore_done"] = False
+
+    env = robosuite.make(**env_kwargs)
+
+    return env, env_meta
+
+
+def reset_each_demo(env, demo):
+    # demo = f["data"]["demo_<idx>"]
+    model_xml = demo.attrs["model_file"]
+    init_state = demo["states"][()][0]
+    ep_meta = demo.attrs["ep_meta"]
+
+    state = {"states": init_state, "model": model_xml, "ep_meta": ep_meta}
+    reset_to(env, state)
+
+
+def process_1_demo(env, f, demo_id, grp):
+    demo = f["data"][demo_id]
+    reset_each_demo(env, demo)
+
+    ep_meta = env.get_ep_meta()
+    model_file = env.model.get_xml()
+
+    for _ in range(10):
+        obs, reward, done, info = env.step(ROBOCASA_DUMMY_ACTION)
+
+    obs_keys = list(obs.keys())
+    obs_keys += [
+        "robot0_agentview_left_intrinsics",
+        "robot0_agentview_right_intrinsics",
+        "robot0_eye_in_hand_intrinsics",
+    ]
+    obs_keys += [
+        "robot0_agentview_left_extrinsics",
+        "robot0_agentview_right_extrinsics",
+        "robot0_eye_in_hand_extrinsics",
+    ]
+    obs_keys += [
+        "robot0_agentview_left_extrinsicsR",
+        "robot0_agentview_right_extrinsicsR",
+        "robot0_eye_in_hand_extrinsicsR",
+    ]
+    obs_keys += ["robot0_agentview_left_depthW", "robot0_agentview_right_depthW", "robot0_eye_in_hand_depthW"]
+
+    obs_dict = {key: [] for key in obs_keys}
+    # action_dict = {key: [] for key in act_keys}
+    actions = []
+    actions_abs = []
+    rewards = []
+    dones = []
+    states = []  # env state, not robot. The state for robot is included in obs
+
+    # for key in obs_keys:
+    #     obs_dict[key] = obs[key]
+    orig_actions = demo["actions"][()]
+    orig_actions_abs = demo["actions_abs"][()]
+    # orig_action_dict = demo['action_dict']
+
+    for i, action in enumerate(orig_actions):
+        # for i, action in enumerate(orig_actions_abs):
+        extent = env.sim.model.stat.extent
+        far = env.sim.model.vis.map.zfar * extent
+        near = env.sim.model.vis.map.znear * extent
+        left_depth = obs["robot0_agentview_left_depth"].copy()
+        right_depth = obs["robot0_agentview_right_depth"].copy()
+        wrist_depth = obs["robot0_eye_in_hand_depth"].copy()
+        left_depth = near / (1.0 - left_depth * (1.0 - near / far))[::-1]
+        right_depth = near / (1.0 - right_depth * (1.0 - near / far))[::-1]
+        wrist_depth = near / (1.0 - wrist_depth * (1.0 - near / far))[::-1]
+
+        obs["robot0_agentview_left_depthW"] = left_depth
+        obs["robot0_agentview_right_depthW"] = right_depth
+        obs["robot0_eye_in_hand_depthW"] = wrist_depth
+
+        left_intrinsics, left_extrinsics = get_camera_info(env.sim, "robot0_agentview_left", 256, 256)
+        right_intrinsics, right_extrinsics = get_camera_info(env.sim, "robot0_agentview_right", 256, 256)
+        wrist_intrinsics, wrist_extrinsics = get_camera_info(env.sim, "robot0_eye_in_hand", 256, 256)
+
+        obs["robot0_agentview_left_intrinsics"] = left_intrinsics
+        obs["robot0_agentview_right_intrinsics"] = right_intrinsics
+        obs["robot0_eye_in_hand_intrinsics"] = wrist_intrinsics
+        obs["robot0_agentview_left_extrinsics"] = left_extrinsics
+        obs["robot0_agentview_right_extrinsics"] = right_extrinsics
+        obs["robot0_eye_in_hand_extrinsics"] = wrist_extrinsics
+
+        left_intrinsics_rel = get_camera_extrinsic_matrix_rel(env.sim, "robot0_agentview_left")
+        right_intrinsics_rel = get_camera_extrinsic_matrix_rel(env.sim, "robot0_agentview_right")
+        wrist_intrinsics_rel = get_camera_extrinsic_matrix_rel(env.sim, "robot0_eye_in_hand")
+
+        obs["robot0_agentview_left_extrinsicsR"] = left_intrinsics_rel
+        obs["robot0_agentview_right_extrinsicsR"] = right_intrinsics_rel
+        obs["robot0_eye_in_hand_extrinsicsR"] = wrist_intrinsics_rel
+
+        # append all keys
+        for key in obs_keys:
+            if (
+                ("eye_in_hand" in key or "agentview" in key)
+                and "depthW" not in key
+                and "intrinsics" not in key
+                and "extrinsics" not in key
+            ):
+                obs_dict[key].append(obs[key][::-1, :, :])
+            else:
+                obs_dict[key].append(obs[key])
+
+        # for key in act_keys:
+        #     action_dict[key].append(orig_action_dict[key][i])
+
+        actions.append(action)
+        actions_abs.append(orig_actions_abs[i])
+
+        rewards.append(reward)
+        dones.append(done)
+
+        current_state = env.sim.get_state().flatten()
+        states.append(current_state)
+
+        # step env
+        obs, reward, done, info = env.step(action.tolist())
+
+        done = done or env._check_success()
+        # if done:
+        # print(f" Step {i} done: {done}")
+        # print(f" Step {i} info: {info}")
+        # print(f" Step {i} is_success: {env._check_success()}" )
+
+    # save successful episode only
+    if done:
+        print(f"Demo {demo_id} is done after {i} actions! -> SAVE!!!")
+
+        # save to new hdf5 file here
+        ep_data = grp.create_group(demo_id)
+        # set attribute for ep_data here ...
+        ep_data.attrs["model_file"] = model_file
+        ep_data.attrs["ep_meta"] = json.dumps(ep_meta, indent=4)
+
+        # obs group
+        obs_grp = ep_data.create_group("obs")
+        for key in obs_keys:
+            obs_grp.create_dataset(key, data=np.stack(obs_dict[key], axis=0))
+
+        # actions dataset
+        ep_data.create_dataset("actions", data=np.stack(actions, axis=0))
+
+        ep_data.create_dataset("actions_abs", data=np.stack(actions_abs, axis=0))
+
+        ep_data.create_dataset("dones", data=np.stack(dones, axis=0))
+
+        ep_data.create_dataset("rewards", data=np.stack(rewards, axis=0))
+
+        # state dataset
+        ep_data.create_dataset("states", data=np.stack(states, axis=0))
+
+    elif not done:
+        print(f"Demo {demo_id} not done after all actions executed! -> does not SAVE!")
+
+
+def regenerate_hdf5_dataset(input_path, output_path, debug=False):
+    f = h5py.File(input_path, "r")
+    env, env_meta = creat_env_from_hdf5(f)
+
+    out_f = h5py.File(output_path, "w")
+    out_f.attrs["env_args"] = json.dumps(env_meta)
+
+    grp = out_f.create_group("data")
+
+    all_demo_ids = list(f["data"].keys())
+    if debug:
+        all_demo_ids = all_demo_ids[: min(2, len(all_demo_ids))]
+    for demo_id in tqdm(all_demo_ids):
+        print(f"Processing {demo_id} ...")
+        process_1_demo(env, f, demo_id, grp)
+
+    f.close()
+    if len(out_f["data"].keys()) == 0:
+        print("No demos were processed successfully. Deleting output file.")
+        out_f.close()
+        os.remove(output_path)
+    else:
+        print(f"Processed data saved {len(out_f['data'].keys())} demos to {output_path}")
+        out_f.close()
+
+
+def process_task_wrapper(args):
+    """Wrapper function for multiprocessing to process a single task."""
+    task, origin_dir, regenerate_dir, debug = args
+    input_path = os.path.join(origin_dir, f"{task}.hdf5")
+    output_path = os.path.join(regenerate_dir, f"{task}.hdf5")
+
+    print(f"Regenerating dataset for task {task} ...")
+    regenerate_hdf5_dataset(input_path, output_path, debug=debug)
+    print(f"Completed task {task}")
+
+
+if __name__ == "__main__":
+    n_demo = 100  # 100
+    origin_dir = f"<directory/contain/original/hdf5/files/>"
+    regenerate_dir = f"<directory/contain/regenerated/hdf5/files/>"
+    os.makedirs(regenerate_dir, exist_ok=True)
+
+    task_list = [
+        "PnPCabToCounter",
+        "PnPCounterToCab",
+        "CoffeeSetupMug",
+        "TurnOffStove",
+        "TurnOnMicrowave",
+        # ... add other tasks as needed
+        # "CoffeePressButton",
+        # "CoffeeServeMug",
+        # "TurnOffMicrowave",
+        # "TurnOffSinkFaucet",
+        # "TurnOnSinkFaucet",
+        # "TurnOnStove",
+        # "TurnSinkSpout"
+        # "CloseDoubleDoor",
+        # "CloseDrawer",
+        # "CloseSingleDoor",
+        # "OpenDoubleDoor",
+        # "OpenDrawer",
+        # "OpenSingleDoor"
+        # "PnPCounterToMicrowave",
+        # "PnPCounterToSink",
+        # "PnPCounterToStove",
+        # "PnPMicrowaveToCounter",
+        # "PnPSinkToCounter",
+        # "PnPStoveToCounter"
+    ]  # 24 tasks in robocasa kitchen dataset
+
+    debug = False
+    if debug:
+        task_list = task_list[:2]
+
+    for task in task_list:
+        input_path = os.path.join(origin_dir, f"{task}.hdf5")
+        output_path = os.path.join(regenerate_dir, f"{task}.hdf5")
+
+        print(f"Regenerating dataset for task {task} ...")
+        regenerate_hdf5_dataset(input_path, output_path, debug=False)