diff --git a/examples/port_datasets/download_openx_datasets.py b/examples/port_datasets/download_openx_datasets.py new file mode 100644 index 000000000..955eeb4ba --- /dev/null +++ b/examples/port_datasets/download_openx_datasets.py @@ -0,0 +1,191 @@ +#!/usr/bin/env python + +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Simple script to download OpenX datasets using TensorFlow Datasets. + +Usage: + python examples/port_datasets/download_openx_datasets.py + python examples/port_datasets/download_openx_datasets.py --download-dir /path/to/datasets + python examples/port_datasets/download_openx_datasets.py --datasets fractal20220817_data kuka bridge +""" + +import argparse +from pathlib import Path + +import tensorflow_datasets as tfds +import tqdm + +# Full list of OpenX dataset names +# Optionally replace with filtered datasets from the Google Sheet +DATASET_NAMES = [ + "fractal20220817_data", + "kuka", + "bridge", + "taco_play", + "jaco_play", + "berkeley_cable_routing", + "roboturk", + "nyu_door_opening_surprising_effectiveness", + "viola", + "berkeley_autolab_ur5", + "toto", + "language_table", + "columbia_cairlab_pusht_real", + "stanford_kuka_multimodal_dataset_converted_externally_to_rlds", + "nyu_rot_dataset_converted_externally_to_rlds", + "stanford_hydra_dataset_converted_externally_to_rlds", + "austin_buds_dataset_converted_externally_to_rlds", + "nyu_franka_play_dataset_converted_externally_to_rlds", + "maniskill_dataset_converted_externally_to_rlds", + "furniture_bench_dataset_converted_externally_to_rlds", + "cmu_franka_exploration_dataset_converted_externally_to_rlds", + "ucsd_kitchen_dataset_converted_externally_to_rlds", + "ucsd_pick_and_place_dataset_converted_externally_to_rlds", + "austin_sailor_dataset_converted_externally_to_rlds", + "austin_sirius_dataset_converted_externally_to_rlds", + "bc_z", + "usc_cloth_sim_converted_externally_to_rlds", + "utokyo_pr2_opening_fridge_converted_externally_to_rlds", + "utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds", + "utokyo_saytap_converted_externally_to_rlds", + "utokyo_xarm_pick_and_place_converted_externally_to_rlds", + "utokyo_xarm_bimanual_converted_externally_to_rlds", + "robo_net", + "berkeley_mvp_converted_externally_to_rlds", + "berkeley_rpt_converted_externally_to_rlds", + "kaist_nonprehensile_converted_externally_to_rlds", + "stanford_mask_vit_converted_externally_to_rlds", + "tokyo_u_lsmo_converted_externally_to_rlds", + "dlr_sara_pour_converted_externally_to_rlds", + "dlr_sara_grid_clamp_converted_externally_to_rlds", + "dlr_edan_shared_control_converted_externally_to_rlds", + "asu_table_top_converted_externally_to_rlds", + "stanford_robocook_converted_externally_to_rlds", + "eth_agent_affordances", + "imperialcollege_sawyer_wrist_cam", + "iamlab_cmu_pickup_insert_converted_externally_to_rlds", + "uiuc_d3field", + "utaustin_mutex", + "berkeley_fanuc_manipulation", + "cmu_food_manipulation", + "cmu_play_fusion", + "cmu_stretch", + "berkeley_gnm_recon", + "berkeley_gnm_cory_hall", + "berkeley_gnm_sac_son", +] + +DEFAULT_DOWNLOAD_DIR = "~/tensorflow_datasets" + + +def download_datasets(datasets, download_dir): + """Download the specified datasets to the given directory.""" + download_dir = Path(download_dir).expanduser().resolve() + print(f"Downloading {len(datasets)} datasets to {download_dir}") + + # Create directory if it doesn't exist + download_dir.mkdir(parents=True, exist_ok=True) + + failed_downloads = [] + + for dataset_name in tqdm.tqdm(datasets, desc="Downloading datasets"): + try: + print(f"\nDownloading {dataset_name}...") + _ = tfds.load(dataset_name, data_dir=str(download_dir), download=True) + print(f"✓ Successfully downloaded {dataset_name}") + except Exception as e: + print(f"✗ Failed to download {dataset_name}: {e}") + failed_downloads.append((dataset_name, str(e))) + + # Summary + print(f"\n{'=' * 60}") + print("Download Summary:") + print(f" Total datasets: {len(datasets)}") + print(f" Successfully downloaded: {len(datasets) - len(failed_downloads)}") + print(f" Failed downloads: {len(failed_downloads)}") + + if failed_downloads: + print("\nFailed downloads:") + for dataset_name, error in failed_downloads: + print(f" - {dataset_name}: {error}") + + print(f"\nDatasets saved to: {download_dir}") + + +def main(): + parser = argparse.ArgumentParser( + description="Download OpenX datasets using TensorFlow Datasets", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Download all OpenX datasets to default directory + python download_openx_datasets.py + + # Download to specific directory + python download_openx_datasets.py --download-dir /path/to/datasets + + # Download only specific datasets + python download_openx_datasets.py --datasets fractal20220817_data kuka bridge + + # Download RT-1 dataset only + python download_openx_datasets.py --datasets fractal20220817_data + """, + ) + + parser.add_argument( + "--download-dir", + type=str, + default=DEFAULT_DOWNLOAD_DIR, + help=f"Directory to download datasets to (default: {DEFAULT_DOWNLOAD_DIR})", + ) + parser.add_argument( + "--datasets", + nargs="*", + default=None, + help="Specific datasets to download. If not provided, downloads all OpenX datasets.", + ) + parser.add_argument( + "--list-datasets", + action="store_true", + help="List all available dataset names and exit", + ) + + args = parser.parse_args() + + if args.list_datasets: + print("Available OpenX datasets:") + for i, dataset in enumerate(DATASET_NAMES, 1): + print(f" {i:2d}. {dataset}") + print(f"\nTotal: {len(DATASET_NAMES)} datasets") + return + + # Determine which datasets to download + if args.datasets: + datasets_to_download = args.datasets + # Validate dataset names + invalid_datasets = [d for d in datasets_to_download if d not in DATASET_NAMES] + if invalid_datasets: + print(f"Warning: Unknown datasets: {invalid_datasets}") + print("Use --list-datasets to see available datasets") + else: + datasets_to_download = DATASET_NAMES + + download_datasets(datasets_to_download, args.download_dir) + + +if __name__ == "__main__": + main()