add: utils for stabler, large scale upload (ds.push_to_hub may fail)

This commit is contained in:
fracapuano
2025-11-16 10:38:36 +00:00
parent 0846b5704c
commit f6755dbf20
2 changed files with 136 additions and 0 deletions
+35
View File
@@ -0,0 +1,35 @@
#!/bin/bash
#SBATCH -J b1k-upload
#SBATCH -p hopper-cpu
#SBATCH --qos=high
#SBATCH -c 8
#SBATCH -t 48:00:00
#SBATCH --mem=8G
#SBATCH --array=0-49%8
#SBATCH -D /admin/home/francesco_capuano/lerobot
#SBATCH -o /admin/home/francesco_capuano/lerobot/examples/behavior_1k/logs/%x-%A_%a.out
#SBATCH -e /admin/home/francesco_capuano/lerobot/examples/behavior_1k/logs/%x-%A_%a.err
set -euo pipefail
set -x
export PYTHONUNBUFFERED=1
export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK:-1}
source "$HOME/.bashrc" 2>/dev/null || true
if ! command -v conda >/dev/null 2>&1; then
source "$HOME/miniconda3/etc/profile.d/conda.sh" 2>/dev/null || true
source "$HOME/anaconda3/etc/profile.d/conda.sh" 2>/dev/null || true
fi
conda activate lerobot
# The SLURM_ARRAY_TASK_ID will be used as the task-id
TASK_ID=${SLURM_ARRAY_TASK_ID}
# Configuration
ROOT_PATH="/fsx/francesco_capuano/behavior1k-v3"
HF_USER="fracapuano"
python examples/behavior_1k/upload_folders.py \
--task-id ${TASK_ID} \
--root-path ${ROOT_PATH} \
--hf-user ${HF_USER}
+101
View File
@@ -0,0 +1,101 @@
import argparse
from pathlib import Path
from huggingface_hub import HfApi, upload_large_folder
def main():
parser = argparse.ArgumentParser(
description="Upload a folder to Hugging Face Hub using upload_large_folder"
)
parser.add_argument(
"--folder-path",
type=str,
required=False,
help="Path to the folder to upload (used if task-id is not provided)",
)
parser.add_argument(
"--repo-id",
type=str,
required=False,
help="Repository ID on Hugging Face Hub (e.g., 'username/repo-name'). If task-id is provided, will be constructed as '{hf-user}/behavior1k-task{task_id:04d}'",
)
parser.add_argument(
"--task-id",
type=int,
required=False,
help="Task index to upload (e.g., 0, 1, 2, ...). When provided, folder-path is constructed from root-path.",
)
parser.add_argument(
"--root-path",
type=str,
required=False,
help="Root path containing task folders (e.g., /fsx/user/behavior1k-v3). Used with --task-id to construct folder path.",
)
parser.add_argument(
"--hf-user",
type=str,
default=None,
help="Hugging Face username for constructing repo-id with task-id (default: from HF_USER env var or 'fracapuano')",
)
parser.add_argument(
"--create-repo", action="store_true", help="Create the repository if it doesn't exist"
)
args = parser.parse_args()
# Construct folder path and repo ID based on task-id or use provided values
if args.task_id is not None:
if not args.root_path:
raise ValueError("--root-path is required when --task-id is provided")
task_folder_name = f"behavior1k-task{args.task_id:04d}"
folder_path = Path(args.root_path) / task_folder_name
repo_id = f"{args.hf_user}/{task_folder_name}"
print(f"Task mode: uploading task {args.task_id}")
else:
if not args.folder_path or not args.repo_id:
raise ValueError(
"Either --task-id with --root-path, or both --folder-path and --repo-id must be provided"
)
folder_path = Path(args.folder_path)
repo_id = args.repo_id
# Validate folder path
if not folder_path.exists():
raise ValueError(f"Folder path does not exist: {folder_path}")
if not folder_path.is_dir():
raise ValueError(f"Path is not a directory: {folder_path}")
print(f"Uploading folder: {folder_path}")
print(f"Repository: {repo_id}")
# Create repository if requested
if args.create_repo:
api = HfApi()
print(f"Creating repository {repo_id}...")
try:
api.create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=True)
print("Repository created or already exists. Updating its contents")
except Exception as e:
print(f"Warning: Could not create repository: {e}")
# Upload the folder
print("Starting upload...")
try:
result = upload_large_folder(
folder_path=str(folder_path),
repo_id=repo_id,
repo_type="dataset",
)
print("✓ Upload completed successfully!")
print(f"Commit URL: {result}")
except Exception as e:
print(f"✗ Upload failed: {e}")
raise
if __name__ == "__main__":
main()