update subtask annotate

This commit is contained in:
Jade Choghari
2026-01-21 13:59:16 +00:00
parent dc85e9b742
commit d0b6a66f34
5 changed files with 43 additions and 25 deletions
@@ -16,13 +16,14 @@ TEMPERATURE=0.9
SAMPLE_INTERVAL=5.0 # generate dialogue every 1 second (all episodes processed)
# Run subtask annotation
# python /admin/home/jade_choghari/lerobot/src/lerobot/policies/pi05_full/annotate/subtask_annotate.py \
# --repo-id "$REPO_ID" \
# --video-key observation.images.image \
# --output-dir "$OUTPUT_DIR" \
# --skip-existing \
# --output-repo-id "jadechoghari/libero10-annotate" \
# --batch-size "$BATCH_SIZE" \
python /admin/home/jade_choghari/lerobot/src/lerobot/policies/pi05_full/annotate/subtask_annotate.py \
--repo-id "$REPO_ID" \
--video-key observation.images.image \
--output-dir "$OUTPUT_DIR" \
--skip-existing \
--output-repo-id "jadechoghari/libero10-annotate" \
--batch-size "$BATCH_SIZE" \
# run synthetic data generation (all episodes processed)
# python examples/dataset/annotate_pgen.py \
# --repo-id "$REPO_ID" \
@@ -41,10 +42,10 @@ SAMPLE_INTERVAL=5.0 # generate dialogue every 1 second (all episodes processed)
# add --push-to-hub flag
# efficient batch processing: 4 episodes at once
python /admin/home/jade_choghari/lerobot/src/lerobot/policies/pi05_full/annotate/high_level_annotate.py \
--data-dir "/fsx/jade_choghari/outputs/libero-10-annotate" \
--output-dir "$OUTPUT_DIR" \
--video-mode \
--video-key observation.images.image \
--video-batch-size "$BATCH_SIZE" \
--sample-interval 5.0
# python /admin/home/jade_choghari/lerobot/src/lerobot/policies/pi05_full/annotate/high_level_annotate.py \
# --data-dir "/fsx/jade_choghari/outputs/libero-10-annotate" \
# --output-dir "$OUTPUT_DIR" \
# --video-mode \
# --video-key observation.images.image \
# --video-batch-size "$BATCH_SIZE" \
# --sample-interval 5.0
@@ -4,7 +4,7 @@ from huggingface_hub import HfApi
import lerobot
from lerobot.datasets.lerobot_dataset import LeRobotDataset, LeRobotDatasetMetadata
dataset = LeRobotDataset(repo_id="local", root="/fsx/jade_choghari/.cache/huggingface/lerobot/lerobot/libero_10/")
dataset = LeRobotDataset(repo_id="local", root="/fsx/jade_choghari/outputs/libero-10-annotate")
dataloader = torch.utils.data.DataLoader(
dataset,
@@ -15,12 +15,18 @@ dataloader = torch.utils.data.DataLoader(
batch = next(iter(dataloader))
print(batch.keys())
print(batch['task_index_high_level'].shape)
print(batch['task_index_high_level'])
print(batch['user_prompt'][0])
print(batch['robot_utterance'][0])
print(batch['task'][0])
# print(batch['task_index_high_level'].shape)
# print(batch['task_index_high_level'])
# print(batch['user_prompt'][0])
# print(batch['robot_utterance'][0])
# print(batch['task'][0])
valid_episode_list = []
for episode_idx in range(len(dataset.meta.episodes)):
subtask_index = dataset[episode_idx]["subtask_index"]
valid_episode_list.append(episode_idx)
print(len(valid_episode_list))
# read this parquet /fsx/jade_choghari/outputs/pgen_annotations1/meta/tasks.parquett
# import pandas as pd
@@ -1012,8 +1012,9 @@ def create_subtask_index_array(
console = Console()
# Array to store subtask index for each frame
# Initialize with -1 to indicate unannotated frames
full_dataset_length = len(dataset)
subtask_indices = np.zeros(full_dataset_length, dtype=np.int64)
subtask_indices = np.full(full_dataset_length, -1, dtype=np.int64)
console.print(f"[cyan]Creating subtask_index array for {full_dataset_length} frames...[/cyan]")
@@ -54,8 +54,8 @@ class Pi05FullPrepareStateTokenizerProcessorStep(ProcessorStep):
"""
max_state_dim: int = 32
user_prompt_key: str = "user_prompt"
command_key: str = "task"
user_prompt_key: str = "task"
command_key: str = "subtask"
def __call__(self, transition: EnvTransition) -> EnvTransition:
transition = transition.copy()
@@ -85,7 +85,7 @@ class Pi05FullPrepareStateTokenizerProcessorStep(ProcessorStep):
for i, user_prompt in enumerate(user_prompts):
cleaned_text = user_prompt.strip().replace("_", " ").replace("\n", " ")
state_str = " ".join(map(str, discretized_states[i]))
full_prompt = f"User prompt: {cleaned_text}, State: {state_str};\n"
full_prompt = f"Task: {cleaned_text}, State: {state_str};\n"
full_prompts.append(full_prompt)
transition[TransitionKey.COMPLEMENTARY_DATA][self.user_prompt_key] = full_prompts
+11 -1
View File
@@ -340,11 +340,21 @@ def train(cfg: TrainPipelineConfig, accelerator: Accelerator | None = None):
# create dataloader for offline training
if hasattr(cfg.policy, "drop_n_last_frames"):
# loop over dataset subtask parquet file to find episode indices that don't have subtask index != -1
# valid_episode_list passed to episode_indexes_to_use
valid_episode_list = []
for episode_idx in range(len(dataset.meta.episodes)):
subtask_index = dataset[episode_idx]["subtask_index"]
if subtask_index != -1:
valid_episode_list.append(episode_idx)
episode_indices_to_use = valid_episode_list
shuffle = False
sampler = EpisodeAwareSampler(
dataset.meta.episodes["dataset_from_index"],
dataset.meta.episodes["dataset_to_index"],
episode_indices_to_use=dataset.episodes,
episode_indices_to_use=episode_indices_to_use,
drop_n_last_frames=cfg.policy.drop_n_last_frames,
shuffle=True,
)