diff --git a/examples/dataset/inference_pi05.py b/examples/dataset/inference_pi05.py index 73522694d..1590c11dc 100644 --- a/examples/dataset/inference_pi05.py +++ b/examples/dataset/inference_pi05.py @@ -19,8 +19,10 @@ pre_processor, post_processor = make_pre_post_processors( pretrained_path="/fsx/jade_choghari/outputs/pi0_training_new/checkpoints/last/pretrained_model", ) +delta_timestamps = {'action': [0.0, 0.03333333333333333, 0.06666666666666667, 0.1, 0.13333333333333333, 0.16666666666666666, 0.2, 0.23333333333333334, 0.26666666666666666, 0.3, 0.3333333333333333, 0.36666666666666664, 0.4, 0.43333333333333335, 0.4666666666666667, 0.5, 0.5333333333333333, 0.5666666666666667, 0.6, 0.6333333333333333, 0.6666666666666666, 0.7, 0.7333333333333333, 0.7666666666666667, 0.8, 0.8333333333333334, 0.8666666666666667, 0.9, 0.9333333333333333, 0.9666666666666667, 1.0, 1.0333333333333334, 1.0666666666666667, 1.1, 1.1333333333333333, 1.1666666666666667, 1.2, 1.2333333333333334, 1.2666666666666666, 1.3, 1.3333333333333333, 1.3666666666666667, 1.4, 1.4333333333333333, 1.4666666666666666, 1.5, 1.5333333333333334, 1.5666666666666667, 1.6, 1.6333333333333333]} + +dataset = LeRobotDataset(repo_id="local", root="/fsx/jade_choghari/outputs/pgen_annotations1", delta_timestamps=delta_timestamps) -dataset = LeRobotDataset(repo_id="local", root="/fsx/jade_choghari/outputs/pgen_annotations1") # rename map --rename_map='{ # "observation.images.side": "observation.images.base_0_rgb", # "observation.images.up": "observation.images.left_wrist_0_rgb" @@ -45,6 +47,7 @@ dataloader = torch.utils.data.DataLoader( batch = next(iter(dataloader)) batch = pre_processor(batch) +breakpoint() policy.train() # run inference # action = policy.select_action(batch) diff --git a/examples/dataset/run.sh b/examples/dataset/run.sh index 164cda5ae..087d9e956 100644 --- a/examples/dataset/run.sh +++ b/examples/dataset/run.sh @@ -7,4 +7,4 @@ python examples/dataset/annotate.py \ --repo-id lerobot/svla_so101_pickplace \ --video-key observation.images.side \ --model Qwen/Qwen3-VL-30B-A3B-Instruct \ - --episodes 3 5 7 44 \ No newline at end of file + --episodes 5 \ No newline at end of file diff --git a/src/lerobot/policies/pi05/compare.txt b/src/lerobot/policies/pi05/compare.txt new file mode 100644 index 000000000..aaebecec3 --- /dev/null +++ b/src/lerobot/policies/pi05/compare.txt @@ -0,0 +1,64 @@ + +Fine tune output +(Pdb) images[2].mean() +tensor(-1., device='cuda:0') +(Pdb) images[1].mean() +tensor(-0.5780, device='cuda:0') +(Pdb) images[0].mean() +tensor(-0.7716, device='cuda:0') +(Pdb) (Pdb) high_level_task[0] +tensor([ 2, 7978, 2403, 6911, 235292, 5651, 3124, 573, 18571, + 7762, 6643, 573, 9010, 72993, 21810, 4894, 3040, 235292, + 235248, 235274, 235274, 235274, 728, 235274, 235248, 235284, 235308, + 235308, 235248, 235274, 235318, 235315, 235248, 235274, 235310, 235318, + 235248, 235284, 235318, 235248, 235274, 235284, 235321, 235248, 235274, + 235284, 235321, 235248, 235274, 235284, 235321, 235248, 235274, 235284, + 235321, 235248, 235274, 235284, 235321, 235248, 235274, 235284, 235321, + 235248, 235274, 235284, 235321, 235248, 235274, 235284, 235321, 235248, + 235274, 235284, 235321, 235248, 235274, 235284, 235321, 235248, 235274, + 235284, 235321, 235248, 235274, 235284, 235321, 235248, 235274, 235284, + 235321, 235248, 235274, 235284, 235321, 235248, 235274, 235284, 235321, + 235248, 235274, 235284, 235321, 235248, 235274, 235284, 235321, 235248, + 235274, 235284, 235321, 235248, 235274, 235284, 235321, 235248, 235274, + 235284, 235321, 235248, 235274, 235284, 235321, 235248, 235274, 235284, + 235321, 235248, 235274, 235284, 235321, 235248, 235274, 235284, 235321, + 235248, 235274, 235284, 235321, 235248, 235274, 235284, 235321, 235289, + 4284, 8277, 235292, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0], device='cuda:0') +(Pdb) subtask_tokens[0] +tensor([ 2, 28040, 7762, 14574, 6643, 9010, 37901, 21810, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + device='cuda:0') +(Pdb) actions.shape +torch.Size([4, 50, 32]) +(Pdb) actions.mean() +tensor(0.0143, device='cuda:0') +(Pdb) + + + + +Inference: diff --git a/src/lerobot/policies/pi05/modeling_pi05.py b/src/lerobot/policies/pi05/modeling_pi05.py index 62a79c95c..c9e22c0a1 100644 --- a/src/lerobot/policies/pi05/modeling_pi05.py +++ b/src/lerobot/policies/pi05/modeling_pi05.py @@ -756,7 +756,7 @@ class PI05Pytorch(nn.Module): # see openpi `PI0Pytorch` time_expanded = time[:, None, None] x_t = time_expanded * noise + (1 - time_expanded) * actions u_t = noise - actions - + # Embed prefix (images + high_level_task + subtask_tokens) # Use high_level_task (prompt WITHOUT subtask) + subtask_tokens to predict prefix_embs, prefix_pad_masks, prefix_att_masks, total_T_images = self.embed_prefix( @@ -809,8 +809,7 @@ class PI05Pytorch(nn.Module): # see openpi `PI0Pytorch` # Apply mask and compute mean loss over valid tokens masked_loss = loss_per_token * subtask_masks.float() subtask_loss = masked_loss.sum() / subtask_masks.sum().clamp(min=1) - - breakpoint() + # Convert embeddings to bfloat16 if needed for the model if ( self.paligemma_with_expert.paligemma.language_model.layers[0].self_attn.q_proj.weight.dtype @@ -912,7 +911,6 @@ class PI05Pytorch(nn.Module): # see openpi `PI0Pytorch` # Embed the generated token and append to prefix next_token_unsqueezed = next_token.unsqueeze(1) # (B, 1) - breakpoint() def next_token_embed_func(next_token_unsqueezed): next_emb = self.paligemma_with_expert.embed_language_tokens(next_token_unsqueezed) @@ -1419,7 +1417,7 @@ class PI05Policy(PreTrainedPolicy): # Use high_level_task tokens (WITHOUT subtask) for inference - we'll generate the subtask high_level_task = batch[f"{OBS_LANGUAGE_HIGH_LEVEL_TASK_TOKENS}"] high_level_task_masks = batch[f"{OBS_LANGUAGE_HIGH_LEVEL_TASK_ATTENTION_MASK}"] - breakpoint() + # Sample actions using the model (pass through RTC kwargs, no separate state needed for PI05) actions = self.model.sample_actions( images, img_masks, high_level_task, high_level_task_masks, @@ -1451,7 +1449,7 @@ class PI05Policy(PreTrainedPolicy): valid_tokens = subtask_tokens[i][subtask_masks[i].bool()] if len(valid_tokens) > 0: decoded_text = self.tokenizer.decode(valid_tokens, skip_special_tokens=True) - print(f"[Training] Ground truth subtask {i}: {decoded_text}") + # print(f"[Training] Ground truth subtask {i}: {decoded_text}") # Compute loss (no separate state needed for PI05) # high_level_task = instruction tokens WITHOUT subtask (e.g., "High level task: X; State: Y; Subtask:") @@ -1461,7 +1459,6 @@ class PI05Policy(PreTrainedPolicy): # Extract the total loss loss = loss_dict["loss"] - breakpoint() # Prepare detailed loss dictionary for logging detailed_loss_dict = { "loss": loss.item(), diff --git a/src/lerobot/processor/tokenizer_processor.py b/src/lerobot/processor/tokenizer_processor.py index abdaf41fc..18b51a9ce 100644 --- a/src/lerobot/processor/tokenizer_processor.py +++ b/src/lerobot/processor/tokenizer_processor.py @@ -302,15 +302,17 @@ class TokenizerProcessorStep(ObservationProcessorStep): def _tokenize_text(self, text: str | list[str]) -> dict[str, torch.Tensor]: """ - A wrapper around the tokenizer call. + A wrapper around the tokenizer call that appends an EOS token to each sequence. Args: text: A string or list of strings to tokenize. Returns: - A dictionary containing tokenized 'input_ids' and 'attention_mask' as PyTorch tensors. + A dictionary containing tokenized 'input_ids' and 'attention_mask' as PyTorch tensors, + with EOS token appended at the end of each sequence. """ - return self.input_tokenizer( + # Tokenize normally + tokenized = self.input_tokenizer( text, max_length=self.max_length, truncation=self.truncation, @@ -318,6 +320,34 @@ class TokenizerProcessorStep(ObservationProcessorStep): padding_side=self.padding_side, return_tensors="pt", ) + + # Get EOS token ID + eos_token_id = self.input_tokenizer.eos_token_id + if eos_token_id is None: + # Some tokenizers don't have an EOS token, skip modification + return tokenized + + # Append EOS token to each sequence (before padding) + input_ids = tokenized["input_ids"] + attention_mask = tokenized["attention_mask"] + + for i in range(input_ids.shape[0]): + # Find the position of the last non-padding token + non_pad_positions = (attention_mask[i] == 1).nonzero(as_tuple=True)[0] + + if len(non_pad_positions) > 0: + last_token_pos = non_pad_positions[-1].item() + + # Check if there's room to add EOS token + if last_token_pos + 1 < self.max_length: + # Insert EOS token after the last real token + input_ids[i, last_token_pos + 1] = eos_token_id + attention_mask[i, last_token_pos + 1] = 1 + else: + # If at max length, replace the last token with EOS + input_ids[i, last_token_pos] = eos_token_id + + return {"input_ids": input_ids, "attention_mask": attention_mask} def get_config(self) -> dict[str, Any]: """ diff --git a/src/lerobot/scripts/lerobot_train.py b/src/lerobot/scripts/lerobot_train.py index 1ebdee600..02401068c 100644 --- a/src/lerobot/scripts/lerobot_train.py +++ b/src/lerobot/scripts/lerobot_train.py @@ -90,6 +90,7 @@ def update_policy( # Let accelerator handle mixed precision with accelerator.autocast(): loss, output_dict = policy.forward(batch) + breakpoint() # TODO(rcadene): policy.unnormalize_outputs(out_dict) # Use accelerator's backward method