add eos token in tokenizer, working

2026-07-23 09:46:00 +00:00 · 2025-12-14 14:54:07 +00:00
parent 522396a15a
commit fddd044306
6 changed files with 107 additions and 12 deletions
@@ -19,8 +19,10 @@ pre_processor, post_processor = make_pre_post_processors(
    pretrained_path="/fsx/jade_choghari/outputs/pi0_training_new/checkpoints/last/pretrained_model",
 )
 delta_timestamps = {'action': [0.0, 0.03333333333333333, 0.06666666666666667, 0.1, 0.13333333333333333, 0.16666666666666666, 0.2, 0.23333333333333334, 0.26666666666666666, 0.3, 0.3333333333333333, 0.36666666666666664, 0.4, 0.43333333333333335, 0.4666666666666667, 0.5, 0.5333333333333333, 0.5666666666666667, 0.6, 0.6333333333333333, 0.6666666666666666, 0.7, 0.7333333333333333, 0.7666666666666667, 0.8, 0.8333333333333334, 0.8666666666666667, 0.9, 0.9333333333333333, 0.9666666666666667, 1.0, 1.0333333333333334, 1.0666666666666667, 1.1, 1.1333333333333333, 1.1666666666666667, 1.2, 1.2333333333333334, 1.2666666666666666, 1.3, 1.3333333333333333, 1.3666666666666667, 1.4, 1.4333333333333333, 1.4666666666666666, 1.5, 1.5333333333333334, 1.5666666666666667, 1.6, 1.6333333333333333]}
 dataset = LeRobotDataset(repo_id="local", root="/fsx/jade_choghari/outputs/pgen_annotations1", delta_timestamps=delta_timestamps)
 dataset = LeRobotDataset(repo_id="local", root="/fsx/jade_choghari/outputs/pgen_annotations1")
 # rename map --rename_map='{
 #         "observation.images.side": "observation.images.base_0_rgb",
 #         "observation.images.up": "observation.images.left_wrist_0_rgb"
@@ -45,6 +47,7 @@ dataloader = torch.utils.data.DataLoader(
 batch = next(iter(dataloader))
 batch = pre_processor(batch)
 breakpoint()
 policy.train()
 # run inference
 # action = policy.select_action(batch)
@@ -7,4 +7,4 @@ python examples/dataset/annotate.py \
    --repo-id lerobot/svla_so101_pickplace \
    --video-key observation.images.side \
    --model Qwen/Qwen3-VL-30B-A3B-Instruct \
-    --episodes 3 5 7 44
+    --episodes 5
@@ -0,0 +1,64 @@
 Fine tune output
 (Pdb) images[2].mean()
 tensor(-1., device='cuda:0')
 (Pdb) images[1].mean()
 tensor(-0.5780, device='cuda:0')
 (Pdb) images[0].mean()
 tensor(-0.7716, device='cuda:0')
 (Pdb) (Pdb) high_level_task[0]
 tensor([     2,   7978,   2403,   6911, 235292,   5651,   3124,    573,  18571,
          7762,   6643,    573,   9010,  72993,  21810,   4894,   3040, 235292,
        235248, 235274, 235274, 235274,    728, 235274, 235248, 235284, 235308,
        235308, 235248, 235274, 235318, 235315, 235248, 235274, 235310, 235318,
        235248, 235284, 235318, 235248, 235274, 235284, 235321, 235248, 235274,
        235284, 235321, 235248, 235274, 235284, 235321, 235248, 235274, 235284,
        235321, 235248, 235274, 235284, 235321, 235248, 235274, 235284, 235321,
        235248, 235274, 235284, 235321, 235248, 235274, 235284, 235321, 235248,
        235274, 235284, 235321, 235248, 235274, 235284, 235321, 235248, 235274,
        235284, 235321, 235248, 235274, 235284, 235321, 235248, 235274, 235284,
        235321, 235248, 235274, 235284, 235321, 235248, 235274, 235284, 235321,
        235248, 235274, 235284, 235321, 235248, 235274, 235284, 235321, 235248,
        235274, 235284, 235321, 235248, 235274, 235284, 235321, 235248, 235274,
        235284, 235321, 235248, 235274, 235284, 235321, 235248, 235274, 235284,
        235321, 235248, 235274, 235284, 235321, 235248, 235274, 235284, 235321,
        235248, 235274, 235284, 235321, 235248, 235274, 235284, 235321, 235289,
          4284,   8277, 235292,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0], device='cuda:0')
 (Pdb) subtask_tokens[0]
 tensor([    2, 28040,  7762, 14574,  6643,  9010, 37901, 21810,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
       device='cuda:0')
 (Pdb) actions.shape
 torch.Size([4, 50, 32])
 (Pdb) actions.mean()
 tensor(0.0143, device='cuda:0')
 (Pdb) 
 Inference:
@@ -756,7 +756,7 @@ class PI05Pytorch(nn.Module):  # see openpi `PI0Pytorch`
        time_expanded = time[:, None, None]
        x_t = time_expanded * noise + (1 - time_expanded) * actions
        u_t = noise - actions
-        
+
        # Embed prefix (images + high_level_task + subtask_tokens)
        # Use high_level_task (prompt WITHOUT subtask) + subtask_tokens to predict
        prefix_embs, prefix_pad_masks, prefix_att_masks, total_T_images = self.embed_prefix(
@@ -809,8 +809,7 @@ class PI05Pytorch(nn.Module):  # see openpi `PI0Pytorch`
        # Apply mask and compute mean loss over valid tokens
        masked_loss = loss_per_token * subtask_masks.float()
        subtask_loss = masked_loss.sum() / subtask_masks.sum().clamp(min=1)
-        
+
        breakpoint()
        # Convert embeddings to bfloat16 if needed for the model
        if (
            self.paligemma_with_expert.paligemma.language_model.layers[0].self_attn.q_proj.weight.dtype
@@ -912,7 +911,6 @@ class PI05Pytorch(nn.Module):  # see openpi `PI0Pytorch`
            # Embed the generated token and append to prefix
            next_token_unsqueezed = next_token.unsqueeze(1)  # (B, 1)
            breakpoint()
            def next_token_embed_func(next_token_unsqueezed):
                next_emb = self.paligemma_with_expert.embed_language_tokens(next_token_unsqueezed)
@@ -1419,7 +1417,7 @@ class PI05Policy(PreTrainedPolicy):
        # Use high_level_task tokens (WITHOUT subtask) for inference - we'll generate the subtask
        high_level_task = batch[f"{OBS_LANGUAGE_HIGH_LEVEL_TASK_TOKENS}"]
        high_level_task_masks = batch[f"{OBS_LANGUAGE_HIGH_LEVEL_TASK_ATTENTION_MASK}"]
-        breakpoint()
+        
        # Sample actions using the model (pass through RTC kwargs, no separate state needed for PI05)
        actions = self.model.sample_actions(
            images, img_masks, high_level_task, high_level_task_masks, 
@@ -1451,7 +1449,7 @@ class PI05Policy(PreTrainedPolicy):
                valid_tokens = subtask_tokens[i][subtask_masks[i].bool()]
                if len(valid_tokens) > 0:
                    decoded_text = self.tokenizer.decode(valid_tokens, skip_special_tokens=True)
-                    print(f"[Training] Ground truth subtask {i}: {decoded_text}")
+                    # print(f"[Training] Ground truth subtask {i}: {decoded_text}")
        # Compute loss (no separate state needed for PI05)
        # high_level_task = instruction tokens WITHOUT subtask (e.g., "High level task: X; State: Y; Subtask:")
@@ -1461,7 +1459,6 @@ class PI05Policy(PreTrainedPolicy):
        # Extract the total loss
        loss = loss_dict["loss"]
        breakpoint()
        # Prepare detailed loss dictionary for logging
        detailed_loss_dict = {
            "loss": loss.item(),
@@ -302,15 +302,17 @@ class TokenizerProcessorStep(ObservationProcessorStep):
    def _tokenize_text(self, text: str | list[str]) -> dict[str, torch.Tensor]:
        """
-        A wrapper around the tokenizer call.
+        A wrapper around the tokenizer call that appends an EOS token to each sequence.
        Args:
            text: A string or list of strings to tokenize.
        Returns:
-            A dictionary containing tokenized 'input_ids' and 'attention_mask' as PyTorch tensors.
+            A dictionary containing tokenized 'input_ids' and 'attention_mask' as PyTorch tensors,
            with EOS token appended at the end of each sequence.
        """
-        return self.input_tokenizer(
+        # Tokenize normally
        tokenized = self.input_tokenizer(
            text,
            max_length=self.max_length,
            truncation=self.truncation,
@@ -318,6 +320,34 @@ class TokenizerProcessorStep(ObservationProcessorStep):
            padding_side=self.padding_side,
            return_tensors="pt",
        )
        # Get EOS token ID
        eos_token_id = self.input_tokenizer.eos_token_id
        if eos_token_id is None:
            # Some tokenizers don't have an EOS token, skip modification
            return tokenized
        # Append EOS token to each sequence (before padding)
        input_ids = tokenized["input_ids"]
        attention_mask = tokenized["attention_mask"]
        for i in range(input_ids.shape[0]):
            # Find the position of the last non-padding token
            non_pad_positions = (attention_mask[i] == 1).nonzero(as_tuple=True)[0]
            if len(non_pad_positions) > 0:
                last_token_pos = non_pad_positions[-1].item()
                # Check if there's room to add EOS token
                if last_token_pos + 1 < self.max_length:
                    # Insert EOS token after the last real token
                    input_ids[i, last_token_pos + 1] = eos_token_id
                    attention_mask[i, last_token_pos + 1] = 1
                else:
                    # If at max length, replace the last token with EOS
                    input_ids[i, last_token_pos] = eos_token_id
        return {"input_ids": input_ids, "attention_mask": attention_mask}
    def get_config(self) -> dict[str, Any]:
        """
@@ -90,6 +90,7 @@ def update_policy(
    # Let accelerator handle mixed precision
    with accelerator.autocast():
        loss, output_dict = policy.forward(batch)
        breakpoint()
        # TODO(rcadene): policy.unnormalize_outputs(out_dict)
    # Use accelerator's backward method