mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-22 12:09:42 +00:00
add eos token in tokenizer, working
This commit is contained in:
@@ -19,8 +19,10 @@ pre_processor, post_processor = make_pre_post_processors(
|
|||||||
pretrained_path="/fsx/jade_choghari/outputs/pi0_training_new/checkpoints/last/pretrained_model",
|
pretrained_path="/fsx/jade_choghari/outputs/pi0_training_new/checkpoints/last/pretrained_model",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
delta_timestamps = {'action': [0.0, 0.03333333333333333, 0.06666666666666667, 0.1, 0.13333333333333333, 0.16666666666666666, 0.2, 0.23333333333333334, 0.26666666666666666, 0.3, 0.3333333333333333, 0.36666666666666664, 0.4, 0.43333333333333335, 0.4666666666666667, 0.5, 0.5333333333333333, 0.5666666666666667, 0.6, 0.6333333333333333, 0.6666666666666666, 0.7, 0.7333333333333333, 0.7666666666666667, 0.8, 0.8333333333333334, 0.8666666666666667, 0.9, 0.9333333333333333, 0.9666666666666667, 1.0, 1.0333333333333334, 1.0666666666666667, 1.1, 1.1333333333333333, 1.1666666666666667, 1.2, 1.2333333333333334, 1.2666666666666666, 1.3, 1.3333333333333333, 1.3666666666666667, 1.4, 1.4333333333333333, 1.4666666666666666, 1.5, 1.5333333333333334, 1.5666666666666667, 1.6, 1.6333333333333333]}
|
||||||
|
|
||||||
|
dataset = LeRobotDataset(repo_id="local", root="/fsx/jade_choghari/outputs/pgen_annotations1", delta_timestamps=delta_timestamps)
|
||||||
|
|
||||||
dataset = LeRobotDataset(repo_id="local", root="/fsx/jade_choghari/outputs/pgen_annotations1")
|
|
||||||
# rename map --rename_map='{
|
# rename map --rename_map='{
|
||||||
# "observation.images.side": "observation.images.base_0_rgb",
|
# "observation.images.side": "observation.images.base_0_rgb",
|
||||||
# "observation.images.up": "observation.images.left_wrist_0_rgb"
|
# "observation.images.up": "observation.images.left_wrist_0_rgb"
|
||||||
@@ -45,6 +47,7 @@ dataloader = torch.utils.data.DataLoader(
|
|||||||
batch = next(iter(dataloader))
|
batch = next(iter(dataloader))
|
||||||
|
|
||||||
batch = pre_processor(batch)
|
batch = pre_processor(batch)
|
||||||
|
breakpoint()
|
||||||
policy.train()
|
policy.train()
|
||||||
# run inference
|
# run inference
|
||||||
# action = policy.select_action(batch)
|
# action = policy.select_action(batch)
|
||||||
|
|||||||
@@ -7,4 +7,4 @@ python examples/dataset/annotate.py \
|
|||||||
--repo-id lerobot/svla_so101_pickplace \
|
--repo-id lerobot/svla_so101_pickplace \
|
||||||
--video-key observation.images.side \
|
--video-key observation.images.side \
|
||||||
--model Qwen/Qwen3-VL-30B-A3B-Instruct \
|
--model Qwen/Qwen3-VL-30B-A3B-Instruct \
|
||||||
--episodes 3 5 7 44
|
--episodes 5
|
||||||
@@ -0,0 +1,64 @@
|
|||||||
|
|
||||||
|
Fine tune output
|
||||||
|
(Pdb) images[2].mean()
|
||||||
|
tensor(-1., device='cuda:0')
|
||||||
|
(Pdb) images[1].mean()
|
||||||
|
tensor(-0.5780, device='cuda:0')
|
||||||
|
(Pdb) images[0].mean()
|
||||||
|
tensor(-0.7716, device='cuda:0')
|
||||||
|
(Pdb) (Pdb) high_level_task[0]
|
||||||
|
tensor([ 2, 7978, 2403, 6911, 235292, 5651, 3124, 573, 18571,
|
||||||
|
7762, 6643, 573, 9010, 72993, 21810, 4894, 3040, 235292,
|
||||||
|
235248, 235274, 235274, 235274, 728, 235274, 235248, 235284, 235308,
|
||||||
|
235308, 235248, 235274, 235318, 235315, 235248, 235274, 235310, 235318,
|
||||||
|
235248, 235284, 235318, 235248, 235274, 235284, 235321, 235248, 235274,
|
||||||
|
235284, 235321, 235248, 235274, 235284, 235321, 235248, 235274, 235284,
|
||||||
|
235321, 235248, 235274, 235284, 235321, 235248, 235274, 235284, 235321,
|
||||||
|
235248, 235274, 235284, 235321, 235248, 235274, 235284, 235321, 235248,
|
||||||
|
235274, 235284, 235321, 235248, 235274, 235284, 235321, 235248, 235274,
|
||||||
|
235284, 235321, 235248, 235274, 235284, 235321, 235248, 235274, 235284,
|
||||||
|
235321, 235248, 235274, 235284, 235321, 235248, 235274, 235284, 235321,
|
||||||
|
235248, 235274, 235284, 235321, 235248, 235274, 235284, 235321, 235248,
|
||||||
|
235274, 235284, 235321, 235248, 235274, 235284, 235321, 235248, 235274,
|
||||||
|
235284, 235321, 235248, 235274, 235284, 235321, 235248, 235274, 235284,
|
||||||
|
235321, 235248, 235274, 235284, 235321, 235248, 235274, 235284, 235321,
|
||||||
|
235248, 235274, 235284, 235321, 235248, 235274, 235284, 235321, 235289,
|
||||||
|
4284, 8277, 235292, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0], device='cuda:0')
|
||||||
|
(Pdb) subtask_tokens[0]
|
||||||
|
tensor([ 2, 28040, 7762, 14574, 6643, 9010, 37901, 21810, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
|
||||||
|
device='cuda:0')
|
||||||
|
(Pdb) actions.shape
|
||||||
|
torch.Size([4, 50, 32])
|
||||||
|
(Pdb) actions.mean()
|
||||||
|
tensor(0.0143, device='cuda:0')
|
||||||
|
(Pdb)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Inference:
|
||||||
@@ -756,7 +756,7 @@ class PI05Pytorch(nn.Module): # see openpi `PI0Pytorch`
|
|||||||
time_expanded = time[:, None, None]
|
time_expanded = time[:, None, None]
|
||||||
x_t = time_expanded * noise + (1 - time_expanded) * actions
|
x_t = time_expanded * noise + (1 - time_expanded) * actions
|
||||||
u_t = noise - actions
|
u_t = noise - actions
|
||||||
|
|
||||||
# Embed prefix (images + high_level_task + subtask_tokens)
|
# Embed prefix (images + high_level_task + subtask_tokens)
|
||||||
# Use high_level_task (prompt WITHOUT subtask) + subtask_tokens to predict
|
# Use high_level_task (prompt WITHOUT subtask) + subtask_tokens to predict
|
||||||
prefix_embs, prefix_pad_masks, prefix_att_masks, total_T_images = self.embed_prefix(
|
prefix_embs, prefix_pad_masks, prefix_att_masks, total_T_images = self.embed_prefix(
|
||||||
@@ -809,8 +809,7 @@ class PI05Pytorch(nn.Module): # see openpi `PI0Pytorch`
|
|||||||
# Apply mask and compute mean loss over valid tokens
|
# Apply mask and compute mean loss over valid tokens
|
||||||
masked_loss = loss_per_token * subtask_masks.float()
|
masked_loss = loss_per_token * subtask_masks.float()
|
||||||
subtask_loss = masked_loss.sum() / subtask_masks.sum().clamp(min=1)
|
subtask_loss = masked_loss.sum() / subtask_masks.sum().clamp(min=1)
|
||||||
|
|
||||||
breakpoint()
|
|
||||||
# Convert embeddings to bfloat16 if needed for the model
|
# Convert embeddings to bfloat16 if needed for the model
|
||||||
if (
|
if (
|
||||||
self.paligemma_with_expert.paligemma.language_model.layers[0].self_attn.q_proj.weight.dtype
|
self.paligemma_with_expert.paligemma.language_model.layers[0].self_attn.q_proj.weight.dtype
|
||||||
@@ -912,7 +911,6 @@ class PI05Pytorch(nn.Module): # see openpi `PI0Pytorch`
|
|||||||
|
|
||||||
# Embed the generated token and append to prefix
|
# Embed the generated token and append to prefix
|
||||||
next_token_unsqueezed = next_token.unsqueeze(1) # (B, 1)
|
next_token_unsqueezed = next_token.unsqueeze(1) # (B, 1)
|
||||||
breakpoint()
|
|
||||||
|
|
||||||
def next_token_embed_func(next_token_unsqueezed):
|
def next_token_embed_func(next_token_unsqueezed):
|
||||||
next_emb = self.paligemma_with_expert.embed_language_tokens(next_token_unsqueezed)
|
next_emb = self.paligemma_with_expert.embed_language_tokens(next_token_unsqueezed)
|
||||||
@@ -1419,7 +1417,7 @@ class PI05Policy(PreTrainedPolicy):
|
|||||||
# Use high_level_task tokens (WITHOUT subtask) for inference - we'll generate the subtask
|
# Use high_level_task tokens (WITHOUT subtask) for inference - we'll generate the subtask
|
||||||
high_level_task = batch[f"{OBS_LANGUAGE_HIGH_LEVEL_TASK_TOKENS}"]
|
high_level_task = batch[f"{OBS_LANGUAGE_HIGH_LEVEL_TASK_TOKENS}"]
|
||||||
high_level_task_masks = batch[f"{OBS_LANGUAGE_HIGH_LEVEL_TASK_ATTENTION_MASK}"]
|
high_level_task_masks = batch[f"{OBS_LANGUAGE_HIGH_LEVEL_TASK_ATTENTION_MASK}"]
|
||||||
breakpoint()
|
|
||||||
# Sample actions using the model (pass through RTC kwargs, no separate state needed for PI05)
|
# Sample actions using the model (pass through RTC kwargs, no separate state needed for PI05)
|
||||||
actions = self.model.sample_actions(
|
actions = self.model.sample_actions(
|
||||||
images, img_masks, high_level_task, high_level_task_masks,
|
images, img_masks, high_level_task, high_level_task_masks,
|
||||||
@@ -1451,7 +1449,7 @@ class PI05Policy(PreTrainedPolicy):
|
|||||||
valid_tokens = subtask_tokens[i][subtask_masks[i].bool()]
|
valid_tokens = subtask_tokens[i][subtask_masks[i].bool()]
|
||||||
if len(valid_tokens) > 0:
|
if len(valid_tokens) > 0:
|
||||||
decoded_text = self.tokenizer.decode(valid_tokens, skip_special_tokens=True)
|
decoded_text = self.tokenizer.decode(valid_tokens, skip_special_tokens=True)
|
||||||
print(f"[Training] Ground truth subtask {i}: {decoded_text}")
|
# print(f"[Training] Ground truth subtask {i}: {decoded_text}")
|
||||||
|
|
||||||
# Compute loss (no separate state needed for PI05)
|
# Compute loss (no separate state needed for PI05)
|
||||||
# high_level_task = instruction tokens WITHOUT subtask (e.g., "High level task: X; State: Y; Subtask:")
|
# high_level_task = instruction tokens WITHOUT subtask (e.g., "High level task: X; State: Y; Subtask:")
|
||||||
@@ -1461,7 +1459,6 @@ class PI05Policy(PreTrainedPolicy):
|
|||||||
# Extract the total loss
|
# Extract the total loss
|
||||||
loss = loss_dict["loss"]
|
loss = loss_dict["loss"]
|
||||||
|
|
||||||
breakpoint()
|
|
||||||
# Prepare detailed loss dictionary for logging
|
# Prepare detailed loss dictionary for logging
|
||||||
detailed_loss_dict = {
|
detailed_loss_dict = {
|
||||||
"loss": loss.item(),
|
"loss": loss.item(),
|
||||||
|
|||||||
@@ -302,15 +302,17 @@ class TokenizerProcessorStep(ObservationProcessorStep):
|
|||||||
|
|
||||||
def _tokenize_text(self, text: str | list[str]) -> dict[str, torch.Tensor]:
|
def _tokenize_text(self, text: str | list[str]) -> dict[str, torch.Tensor]:
|
||||||
"""
|
"""
|
||||||
A wrapper around the tokenizer call.
|
A wrapper around the tokenizer call that appends an EOS token to each sequence.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
text: A string or list of strings to tokenize.
|
text: A string or list of strings to tokenize.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A dictionary containing tokenized 'input_ids' and 'attention_mask' as PyTorch tensors.
|
A dictionary containing tokenized 'input_ids' and 'attention_mask' as PyTorch tensors,
|
||||||
|
with EOS token appended at the end of each sequence.
|
||||||
"""
|
"""
|
||||||
return self.input_tokenizer(
|
# Tokenize normally
|
||||||
|
tokenized = self.input_tokenizer(
|
||||||
text,
|
text,
|
||||||
max_length=self.max_length,
|
max_length=self.max_length,
|
||||||
truncation=self.truncation,
|
truncation=self.truncation,
|
||||||
@@ -318,6 +320,34 @@ class TokenizerProcessorStep(ObservationProcessorStep):
|
|||||||
padding_side=self.padding_side,
|
padding_side=self.padding_side,
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Get EOS token ID
|
||||||
|
eos_token_id = self.input_tokenizer.eos_token_id
|
||||||
|
if eos_token_id is None:
|
||||||
|
# Some tokenizers don't have an EOS token, skip modification
|
||||||
|
return tokenized
|
||||||
|
|
||||||
|
# Append EOS token to each sequence (before padding)
|
||||||
|
input_ids = tokenized["input_ids"]
|
||||||
|
attention_mask = tokenized["attention_mask"]
|
||||||
|
|
||||||
|
for i in range(input_ids.shape[0]):
|
||||||
|
# Find the position of the last non-padding token
|
||||||
|
non_pad_positions = (attention_mask[i] == 1).nonzero(as_tuple=True)[0]
|
||||||
|
|
||||||
|
if len(non_pad_positions) > 0:
|
||||||
|
last_token_pos = non_pad_positions[-1].item()
|
||||||
|
|
||||||
|
# Check if there's room to add EOS token
|
||||||
|
if last_token_pos + 1 < self.max_length:
|
||||||
|
# Insert EOS token after the last real token
|
||||||
|
input_ids[i, last_token_pos + 1] = eos_token_id
|
||||||
|
attention_mask[i, last_token_pos + 1] = 1
|
||||||
|
else:
|
||||||
|
# If at max length, replace the last token with EOS
|
||||||
|
input_ids[i, last_token_pos] = eos_token_id
|
||||||
|
|
||||||
|
return {"input_ids": input_ids, "attention_mask": attention_mask}
|
||||||
|
|
||||||
def get_config(self) -> dict[str, Any]:
|
def get_config(self) -> dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -90,6 +90,7 @@ def update_policy(
|
|||||||
# Let accelerator handle mixed precision
|
# Let accelerator handle mixed precision
|
||||||
with accelerator.autocast():
|
with accelerator.autocast():
|
||||||
loss, output_dict = policy.forward(batch)
|
loss, output_dict = policy.forward(batch)
|
||||||
|
breakpoint()
|
||||||
# TODO(rcadene): policy.unnormalize_outputs(out_dict)
|
# TODO(rcadene): policy.unnormalize_outputs(out_dict)
|
||||||
|
|
||||||
# Use accelerator's backward method
|
# Use accelerator's backward method
|
||||||
|
|||||||
Reference in New Issue
Block a user