mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-20 19:19:56 +00:00
removing missleading future_action_window_size to just use chunk_size
This commit is contained in:
@@ -210,7 +210,7 @@ class VLAJEPAActionHead(nn.Module):
|
|||||||
inner_dim = num_heads * head_dim # e.g. DiT-B: 12 × 64 = 768
|
inner_dim = num_heads * head_dim # e.g. DiT-B: 12 × 64 = 768
|
||||||
|
|
||||||
self.input_embedding_dim = inner_dim
|
self.input_embedding_dim = inner_dim
|
||||||
self.action_horizon = config.future_action_window_size + 1
|
self.action_horizon = config.chunk_size
|
||||||
self.num_inference_timesteps = config.num_inference_timesteps
|
self.num_inference_timesteps = config.num_inference_timesteps
|
||||||
|
|
||||||
self.model = DiT(
|
self.model = DiT(
|
||||||
|
|||||||
@@ -36,8 +36,7 @@ class VLAJEPAConfig(PreTrainedConfig):
|
|||||||
|
|
||||||
action_dim: int = 7
|
action_dim: int = 7
|
||||||
state_dim: int = 8
|
state_dim: int = 8
|
||||||
future_action_window_size: int = 6
|
|
||||||
past_action_window_size: int = 0
|
|
||||||
num_action_tokens_per_timestep: int = 8
|
num_action_tokens_per_timestep: int = 8
|
||||||
num_embodied_action_tokens_per_instruction: int = 32
|
num_embodied_action_tokens_per_instruction: int = 32
|
||||||
num_inference_timesteps: int = 4
|
num_inference_timesteps: int = 4
|
||||||
@@ -82,8 +81,6 @@ class VLAJEPAConfig(PreTrainedConfig):
|
|||||||
self.enable_world_model = False
|
self.enable_world_model = False
|
||||||
if self.n_action_steps > self.chunk_size:
|
if self.n_action_steps > self.chunk_size:
|
||||||
raise ValueError("`n_action_steps` must be <= `chunk_size`.")
|
raise ValueError("`n_action_steps` must be <= `chunk_size`.")
|
||||||
if self.future_action_window_size + 1 > self.chunk_size:
|
|
||||||
raise ValueError("`chunk_size` must cover the predicted action horizon.")
|
|
||||||
if self.num_video_frames < 2 * self.jepa_tubelet_size:
|
if self.num_video_frames < 2 * self.jepa_tubelet_size:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"`video_horizon` ({self.num_video_frames}) must be >= 2 * `jepa_tubelet_size` "
|
f"`video_horizon` ({self.num_video_frames}) must be >= 2 * `jepa_tubelet_size` "
|
||||||
|
|||||||
@@ -62,7 +62,6 @@ _ARCH = {
|
|||||||
"qwen_model_name": "Qwen/Qwen3-VL-2B-Instruct", # 2B, NOT the default 4B
|
"qwen_model_name": "Qwen/Qwen3-VL-2B-Instruct", # 2B, NOT the default 4B
|
||||||
"chunk_size": 7,
|
"chunk_size": 7,
|
||||||
"n_action_steps": 7,
|
"n_action_steps": 7,
|
||||||
"future_action_window_size": 6,
|
|
||||||
"num_video_frames": 8,
|
"num_video_frames": 8,
|
||||||
"jepa_tubelet_size": 2,
|
"jepa_tubelet_size": 2,
|
||||||
"num_action_tokens_per_timestep": 8,
|
"num_action_tokens_per_timestep": 8,
|
||||||
|
|||||||
@@ -236,7 +236,7 @@ class VLAJEPAModel(nn.Module):
|
|||||||
actions_tensor = torch.tensor(
|
actions_tensor = torch.tensor(
|
||||||
np.array(actions), device=last_hidden.device, dtype=torch.float32
|
np.array(actions), device=last_hidden.device, dtype=torch.float32
|
||||||
) # [B, T_full, action_dim]
|
) # [B, T_full, action_dim]
|
||||||
action_horizon = self.config.future_action_window_size + 1
|
action_horizon = self.config.chunk_size
|
||||||
actions_target = actions_tensor[:, -action_horizon:, :]
|
actions_target = actions_tensor[:, -action_horizon:, :]
|
||||||
|
|
||||||
state_tensor = None
|
state_tensor = None
|
||||||
|
|||||||
@@ -62,7 +62,6 @@ def make_config(
|
|||||||
device="cpu",
|
device="cpu",
|
||||||
chunk_size=action_horizon,
|
chunk_size=action_horizon,
|
||||||
n_action_steps=min(N_ACTION_STEPS, action_horizon),
|
n_action_steps=min(N_ACTION_STEPS, action_horizon),
|
||||||
future_action_window_size=action_horizon - 1,
|
|
||||||
action_dim=action_dim,
|
action_dim=action_dim,
|
||||||
state_dim=state_dim,
|
state_dim=state_dim,
|
||||||
num_video_frames=num_video_frames,
|
num_video_frames=num_video_frames,
|
||||||
|
|||||||
@@ -18,12 +18,7 @@ def test_delta_indices() -> None:
|
|||||||
|
|
||||||
def test_n_action_steps_exceeds_chunk_size_raises() -> None:
|
def test_n_action_steps_exceeds_chunk_size_raises() -> None:
|
||||||
with pytest.raises(ValueError, match="n_action_steps"):
|
with pytest.raises(ValueError, match="n_action_steps"):
|
||||||
VLAJEPAConfig(chunk_size=4, n_action_steps=8, future_action_window_size=3)
|
VLAJEPAConfig(chunk_size=4, n_action_steps=8)
|
||||||
|
|
||||||
|
|
||||||
def test_future_window_exceeds_chunk_size_raises() -> None:
|
|
||||||
with pytest.raises(ValueError, match="predicted action horizon"):
|
|
||||||
VLAJEPAConfig(chunk_size=4, n_action_steps=4, future_action_window_size=4)
|
|
||||||
|
|
||||||
|
|
||||||
def test_too_few_video_frames_raises() -> None:
|
def test_too_few_video_frames_raises() -> None:
|
||||||
@@ -31,7 +26,6 @@ def test_too_few_video_frames_raises() -> None:
|
|||||||
VLAJEPAConfig(
|
VLAJEPAConfig(
|
||||||
chunk_size=16,
|
chunk_size=16,
|
||||||
n_action_steps=16,
|
n_action_steps=16,
|
||||||
future_action_window_size=15,
|
|
||||||
num_video_frames=2,
|
num_video_frames=2,
|
||||||
jepa_tubelet_size=2, # needs >= 4 frames (2 for current, 2 for future) to have a window of size > 0
|
jepa_tubelet_size=2, # needs >= 4 frames (2 for current, 2 for future) to have a window of size > 0
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user