mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-15 16:49:55 +00:00
smaller model
This commit is contained in:
File diff suppressed because one or more lines are too long
@@ -13,7 +13,6 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import logging
|
||||
from pprint import pformat
|
||||
|
||||
import torch
|
||||
@@ -87,7 +86,7 @@ def make_dataset(cfg: TrainPipelineConfig) -> LeRobotDataset | MultiLeRobotDatas
|
||||
cfg.dataset.repo_id, root=cfg.dataset.root, revision=cfg.dataset.revision
|
||||
)
|
||||
delta_timestamps = resolve_delta_timestamps(cfg.policy, ds_meta)
|
||||
|
||||
|
||||
# Handle percentage parameter
|
||||
episodes = cfg.dataset.episodes
|
||||
if cfg.dataset.percentage is not None:
|
||||
@@ -96,8 +95,11 @@ def make_dataset(cfg: TrainPipelineConfig) -> LeRobotDataset | MultiLeRobotDatas
|
||||
num_episodes_to_use = max(1, int(total_episodes * cfg.dataset.percentage / 100))
|
||||
episodes = list(range(num_episodes_to_use))
|
||||
import logging
|
||||
logging.info(f"Using {cfg.dataset.percentage}% of dataset: {num_episodes_to_use}/{total_episodes} episodes")
|
||||
|
||||
|
||||
logging.info(
|
||||
f"Using {cfg.dataset.percentage}% of dataset: {num_episodes_to_use}/{total_episodes} episodes"
|
||||
)
|
||||
|
||||
dataset = LeRobotDataset(
|
||||
cfg.dataset.repo_id,
|
||||
root=cfg.dataset.root,
|
||||
|
||||
@@ -16,11 +16,11 @@ from .act.configuration_act import ACTConfig as ACTConfig
|
||||
from .diffusion.configuration_diffusion import DiffusionConfig as DiffusionConfig
|
||||
from .pi0.configuration_pi0 import PI0Config as PI0Config
|
||||
from .pi0.processor_pi0 import Pi0NewLineProcessor
|
||||
from .rlearn.configuration_rlearn import RLearNConfig as RLearNConfig
|
||||
from .smolvla.configuration_smolvla import SmolVLAConfig as SmolVLAConfig
|
||||
from .smolvla.processor_smolvla import SmolVLANewLineProcessor
|
||||
from .tdmpc.configuration_tdmpc import TDMPCConfig as TDMPCConfig
|
||||
from .vqbet.configuration_vqbet import VQBeTConfig as VQBeTConfig
|
||||
from .rlearn.configuration_rlearn import RLearNConfig as RLearNConfig
|
||||
|
||||
__all__ = [
|
||||
"ACTConfig",
|
||||
|
||||
@@ -301,7 +301,7 @@ def make_policy(
|
||||
cfg.output_features = {key: ft for key, ft in features.items() if ft.type is FeatureType.ACTION}
|
||||
cfg.input_features = {key: ft for key, ft in features.items() if key not in cfg.output_features}
|
||||
kwargs["config"] = cfg
|
||||
|
||||
|
||||
# Pass episode_data_index for RLearN policy to calculate proper progress
|
||||
if cfg.type == "rlearn" and episode_data_index is not None:
|
||||
kwargs["episode_data_index"] = episode_data_index
|
||||
|
||||
@@ -38,7 +38,7 @@ class RLearNConfig(PreTrainedConfig):
|
||||
"""
|
||||
|
||||
# Encoders
|
||||
model_name: str = "google/siglip2-large-patch16-256"
|
||||
model_name: str = "google/siglip2-base-patch16-256"
|
||||
freeze_backbones: bool = True
|
||||
|
||||
# Temporal aggregator
|
||||
@@ -61,12 +61,12 @@ class RLearNConfig(PreTrainedConfig):
|
||||
# Training
|
||||
learning_rate: float = 1e-4
|
||||
weight_decay: float = 0.01
|
||||
|
||||
|
||||
# ReWiND-specific parameters
|
||||
use_video_rewind: bool = True # Enable video rewinding augmentation
|
||||
rewind_prob: float = 0.5 # Probability of applying rewind to each batch
|
||||
use_mismatch_loss: bool = True # Enable mismatched language-video loss
|
||||
|
||||
|
||||
# Loss hyperparameters (simplified for ReWiND)
|
||||
# The main loss is just MSE between predicted and target progress
|
||||
|
||||
|
||||
@@ -242,6 +242,9 @@ class RLearnEvaluator:
|
||||
def predict_episode_rewards(self, frames: Tensor, language: str, batch_size: int = 16) -> np.ndarray:
|
||||
"""
|
||||
Predict rewards for a single episode using proper temporal sequences.
|
||||
|
||||
Note: With ReWiND loss, the model predicts progress values (0-1) across episodes,
|
||||
which serve as dense reward signals for policy learning.
|
||||
|
||||
Args:
|
||||
frames: Video frames tensor of shape (T, C, H, W)
|
||||
@@ -249,7 +252,7 @@ class RLearnEvaluator:
|
||||
batch_size: Maximum number of temporal sequences to process at once
|
||||
|
||||
Returns:
|
||||
Predicted rewards array of shape (T,)
|
||||
Predicted progress/rewards array of shape (T,) with values typically in [0, 1]
|
||||
"""
|
||||
T = frames.shape[0]
|
||||
max_seq_len = self.model.config.max_seq_len
|
||||
@@ -260,7 +263,7 @@ class RLearnEvaluator:
|
||||
# Create temporal sequences for each frame
|
||||
# For frame i, we want frames [i-max_seq_len+1, ..., i-1, i]
|
||||
temporal_sequences = []
|
||||
|
||||
|
||||
for i in range(T):
|
||||
# Create sequence ending at frame i
|
||||
seq_frames = []
|
||||
@@ -268,14 +271,14 @@ class RLearnEvaluator:
|
||||
# Use frame j if available, otherwise repeat the first available frame
|
||||
frame_idx = max(0, min(j, T - 1))
|
||||
seq_frames.append(processed_frames[frame_idx])
|
||||
|
||||
|
||||
# Pad sequence to max_seq_len by repeating the first frame if needed
|
||||
while len(seq_frames) < max_seq_len:
|
||||
seq_frames.insert(0, seq_frames[0]) # Prepend first frame
|
||||
|
||||
|
||||
# Take only the last max_seq_len frames if we have too many
|
||||
seq_frames = seq_frames[-max_seq_len:]
|
||||
|
||||
|
||||
temporal_sequences.append(torch.stack(seq_frames)) # (max_seq_len, C, H, W)
|
||||
|
||||
# Stack all temporal sequences: (T, max_seq_len, C, H, W)
|
||||
@@ -286,7 +289,7 @@ class RLearnEvaluator:
|
||||
for i in range(0, T, batch_size):
|
||||
end_idx = min(i + batch_size, T)
|
||||
batch_sequences = all_sequences[i:end_idx].to(self.device) # (B, max_seq_len, C, H, W)
|
||||
|
||||
|
||||
# Create batch for model
|
||||
batch = {
|
||||
OBS_IMAGES: batch_sequences, # (B, T, C, H, W) format expected by model
|
||||
@@ -295,13 +298,13 @@ class RLearnEvaluator:
|
||||
|
||||
# Predict rewards - model returns (B, T') but we want the last timestep for each sequence
|
||||
values = self.model.predict_rewards(batch) # (B, T')
|
||||
|
||||
|
||||
# Take the last timestep prediction for each sequence (represents current frame reward)
|
||||
if values.dim() == 2:
|
||||
batch_rewards = values[:, -1].cpu().numpy() # (B,) - last timestep
|
||||
else:
|
||||
batch_rewards = values.cpu().numpy() # (B,) - already single timestep
|
||||
|
||||
|
||||
rewards.extend(batch_rewards)
|
||||
|
||||
return np.array(rewards[:T]) # Ensure exact length
|
||||
@@ -570,6 +573,83 @@ class RLearnEvaluator:
|
||||
|
||||
return detection_results
|
||||
|
||||
def evaluate_rewind_progress(
|
||||
self, dataset, num_episodes: int = 100
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Evaluate ReWiND-specific progress properties.
|
||||
|
||||
Checks:
|
||||
1. Progress values are in [0, 1] range
|
||||
2. Progress increases monotonically (or mostly)
|
||||
3. First frames have low progress, last frames have high progress
|
||||
"""
|
||||
episodes = np.random.choice(len(dataset.meta.episodes), min(num_episodes, len(dataset.meta.episodes)), replace=False)
|
||||
|
||||
results = {
|
||||
"progress_range_violations": 0,
|
||||
"monotonicity_scores": [],
|
||||
"start_progress_values": [],
|
||||
"end_progress_values": [],
|
||||
"episodes_evaluated": 0
|
||||
}
|
||||
|
||||
for ep_idx in episodes:
|
||||
try:
|
||||
# Get episode data
|
||||
ep_start = dataset.episode_data_index["from"][ep_idx].item()
|
||||
ep_end = dataset.episode_data_index["to"][ep_idx].item()
|
||||
|
||||
# Sample some frames from episode
|
||||
sample_indices = np.linspace(ep_start, ep_end-1, min(20, ep_end-ep_start), dtype=int)
|
||||
|
||||
frames = []
|
||||
for idx in sample_indices:
|
||||
item = dataset[idx]
|
||||
if OBS_IMAGES in item:
|
||||
frames.append(item[OBS_IMAGES])
|
||||
elif OBS_IMAGE in item:
|
||||
frames.append(item[OBS_IMAGE])
|
||||
else:
|
||||
continue
|
||||
|
||||
if len(frames) < 2:
|
||||
continue
|
||||
|
||||
frames = torch.stack(frames)
|
||||
language = dataset[ep_start].get("task", "")
|
||||
|
||||
# Predict rewards/progress
|
||||
progress = self.predict_episode_rewards(frames, language)
|
||||
|
||||
# Check range violations
|
||||
range_violations = np.sum((progress < 0) | (progress > 1))
|
||||
results["progress_range_violations"] += range_violations
|
||||
|
||||
# Check monotonicity (should generally increase)
|
||||
if len(progress) > 1:
|
||||
diffs = np.diff(progress)
|
||||
monotonicity = np.mean(diffs >= 0) # Fraction of non-decreasing steps
|
||||
results["monotonicity_scores"].append(monotonicity)
|
||||
|
||||
# Record start/end values
|
||||
results["start_progress_values"].append(progress[0])
|
||||
results["end_progress_values"].append(progress[-1])
|
||||
results["episodes_evaluated"] += 1
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error evaluating episode {ep_idx}: {e}")
|
||||
continue
|
||||
|
||||
# Summarize results
|
||||
if results["episodes_evaluated"] > 0:
|
||||
results["mean_monotonicity"] = np.mean(results["monotonicity_scores"])
|
||||
results["mean_start_progress"] = np.mean(results["start_progress_values"])
|
||||
results["mean_end_progress"] = np.mean(results["end_progress_values"])
|
||||
results["progress_increase"] = results["mean_end_progress"] - results["mean_start_progress"]
|
||||
|
||||
return results
|
||||
|
||||
def comprehensive_evaluation(
|
||||
self,
|
||||
dataset,
|
||||
|
||||
@@ -35,7 +35,7 @@ High-level Architecture
|
||||
+------------------------------+
|
||||
| Vision Encoder (frozen) | e.g. SigLIP2 vision tower
|
||||
+------------------------------+
|
||||
|
|
||||
|s
|
||||
| pooled per-frame embeddings (BT, H_v)
|
||||
v
|
||||
reshape -> (B, T, H_v) -- Linear proj --> (B, T, D)
|
||||
@@ -214,7 +214,7 @@ class RLearNPolicy(PreTrainedPolicy):
|
||||
)
|
||||
lang_emb = self.text_proj(lang_emb) # (B, D)
|
||||
|
||||
# ---- NEW: use the HF processor to standardize size & normalization ----
|
||||
# Use the HF processor to standardize size & normalization
|
||||
# Flatten (B, T_eff, C, H, W) -> (BT, C, H, W)
|
||||
BT = B * T_eff
|
||||
flat = frames.reshape(BT, C, H, W).detach().cpu()
|
||||
@@ -230,7 +230,6 @@ class RLearNPolicy(PreTrainedPolicy):
|
||||
|
||||
proc_out = self.processor(images=images, return_tensors="pt")
|
||||
pixel_values = proc_out["pixel_values"].to(next(self.vision_encoder.parameters()).device)
|
||||
# ----------------------------------------------------------------------
|
||||
|
||||
# Encode frames through visual tower per frame
|
||||
vision_outputs = self.vision_encoder(pixel_values=pixel_values)
|
||||
@@ -276,7 +275,7 @@ class RLearNPolicy(PreTrainedPolicy):
|
||||
Expected batch keys:
|
||||
- OBS_IMAGES: list[Tensor] of shape [(B, C, H, W), ...] per time step or stacked (B, T, C, H, W)
|
||||
- OBS_LANGUAGE: optional string tokens already tokenized externally or raw strings
|
||||
|
||||
|
||||
Note: Progress labels (0 to 1) are generated automatically for each episode.
|
||||
No REWARD key is needed in the batch.
|
||||
"""
|
||||
@@ -362,42 +361,42 @@ class RLearNPolicy(PreTrainedPolicy):
|
||||
# Generate progress labels on-the-fly (ReWiND approach)
|
||||
# IMPORTANT: Progress should be 0-1 across the ENTIRE EPISODE, not just the temporal window
|
||||
loss_dict: dict[str, float] = {}
|
||||
|
||||
|
||||
# Check if video rewinding already set the target
|
||||
if self.training and self.config.use_video_rewind and 'augmented_target' in locals():
|
||||
if self.training and self.config.use_video_rewind and "augmented_target" in locals():
|
||||
# Use the augmented target from video rewinding
|
||||
target = augmented_target
|
||||
else:
|
||||
# Calculate true episode progress using episode_index and frame_index from batch
|
||||
if "episode_index" in batch and "frame_index" in batch and hasattr(self, 'episode_data_index'):
|
||||
if "episode_index" in batch and "frame_index" in batch and hasattr(self, "episode_data_index"):
|
||||
# Get episode indices and frame indices from batch
|
||||
episode_indices = batch["episode_index"] # Shape: (B,)
|
||||
frame_indices = batch["frame_index"] # Shape: (B,)
|
||||
|
||||
|
||||
# Calculate progress for the current frame in each sample
|
||||
progress_values = []
|
||||
|
||||
|
||||
for b_idx in range(B):
|
||||
ep_idx = episode_indices[b_idx].item()
|
||||
frame_idx = frame_indices[b_idx].item()
|
||||
|
||||
|
||||
# Get episode boundaries
|
||||
ep_start = self.episode_data_index["from"][ep_idx].item()
|
||||
ep_end = self.episode_data_index["to"][ep_idx].item()
|
||||
ep_length = ep_end - ep_start
|
||||
|
||||
|
||||
# Progress from 0 to 1 within the episode
|
||||
# frame_index is relative to the episode (0-based within episode)
|
||||
progress = frame_idx / max(1, ep_length - 1)
|
||||
progress_values.append(progress)
|
||||
|
||||
|
||||
# Create progress tensor for the current frame (last in temporal sequence)
|
||||
current_progress = torch.tensor(progress_values, device=values.device, dtype=values.dtype)
|
||||
|
||||
|
||||
# Now calculate progress for ALL frames in the temporal window
|
||||
# The observation_delta_indices tell us which frames we're looking at
|
||||
delta_indices = self.config.observation_delta_indices # e.g., [-15, -14, ..., 0]
|
||||
|
||||
|
||||
# Calculate progress for each frame in the temporal window
|
||||
all_progress = []
|
||||
for delta in delta_indices:
|
||||
@@ -406,42 +405,44 @@ class RLearNPolicy(PreTrainedPolicy):
|
||||
for b_idx in range(B):
|
||||
ep_idx = episode_indices[b_idx].item()
|
||||
frame_idx = frame_indices[b_idx].item()
|
||||
|
||||
|
||||
# Calculate the actual frame index with delta
|
||||
target_frame_idx = frame_idx + delta
|
||||
|
||||
|
||||
# Get episode boundaries
|
||||
ep_start = self.episode_data_index["from"][ep_idx].item()
|
||||
ep_end = self.episode_data_index["to"][ep_idx].item()
|
||||
ep_length = ep_end - ep_start
|
||||
|
||||
|
||||
# Clamp to episode boundaries (frame_index is relative to episode)
|
||||
target_frame_idx = max(0, min(ep_length - 1, target_frame_idx))
|
||||
|
||||
|
||||
# Calculate progress for this frame
|
||||
prog = target_frame_idx / max(1, ep_length - 1)
|
||||
frame_progress.append(prog)
|
||||
|
||||
all_progress.append(torch.tensor(frame_progress, device=values.device, dtype=values.dtype))
|
||||
|
||||
|
||||
all_progress.append(
|
||||
torch.tensor(frame_progress, device=values.device, dtype=values.dtype)
|
||||
)
|
||||
|
||||
# Stack to get (B, T) tensor where T is the temporal sequence length
|
||||
target = torch.stack(all_progress, dim=1) # (B, max_seq_len)
|
||||
|
||||
|
||||
# Apply stride/dropout indexing to match the processed frames
|
||||
target = target[:, idx]
|
||||
|
||||
elif "index" in batch and hasattr(self, 'episode_data_index'):
|
||||
|
||||
elif "index" in batch and hasattr(self, "episode_data_index"):
|
||||
# Fallback: Use global index if available
|
||||
global_indices = batch["index"] # Shape: (B,)
|
||||
|
||||
|
||||
# For each index, find which episode it belongs to and its position
|
||||
progress_values = []
|
||||
|
||||
|
||||
for global_idx in global_indices:
|
||||
# Find which episode this index belongs to
|
||||
episode_starts = self.episode_data_index["from"]
|
||||
episode_ends = self.episode_data_index["to"]
|
||||
|
||||
|
||||
# Find the episode by checking which range the index falls into
|
||||
episode_idx = None
|
||||
frame_in_episode = None
|
||||
@@ -450,30 +451,32 @@ class RLearNPolicy(PreTrainedPolicy):
|
||||
episode_idx = ep_idx
|
||||
frame_in_episode = global_idx.item() - episode_starts[ep_idx].item()
|
||||
break
|
||||
|
||||
|
||||
if episode_idx is not None:
|
||||
# Calculate position within episode
|
||||
ep_start = episode_starts[episode_idx].item()
|
||||
ep_end = episode_ends[episode_idx].item()
|
||||
ep_length = ep_end - ep_start
|
||||
|
||||
|
||||
# Progress from 0 to 1 within the episode
|
||||
progress = frame_in_episode / max(1, ep_length - 1)
|
||||
else:
|
||||
# Fallback if we can't find the episode (shouldn't happen)
|
||||
progress = 0.5
|
||||
|
||||
|
||||
progress_values.append(progress)
|
||||
|
||||
|
||||
# For temporal window, use simplified linear progress
|
||||
# (proper calculation would need all frame indices in the window)
|
||||
T_effective = len(idx)
|
||||
target = torch.tensor(progress_values, device=values.device, dtype=values.dtype)
|
||||
target = target.unsqueeze(1).expand(B, T_effective) # Simple expansion
|
||||
|
||||
|
||||
else:
|
||||
raise ValueError("No episode information found in batch. Please ensure 'episode_index' and 'frame_index' keys are present.")
|
||||
|
||||
raise ValueError(
|
||||
"No episode information found in batch. Please ensure 'episode_index' and 'frame_index' keys are present."
|
||||
)
|
||||
|
||||
# During inference, we might not want to compute loss
|
||||
if not self.training and target is None:
|
||||
loss = values.mean() * 0.0
|
||||
@@ -482,25 +485,25 @@ class RLearNPolicy(PreTrainedPolicy):
|
||||
|
||||
# ReWiND Loss (following the paper exactly)
|
||||
# The core loss is progress regression with video rewinding augmentation
|
||||
|
||||
|
||||
# 1) Main progress regression loss for matched sequences
|
||||
# Target should be normalized progress from 0 to 1 (t/T)
|
||||
L_progress = F.mse_loss(values, target)
|
||||
|
||||
|
||||
# 2) Mismatched video-language pairs should predict zero progress
|
||||
L_mismatch = torch.zeros((), device=values.device)
|
||||
if self.training and self.config.use_mismatch_loss and values.size(0) > 1:
|
||||
# Randomly shuffle language instructions within the batch
|
||||
shuffled_indices = torch.randperm(B, device=values.device)
|
||||
lang_mismatch = lang_emb[shuffled_indices]
|
||||
|
||||
|
||||
# Forward pass with mismatched language
|
||||
mismatch_feat = self.temporal(visual_seq, lang_mismatch, return_features=True)
|
||||
mismatch_values = self.head(mismatch_feat).squeeze(-1)
|
||||
|
||||
|
||||
# Mismatched pairs should predict zero progress
|
||||
L_mismatch = F.mse_loss(mismatch_values, torch.zeros_like(target))
|
||||
|
||||
|
||||
# Total loss is just progress regression (rewinding is handled via data augmentation)
|
||||
loss = L_progress + L_mismatch
|
||||
|
||||
@@ -720,7 +723,7 @@ def encode_language(
|
||||
|
||||
def apply_video_rewind(frames: Tensor, rewind_prob: float = 0.5) -> tuple[Tensor, Tensor]:
|
||||
"""Apply video rewinding augmentation as described in ReWiND paper.
|
||||
|
||||
|
||||
Each video in the batch has an independent chance of being rewound.
|
||||
|
||||
Args:
|
||||
@@ -732,61 +735,61 @@ def apply_video_rewind(frames: Tensor, rewind_prob: float = 0.5) -> tuple[Tensor
|
||||
"""
|
||||
B, T, C, H, W = frames.shape
|
||||
device = frames.device
|
||||
|
||||
|
||||
# Create default progress labels (linearly increasing from 0 to 1)
|
||||
default_progress = torch.linspace(0, 1, T, device=device).unsqueeze(0).expand(B, -1)
|
||||
|
||||
|
||||
# Apply rewind augmentation to each sample in batch independently
|
||||
augmented_frames = []
|
||||
augmented_progress = []
|
||||
|
||||
|
||||
for b in range(B):
|
||||
# Each video has independent chance of being rewound
|
||||
should_rewind = torch.rand(1).item() < rewind_prob
|
||||
|
||||
|
||||
if not should_rewind or T < 3:
|
||||
# Keep original sequence
|
||||
augmented_frames.append(frames[b])
|
||||
augmented_progress.append(default_progress[b])
|
||||
continue
|
||||
|
||||
|
||||
# Apply rewinding to this video
|
||||
# Split point i: between frame 2 and T-1
|
||||
i = torch.randint(2, T, (1,)).item()
|
||||
|
||||
|
||||
# Rewind length k: between 1 and i-1 frames
|
||||
k = torch.randint(1, min(i, T - i + 1), (1,)).item()
|
||||
|
||||
|
||||
# Create rewound sequence: o1...oi, oi-1, ..., oi-k
|
||||
forward_frames = frames[b, :i] # Frames up to split point
|
||||
reverse_frames = frames[b, max(0, i-k):i].flip(dims=[0]) # Reversed frames
|
||||
|
||||
reverse_frames = frames[b, max(0, i - k) : i].flip(dims=[0]) # Reversed frames
|
||||
|
||||
# Concatenate forward and reverse parts
|
||||
rewound_seq = torch.cat([forward_frames, reverse_frames], dim=0)
|
||||
|
||||
|
||||
# Pad with zeros if needed to maintain shape
|
||||
if rewound_seq.shape[0] < T:
|
||||
padding = torch.zeros(T - rewound_seq.shape[0], C, H, W, device=device)
|
||||
rewound_seq = torch.cat([rewound_seq, padding], dim=0)
|
||||
elif rewound_seq.shape[0] > T:
|
||||
rewound_seq = rewound_seq[:T]
|
||||
|
||||
|
||||
# Create corresponding progress labels
|
||||
# Forward part: increasing progress
|
||||
forward_progress = torch.linspace(0, i/T, i, device=device)
|
||||
# Reverse part: decreasing progress
|
||||
reverse_progress = torch.linspace(i/T, max(0, (i-k)/T), k, device=device)
|
||||
|
||||
forward_progress = torch.linspace(0, i / T, i, device=device)
|
||||
# Reverse part: decreasing progress
|
||||
reverse_progress = torch.linspace(i / T, max(0, (i - k) / T), k, device=device)
|
||||
|
||||
rewound_progress = torch.cat([forward_progress, reverse_progress])
|
||||
|
||||
|
||||
# Pad progress if needed
|
||||
if rewound_progress.shape[0] < T:
|
||||
padding = torch.zeros(T - rewound_progress.shape[0], device=device)
|
||||
rewound_progress = torch.cat([rewound_progress, padding])
|
||||
elif rewound_progress.shape[0] > T:
|
||||
rewound_progress = rewound_progress[:T]
|
||||
|
||||
|
||||
augmented_frames.append(rewound_seq)
|
||||
augmented_progress.append(rewound_progress)
|
||||
|
||||
|
||||
return torch.stack(augmented_frames), torch.stack(augmented_progress)
|
||||
|
||||
@@ -125,18 +125,18 @@ Default weights: $\lambda_{\text{prog}}=1.0$, $\lambda_{\text{spatial-nce}}=0.5$
|
||||
- Do first training [x]
|
||||
- Implement on-the-fly progress label generation (no need for pre-annotated rewards) [x]
|
||||
- Try different losses
|
||||
- Only rewind loss [x]
|
||||
- Convert python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=IPEC-COMMUNITY/bc_z_lerobot
|
||||
- Only rewind loss [x]
|
||||
- Test only rewind loss (evaluate) []
|
||||
- Check rewind implementatyion by hand []
|
||||
- Only vlc loss then eval []
|
||||
- Vlc + rewind loss then eval []
|
||||
- Cleanup code []
|
||||
- Convert python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=IPEC-COMMUNITY/bc_z_lerobot and train on 1 percent
|
||||
- Then on 10 percent
|
||||
- Try DINO v3 as encoder Base 86 M: https://huggingface.co/facebook/dinov3-vitb16-pretrain-lvd1689m with HuggingFaceTB/SmolLM2-135M-Instruct ? []
|
||||
- Add more artificial text to dataset generated by vlm (google gemini) []
|
||||
- See google gemini vlm caption [] https://gemini.google.com/app/7e332ffaf32580f2
|
||||
- Multiple captions per video, creat method to generate as much data as possible etc [] https://arxiv.org/abs/2508.13446, https://arxiv.org/pdf/2412.04453
|
||||
- How can we improve spatial aware learning? co generating captions for each frame with language decoder?
|
||||
- Add droid []
|
||||
- Extend evaluation []
|
||||
- Add other dataset mentioned above []
|
||||
- Add other datasets mentioned above []
|
||||
|
||||
@@ -137,7 +137,7 @@ def train(cfg: TrainPipelineConfig):
|
||||
|
||||
logging.info("Creating policy")
|
||||
# Pass episode_data_index for RLearN to calculate proper progress
|
||||
episode_data_index = dataset.episode_data_index if hasattr(dataset, 'episode_data_index') else None
|
||||
episode_data_index = dataset.episode_data_index if hasattr(dataset, "episode_data_index") else None
|
||||
policy = make_policy(
|
||||
cfg=cfg.policy,
|
||||
ds_meta=dataset.meta,
|
||||
|
||||
@@ -1,57 +1,57 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import torch
|
||||
import numpy as np
|
||||
|
||||
from lerobot.policies.rlearn.configuration_rlearn import RLearNConfig
|
||||
from lerobot.policies.rlearn.modeling_rlearn import RLearNPolicy
|
||||
from lerobot.policies.rlearn.evaluation import RLearnEvaluator
|
||||
from lerobot.policies.rlearn.modeling_rlearn import RLearNPolicy
|
||||
|
||||
|
||||
def test_temporal_evaluation():
|
||||
"""Test that evaluation creates proper temporal sequences with past frames."""
|
||||
|
||||
|
||||
# Create a simple config
|
||||
config = RLearNConfig(
|
||||
max_seq_len=4, # Small for testing
|
||||
dim_model=64, # Small for testing
|
||||
dim_model=64, # Small for testing
|
||||
n_heads=2,
|
||||
n_layers=2,
|
||||
)
|
||||
|
||||
|
||||
# Create model (will be randomly initialized)
|
||||
model = RLearNPolicy(config)
|
||||
model.eval()
|
||||
|
||||
|
||||
# Create evaluator
|
||||
evaluator = RLearnEvaluator(model, device="cpu")
|
||||
|
||||
|
||||
# Create test episode: 8 frames of 3x64x64 images
|
||||
T, C, H, W = 8, 3, 64, 64
|
||||
frames = torch.randn(T, C, H, W)
|
||||
language = "test instruction"
|
||||
|
||||
|
||||
print(f"Input episode shape: {frames.shape}")
|
||||
print(f"Model expects sequences of length: {config.max_seq_len}")
|
||||
|
||||
|
||||
# Test the evaluation
|
||||
rewards = evaluator.predict_episode_rewards(frames, language, batch_size=4)
|
||||
|
||||
|
||||
print(f"Output rewards shape: {rewards.shape}")
|
||||
print(f"Rewards: {rewards}")
|
||||
|
||||
|
||||
# Verify we get one reward per frame
|
||||
assert len(rewards) == T, f"Expected {T} rewards, got {len(rewards)}"
|
||||
|
||||
|
||||
print("✅ Test passed! Evaluation correctly processes temporal sequences.")
|
||||
|
||||
|
||||
# Test with very short episode (shorter than max_seq_len)
|
||||
short_frames = torch.randn(2, C, H, W) # Only 2 frames
|
||||
short_rewards = evaluator.predict_episode_rewards(short_frames, language)
|
||||
|
||||
|
||||
print(f"\nShort episode shape: {short_frames.shape}")
|
||||
print(f"Short rewards shape: {short_rewards.shape}")
|
||||
assert len(short_rewards) == 2, f"Expected 2 rewards, got {len(short_rewards)}"
|
||||
|
||||
|
||||
print("✅ Short episode test passed!")
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user