smaller model

2026-07-25 10:46:01 +00:00 · 2025-08-28 17:43:03 +02:00
parent c877e98658
commit bead25a58a
10 changed files with 235 additions and 167 deletions
@@ -13,7 +13,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import logging
 from pprint import pformat

 import torch
@@ -96,7 +95,10 @@ def make_dataset(cfg: TrainPipelineConfig) -> LeRobotDataset | MultiLeRobotDatas
            num_episodes_to_use = max(1, int(total_episodes * cfg.dataset.percentage / 100))
            episodes = list(range(num_episodes_to_use))
            import logging
-            logging.info(f"Using {cfg.dataset.percentage}% of dataset: {num_episodes_to_use}/{total_episodes} episodes")
+
+            logging.info(
+                f"Using {cfg.dataset.percentage}% of dataset: {num_episodes_to_use}/{total_episodes} episodes"
+            )

        dataset = LeRobotDataset(
            cfg.dataset.repo_id,
@@ -16,11 +16,11 @@ from .act.configuration_act import ACTConfig as ACTConfig
 from .diffusion.configuration_diffusion import DiffusionConfig as DiffusionConfig
 from .pi0.configuration_pi0 import PI0Config as PI0Config
 from .pi0.processor_pi0 import Pi0NewLineProcessor
+from .rlearn.configuration_rlearn import RLearNConfig as RLearNConfig
 from .smolvla.configuration_smolvla import SmolVLAConfig as SmolVLAConfig
 from .smolvla.processor_smolvla import SmolVLANewLineProcessor
 from .tdmpc.configuration_tdmpc import TDMPCConfig as TDMPCConfig
 from .vqbet.configuration_vqbet import VQBeTConfig as VQBeTConfig
-from .rlearn.configuration_rlearn import RLearNConfig as RLearNConfig

 __all__ = [
    "ACTConfig",
@@ -38,7 +38,7 @@ class RLearNConfig(PreTrainedConfig):
    """

    # Encoders
-    model_name: str = "google/siglip2-large-patch16-256"
+    model_name: str = "google/siglip2-base-patch16-256"
    freeze_backbones: bool = True

    # Temporal aggregator
@@ -243,13 +243,16 @@ class RLearnEvaluator:
        """
        Predict rewards for a single episode using proper temporal sequences.
        
+        Note: With ReWiND loss, the model predicts progress values (0-1) across episodes,
+        which serve as dense reward signals for policy learning.
+
        Args:
            frames: Video frames tensor of shape (T, C, H, W)
            language: Language instruction string
            batch_size: Maximum number of temporal sequences to process at once

        Returns:
-            Predicted rewards array of shape (T,)
+            Predicted progress/rewards array of shape (T,) with values typically in [0, 1]
        """
        T = frames.shape[0]
        max_seq_len = self.model.config.max_seq_len
@@ -570,6 +573,83 @@ class RLearnEvaluator:

        return detection_results

+    def evaluate_rewind_progress(
+        self, dataset, num_episodes: int = 100
+    ) -> dict[str, Any]:
+        """
+        Evaluate ReWiND-specific progress properties.
+        
+        Checks:
+        1. Progress values are in [0, 1] range
+        2. Progress increases monotonically (or mostly)
+        3. First frames have low progress, last frames have high progress
+        """
+        episodes = np.random.choice(len(dataset.meta.episodes), min(num_episodes, len(dataset.meta.episodes)), replace=False)
+        
+        results = {
+            "progress_range_violations": 0,
+            "monotonicity_scores": [],
+            "start_progress_values": [],
+            "end_progress_values": [],
+            "episodes_evaluated": 0
+        }
+        
+        for ep_idx in episodes:
+            try:
+                # Get episode data
+                ep_start = dataset.episode_data_index["from"][ep_idx].item()
+                ep_end = dataset.episode_data_index["to"][ep_idx].item()
+                
+                # Sample some frames from episode
+                sample_indices = np.linspace(ep_start, ep_end-1, min(20, ep_end-ep_start), dtype=int)
+                
+                frames = []
+                for idx in sample_indices:
+                    item = dataset[idx]
+                    if OBS_IMAGES in item:
+                        frames.append(item[OBS_IMAGES])
+                    elif OBS_IMAGE in item:
+                        frames.append(item[OBS_IMAGE])
+                    else:
+                        continue
+                
+                if len(frames) < 2:
+                    continue
+                    
+                frames = torch.stack(frames)
+                language = dataset[ep_start].get("task", "")
+                
+                # Predict rewards/progress
+                progress = self.predict_episode_rewards(frames, language)
+                
+                # Check range violations
+                range_violations = np.sum((progress < 0) | (progress > 1))
+                results["progress_range_violations"] += range_violations
+                
+                # Check monotonicity (should generally increase)
+                if len(progress) > 1:
+                    diffs = np.diff(progress)
+                    monotonicity = np.mean(diffs >= 0)  # Fraction of non-decreasing steps
+                    results["monotonicity_scores"].append(monotonicity)
+                
+                # Record start/end values
+                results["start_progress_values"].append(progress[0])
+                results["end_progress_values"].append(progress[-1])
+                results["episodes_evaluated"] += 1
+                
+            except Exception as e:
+                print(f"Error evaluating episode {ep_idx}: {e}")
+                continue
+        
+        # Summarize results
+        if results["episodes_evaluated"] > 0:
+            results["mean_monotonicity"] = np.mean(results["monotonicity_scores"])
+            results["mean_start_progress"] = np.mean(results["start_progress_values"])
+            results["mean_end_progress"] = np.mean(results["end_progress_values"])
+            results["progress_increase"] = results["mean_end_progress"] - results["mean_start_progress"]
+        
+        return results
+
    def comprehensive_evaluation(
        self,
        dataset,
@@ -35,7 +35,7 @@ High-level Architecture
  +------------------------------+
  |  Vision Encoder (frozen)     |  e.g. SigLIP2 vision tower
  +------------------------------+
-        |
+        |s
        |  pooled per-frame embeddings (BT, H_v)
        v
  reshape -> (B, T, H_v) -- Linear proj --> (B, T, D)
@@ -214,7 +214,7 @@ class RLearNPolicy(PreTrainedPolicy):
        )
        lang_emb = self.text_proj(lang_emb)  # (B, D)

-        # ---- NEW: use the HF processor to standardize size & normalization ----
+        # Use the HF processor to standardize size & normalization
        # Flatten (B, T_eff, C, H, W) -> (BT, C, H, W)
        BT = B * T_eff
        flat = frames.reshape(BT, C, H, W).detach().cpu()
@@ -230,7 +230,6 @@ class RLearNPolicy(PreTrainedPolicy):

        proc_out = self.processor(images=images, return_tensors="pt")
        pixel_values = proc_out["pixel_values"].to(next(self.vision_encoder.parameters()).device)
-        # ----------------------------------------------------------------------

        # Encode frames through visual tower per frame
        vision_outputs = self.vision_encoder(pixel_values=pixel_values)
@@ -364,12 +363,12 @@ class RLearNPolicy(PreTrainedPolicy):
        loss_dict: dict[str, float] = {}

        # Check if video rewinding already set the target
-        if self.training and self.config.use_video_rewind and 'augmented_target' in locals():
+        if self.training and self.config.use_video_rewind and "augmented_target" in locals():
            # Use the augmented target from video rewinding
            target = augmented_target
        else:
            # Calculate true episode progress using episode_index and frame_index from batch
-            if "episode_index" in batch and "frame_index" in batch and hasattr(self, 'episode_data_index'):
+            if "episode_index" in batch and "frame_index" in batch and hasattr(self, "episode_data_index"):
                # Get episode indices and frame indices from batch
                episode_indices = batch["episode_index"]  # Shape: (B,)
                frame_indices = batch["frame_index"]  # Shape: (B,)
@@ -422,7 +421,9 @@ class RLearNPolicy(PreTrainedPolicy):
                        prog = target_frame_idx / max(1, ep_length - 1)
                        frame_progress.append(prog)

-                    all_progress.append(torch.tensor(frame_progress, device=values.device, dtype=values.dtype))
+                    all_progress.append(
+                        torch.tensor(frame_progress, device=values.device, dtype=values.dtype)
+                    )

                # Stack to get (B, T) tensor where T is the temporal sequence length
                target = torch.stack(all_progress, dim=1)  # (B, max_seq_len)
@@ -430,7 +431,7 @@ class RLearNPolicy(PreTrainedPolicy):
                # Apply stride/dropout indexing to match the processed frames
                target = target[:, idx]

-            elif "index" in batch and hasattr(self, 'episode_data_index'):
+            elif "index" in batch and hasattr(self, "episode_data_index"):
                # Fallback: Use global index if available
                global_indices = batch["index"]  # Shape: (B,)

@@ -472,7 +473,9 @@ class RLearNPolicy(PreTrainedPolicy):
                target = target.unsqueeze(1).expand(B, T_effective)  # Simple expansion

            else:
-                raise ValueError("No episode information found in batch. Please ensure 'episode_index' and 'frame_index' keys are present.")
+                raise ValueError(
+                    "No episode information found in batch. Please ensure 'episode_index' and 'frame_index' keys are present."
+                )

        # During inference, we might not want to compute loss
        if not self.training and target is None:
@@ -759,7 +762,7 @@ def apply_video_rewind(frames: Tensor, rewind_prob: float = 0.5) -> tuple[Tensor

        # Create rewound sequence: o1...oi, oi-1, ..., oi-k
        forward_frames = frames[b, :i]  # Frames up to split point
-        reverse_frames = frames[b, max(0, i-k):i].flip(dims=[0])  # Reversed frames
+        reverse_frames = frames[b, max(0, i - k) : i].flip(dims=[0])  # Reversed frames

        # Concatenate forward and reverse parts
        rewound_seq = torch.cat([forward_frames, reverse_frames], dim=0)
@@ -773,9 +776,9 @@ def apply_video_rewind(frames: Tensor, rewind_prob: float = 0.5) -> tuple[Tensor

        # Create corresponding progress labels
        # Forward part: increasing progress
-        forward_progress = torch.linspace(0, i/T, i, device=device)
+        forward_progress = torch.linspace(0, i / T, i, device=device)
        # Reverse part: decreasing progress
-        reverse_progress = torch.linspace(i/T, max(0, (i-k)/T), k, device=device)
+        reverse_progress = torch.linspace(i / T, max(0, (i - k) / T), k, device=device)

        rewound_progress = torch.cat([forward_progress, reverse_progress])

@@ -126,17 +126,17 @@ Default weights: $\lambda_{\text{prog}}=1.0$, $\lambda_{\text{spatial-nce}}=0.5$
 - Implement on-the-fly progress label generation (no need for pre-annotated rewards) [x]
 - Try different losses
  - Only rewind loss [x]
-  - Convert python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=IPEC-COMMUNITY/bc_z_lerobot
  - Test only rewind loss (evaluate) []
  - Check rewind implementatyion by hand []
  - Only vlc loss then eval []
  - Vlc + rewind loss then eval []
 - Cleanup code []
+- Convert python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=IPEC-COMMUNITY/bc_z_lerobot and train on 1 percent
+- Then on 10 percent
 - Try DINO v3 as encoder Base 86 M: https://huggingface.co/facebook/dinov3-vitb16-pretrain-lvd1689m with HuggingFaceTB/SmolLM2-135M-Instruct ? []
 - Add more artificial text to dataset generated by vlm (google gemini) []
  - See google gemini vlm caption [] https://gemini.google.com/app/7e332ffaf32580f2
  - Multiple captions per video, creat method to generate as much data as possible etc [] https://arxiv.org/abs/2508.13446, https://arxiv.org/pdf/2412.04453
 - How can we improve spatial aware learning? co generating captions for each frame with language decoder?
- Add droid []
 - Extend evaluation []
- Add other dataset mentioned above []
+- Add other datasets mentioned above []
@@ -137,7 +137,7 @@ def train(cfg: TrainPipelineConfig):

    logging.info("Creating policy")
    # Pass episode_data_index for RLearN to calculate proper progress
-    episode_data_index = dataset.episode_data_index if hasattr(dataset, 'episode_data_index') else None
+    episode_data_index = dataset.episode_data_index if hasattr(dataset, "episode_data_index") else None
    policy = make_policy(
        cfg=cfg.policy,
        ds_meta=dataset.meta,
@@ -1,10 +1,10 @@
 #!/usr/bin/env python

 import torch
-import numpy as np
+
 from lerobot.policies.rlearn.configuration_rlearn import RLearNConfig
-from lerobot.policies.rlearn.modeling_rlearn import RLearNPolicy
 from lerobot.policies.rlearn.evaluation import RLearnEvaluator
+from lerobot.policies.rlearn.modeling_rlearn import RLearNPolicy


 def test_temporal_evaluation():