add multipe timesteps

2026-07-25 18:56:09 +00:00 · 2025-08-27 16:33:53 +02:00
parent 681be962ae
commit 450be9d7d1
4 changed files with 38 additions and 28 deletions
@@ -21,7 +21,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
@@ -75,7 +75,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
@@ -93,7 +93,7 @@
   "source": [
    "# Configuration\n",
    "DATASET_REPO = \"pepijn223/rewards_bc_z3\"  # Change to your dataset\n",
-    "MODEL_PATH = \"path/to/your/trained/model\"  # Change to your model checkpoint\n",
+    "MODEL_PATH = \"pepijn223/rlearn_rewards_bc_z\"  # Change to your model checkpoint\n",
    "DEVICE = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
    "NUM_EVAL_EPISODES = 50  # Number of episodes for evaluation\n",
    "\n",
@@ -104,8 +104,9 @@ class RLearNConfig(PreTrainedConfig):

    @property
    def observation_delta_indices(self) -> list | None:
-        # Not using delta sampling from the dataset by default.
-        return None
+        # Use temporal sequences: past frames from -(max_seq_len-1) to current (0)
+        # This gives us max_seq_len frames total, e.g. [-15, -14, ..., -1, 0] for max_seq_len=16
+        return list(range(1 - self.max_seq_len, 1))

    @property
    def action_delta_indices(self) -> list | None:
@@ -604,7 +604,7 @@ def generate_causal_mask(T: int, device=None) -> Tensor:

 def extract_visual_sequence(batch: dict[str, Tensor]) -> Tensor:
    # Accept various image key formats from datasets
-    # Try multiple common key patterns
+    # With delta_indices, the dataset provides temporal sequences automatically
    
    # List of possible image keys to check, in order of preference
    possible_keys = [
@@ -619,14 +619,15 @@ def extract_visual_sequence(batch: dict[str, Tensor]) -> Tensor:

            if isinstance(image_val, list) and len(image_val) > 0:
                # List of (B, C, H, W) -> stack over time
+                # This happens when dataset provides temporal sequence as list
                return torch.stack(image_val, dim=1)
            elif torch.is_tensor(image_val):
                # Tensor of shape (B, T, C, H, W) or (B, C, H, W)
                if image_val.dim() == 5:
-                    # Already has time dimension
+                    # Already has time dimension - this is what we expect with delta_indices
                    return image_val
                elif image_val.dim() == 4:
-                    # Add time dimension (single frame)
+                    # Add time dimension (single frame) - fallback for datasets without temporal sequences
                    return image_val.unsqueeze(1)
                else:
                    raise ValueError(
@@ -690,21 +691,28 @@ def pairwise_ranking_loss(logits: Tensor, target: Tensor, margin: float = 0.1, n


 def zscore(x: Tensor, eps: float = 1e-3) -> Tensor:
-    """Z-score normalization with numerical stability."""
+    """Z-score normalization with numerical stability.
+    
+    Args:
+        x: Tensor of shape (B, T) where B is batch size, T is sequence length
+        eps: Small epsilon for numerical stability
+    
+    Returns:
+        Z-scored tensor of same shape as input
+    """
    # Handle both (B,) and (B, T) shapes
    if x.dim() == 1:
        x = x.unsqueeze(1)  # Make it (B, 1)

    B, T = x.shape

-    # If only one timestep, can't compute meaningful std across time
    if T == 1:
-        # Just use tanh to bound values instead of z-score
-        return torch.tanh(x * 0.1)  # Scale and bound
+        # Single timestep: use tanh to bound values instead of z-score
+        return torch.tanh(x * 0.1)
    
-    # Compute mean and std across time dimension
-    mean = x.mean(dim=1, keepdim=True)
-    std = x.std(dim=1, keepdim=True, unbiased=False)
+    # Multiple timesteps: compute z-score across time dimension for each batch
+    mean = x.mean(dim=1, keepdim=True)  # (B, 1)
+    std = x.std(dim=1, keepdim=True, unbiased=False)  # (B, 1)

    # Check if std is valid (not zero or NaN)
    std_is_valid = (std > eps) & (~torch.isnan(std))
@@ -715,7 +723,7 @@ def zscore(x: Tensor, eps: float = 1e-3) -> Tensor:
    # Compute z-score where valid
    z = (x - mean) / std_safe

-    # For invalid cases, use tanh of centered values
+    # For invalid cases (constant values across time), use tanh of centered values
    z_fallback = torch.tanh((x - mean) * 0.1)
    z = torch.where(std_is_valid.expand_as(z), z, z_fallback)

@@ -35,7 +35,7 @@ _ google/siglip2-large-patch16-256: https://huggingface.co/google/siglip2-large-

 Loss: See this chatgpt thread: https://chatgpt.com/s/t_68999a50a0b081919abc365cdd205e01

-Past images: (for example a reward methoid go to 3rd floor, has to know what floor it was on and what pas actions it did, can we attend or encorperate images of decision from history in one way?) Maybe via this paper: Learning Long-Context Diffusion Policies via Past-Token Prediction
+Past images: (for example a reward method go to 3rd floor, has to know what floor it was on and what pas actions it did, can we attend or encorperate images of decision from history in one way?) Maybe via this paper: Learning Long-Context Diffusion Policies via Past-Token Prediction

 Amount of frames needed for test/generalization: 1M frames? or ~20% of IPEC-COMMUNITY/bc_z_lerobot

@@ -43,7 +43,6 @@ Eval:
 Implement something like voc score , or ROC rank order correlation between reward leanredna and ev reward from sim, or use something else to do additional evaluation

 Ideas:
-
 - Incorporate training on multiple horizons: as in label same dataset for longer horizons: make a sandwich (long), put cheese on bread (medium) and even smaller horizons: go down or close gripper (small)
 - Incorporate navigation goals “walk towards the kitchen”, make sure we fix CLIP contrastive learning issue of positional text misunderstanding where model doesnnt learn difference between "horse right of cow" and "horse left of cow" “Move right” potentially train with more other data or even actionable world models such as Genie 3 (https://deepmind.google/discover/blog/genie-3-a-new-frontier-for-world-models/)

@@ -59,11 +58,6 @@ _ GTEA+ Gaze: https://cbs.ic.gatech.edu/fpv/
 _ YouCook2 dataset
 _ HOWTO100M: https://www.di.ens.fr/willow/research/howto100m/

-Also in plan include investigating/incorperating these two things:
-
- Curriculum: Start training on easier distinctions (e.g. very early vs very late frames which are easy to tell apart) then gradually ask the model to distinguish more subtle differences (frames that are closer in time). This curriculum can be implemented by initially using pairs far apart in the trajectory for ranking, then moving to closer pairs as accuracy improves.
- Augmentations: As mentioned, heavy image augmentation (random crops, slight noise) is often used so that the reward model focuses on high-level task progress features rather than pixel-level cues. For video models, temporal augmentation like random frame skipping can also make the model robust to different speeds.
-
 ### Implemented Loss (Spatial-Aware Composite Loss)

 Our implementation uses a **composite loss with spatial awareness** to address the limitations of standard contrastive learning (e.g., CLIP's inability to distinguish "move left" vs "move right"). The loss has three components:
@@ -127,10 +121,17 @@ Default weights: $\lambda_{\text{prog}}=1.0$, $\lambda_{\text{spatial-nce}}=0.5$
  - Annotate with ReWiND-style 0→1 progress rewards [x]
  - Visualize to check [x]
 - Implement eval score or metric that is robust and can deal with generalization/is a good metric to try different architectures. And use it in an eval jupyter notebook with visalization of the live reward next to the video for part of the dataset: VOC score and score with correct and incorrect language captions [x]
- Do first training []
+- Do first training [x]
 - Try different losses []
+    - Only vlc loss then eval []
+    - Only rewind loss then eval []
+    - Vlc + rewind loss then eval []
+- Cleanup code
 - Switch to DINO v3 as encoder Base 86 M: https://huggingface.co/facebook/dinov3-vitb16-pretrain-lvd1689m with HuggingFaceTB/SmolLM2-135M-Instruct ?
 - Add more artificial text to dataset generated by vlm (google gemini) []
-  - See google gemini vlm caption from Leandro []
+  - See google gemini vlm caption from Leandro [] https://gemini.google.com/app/7e332ffaf32580f2
  - Multiple captions per video, creat method to generate as much data as possible etc [] https://arxiv.org/abs/2508.13446
+- How can we improve spatial aware learning? co generating captions for each frame with language decoder?
 - Add droid []
+- Extend evaluation []
+- Add other dataset mentioned above []