diff --git a/notebooks/rlearn_evaluation.ipynb b/notebooks/rlearn_evaluation.ipynb
index 3f5103b2a..f05103500 100644
--- a/notebooks/rlearn_evaluation.ipynb
+++ b/notebooks/rlearn_evaluation.ipynb
@@ -21,7 +21,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 31,
    "metadata": {},
    "outputs": [
     {
@@ -75,7 +75,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -93,7 +93,7 @@
    "source": [
     "# Configuration\n",
     "DATASET_REPO = \"pepijn223/rewards_bc_z3\"  # Change to your dataset\n",
-    "MODEL_PATH = \"path/to/your/trained/model\"  # Change to your model checkpoint\n",
+    "MODEL_PATH = \"pepijn223/rlearn_rewards_bc_z\"  # Change to your model checkpoint\n",
     "DEVICE = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
     "NUM_EVAL_EPISODES = 50  # Number of episodes for evaluation\n",
     "\n",
diff --git a/src/lerobot/policies/rlearn/configuration_rlearn.py b/src/lerobot/policies/rlearn/configuration_rlearn.py
index c267cbf12..723121675 100644
--- a/src/lerobot/policies/rlearn/configuration_rlearn.py
+++ b/src/lerobot/policies/rlearn/configuration_rlearn.py
@@ -104,8 +104,9 @@ class RLearNConfig(PreTrainedConfig):
 
     @property
     def observation_delta_indices(self) -> list | None:
-        # Not using delta sampling from the dataset by default.
-        return None
+        # Use temporal sequences: past frames from -(max_seq_len-1) to current (0)
+        # This gives us max_seq_len frames total, e.g. [-15, -14, ..., -1, 0] for max_seq_len=16
+        return list(range(1 - self.max_seq_len, 1))
 
     @property
     def action_delta_indices(self) -> list | None:
diff --git a/src/lerobot/policies/rlearn/modeling_rlearn.py b/src/lerobot/policies/rlearn/modeling_rlearn.py
index 88ca8d58b..8e440b997 100644
--- a/src/lerobot/policies/rlearn/modeling_rlearn.py
+++ b/src/lerobot/policies/rlearn/modeling_rlearn.py
@@ -604,12 +604,12 @@ def generate_causal_mask(T: int, device=None) -> Tensor:
 
 def extract_visual_sequence(batch: dict[str, Tensor]) -> Tensor:
     # Accept various image key formats from datasets
-    # Try multiple common key patterns
-
+    # With delta_indices, the dataset provides temporal sequences automatically
+    
     # List of possible image keys to check, in order of preference
     possible_keys = [
         OBS_IMAGES,  # 'observation.images'
-        OBS_IMAGE,  # 'observation.image'
+        OBS_IMAGE,   # 'observation.image'  
         "observation.images.image",  # nested format from some datasets
     ]
 
@@ -619,14 +619,15 @@ def extract_visual_sequence(batch: dict[str, Tensor]) -> Tensor:
 
             if isinstance(image_val, list) and len(image_val) > 0:
                 # List of (B, C, H, W) -> stack over time
+                # This happens when dataset provides temporal sequence as list
                 return torch.stack(image_val, dim=1)
             elif torch.is_tensor(image_val):
                 # Tensor of shape (B, T, C, H, W) or (B, C, H, W)
                 if image_val.dim() == 5:
-                    # Already has time dimension
+                    # Already has time dimension - this is what we expect with delta_indices
                     return image_val
                 elif image_val.dim() == 4:
-                    # Add time dimension (single frame)
+                    # Add time dimension (single frame) - fallback for datasets without temporal sequences
                     return image_val.unsqueeze(1)
                 else:
                     raise ValueError(
@@ -690,21 +691,28 @@ def pairwise_ranking_loss(logits: Tensor, target: Tensor, margin: float = 0.1, n
 
 
 def zscore(x: Tensor, eps: float = 1e-3) -> Tensor:
-    """Z-score normalization with numerical stability."""
+    """Z-score normalization with numerical stability.
+    
+    Args:
+        x: Tensor of shape (B, T) where B is batch size, T is sequence length
+        eps: Small epsilon for numerical stability
+    
+    Returns:
+        Z-scored tensor of same shape as input
+    """
     # Handle both (B,) and (B, T) shapes
     if x.dim() == 1:
         x = x.unsqueeze(1)  # Make it (B, 1)
 
     B, T = x.shape
 
-    # If only one timestep, can't compute meaningful std across time
     if T == 1:
-        # Just use tanh to bound values instead of z-score
-        return torch.tanh(x * 0.1)  # Scale and bound
-
-    # Compute mean and std across time dimension
-    mean = x.mean(dim=1, keepdim=True)
-    std = x.std(dim=1, keepdim=True, unbiased=False)
+        # Single timestep: use tanh to bound values instead of z-score
+        return torch.tanh(x * 0.1)
+    
+    # Multiple timesteps: compute z-score across time dimension for each batch
+    mean = x.mean(dim=1, keepdim=True)  # (B, 1)
+    std = x.std(dim=1, keepdim=True, unbiased=False)  # (B, 1)
 
     # Check if std is valid (not zero or NaN)
     std_is_valid = (std > eps) & (~torch.isnan(std))
@@ -715,7 +723,7 @@ def zscore(x: Tensor, eps: float = 1e-3) -> Tensor:
     # Compute z-score where valid
     z = (x - mean) / std_safe
 
-    # For invalid cases, use tanh of centered values
+    # For invalid cases (constant values across time), use tanh of centered values
     z_fallback = torch.tanh((x - mean) * 0.1)
     z = torch.where(std_is_valid.expand_as(z), z, z_fallback)
 
diff --git a/src/lerobot/policies/rlearn/rlearn_plan.md b/src/lerobot/policies/rlearn/rlearn_plan.md
index 6e7d720cf..f7094afeb 100644
--- a/src/lerobot/policies/rlearn/rlearn_plan.md
+++ b/src/lerobot/policies/rlearn/rlearn_plan.md
@@ -35,7 +35,7 @@ _ google/siglip2-large-patch16-256: https://huggingface.co/google/siglip2-large-
 
 Loss: See this chatgpt thread: https://chatgpt.com/s/t_68999a50a0b081919abc365cdd205e01
 
-Past images: (for example a reward methoid go to 3rd floor, has to know what floor it was on and what pas actions it did, can we attend or encorperate images of decision from history in one way?) Maybe via this paper: Learning Long-Context Diffusion Policies via Past-Token Prediction
+Past images: (for example a reward method go to 3rd floor, has to know what floor it was on and what pas actions it did, can we attend or encorperate images of decision from history in one way?) Maybe via this paper: Learning Long-Context Diffusion Policies via Past-Token Prediction
 
 Amount of frames needed for test/generalization: 1M frames? or ~20% of IPEC-COMMUNITY/bc_z_lerobot
 
@@ -43,7 +43,6 @@ Eval:
 Implement something like voc score , or ROC rank order correlation between reward leanredna and ev reward from sim, or use something else to do additional evaluation
 
 Ideas:
-
 - Incorporate training on multiple horizons: as in label same dataset for longer horizons: make a sandwich (long), put cheese on bread (medium) and even smaller horizons: go down or close gripper (small)
 - Incorporate navigation goals “walk towards the kitchen”, make sure we fix CLIP contrastive learning issue of positional text misunderstanding where model doesnnt learn difference between "horse right of cow" and "horse left of cow" “Move right” potentially train with more other data or even actionable world models such as Genie 3 (https://deepmind.google/discover/blog/genie-3-a-new-frontier-for-world-models/)
 
@@ -59,11 +58,6 @@ _ GTEA+ Gaze: https://cbs.ic.gatech.edu/fpv/
 _ YouCook2 dataset
 _ HOWTO100M: https://www.di.ens.fr/willow/research/howto100m/
 
-Also in plan include investigating/incorperating these two things:
-
-- Curriculum: Start training on easier distinctions (e.g. very early vs very late frames which are easy to tell apart) then gradually ask the model to distinguish more subtle differences (frames that are closer in time). This curriculum can be implemented by initially using pairs far apart in the trajectory for ranking, then moving to closer pairs as accuracy improves.
-- Augmentations: As mentioned, heavy image augmentation (random crops, slight noise) is often used so that the reward model focuses on high-level task progress features rather than pixel-level cues. For video models, temporal augmentation like random frame skipping can also make the model robust to different speeds.
-
 ### Implemented Loss (Spatial-Aware Composite Loss)
 
 Our implementation uses a **composite loss with spatial awareness** to address the limitations of standard contrastive learning (e.g., CLIP's inability to distinguish "move left" vs "move right"). The loss has three components:
@@ -127,10 +121,17 @@ Default weights: $\lambda_{\text{prog}}=1.0$, $\lambda_{\text{spatial-nce}}=0.5$
   - Annotate with ReWiND-style 0→1 progress rewards [x]
   - Visualize to check [x]
 - Implement eval score or metric that is robust and can deal with generalization/is a good metric to try different architectures. And use it in an eval jupyter notebook with visalization of the live reward next to the video for part of the dataset: VOC score and score with correct and incorrect language captions [x]
-- Do first training []
+- Do first training [x]
 - Try different losses []
+    - Only vlc loss then eval []
+    - Only rewind loss then eval []
+    - Vlc + rewind loss then eval []
+- Cleanup code
 - Switch to DINO v3 as encoder Base 86 M: https://huggingface.co/facebook/dinov3-vitb16-pretrain-lvd1689m with HuggingFaceTB/SmolLM2-135M-Instruct ?
 - Add more artificial text to dataset generated by vlm (google gemini) []
-  - See google gemini vlm caption from Leandro []
+  - See google gemini vlm caption from Leandro [] https://gemini.google.com/app/7e332ffaf32580f2
   - Multiple captions per video, creat method to generate as much data as possible etc [] https://arxiv.org/abs/2508.13446
+- How can we improve spatial aware learning? co generating captions for each frame with language decoder?
 - Add droid []
+- Extend evaluation []
+- Add other dataset mentioned above []