fix

2026-07-25 18:56:09 +00:00 · 2025-08-30 23:11:26 +02:00
parent 47bc670ad2
commit 825c0666a9
5 changed files with 716 additions and 54 deletions
@@ -61,7 +61,7 @@ class RLearNConfig(PreTrainedConfig):
    use_tanh_head: bool = False  # when True, bound outputs in [-1, 1]

    # Training
-    learning_rate: float = 1e-4
+    learning_rate: float = 1e-3
    weight_decay: float = 0.01
    
    # Performance optimizations
@@ -98,6 +98,12 @@ class RLearNConfig(PreTrainedConfig):
    reward_max_value: float = 1.0
    reward_hl_gauss_loss_num_bins: int = 20
    
+    # Evaluation visualization parameters
+    enable_eval_visualizations: bool = False  # Enable reward evaluation visualizations during training
+    eval_visualization_freq: int = 1000  # Steps between evaluation visualizations
+    eval_holdout_episodes: int = 9  # Number of episodes to hold out for evaluation
+    eval_max_frames: int = 128  # Maximum frames per episode for evaluation
+    eval_visualization_seed: int = 42  # Seed for reproducible episode selection

    # Optional: path to episodes.jsonl to build full-episode indices automatically
    # Default to common dataset layout: <dataset_root>/meta/episodes.jsonl
@@ -0,0 +1,511 @@
+#!/usr/bin/env python
+
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Visualization utilities for RLearN evaluation during training.
+
+Creates and saves reward prediction visualizations for held-out episodes.
+"""
+
+from __future__ import annotations
+
+import warnings
+from pathlib import Path
+from typing import Any
+
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+from matplotlib import rcParams
+from scipy.stats import spearmanr
+from torch import Tensor
+
+from lerobot.constants import OBS_IMAGES, OBS_LANGUAGE
+
+# Set matplotlib backend to avoid GUI issues during training
+rcParams['backend'] = 'Agg'
+
+
+class RLearNEvalVisualizer:
+    """
+    Creates visualization plots for RLearN model evaluation during training.
+    
+    Generates reward prediction plots similar to the evaluation notebook but saves
+    them as images for monitoring training progress.
+    """
+    
+    def __init__(self, model, dataset, device: str = "cuda"):
+        """
+        Args:
+            model: RLearN model instance
+            dataset: LeRobot dataset instance
+            device: Device to run evaluation on
+        """
+        self.model = model
+        self.dataset = dataset
+        self.device = device
+        
+    def get_episode_data(self, episode_idx: int, max_frames: int = 64) -> tuple[Tensor | None, str | None, np.ndarray | None, int | None]:
+        """Extract frames, language, and predict rewards for an episode."""
+        try:
+            # Get episode data
+            ep_start = self.dataset.episode_data_index["from"][episode_idx].item()
+            ep_end = self.dataset.episode_data_index["to"][episode_idx].item()
+            episode_length = min(ep_end - ep_start, max_frames)
+
+            # Collect frames and get language
+            frames = []
+            language = None
+
+            for frame_idx in range(episode_length):
+                global_idx = ep_start + frame_idx
+                frame_data = self.dataset[global_idx]
+
+                # Extract image
+                if OBS_IMAGES in frame_data:
+                    img = frame_data[OBS_IMAGES]
+                else:
+                    img_keys = [k for k in frame_data.keys() if "image" in k.lower()]
+                    if img_keys:
+                        img = frame_data[img_keys[0]]
+                    else:
+                        continue
+
+                if isinstance(img, np.ndarray):
+                    img = torch.from_numpy(img)
+
+                # Ensure CHW format
+                if len(img.shape) == 3 and img.shape[-1] in [1, 3, 4]:
+                    img = img.permute(2, 0, 1)
+
+                # Resize to expected input size (224x224 for SigLIP2)
+                if img.shape[-2:] != (224, 224):
+                    import torch.nn.functional as F
+                    img = F.interpolate(
+                        img.unsqueeze(0), size=(224, 224), mode="bilinear", align_corners=False
+                    ).squeeze(0)
+
+                # Normalize to [0, 1] if needed
+                if img.dtype == torch.uint8:
+                    img = img.float() / 255.0
+
+                frames.append(img)
+
+                # Get language
+                if language is None:
+                    if OBS_LANGUAGE in frame_data:
+                        language = frame_data[OBS_LANGUAGE]
+                        if isinstance(language, list):
+                            language = language[0]
+                    elif "task" in frame_data:
+                        language = frame_data["task"]
+                    else:
+                        language = "No language provided"
+
+            if not frames:
+                return None, None, None, None
+
+            frames_tensor = torch.stack(frames)
+            
+            # Predict rewards using the model's evaluation method
+            with torch.no_grad():
+                self.model.eval()
+                rewards = self._predict_episode_rewards(frames_tensor, language)
+
+            return frames_tensor, language, rewards, episode_length
+
+        except Exception as e:
+            warnings.warn(f"Error processing episode {episode_idx}: {e}")
+            return None, None, None, None
+
+    @torch.no_grad()
+    def _predict_episode_rewards(self, frames: Tensor, language: str, batch_size: int = 16) -> np.ndarray:
+        """
+        Predict rewards for a single episode using proper temporal sequences.
+        
+        Args:
+            frames: Video frames tensor of shape (T, C, H, W)
+            language: Language instruction string
+            batch_size: Maximum number of temporal sequences to process at once
+
+        Returns:
+            Predicted progress/rewards array of shape (T,)
+        """
+        T = frames.shape[0]
+        max_seq_len = self.model.config.max_seq_len
+
+        # Create temporal sequences for each frame
+        temporal_sequences = []
+
+        for i in range(T):
+            # Create sequence ending at frame i
+            seq_frames = []
+            for j in range(max(0, i - max_seq_len + 1), i + 1):
+                # Use frame j if available, otherwise repeat the first available frame
+                frame_idx = max(0, min(j, T - 1))
+                seq_frames.append(frames[frame_idx])
+
+            # Pad sequence to max_seq_len by repeating the first frame if needed
+            while len(seq_frames) < max_seq_len:
+                seq_frames.insert(0, seq_frames[0])  # Prepend first frame
+
+            # Take only the last max_seq_len frames if we have too many
+            seq_frames = seq_frames[-max_seq_len:]
+            temporal_sequences.append(torch.stack(seq_frames))  # (max_seq_len, C, H, W)
+
+        # Stack all temporal sequences: (T, max_seq_len, C, H, W)
+        all_sequences = torch.stack(temporal_sequences)
+
+        # Process in batches
+        rewards = []
+        for i in range(0, T, batch_size):
+            end_idx = min(i + batch_size, T)
+            batch_sequences = all_sequences[i:end_idx].to(self.device)  # (B, max_seq_len, C, H, W)
+
+            # Create batch for model
+            batch = {
+                OBS_IMAGES: batch_sequences,  # (B, T, C, H, W) format expected by model
+                OBS_LANGUAGE: [language] * batch_sequences.shape[0],
+            }
+
+            # Predict rewards - model returns (B, T') but we want the last timestep for each sequence
+            values = self.model.predict_rewards(batch)  # (B, T')
+
+            # Take the last timestep prediction for each sequence (represents current frame reward)
+            if values.dim() == 2:
+                batch_rewards = values[:, -1].cpu().numpy()  # (B,) - last timestep
+            else:
+                batch_rewards = values.cpu().numpy()  # (B,) - already single timestep
+
+            rewards.extend(batch_rewards)
+
+        return np.array(rewards[:T])  # Ensure exact length
+
+    def create_episode_grid_visualization(
+        self, 
+        episode_indices: list[int], 
+        save_path: Path, 
+        step: int | None = None,
+        max_frames: int = 64
+    ) -> dict[str, Any]:
+        """
+        Create a 3x3 grid visualization of episode reward predictions.
+        
+        Args:
+            episode_indices: List of 9 episode indices to visualize
+            save_path: Path to save the visualization image
+            step: Training step (for title)
+            max_frames: Maximum frames per episode to process
+            
+        Returns:
+            Dictionary with evaluation metrics
+        """
+        if len(episode_indices) != 9:
+            raise ValueError("Expected exactly 9 episode indices for 3x3 grid")
+            
+        # Create figure with 3x3 subplots
+        fig, axes = plt.subplots(3, 3, figsize=(20, 16))
+        axes = axes.flatten()
+        
+        eval_metrics = {
+            "voc_s_scores": [],
+            "episode_lengths": [],
+            "reward_ranges": [],
+            "languages": []
+        }
+        
+        for i, episode_idx in enumerate(episode_indices):
+            ax = axes[i]
+            
+            frames, language, rewards, episode_length = self.get_episode_data(episode_idx, max_frames)
+            
+            if rewards is None:
+                ax.text(
+                    0.5, 0.5, f"Episode {episode_idx}\nNo data available",
+                    ha="center", va="center", transform=ax.transAxes,
+                    fontsize=12, bbox=dict(boxstyle="round,pad=0.3", facecolor="lightcoral", alpha=0.7)
+                )
+                ax.set_title(f"Episode {episode_idx} - Error", fontsize=12, pad=10)
+                continue
+                
+            # Plot predicted rewards
+            time_steps = range(len(rewards))
+            ax.plot(
+                time_steps, rewards, "b-", linewidth=2.5, marker="o", markersize=5, 
+                label="Predicted Reward", alpha=0.8
+            )
+            
+            # Add expected progress line (ground truth for ReWiND)
+            expected_progress = np.linspace(0, 1, len(rewards))
+            ax.plot(
+                time_steps, expected_progress, "orange", linestyle="--", linewidth=2.5, 
+                label="Expected Progress (0→1)", alpha=0.8
+            )
+            
+            # Compute VOC-S (Value-Order Correlation for Success)
+            frame_indices = np.arange(1, len(rewards) + 1)
+            correlation, p_value = spearmanr(frame_indices, rewards)
+            if np.isnan(correlation):
+                correlation = 0.0
+                
+            eval_metrics["voc_s_scores"].append(correlation)
+            eval_metrics["episode_lengths"].append(len(rewards))
+            eval_metrics["reward_ranges"].append((rewards.min(), rewards.max()))
+            eval_metrics["languages"].append(language)
+            
+            # Format title with language (truncated) and VOC-S
+            title_lang = language[:35] + "..." if len(language) > 35 else language
+            title = f'Episode {episode_idx}\n"{title_lang}"\nVOC-S: {correlation:.3f}'
+            ax.set_title(title, fontsize=10, pad=15)
+            
+            ax.set_xlabel("Frame Index", fontsize=10)
+            ax.set_ylabel("Reward", fontsize=10)
+            ax.legend(fontsize=8, loc='upper left')
+            ax.grid(True, alpha=0.3)
+            
+            # Color-coded trend indicator
+            if correlation > 0.3:
+                trend_text = "↗ Strong+"
+                trend_color = "darkgreen"
+            elif correlation > 0.1:
+                trend_text = "↗ Weak+"
+                trend_color = "green"
+            elif correlation < -0.3:
+                trend_text = "↘ Strong-"
+                trend_color = "darkred"
+            elif correlation < -0.1:
+                trend_text = "↘ Weak-"
+                trend_color = "red"
+            else:
+                trend_text = "→ Flat"
+                trend_color = "gray"
+                
+            ax.text(
+                0.02, 0.98, trend_text, transform=ax.transAxes,
+                verticalalignment="top", fontsize=9, fontweight="bold",
+                bbox=dict(boxstyle="round,pad=0.3", facecolor=trend_color, alpha=0.2),
+                color=trend_color
+            )
+            
+            # Add reward range info
+            ax.text(
+                0.98, 0.02, f"Range: [{rewards.min():.3f}, {rewards.max():.3f}]",
+                transform=ax.transAxes, ha="right", va="bottom", fontsize=8,
+                bbox=dict(boxstyle="round,pad=0.2", facecolor="lightblue", alpha=0.5)
+            )
+        
+        # Add overall title
+        step_text = f" - Step {step}" if step is not None else ""
+        fig.suptitle(
+            f"RLearN Reward Evaluation{step_text}\n"
+            f"Mean VOC-S: {np.mean(eval_metrics['voc_s_scores']):.3f} | "
+            f"Episodes: {len([s for s in eval_metrics['voc_s_scores'] if s != 0])}/9",
+            fontsize=16, y=0.95
+        )
+        
+        plt.tight_layout()
+        plt.subplots_adjust(top=0.90)  # Make room for suptitle
+        
+        # Save the figure
+        save_path.parent.mkdir(parents=True, exist_ok=True)
+        plt.savefig(save_path, dpi=150, bbox_inches='tight', facecolor='white')
+        plt.close()  # Close to free memory
+        
+        # Calculate summary metrics
+        valid_scores = [s for s in eval_metrics["voc_s_scores"] if s != 0]
+        summary = {
+            "mean_voc_s": np.mean(valid_scores) if valid_scores else 0.0,
+            "std_voc_s": np.std(valid_scores) if valid_scores else 0.0,
+            "num_valid_episodes": len(valid_scores),
+            "total_episodes": len(episode_indices),
+            "mean_episode_length": np.mean(eval_metrics["episode_lengths"]) if eval_metrics["episode_lengths"] else 0,
+            "individual_scores": eval_metrics["voc_s_scores"],
+            "episode_languages": eval_metrics["languages"]
+        }
+        
+        return summary
+        
+    def create_comparison_visualization(
+        self,
+        episode_indices: list[int],
+        save_path: Path,
+        step: int | None = None,
+        max_frames: int = 64,
+        mismatch_templates: list[str] | None = None
+    ) -> dict[str, Any]:
+        """
+        Create correct vs incorrect language comparison visualization.
+        
+        Args:
+            episode_indices: List of episode indices to compare (up to 6)
+            save_path: Path to save the visualization image  
+            step: Training step (for title)
+            max_frames: Maximum frames per episode to process
+            mismatch_templates: Custom mismatch templates
+            
+        Returns:
+            Dictionary with detection metrics
+        """
+        if mismatch_templates is None:
+            mismatch_templates = [
+                "kick the ball", "clean the sink", "dance in place", 
+                "wave your hand", "jump up and down", "do nothing"
+            ]
+            
+        # Limit to 6 episodes for 2x3 grid
+        episode_indices = episode_indices[:6]
+        n_episodes = len(episode_indices)
+        
+        # Create figure with 2x3 subplots
+        fig, axes = plt.subplots(2, 3, figsize=(18, 12))
+        axes = axes.flatten()
+        
+        detection_results = {
+            "correct_finals": [],
+            "incorrect_finals": [],
+            "detection_successes": [],
+            "episode_info": []
+        }
+        
+        for i, episode_idx in enumerate(episode_indices):
+            if i >= 6:  # Limit to 6 subplots
+                break
+                
+            ax = axes[i]
+            
+            # Get episode data with correct language
+            frames, correct_language, correct_rewards, episode_length = self.get_episode_data(episode_idx, max_frames)
+            
+            if correct_rewards is None:
+                ax.text(
+                    0.5, 0.5, f"Episode {episode_idx}\nNo data available",
+                    ha="center", va="center", transform=ax.transAxes
+                )
+                ax.set_title(f"Episode {episode_idx} - Error")
+                continue
+                
+            # Generate incorrect language and predict
+            incorrect_language = mismatch_templates[i % len(mismatch_templates)]
+            incorrect_rewards = self._predict_episode_rewards(frames, incorrect_language)
+            
+            # Plot both reward curves
+            time_steps = range(len(correct_rewards))
+            ax.plot(
+                time_steps, correct_rewards, "g-", linewidth=2.5, marker="o", markersize=4,
+                label=f"Correct: '{correct_language[:25]}...'" if len(correct_language) > 25 else f"Correct: '{correct_language}'"
+            )
+            ax.plot(
+                time_steps, incorrect_rewards, "r-", linewidth=2.5, marker="s", markersize=4,
+                label=f"Incorrect: '{incorrect_language}'"
+            )
+            
+            # Calculate detection success
+            final_correct = correct_rewards[-1]
+            final_incorrect = incorrect_rewards[-1]
+            detection_success = final_correct > final_incorrect
+            
+            detection_results["correct_finals"].append(final_correct)
+            detection_results["incorrect_finals"].append(final_incorrect)
+            detection_results["detection_successes"].append(detection_success)
+            detection_results["episode_info"].append({
+                "episode_idx": episode_idx,
+                "correct_language": correct_language,
+                "incorrect_language": incorrect_language,
+                "final_correct": final_correct,
+                "final_incorrect": final_incorrect
+            })
+            
+            # Color-coded title based on detection success
+            success_indicator = "✓" if detection_success else "✗"
+            title_color = "darkgreen" if detection_success else "darkred"
+            ax.set_title(
+                f"Episode {episode_idx} {success_indicator}\nΔ: {final_correct - final_incorrect:.3f}",
+                color=title_color, fontweight="bold", fontsize=11
+            )
+            
+            ax.set_xlabel("Frame Index")
+            ax.set_ylabel("Reward")
+            ax.legend(fontsize=8, loc='upper left')
+            ax.grid(True, alpha=0.3)
+            
+            # Add final reward values as text
+            ax.text(
+                0.98, 0.02, 
+                f"Final: C={final_correct:.3f}, I={final_incorrect:.3f}",
+                transform=ax.transAxes, ha="right", va="bottom", fontsize=9,
+                bbox=dict(boxstyle="round,pad=0.3", facecolor="lightyellow", alpha=0.7)
+            )
+        
+        # Hide unused subplots
+        for i in range(n_episodes, 6):
+            axes[i].axis('off')
+        
+        # Calculate summary metrics
+        detection_accuracy = np.mean(detection_results["detection_successes"]) if detection_results["detection_successes"] else 0.0
+        mean_correct = np.mean(detection_results["correct_finals"]) if detection_results["correct_finals"] else 0.0
+        mean_incorrect = np.mean(detection_results["incorrect_finals"]) if detection_results["incorrect_finals"] else 0.0
+        
+        # Add overall title
+        step_text = f" - Step {step}" if step is not None else ""
+        fig.suptitle(
+            f"RLearN Language Detection{step_text}\n"
+            f"Accuracy: {detection_accuracy:.1%} | Mean Δ: {mean_correct - mean_incorrect:.3f} | "
+            f"Success: {sum(detection_results['detection_successes'])}/{len(detection_results['detection_successes'])}",
+            fontsize=16, y=0.95
+        )
+        
+        plt.tight_layout()
+        plt.subplots_adjust(top=0.90)
+        
+        # Save the figure
+        save_path.parent.mkdir(parents=True, exist_ok=True)
+        plt.savefig(save_path, dpi=150, bbox_inches='tight', facecolor='white')
+        plt.close()
+        
+        summary = {
+            "detection_accuracy": detection_accuracy,
+            "mean_correct_final": mean_correct,
+            "mean_incorrect_final": mean_incorrect,
+            "separation_score": mean_correct - mean_incorrect,
+            "num_episodes": len(detection_results["detection_successes"]),
+            "individual_results": detection_results["episode_info"]
+        }
+        
+        return summary
+
+
+def select_evaluation_episodes(dataset, num_episodes: int = 9, seed: int = 42) -> list[int]:
+    """
+    Select a diverse set of episodes for evaluation holdout.
+    
+    Args:
+        dataset: LeRobot dataset instance
+        num_episodes: Number of episodes to select
+        seed: Random seed for reproducibility
+        
+    Returns:
+        List of episode indices
+    """
+    np.random.seed(seed)
+    
+    total_episodes = dataset.num_episodes
+    if num_episodes >= total_episodes:
+        return list(range(total_episodes))
+    
+    # Select random episodes
+    episode_indices = np.random.choice(total_episodes, num_episodes, replace=False).tolist()
+    
+    return sorted(episode_indices)
@@ -556,6 +556,20 @@ class RLearNPolicy(PreTrainedPolicy):
        total_loss = loss + L_mismatch
        loss_time = time.perf_counter() - loss_start
        
+        # DEBUG: Print targets and predictions occasionally during training
+        if self.training and torch.rand(1).item() < 0.02:  # ~2% chance to debug print
+            with torch.no_grad():
+                preds = self.hl_gauss_layer(video_frame_embeds).squeeze(-1)
+                print(f"\n=== DEBUG TRAINING ===")
+                print(f"Target range: [{target.min():.3f}, {target.max():.3f}]")
+                print(f"Target mean: {target.mean():.3f}")
+                print(f"Pred range: [{preds.min():.3f}, {preds.max():.3f}]") 
+                print(f"Pred mean: {preds.mean():.3f}")
+                print(f"Loss: {loss:.4f}")
+                print("First sample targets:", target[0, :5].cpu().numpy())
+                print("First sample preds:", preds[0, :5].cpu().numpy())
+                print("="*25)
+        
        total_forward_time = time.perf_counter() - forward_start

        # Log individual loss components
@@ -212,6 +212,28 @@ def train(cfg: TrainPipelineConfig):
        ds_meta=dataset.meta,
        episode_data_index=episode_data_index,
    )
+    
+    # Setup RLearN evaluation visualizations if enabled
+    eval_visualizer = None
+    eval_holdout_episodes = None
+    if (getattr(cfg.policy, "type", None) == "rlearn" and 
+        getattr(cfg.policy, "enable_eval_visualizations", False)):
+        
+        try:
+            from lerobot.policies.rlearn.eval_visualizer import RLearNEvalVisualizer, select_evaluation_episodes
+            
+            logging.info("Setting up RLearN evaluation visualizations")
+            eval_visualizer = RLearNEvalVisualizer(policy, dataset, device=str(device))
+            eval_holdout_episodes = select_evaluation_episodes(
+                dataset, 
+                num_episodes=getattr(cfg.policy, "eval_holdout_episodes", 9),
+                seed=getattr(cfg.policy, "eval_visualization_seed", 42)
+            )
+            logging.info(f"Selected {len(eval_holdout_episodes)} holdout episodes for evaluation: {eval_holdout_episodes}")
+        except ImportError as e:
+            logging.warning(f"Could not setup RLearN evaluation visualizations: {e}")
+            eval_visualizer = None
+    
    preprocessor, postprocessor = make_processor(
        policy_cfg=cfg.policy, pretrained_path=cfg.policy.pretrained_path, dataset_stats=dataset.meta.stats
    )
@@ -386,6 +408,8 @@ def train(cfg: TrainPipelineConfig):
        is_log_step = cfg.log_freq > 0 and step % cfg.log_freq == 0
        is_saving_step = step % cfg.save_freq == 0 or step == cfg.steps
        is_eval_step = cfg.eval_freq > 0 and step % cfg.eval_freq == 0
+        is_eval_viz_step = (eval_visualizer is not None and 
+                           step % getattr(cfg.policy, "eval_visualization_freq", 1000) == 0)

        if is_log_step:
            logging.info(train_tracker)
@@ -437,6 +461,87 @@ def train(cfg: TrainPipelineConfig):
                wandb_logger.log_dict(wandb_log_dict, step, mode="eval")
                wandb_logger.log_video(eval_info["video_paths"][0], step, mode="eval")

+        # RLearN evaluation visualizations
+        if is_eval_viz_step:
+            logging.info(f"Creating RLearN evaluation visualizations at step {step}")
+            try:
+                with torch.no_grad():
+                    policy.eval()
+                    
+                    # Create evaluation visualizations directory
+                    eval_viz_dir = cfg.output_dir / "eval_visualizations"
+                    eval_viz_dir.mkdir(parents=True, exist_ok=True)
+                    
+                    # Create reward prediction visualization (3x3 grid)
+                    reward_viz_path = eval_viz_dir / f"reward_predictions_step_{step:06d}.png"
+                    reward_metrics = eval_visualizer.create_episode_grid_visualization(
+                        episode_indices=eval_holdout_episodes,
+                        save_path=reward_viz_path,
+                        step=step,
+                        max_frames=getattr(cfg.policy, "eval_max_frames", 128)
+                    )
+                    
+                    # Log metrics
+                    eval_viz_metrics = {
+                        "eval_viz/mean_voc_s": reward_metrics["mean_voc_s"],
+                        "eval_viz/std_voc_s": reward_metrics["std_voc_s"],
+                        "eval_viz/valid_episodes": reward_metrics["num_valid_episodes"],
+                        "eval_viz/total_episodes": reward_metrics["total_episodes"],
+                        "eval_viz/mean_episode_length": reward_metrics["mean_episode_length"],
+                    }
+                    
+                    logging.info(f"RLearN Evaluation Results at Step {step}:")
+                    logging.info(f"  Mean VOC-S: {reward_metrics['mean_voc_s']:.4f} (±{reward_metrics['std_voc_s']:.4f})")
+                    logging.info(f"  Valid Episodes: {reward_metrics['num_valid_episodes']}/{reward_metrics['total_episodes']}")
+                    logging.info(f"  Mean Episode Length: {reward_metrics['mean_episode_length']:.1f}")
+                    logging.info(f"  Visualizations saved to: {eval_viz_dir}")
+                    
+                    if wandb_logger:
+                        wandb_logger.log_dict(eval_viz_metrics, step, mode="eval_viz")
+                        
+                        # Log the visualization image both as regular image and as artifact
+                        try:
+                            import wandb
+                            
+                            # Log as regular image for immediate viewing in wandb UI
+                            wandb_logger.wandb_run.log({
+                                f"eval_viz/reward_predictions_step_{step}": wandb.Image(str(reward_viz_path)),
+                            }, step=step)
+                            
+                            # Create and upload artifact with reward prediction visualization
+                            artifact_name = f"rlearn_reward_predictions_step_{step:06d}"
+                            artifact = wandb.Artifact(
+                                name=artifact_name,
+                                type="reward_prediction_visualization",
+                                description=f"RLearN reward prediction visualization at training step {step}",
+                                metadata={
+                                    "step": step,
+                                    "mean_voc_s": reward_metrics["mean_voc_s"],
+                                    "std_voc_s": reward_metrics["std_voc_s"],
+                                    "valid_episodes": reward_metrics["num_valid_episodes"],
+                                    "total_episodes": reward_metrics["total_episodes"],
+                                    "mean_episode_length": reward_metrics["mean_episode_length"],
+                                    "holdout_episodes": eval_holdout_episodes,
+                                }
+                            )
+                            
+                            # Add reward prediction visualization to the artifact
+                            artifact.add_file(str(reward_viz_path), name="reward_predictions.png")
+                            
+                            # Upload the artifact
+                            wandb_logger.wandb_run.log_artifact(artifact)
+                            
+                            logging.info(f"Uploaded wandb artifact: {artifact_name}")
+                            
+                        except Exception as e:
+                            logging.warning(f"Could not log visualization image to wandb: {e}")
+                    
+                    policy.train()  # Return to training mode
+                    
+            except Exception as e:
+                logging.error(f"Error during RLearN evaluation visualization: {e}")
+                # Continue training even if evaluation fails
+
    if eval_env:
        eval_env.close()
    logging.info("End of training")