#!/usr/bin/env python """ Script to add profiling instrumentation to RTCProcessor. This script shows which methods to profile in the RTC code to identify bottlenecks. You can either: 1. Apply these changes directly to modeling_rtc.py 2. Use monkey patching to add profiling without modifying source 3. Use as reference for manual instrumentation Usage: # Option 1: Monkey patch (no source changes) python examples/rtc/add_rtc_profiling.py # Option 2: Apply changes to source # Copy the profiled methods below into src/lerobot/policies/rtc/modeling_rtc.py """ import logging import torch from torch import Tensor from lerobot.policies.rtc.modeling_rtc import RTCProcessor from lerobot.utils.profiling import ProfileContext, enable_profiling, is_profiling_enabled logger = logging.getLogger(__name__) def profile_denoise_step(self, x_t, prev_chunk_left_over, inference_delay, time, original_denoise_step_partial, execution_horizon=None) -> Tensor: """Profiled version of denoise_step.""" if not is_profiling_enabled(): # Call original implementation if profiling disabled return self._original_denoise_step(x_t, prev_chunk_left_over, inference_delay, time, original_denoise_step_partial, execution_horizon) with ProfileContext("rtc.denoise_step.total"): # In the original implementation, the time goes from 0 to 1 and # In our implementation, the time goes from 1 to 0 # So we need to invert the time tau = 1 - time if prev_chunk_left_over is None: # First step, no guidance - return v_t with ProfileContext("rtc.denoise_step.base_denoising"): v_t = original_denoise_step_partial(x_t) return v_t with ProfileContext("rtc.denoise_step.setup"): x_t = x_t.clone().detach() squeezed = False if len(x_t.shape) < 3: x_t = x_t.unsqueeze(0) squeezed = True if len(prev_chunk_left_over.shape) < 3: prev_chunk_left_over = prev_chunk_left_over.unsqueeze(0) if execution_horizon is None: execution_horizon = self.rtc_config.execution_horizon if execution_horizon > prev_chunk_left_over.shape[1]: execution_horizon = prev_chunk_left_over.shape[1] batch_size = x_t.shape[0] action_chunk_size = x_t.shape[1] action_dim = x_t.shape[2] # Padding with ProfileContext("rtc.denoise_step.padding"): if prev_chunk_left_over.shape[1] < action_chunk_size or prev_chunk_left_over.shape[2] < action_dim: padded = torch.zeros(batch_size, action_chunk_size, action_dim).to(x_t.device) padded[:, : prev_chunk_left_over.shape[1], : prev_chunk_left_over.shape[2]] = prev_chunk_left_over prev_chunk_left_over = padded # Get prefix weights with ProfileContext("rtc.denoise_step.get_prefix_weights"): weights = ( self.get_prefix_weights(inference_delay, execution_horizon, action_chunk_size) .to(x_t.device) .unsqueeze(0) .unsqueeze(-1) ) # Main RTC guidance computation with ProfileContext("rtc.denoise_step.guidance_computation"): with torch.enable_grad(): # Base denoising with ProfileContext("rtc.denoise_step.base_denoising"): v_t = original_denoise_step_partial(x_t) x_t.requires_grad_(True) # Compute x1_t with ProfileContext("rtc.denoise_step.compute_x1_t"): x1_t = x_t - time * v_t # Compute error with ProfileContext("rtc.denoise_step.compute_error"): err = (prev_chunk_left_over - x1_t) * weights grad_outputs = err.clone().detach() # Compute correction via autograd with ProfileContext("rtc.denoise_step.autograd_correction"): correction = torch.autograd.grad(x1_t, x_t, grad_outputs, retain_graph=False)[0] # Compute guidance weight with ProfileContext("rtc.denoise_step.compute_guidance_weight"): max_guidance_weight = torch.as_tensor(self.rtc_config.max_guidance_weight) tau_tensor = torch.as_tensor(tau) squared_one_minus_tau = (1 - tau_tensor) ** 2 inv_r2 = (squared_one_minus_tau + tau_tensor**2) / (squared_one_minus_tau) c = torch.nan_to_num((1 - tau_tensor) / tau_tensor, posinf=max_guidance_weight) guidance_weight = torch.nan_to_num(c * inv_r2, posinf=max_guidance_weight) guidance_weight = torch.minimum(guidance_weight, max_guidance_weight) # Apply guidance with ProfileContext("rtc.denoise_step.apply_guidance"): result = v_t - guidance_weight * correction # Cleanup with ProfileContext("rtc.denoise_step.cleanup"): if squeezed: result = result.squeeze(0) correction = correction.squeeze(0) x1_t = x1_t.squeeze(0) err = err.squeeze(0) self.track( time=time, x1_t=x1_t, correction=correction, err=err, weights=weights, guidance_weight=guidance_weight, inference_delay=inference_delay, execution_horizon=execution_horizon, ) return result def monkey_patch_rtc_profiling(): """Apply profiling to RTCProcessor via monkey patching. This modifies the RTCProcessor class at runtime to add profiling without changing source files. """ logger.info("Applying RTC profiling monkey patch...") # Save original method RTCProcessor._original_denoise_step = RTCProcessor.denoise_step # Replace with profiled version RTCProcessor.denoise_step = profile_denoise_step logger.info("✓ RTC profiling enabled") def print_usage(): """Print usage instructions.""" print("\n" + "="*80) print("RTC PROFILING INSTRUMENTATION") print("="*80) print("\nThis script provides profiling for RTCProcessor methods.") print("\nOption 1: Monkey Patch (Recommended)") print("-" * 40) print("Add to your script:") print(""" from lerobot.utils.profiling import enable_profiling, print_profiling_summary from examples.rtc.add_rtc_profiling import monkey_patch_rtc_profiling # Enable profiling enable_profiling() monkey_patch_rtc_profiling() # ... run your code ... # Print results print_profiling_summary() """) print("\nOption 2: Manual Source Modification") print("-" * 40) print("1. Copy profile_denoise_step() from this file") print("2. Replace denoise_step() in src/lerobot/policies/rtc/modeling_rtc.py") print("3. Add profiling imports at top of file") print("\nKey Metrics to Watch:") print("-" * 40) print("- rtc.denoise_step.base_denoising - Time for base policy inference") print("- rtc.denoise_step.autograd_correction - Time computing gradients") print("- rtc.denoise_step.guidance_computation - Total guidance overhead") print("- rtc.denoise_step.get_prefix_weights - Time computing weights") print("="*80 + "\n") if __name__ == "__main__": print_usage()