profile

2026-05-23 04:30:10 +00:00 · 2025-11-18 09:51:50 +01:00
parent 8847e75c55
commit c868777752
8 changed files with 2567 additions and 0 deletions
@@ -0,0 +1,347 @@
+#!/usr/bin/env python
+
+"""
+Script to compare performance with and without RTC enabled.
+
+This script helps identify whether RTC is actually improving or degrading performance
+by running multiple inference passes and collecting detailed timing statistics.
+
+Usage:
+    # Profile with mock data (no robot needed)
+    uv run examples/rtc/profile_rtc_comparison.py \
+        --policy_path=helper2424/pi05_check_rtc \
+        --device=mps \
+        --num_iterations=50
+
+    # Profile with specific RTC config
+    uv run examples/rtc/profile_rtc_comparison.py \
+        --policy_path=helper2424/pi05_check_rtc \
+        --device=mps \
+        --num_iterations=50 \
+        --execution_horizon=20
+"""
+
+import argparse
+import logging
+import time
+from dataclasses import dataclass
+
+import numpy as np
+import torch
+
+from lerobot.configs.policies import PreTrainedConfig
+from lerobot.configs.types import RTCAttentionSchedule
+from lerobot.policies.factory import get_policy_class, make_pre_post_processors
+from lerobot.policies.rtc.configuration_rtc import RTCConfig
+from lerobot.utils.profiling import (
+    clear_profiling_stats,
+    enable_profiling,
+    get_profiling_stats,
+    print_profiling_summary,
+)
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ProfileResults:
+    """Results from profiling run."""
+
+    mode: str  # "with_rtc" or "without_rtc"
+    mean_time: float
+    std_time: float
+    min_time: float
+    max_time: float
+    times: list[float]
+    throughput: float  # iterations per second
+
+
+def create_mock_observation(policy, device: str) -> dict:
+    """Create a mock observation for testing.
+
+    Args:
+        policy: Policy instance
+        device: Device to create tensors on
+
+    Returns:
+        Mock observation dictionary
+    """
+    # Get expected input shapes from policy config
+    # This is a simplified version - adjust based on actual policy requirements
+    obs = {}
+
+    # Mock image observations (if needed)
+    if hasattr(policy.config, "input_shapes"):
+        for key, shape in policy.config.input_shapes.items():
+            if "image" in key:
+                # Typical image shape: (batch, channels, height, width)
+                obs[key] = torch.randn(1, *shape, device=device)
+            else:
+                obs[key] = torch.randn(1, *shape, device=device)
+
+    # Add task if needed
+    if "task" in policy.config.__dict__ or hasattr(policy, "accepts_task"):
+        obs["task"] = ["Pick up the object"]
+
+    # Mock state observation
+    obs["observation.state"] = torch.randn(1, 10, device=device)  # Adjust size as needed
+
+    return obs
+
+
+def profile_inference(
+    policy, observation: dict, num_iterations: int, use_rtc: bool, execution_horizon: int = 10
+) -> ProfileResults:
+    """Profile policy inference with or without RTC.
+
+    Args:
+        policy: Policy instance
+        observation: Observation dictionary
+        num_iterations: Number of inference iterations to run
+        use_rtc: Whether to enable RTC
+        execution_horizon: Execution horizon for RTC
+
+    Returns:
+        ProfileResults with timing statistics
+    """
+    mode = "with_rtc" if use_rtc else "without_rtc"
+    logger.info(f"\n{'='*80}")
+    logger.info(f"Profiling: {mode.upper()}")
+    logger.info(f"{'='*80}")
+
+    # Configure RTC
+    if use_rtc:
+        policy.config.rtc_config.enabled = True
+        policy.config.rtc_config.execution_horizon = execution_horizon
+        policy.init_rtc_processor()
+    else:
+        policy.config.rtc_config.enabled = False
+
+    times = []
+    prev_actions = None
+
+    # Warmup
+    logger.info("Warming up (5 iterations)...")
+    for _ in range(5):
+        with torch.no_grad():
+            if use_rtc:
+                _ = policy.predict_action_chunk(
+                    observation, inference_delay=0, prev_chunk_left_over=prev_actions
+                )
+            else:
+                _ = policy.predict_action_chunk(observation)
+
+    # Actual profiling
+    logger.info(f"Running {num_iterations} profiled iterations...")
+    for i in range(num_iterations):
+        start = time.perf_counter()
+
+        with torch.no_grad():
+            if use_rtc:
+                actions = policy.predict_action_chunk(
+                    observation, inference_delay=0, prev_chunk_left_over=prev_actions
+                )
+                # Simulate consuming some actions for next iteration
+                if actions.shape[1] > execution_horizon:
+                    prev_actions = actions[:, execution_horizon:].clone()
+                else:
+                    prev_actions = None
+            else:
+                actions = policy.predict_action_chunk(observation)
+
+        # Synchronize if using CUDA
+        if observation["observation.state"].device.type == "cuda":
+            torch.cuda.synchronize()
+
+        elapsed = time.perf_counter() - start
+        times.append(elapsed)
+
+        if (i + 1) % 10 == 0:
+            logger.info(f"  Completed {i+1}/{num_iterations} iterations")
+
+    # Calculate statistics
+    times_arr = np.array(times)
+    results = ProfileResults(
+        mode=mode,
+        mean_time=float(np.mean(times_arr)),
+        std_time=float(np.std(times_arr)),
+        min_time=float(np.min(times_arr)),
+        max_time=float(np.max(times_arr)),
+        times=times,
+        throughput=num_iterations / sum(times),
+    )
+
+    logger.info(f"\nResults for {mode}:")
+    logger.info(f"  Mean time: {results.mean_time*1000:.2f} ms")
+    logger.info(f"  Std dev:   {results.std_time*1000:.2f} ms")
+    logger.info(f"  Min time:  {results.min_time*1000:.2f} ms")
+    logger.info(f"  Max time:  {results.max_time*1000:.2f} ms")
+    logger.info(f"  Throughput: {results.throughput:.2f} iter/s")
+
+    return results
+
+
+def compare_results(results_without_rtc: ProfileResults, results_with_rtc: ProfileResults):
+    """Compare and print results from both runs.
+
+    Args:
+        results_without_rtc: Results from run without RTC
+        results_with_rtc: Results from run with RTC
+    """
+    logger.info(f"\n{'='*80}")
+    logger.info("COMPARISON SUMMARY")
+    logger.info(f"{'='*80}")
+
+    mean_diff = results_with_rtc.mean_time - results_without_rtc.mean_time
+    mean_diff_pct = (mean_diff / results_without_rtc.mean_time) * 100
+
+    throughput_diff = results_with_rtc.throughput - results_without_rtc.throughput
+    throughput_diff_pct = (throughput_diff / results_without_rtc.throughput) * 100
+
+    logger.info(f"\n{'Metric':<30} {'Without RTC':>15} {'With RTC':>15} {'Difference':>15}")
+    logger.info("-" * 80)
+    logger.info(
+        f"{'Mean time (ms)':<30} "
+        f"{results_without_rtc.mean_time*1000:>15.2f} "
+        f"{results_with_rtc.mean_time*1000:>15.2f} "
+        f"{mean_diff*1000:>+15.2f}"
+    )
+    logger.info(
+        f"{'Std dev (ms)':<30} "
+        f"{results_without_rtc.std_time*1000:>15.2f} "
+        f"{results_with_rtc.std_time*1000:>15.2f} "
+        f"{(results_with_rtc.std_time - results_without_rtc.std_time)*1000:>+15.2f}"
+    )
+    logger.info(
+        f"{'Min time (ms)':<30} "
+        f"{results_without_rtc.min_time*1000:>15.2f} "
+        f"{results_with_rtc.min_time*1000:>15.2f} "
+        f"{(results_with_rtc.min_time - results_without_rtc.min_time)*1000:>+15.2f}"
+    )
+    logger.info(
+        f"{'Max time (ms)':<30} "
+        f"{results_without_rtc.max_time*1000:>15.2f} "
+        f"{results_with_rtc.max_time*1000:>15.2f} "
+        f"{(results_with_rtc.max_time - results_without_rtc.max_time)*1000:>+15.2f}"
+    )
+    logger.info(
+        f"{'Throughput (iter/s)':<30} "
+        f"{results_without_rtc.throughput:>15.2f} "
+        f"{results_with_rtc.throughput:>15.2f} "
+        f"{throughput_diff:>+15.2f}"
+    )
+
+    logger.info(f"\n{'='*80}")
+    logger.info("VERDICT")
+    logger.info(f"{'='*80}")
+
+    if mean_diff_pct < -5:
+        logger.info(f"✓ RTC is FASTER by {abs(mean_diff_pct):.1f}%")
+        logger.info(f"  Mean time reduced by {abs(mean_diff)*1000:.2f} ms")
+    elif mean_diff_pct > 5:
+        logger.info(f"✗ RTC is SLOWER by {mean_diff_pct:.1f}%")
+        logger.info(f"  Mean time increased by {mean_diff*1000:.2f} ms")
+        logger.info("\n  Possible reasons:")
+        logger.info("  - RTC overhead exceeds benefits at current execution horizon")
+        logger.info("  - Inference delay calculation not accounting for RTC processing")
+        logger.info("  - Additional tensor operations in RTC guidance")
+    else:
+        logger.info(f"≈ Performance is SIMILAR (difference: {mean_diff_pct:+.1f}%)")
+
+    logger.info(f"{'='*80}\n")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Profile RTC performance")
+    parser.add_argument(
+        "--policy_path", type=str, required=True, help="Path to pretrained policy"
+    )
+    parser.add_argument(
+        "--device", type=str, default="cuda", help="Device to run on (cuda/cpu/mps)"
+    )
+    parser.add_argument(
+        "--num_iterations", type=int, default=50, help="Number of inference iterations"
+    )
+    parser.add_argument(
+        "--execution_horizon", type=int, default=10, help="RTC execution horizon"
+    )
+    parser.add_argument(
+        "--enable_detailed_profiling",
+        action="store_true",
+        help="Enable detailed method-level profiling",
+    )
+    parser.add_argument(
+        "--use_torch_compile", action="store_true", help="Use torch.compile for faster inference"
+    )
+
+    args = parser.parse_args()
+
+    # Load policy
+    logger.info(f"Loading policy from {args.policy_path}")
+    config = PreTrainedConfig.from_pretrained(args.policy_path)
+    policy_class = get_policy_class(config.type)
+
+    # Set compile flag if needed
+    if hasattr(config, "compile_model"):
+        config.compile_model = args.use_torch_compile
+
+    policy = policy_class.from_pretrained(args.policy_path, config=config)
+
+    # Initialize RTC config
+    policy.config.rtc_config = RTCConfig(
+        execution_horizon=args.execution_horizon,
+        max_guidance_weight=1.0,
+        prefix_attention_schedule=RTCAttentionSchedule.EXP,
+    )
+
+    policy = policy.to(args.device)
+    policy.eval()
+
+    logger.info(f"Policy loaded: {config.type}")
+    logger.info(f"Device: {args.device}")
+    logger.info(f"Execution horizon: {args.execution_horizon}")
+
+    # Create mock observation
+    logger.info("Creating mock observation...")
+    observation = create_mock_observation(policy, args.device)
+
+    # Enable detailed profiling if requested
+    if args.enable_detailed_profiling:
+        enable_profiling()
+        logger.info("Detailed profiling enabled")
+
+    # Profile without RTC
+    results_without_rtc = profile_inference(
+        policy=policy,
+        observation=observation,
+        num_iterations=args.num_iterations,
+        use_rtc=False,
+        execution_horizon=args.execution_horizon,
+    )
+
+    if args.enable_detailed_profiling:
+        logger.info("\nDetailed profiling stats (WITHOUT RTC):")
+        print_profiling_summary()
+        clear_profiling_stats()
+
+    # Profile with RTC
+    results_with_rtc = profile_inference(
+        policy=policy,
+        observation=observation,
+        num_iterations=args.num_iterations,
+        use_rtc=True,
+        execution_horizon=args.execution_horizon,
+    )
+
+    if args.enable_detailed_profiling:
+        logger.info("\nDetailed profiling stats (WITH RTC):")
+        print_profiling_summary()
+
+    # Compare results
+    compare_results(results_without_rtc, results_with_rtc)
+
+
+if __name__ == "__main__":
+    main()
+