add: inference benchmark

2026-07-19 07:51:43 +00:00 · 2025-09-23 22:34:52 +02:00
parent f6cd24be17
commit cdd6cb606c
2 changed files with 315 additions and 194 deletions
@@ -0,0 +1,315 @@
+"""
+Benchmark memory footprint and inference latency of a policy on arbitrary devices.
+
+This script loads a pretrained policy directly (similar to the async inference server)
+and generates dummy input data based on the policy's input_features to perform
+accurate benchmarking without requiring datasets.
+"""
+
+import argparse
+import os
+import statistics
+from datetime import datetime
+from pathlib import Path
+
+import psutil
+import torch
+
+from lerobot.configs.types import FeatureType
+from lerobot.policies.factory import get_policy_class
+from lerobot.policies.pretrained import PreTrainedPolicy
+
+
+def bytes_to_human(n: int) -> str:
+    for unit in ["B", "KB", "MB", "GB", "TB"]:
+        if n < 1024:
+            return f"{n:.2f} {unit}"
+        n /= 1024
+    return f"{n:.2f} PB"
+
+
+def percentile(values: list[float], p: float) -> float:
+    if not values:
+        return float("nan")
+    k = (len(values) - 1) * (p / 100.0)
+    f = int(k)
+    c = min(f + 1, len(values) - 1)
+    if f == c:
+        return values[f]
+    return values[f] + (values[c] - values[f]) * (k - f)
+
+
+def generate_dummy_observation(input_features: dict, device: str = "cpu") -> dict:
+    """Generate dummy observation data based on policy input features."""
+    dummy_obs = {}
+
+    for key, feature in input_features.items():
+        shape = feature.shape
+
+        if feature.type == FeatureType.VISUAL:
+            # Images: random values in [0, 1] range (already normalized)
+            dummy_obs[key] = torch.rand(shape, dtype=torch.float32, device=device)
+        elif feature.type in [FeatureType.STATE, FeatureType.ACTION, FeatureType.ENV]:
+            # State/action/env: random normal distribution
+            dummy_obs[key] = torch.randn(shape, dtype=torch.float32, device=device)
+        else:
+            # Default: random normal for unknown types
+            dummy_obs[key] = torch.randn(shape, dtype=torch.float32, device=device)
+
+    # Add batch dimension
+    for key in dummy_obs:
+        dummy_obs[key] = dummy_obs[key].unsqueeze(0)
+
+    # Add task string for language-conditioned policies
+    dummy_obs["task"] = ""
+
+    return dummy_obs
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Policy inference benchmark")
+    parser.add_argument(
+        "--policy-id", type=str, required=True, help="Model ID or local path to pretrained policy"
+    )
+    parser.add_argument(
+        "--policy-type", type=str, required=True, help="Type of policy (smolvla, act, diffusion, etc.)"
+    )
+    parser.add_argument(
+        "--device", type=str, default="mps", choices=["cuda", "cpu", "mps"], help="Device to run on"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="Random seed")
+    parser.add_argument("--num-trials", type=int, default=10, help="Number of timing trials")
+    parser.add_argument("--forwards-per-trial", type=int, default=10, help="Number of forwards per trial")
+    parser.add_argument("--warmup", type=int, default=2, help="Warmup forwards (not timed)")
+    parser.add_argument(
+        "--output-dir", type=str, default="outputs/benchmarks", help="Directory to save benchmark results"
+    )
+    args = parser.parse_args()
+
+    # Seed & deterministic-ish setup
+    torch.manual_seed(args.seed)
+    if args.device == "cuda":
+        torch.cuda.manual_seed_all(args.seed)
+    torch.backends.cudnn.benchmark = False
+    torch.backends.cudnn.deterministic = False  # leave False to avoid perf cliffs
+
+    # Resolve device availability
+    device = args.device.lower()
+    if device == "cuda" and not torch.cuda.is_available():
+        print("[!] CUDA requested but unavailable. Falling back to CPU.")
+        device = "cpu"
+    elif device == "mps" and not (hasattr(torch.backends, "mps") and torch.backends.mps.is_available()):
+        print("[!] MPS requested but unavailable. Falling back to CPU.")
+        device = "cpu"
+
+    use_cuda = device == "cuda"
+
+    # Create output directory and log file
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    policy_name = args.policy_id.replace("/", "_").replace("\\", "_")
+    log_file = output_dir / f"benchmark_{args.policy_type}_{policy_name}_{device}_{timestamp}.txt"
+
+    # Load policy directly from pretrained (similar to async inference server)
+    print(f"Loading policy {args.policy_type} from {args.policy_id}...")
+    policy_class = get_policy_class(args.policy_type)
+    policy: PreTrainedPolicy = policy_class.from_pretrained(args.policy_id)
+    policy.eval()
+    policy.to(device)
+
+    print(f"Policy loaded on {device}")
+    print(f"Input features: {list(policy.config.input_features.keys())}")
+    print(f"Output features: {list(policy.config.output_features.keys())}")
+
+    # Generate dummy observation based on policy input features
+    dummy_observation = generate_dummy_observation(policy.config.input_features, device)
+
+    # Helper to sync for fair timings
+    def _sync(dev_=device):
+        if dev_ == "cuda" and torch.cuda.is_available():
+            torch.cuda.synchronize()
+        elif dev_ == "mps" and hasattr(torch, "mps"):
+            try:
+                torch.mps.synchronize()
+            except AttributeError:
+                pass  # MPS sync not available in this PyTorch version
+
+    # Warmup (to stabilize kernels/caches)
+    print("Warming up...")
+    with torch.no_grad():
+        for _ in range(args.warmup):
+            _ = policy.predict_action_chunk(dummy_observation)
+        _sync()
+
+    # Memory footprint before timing
+    process = psutil.Process(os.getpid())
+    rss_before = process.memory_info().rss
+    if use_cuda:
+        torch.cuda.reset_peak_memory_stats()
+
+    # PyTorch timing with Event objects for more accurate GPU timing
+    print(f"Running benchmark: {args.num_trials} trials x {args.forwards_per_trial} forwards...")
+
+    if use_cuda:
+        # Use CUDA Events for precise GPU timing
+        start_events = []
+        end_events = []
+
+        with torch.no_grad():
+            for _ in range(args.num_trials):
+                for _ in range(args.forwards_per_trial):
+                    start_event = torch.cuda.Event(enable_timing=True)
+                    end_event = torch.cuda.Event(enable_timing=True)
+
+                    start_event.record()
+                    _ = policy.predict_action_chunk(dummy_observation)
+                    end_event.record()
+
+                    start_events.append(start_event)
+                    end_events.append(end_event)
+
+        # Synchronize and collect timing results
+        torch.cuda.synchronize()
+        per_forward_ms = []
+        for start_event, end_event in zip(start_events, end_events, strict=True):
+            per_forward_ms.append(start_event.elapsed_time(end_event))
+
+    else:
+        # Use torch.utils.benchmark for CPU/MPS timing
+        from torch.utils.benchmark import Timer
+
+        def run_inference():
+            return policy.predict_action_chunk(dummy_observation)
+
+        # Collect individual timing measurements
+        per_forward_ms = []
+        with torch.no_grad():
+            for _ in range(args.num_trials):
+                for _ in range(args.forwards_per_trial):
+                    timer = Timer(stmt="run_inference()", globals={"run_inference": run_inference})
+                    measurement = timer.timeit(1)  # Single measurement
+                    per_forward_ms.append(measurement.mean * 1000)  # Convert to ms
+
+    # Memory footprint after timing
+    rss_after = process.memory_info().rss
+    rss_delta = rss_after - rss_before
+    cuda_peak = torch.cuda.max_memory_allocated() if use_cuda else 0
+
+    # Sort timing results for percentile calculations
+    per_forward_ms_sorted = sorted(per_forward_ms)
+
+    mean_ms = statistics.fmean(per_forward_ms) if per_forward_ms else float("nan")
+    std_ms = statistics.pstdev(per_forward_ms) if len(per_forward_ms) > 1 else 0.0
+    min_ms = per_forward_ms_sorted[0] if per_forward_ms_sorted else float("nan")
+    max_ms = per_forward_ms_sorted[-1] if per_forward_ms_sorted else float("nan")
+    p50_ms = percentile(per_forward_ms_sorted, 50)
+    p95_ms = percentile(per_forward_ms_sorted, 95)
+
+    # Model size
+    num_params = sum(p.numel() for p in policy.parameters())
+
+    # Prepare results for logging
+    results = {
+        "timestamp": datetime.now().isoformat(),
+        "policy_type": args.policy_type,
+        "policy_id": args.policy_id,
+        "device": device,
+        "num_trials": args.num_trials,
+        "forwards_per_trial": args.forwards_per_trial,
+        "warmup": args.warmup,
+        "seed": args.seed,
+        "num_params": num_params,
+        "latency_mean_ms": mean_ms,
+        "latency_std_ms": std_ms,
+        "latency_min_ms": min_ms,
+        "latency_max_ms": max_ms,
+        "latency_p50_ms": p50_ms,
+        "latency_p95_ms": p95_ms,
+        "cpu_rss_before": rss_before,
+        "cpu_rss_after": rss_after,
+        "cpu_rss_delta": rss_delta,
+        "cuda_peak_alloc": cuda_peak,
+        "input_features": list(policy.config.input_features.keys()),
+        "output_features": list(policy.config.output_features.keys()),
+    }
+
+    # Format and write results to log file
+    log_content = f"""
+=== LeRobot Policy Inference Benchmark ===
+Timestamp: {results["timestamp"]}
+Policy: {results["policy_type"]} ({results["policy_id"]})
+Device: {results["device"]}
+Seed: {results["seed"]}
+
+=== Model Information ===
+Parameters: {results["num_params"]:,}
+Input Features: {", ".join(results["input_features"])}
+Output Features: {", ".join(results["output_features"])}
+
+=== Benchmark Configuration ===
+Trials: {results["num_trials"]}
+Forwards per Trial: {results["forwards_per_trial"]}
+Warmup: {results["warmup"]}
+Total Measurements: {len(per_forward_ms)}
+
+=== Latency Results (ms) ===
+Mean:     {results["latency_mean_ms"]:.3f}
+Std Dev:  {results["latency_std_ms"]:.3f}
+Min:      {results["latency_min_ms"]:.3f}
+Max:      {results["latency_max_ms"]:.3f}
+P50:      {results["latency_p50_ms"]:.3f}
+P95:      {results["latency_p95_ms"]:.3f}
+
+=== Memory Footprint ===
+CPU RSS Before: {bytes_to_human(results["cpu_rss_before"])}
+CPU RSS After:  {bytes_to_human(results["cpu_rss_after"])} (Δ {bytes_to_human(results["cpu_rss_delta"])})
+"""
+
+    if use_cuda:
+        log_content += f"CUDA Peak:      {bytes_to_human(results['cuda_peak_alloc'])} (reset before timing)\n"
+
+    log_content += f"""
+=== Raw Timing Data (first 20 measurements, ms) ===
+{", ".join(f"{t:.3f}" for t in per_forward_ms[:20])}
+{"..." if len(per_forward_ms) > 20 else ""}
+
+=== Summary Statistics ===
+Timing Method: {"CUDA Events" if use_cuda else "torch.utils.benchmark.Timer"}
+Device Available: {torch.cuda.is_available() if device == "cuda" else torch.backends.mps.is_available() if device == "mps" else True}
+PyTorch Version: {torch.__version__}
+
+Benchmark completed successfully at {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
+"""
+
+    # Write to log file
+    with open(log_file, "w") as f:
+        f.write(log_content)
+
+    # Print to console (shorter version)
+    print("\n=== Inference Benchmark Results ===")
+    print(f"Policy: {args.policy_type} ({args.policy_id})")
+    print(f"Device: {device}")
+    print(f"Trials: {args.num_trials} | Forwards/Trial: {args.forwards_per_trial} | Warmup: {args.warmup}")
+    print(f"Model params: {num_params:,}")
+
+    print("\nLatency per forward (ms):")
+    print(f"  mean: {mean_ms:.3f}  std: {std_ms:.3f}")
+    print(f"  min:  {min_ms:.3f}   max: {max_ms:.3f}")
+    print(f"  p50:  {p50_ms:.3f}   p95: {p95_ms:.3f}")
+
+    print("\nMemory footprint:")
+    print(f"  CPU RSS before: {bytes_to_human(rss_before)}")
+    print(f"  CPU RSS after : {bytes_to_human(rss_after)}  (Δ {bytes_to_human(rss_delta)})")
+    if use_cuda:
+        print(
+            f"  CUDA peak allocated: {bytes_to_human(cuda_peak)} "
+            f"(reset by reset_peak_memory_stats before timing)"
+        )
+
+    print(f"\nResults saved to: {log_file}")
+    print("Benchmark completed successfully!")
+
+
+if __name__ == "__main__":
+    main()
@@ -1,194 +0,0 @@
-#!/usr/bin/env python
-"""
-Minimal Policy inference + benchmarking.
-
-Features:
- End-to-end pipeline: dataset -> pre/post-processors -> policy.select_action
- Latency benchmarking with warmup, N trials, and M forwards/trial
- Reports mean/std/min/max and p50/p95 latencies (ms) per forward
- CPU RSS and CUDA (peak) memory footprint
- Works on CPU or CUDA; syncs properly for fair GPU timings
-
-Example:
-  python smolvla_bench.py \
-    --repo_id AdilZtn/grab_red_cube_test_25 --episode 0 --sample_index 10 \
-    --device cuda --num_trials 100 --forwards_per_trial 10 --warmup 20
-"""
-
-import argparse
-import os
-import statistics
-import time
-from typing import List
-
-import torch
-import psutil
-
-from lerobot.datasets.lerobot_dataset import LeRobotDataset, LeRobotDatasetMetadata
-from lerobot.policies.factory import make_policy, make_policy_config
-from lerobot.policies.pretrained import PreTrainedPolicy
-from lerobot.policies.factory import make_pre_post_processors
-
-
-def bytes_to_human(n: int) -> str:
-    for unit in ["B", "KB", "MB", "GB", "TB"]:
-        if n < 1024:
-            return f"{n:.2f} {unit}"
-        n /= 1024
-    return f"{n:.2f} PB"
-
-
-def percentile(values: List[float], p: float) -> float:
-    if not values:
-        return float("nan")
-    k = (len(values) - 1) * (p / 100.0)
-    f = int(k)
-    c = min(f + 1, len(values) - 1)
-    if f == c:
-        return values[f]
-    return values[f] + (values[c] - values[f]) * (k - f)
-
-
-def main():
-    parser = argparse.ArgumentParser(description="SmolVLA inference + latency benchmark")
-    parser.add_argument("--repo_id", type=str, default="AdilZtn/grab_red_cube_test_25",
-                        help="HF dataset repo_id with language instructions")
-    parser.add_argument("--episode", type=int, default=0, help="Episode index to load")
-    parser.add_argument("--sample_index", type=int, default=10, help="Sample index in the episode")
-    parser.add_argument("--device", type=str, default="cuda", choices=["cuda", "cpu"], help="Device to run on")
-    parser.add_argument("--seed", type=int, default=42, help="Random seed")
-    parser.add_argument("--n_obs_steps", type=int, default=1, help="Obs steps for SmolVLA")
-    parser.add_argument("--n_action_steps", type=int, default=50, help="Action steps for SmolVLA")
-    parser.add_argument("--chunk_size", type=int, default=50, help="Chunk size for SmolVLA")
-    parser.add_argument("--num_trials", type=int, default=100, help="Number of timing trials")
-    parser.add_argument("--forwards_per_trial", type=int, default=1, help="Number of forwards per trial")
-    parser.add_argument("--warmup", type=int, default=20, help="Warmup forwards (not timed)")
-    parser.add_argument("--print_each_trial", action="store_true", help="Print each trial's aggregate time")
-    parser.add_argument("--policy_type", type=str, default="smolvla", help="Type of policy to benchmark")
-    args = parser.parse_args()
-
-    # Seed & deterministic-ish setup
-    torch.manual_seed(args.seed)
-    torch.cuda.manual_seed_all(args.seed)
-    torch.backends.cudnn.benchmark = False
-    torch.backends.cudnn.deterministic = False  # leave False to avoid perf cliffs
-
-    # Device
-    use_cuda = args.device == "cuda" and torch.cuda.is_available()
-    device = "cuda" if use_cuda else "cpu"
-    if args.device == "cuda" and not use_cuda:
-        print("[!] CUDA requested but unavailable. Falling back to CPU.")
-
-    # Load dataset metadata
-    ds_meta = LeRobotDatasetMetadata(args.repo_id)
-
-    # Policy config & creation
-    cfg = make_policy_config(
-        args.policy_type,
-        n_obs_steps=args.n_obs_steps,
-        chunk_size=args.chunk_size, # comment this if policy_type = "diffusion"
-        n_action_steps=args.n_action_steps,
-        device=device,
-    )
-
-    policy: PreTrainedPolicy = make_policy(cfg, ds_meta=ds_meta)
-    policy.eval()
-    policy.to(device)
-
-    # Pre/post processors
-    preprocessor, postprocessor = make_pre_post_processors(cfg, dataset_stats=ds_meta.stats)
-
-    # Dataset sample
-    dataset = LeRobotDataset(args.repo_id, episodes=[args.episode])
-    sample = dataset[args.sample_index]
-
-    # Preprocess once; we will reuse the same batch for all forwards (typical for latency bench)
-    preprocessed_batch = preprocessor(sample)
-
-    # Helper to sync for fair timings
-    def _sync():
-        if use_cuda:
-            torch.cuda.synchronize()
-
-    # Warmup (to stabilize kernels/caches)
-    with torch.no_grad():
-        for _ in range(args.warmup):
-            _ = policy.select_action(preprocessed_batch)
-        _sync()
-
-    # Memory footprint before timing
-    process = psutil.Process(os.getpid())
-    rss_before = process.memory_info().rss
-    if use_cuda:
-        torch.cuda.reset_peak_memory_stats()
-
-    # Timing
-    trial_times_sec: List[float] = []
-
-    with torch.no_grad():
-        for t in range(args.num_trials):
-            _sync()
-            t0 = time.perf_counter()
-            for _ in range(args.forwards_per_trial):
-                _ = policy.select_action(preprocessed_batch)
-            _sync()
-            t1 = time.perf_counter()
-            trial_dur = t1 - t0
-            trial_times_sec.append(trial_dur)
-            if args.print_each_trial:
-                print(f"[trial {t+1:03d}] total {trial_dur*1000:.3f} ms "
-                      f"({(trial_dur/args.forwards_per_trial)*1000:.3f} ms/forward)")
-
-    # Memory footprint after timing
-    rss_after = process.memory_info().rss
-    rss_delta = rss_after - rss_before
-    cuda_peak = torch.cuda.max_memory_allocated() if use_cuda else 0
-
-    # Do a single real inference and postprocess to verify everything still works
-    with torch.no_grad():
-        action = policy.select_action(preprocessed_batch)
-    postprocessed_action = postprocessor(action)
-
-    # Summaries
-    # Per-forward latencies in ms
-    per_forward_ms = [(d / args.forwards_per_trial) * 1000.0 for d in trial_times_sec]
-    per_forward_ms_sorted = sorted(per_forward_ms)
-
-    mean_ms = statistics.fmean(per_forward_ms) if per_forward_ms else float("nan")
-    std_ms = statistics.pstdev(per_forward_ms) if len(per_forward_ms) > 1 else 0.0
-    min_ms = per_forward_ms_sorted[0] if per_forward_ms_sorted else float("nan")
-    max_ms = per_forward_ms_sorted[-1] if per_forward_ms_sorted else float("nan")
-    p50_ms = percentile(per_forward_ms_sorted, 50)
-    p95_ms = percentile(per_forward_ms_sorted, 95)
-
-    # Model size
-    num_params = sum(p.numel() for p in policy.parameters())
-
-    print("\n=== Inference Benchmark for ===", args.policy_type)
-    print(f"Device: {device}")
-    print(f"Trials: {args.num_trials} | Forwards/Trial: {args.forwards_per_trial} | Warmup: {args.warmup}")
-    print(f"Model params: {num_params:,}")
-
-    print("\nLatency per forward (ms):")
-    print(f"  mean: {mean_ms:.3f}  std: {std_ms:.3f}")
-    print(f"  min:  {min_ms:.3f}   max: {max_ms:.3f}")
-    print(f"  p50:  {p50_ms:.3f}   p95: {p95_ms:.3f}")
-
-    print("\nMemory footprint:")
-    print(f"  CPU RSS before: {bytes_to_human(rss_before)}")
-    print(f"  CPU RSS after : {bytes_to_human(rss_after)}  (Δ {bytes_to_human(rss_delta)})")
-    if use_cuda:
-        print(f"  CUDA peak allocated: {bytes_to_human(cuda_peak)} "
-              f"(reset by reset_peak_memory_stats before timing)")
-
-    # Quick shape dump from this run
-    try:
-        print("\nAction shapes:")
-        print(f"  raw: {tuple(action.shape)}")
-        print(f"  postprocessed: {tuple(postprocessed_action.shape)}")
-    except Exception:
-        pass
-
-
-if __name__ == "__main__":
-    main()