diff --git a/src/lerobot/policies/smolvla2/inference/steps.py b/src/lerobot/policies/smolvla2/inference/steps.py
index 03e902dfe..5fc04bcdb 100644
--- a/src/lerobot/policies/smolvla2/inference/steps.py
+++ b/src/lerobot/policies/smolvla2/inference/steps.py
@@ -365,23 +365,24 @@ class HighLevelSubtaskFwd(InferenceStep):
             return None
         ctx = _msgs_for_subtask(state)
         observation = _maybe_observation(self.observation_provider)
-        # Force the head to commit to ≥ 5 real tokens before it can
-        # close the turn, and sample at moderate temperature with
-        # nucleus filtering. On a memorised head whose argmax at
-        # position 0 is EOS, greedy decoding silently produced empty
-        # completions every chunk boundary (visible as the
-        # ``empty:N`` counter climbing). Temp 0.4 + top_p 0.9 is well
-        # below where SmolVLM goes incoherent and above where greedy
-        # collapse re-emerges.
+        # Match training: greedy argmax, no min_new_tokens, no
+        # special-token suppression. Earlier experiments forced
+        # min_new_tokens=5 + sampling because the LM head was
+        # collapsing to EOS at position 0 — but that turned out to
+        # be a visual-distribution shift (camera frames being fed
+        # at the camera's native resolution rather than the
+        # dataset's recorded resolution), not a head pathology.
+        # With the camera frame resized to the dataset's
+        # ``ds_features['observation.images.*']['shape']`` shape,
+        # the visual prefix is back on-distribution and the same
+        # greedy decoding that works in ``--no_robot`` dry-run also
+        # works on the live robot.
         msg = _generate_with_policy(
             self.policy,
             ctx,
             observation=observation,
             state=state,
             label="subtask gen",
-            min_new_tokens=5,
-            temperature=0.4,
-            top_p=0.9,
         )
         # Diagnostics: surface what the model is *actually* producing
         # at chunk boundaries, even when the output gets rejected or
@@ -474,9 +475,6 @@ class MemoryUpdateFwd(InferenceStep):
             observation=observation,
             state=state,
             label="memory gen",
-            min_new_tokens=5,
-            temperature=0.4,
-            top_p=0.9,
         )
         state["last_memory_raw"] = new_memory or ""
         if new_memory and _looks_like_gibberish(new_memory):
@@ -520,9 +518,6 @@ class UserInterjectionFwd(InferenceStep):
             observation=observation,
             state=state,
             label="plan/say gen",
-            min_new_tokens=10,
-            temperature=0.5,
-            top_p=0.9,
         )
         if not out:
             # Don't log every empty completion — happens repeatedly on
@@ -592,9 +587,6 @@ class AskVQAFwd(InferenceStep):
             observation=observation,
             state=state,
             label="vqa gen",
-            min_new_tokens=3,
-            temperature=0.4,
-            top_p=0.9,
         )
         # VQA answers are intentionally JSON-like during training, so
         # ``_looks_like_gibberish`` would false-positive on them. Keep
diff --git a/src/lerobot/scripts/lerobot_smolvla2_runtime.py b/src/lerobot/scripts/lerobot_smolvla2_runtime.py
index d7c567b50..9b0613874 100644
--- a/src/lerobot/scripts/lerobot_smolvla2_runtime.py
+++ b/src/lerobot/scripts/lerobot_smolvla2_runtime.py
@@ -594,6 +594,40 @@ def _build_robot_observation_provider(
         getattr(robot, "config", None), "type", None
     )
 
+    # Pre-compute the camera-key → target (H, W) map from
+    # ``ds_features``. The training distribution sees frames at the
+    # recorded resolution (e.g. 480×640); a live Mac/USB camera will
+    # almost always hand us a different native size (720p / 1080p).
+    # SmolVLA's internal ``resize_with_pad(512, 512)`` does pad the
+    # input to a fixed canvas, but the *geometry* of that pad differs
+    # by input aspect ratio — top/left padding varies, so the visual
+    # tokens at each tile carry different content than what the model
+    # saw at training. The action expert tolerates this (flow head
+    # rides broad geometry); the LM head, supervised much more
+    # tightly on visual features, goes out of distribution and the
+    # head's distribution at position 0 collapses to its dominant
+    # mode (a memorised ``\n``-only run in this checkpoint).
+    target_image_shapes: dict[str, tuple[int, int]] = {}
+    if ds_features:
+        for fkey, fmeta in ds_features.items():
+            if not isinstance(fmeta, dict):
+                continue
+            dtype = fmeta.get("dtype")
+            if dtype not in ("image", "video"):
+                continue
+            shape = fmeta.get("shape")
+            if not shape or len(shape) != 3:
+                continue
+            names = fmeta.get("names") or []
+            # Feature schema stores either (H, W, C) or (C, H, W);
+            # disambiguate by the ``names`` ordering when present.
+            if names and len(names) == 3 and names[0] == "channels":
+                _, h, w = shape
+            else:
+                h, w, _ = shape
+            cam_key = fkey.removeprefix("observation.images.")
+            target_image_shapes[cam_key] = (int(h), int(w))
+
     def _provider() -> dict | None:
         try:
             raw = robot.get_observation()
@@ -606,6 +640,32 @@ def _build_robot_observation_provider(
         for k in ("language_persistent", "language_events"):
             raw.pop(k, None)
 
+        # Force-match the training-time visual distribution:
+        # every camera frame the model trained on came from the
+        # dataset at its recorded (H, W). Resize the live frame to
+        # that exact shape so the downstream resize_with_pad geometry
+        # matches training. Without this the LM head is OOD on every
+        # tick.
+        if target_image_shapes:
+            try:
+                import cv2 as _cv2  # noqa: PLC0415
+                import numpy as _np  # noqa: PLC0415
+
+                for cam_key, (target_h, target_w) in target_image_shapes.items():
+                    img = raw.get(cam_key)
+                    if img is None or not isinstance(img, _np.ndarray):
+                        continue
+                    if img.ndim != 3:
+                        continue
+                    cur_h, cur_w = img.shape[:2]
+                    if (cur_h, cur_w) == (target_h, target_w):
+                        continue
+                    raw[cam_key] = _cv2.resize(
+                        img, (target_w, target_h), interpolation=_cv2.INTER_AREA
+                    )
+            except Exception as exc:  # noqa: BLE001
+                logger.warning("camera resize to dataset shape failed: %s", exc)
+
         try:
             if ds_features:
                 # Use the dataset's feature schema to pick the right