From 27f7829b09c7549e3b1ec661e442729db90abf28 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Wed, 29 Apr 2026 15:00:23 +0200
Subject: [PATCH] feat(annotate): forward chat_template_kwargs to OpenAI
 extra_body

Lets callers pass per-request template flags such as
{"enable_thinking": false} for Qwen3.5/Qwen3.6 models, where the
default thinking preamble otherwise consumes the entire max_new_tokens
budget before any JSON is emitted.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/lerobot/annotations/steerable_pipeline/config.py     | 7 +++++++
 src/lerobot/annotations/steerable_pipeline/vlm_client.py | 9 ++++++---
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py
index 81e1a6a13..297839d06 100644
--- a/src/lerobot/annotations/steerable_pipeline/config.py
+++ b/src/lerobot/annotations/steerable_pipeline/config.py
@@ -18,6 +18,7 @@ from __future__ import annotations
 
 from dataclasses import dataclass, field
 from pathlib import Path
+from typing import Any
 
 
 @dataclass
@@ -140,6 +141,12 @@ class VlmConfig:
     camera_key: str | None = None
     """Override the camera stream used for keyframe attachment. ``None`` picks
     the first ``observation.images.*`` key the dataset declares."""
+    chat_template_kwargs: dict[str, Any] | None = None
+    """Forwarded as ``extra_body.chat_template_kwargs`` on every chat call.
+    Use this to pass model-specific template flags such as
+    ``{"enable_thinking": false}`` for Qwen3.5/Qwen3.6 to suppress the
+    reasoning preamble that otherwise eats the entire ``max_new_tokens``
+    budget before any JSON is emitted."""
 
 
 @dataclass
diff --git a/src/lerobot/annotations/steerable_pipeline/vlm_client.py b/src/lerobot/annotations/steerable_pipeline/vlm_client.py
index 1f1f83037..d2659321b 100644
--- a/src/lerobot/annotations/steerable_pipeline/vlm_client.py
+++ b/src/lerobot/annotations/steerable_pipeline/vlm_client.py
@@ -380,10 +380,13 @@ def _make_openai_client(config: VlmConfig) -> VlmClient:
             "max_tokens": max_tok,
             "temperature": temp,
         }
+        extra_body: dict[str, Any] = {}
         if send_mm_kwargs and mm_kwargs:
-            kwargs["extra_body"] = {
-                "mm_processor_kwargs": {**mm_kwargs, "do_sample_frames": True}
-            }
+            extra_body["mm_processor_kwargs"] = {**mm_kwargs, "do_sample_frames": True}
+        if config.chat_template_kwargs:
+            extra_body["chat_template_kwargs"] = config.chat_template_kwargs
+        if extra_body:
+            kwargs["extra_body"] = extra_body
         with rr_lock:
             chosen = clients[rr_counter["i"] % len(clients)]
             rr_counter["i"] += 1