From d55b581ca194c92fde8cf251e83ec7b5bd70caf8 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Wed, 6 May 2026 14:51:06 +0200
Subject: [PATCH] =?UTF-8?q?fix(language):=20address=20review=20=E2=80=94?=
 =?UTF-8?q?=20tools=20accessor,=20motion=20docs,=20conditional=20collate?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* **`meta.tools` actually reads `info.json["tools"]`.** `DatasetInfo`
  had no `tools` field, so `from_dict` silently dropped the key (it
  warned about unknown fields then discarded them) and the property
  always returned `DEFAULT_TOOLS`. Added `tools: list[dict] | None`
  to the dataclass; `to_dict()` drops it when unset so existing
  datasets keep a clean `info.json`. Fixed the accessor to read
  `self.info.tools` (the previous `.get(...)` would have raised
  AttributeError on the dataclass anyway). Added regression tests:
  fallback when absent, round-trip from disk, and round-trip
  through `DatasetInfo.from_dict` / `to_dict`.

* **`motion` is not view-dependent — fix the docs.** The mdx claimed
  rows of style `motion` must carry `camera`, but `VIEW_DEPENDENT_STYLES
  = {"vqa", "trace"}` and the validator agrees: motion primitives are
  joint/Cartesian-frame, not pixel-space. Updated both call-out
  paragraphs in `language_and_recipes.mdx`.

* **Conditional `collate_fn` swap.** Added `meta.has_language_columns`
  and gate the `lerobot_collate_fn` swap in `lerobot_train.py` on it,
  so non-language datasets keep PyTorch's `default_collate`. Also
  added a pass-through test in `test_collate.py` that asserts on a
  plain tensor batch the custom collate matches `default_collate`
  key-for-key, plus a test for the `None`-sample drop path.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 docs/source/language_and_recipes.mdx     | 11 ++--
 src/lerobot/datasets/dataset_metadata.py | 15 ++++-
 src/lerobot/datasets/utils.py            |  7 ++
 src/lerobot/scripts/lerobot_train.py     |  6 +-
 tests/datasets/test_dataset_metadata.py  | 81 ++++++++++++++++++++++++
 tests/utils/test_collate.py              | 44 +++++++++++++
 6 files changed, 156 insertions(+), 8 deletions(-)

diff --git a/docs/source/language_and_recipes.mdx b/docs/source/language_and_recipes.mdx
index 82401c8f6..7eb588ce8 100644
--- a/docs/source/language_and_recipes.mdx
+++ b/docs/source/language_and_recipes.mdx
@@ -46,10 +46,11 @@ tool_calls: list[Json] | null
 ```
 
 The `camera` field tags rows whose `content` is grounded in a specific camera
-view. Rows of view-dependent styles (`vqa`, and the reserved `motion` /
-`trace`) MUST set `camera` to the matching `observation.images.*` feature key.
-Rows of every other style MUST leave `camera` as `null`. Pipeline writers and
-the validator enforce this via `validate_camera_field(style, camera)`.
+view. Rows of view-dependent styles (`vqa` and `trace`) MUST set `camera` to
+the matching `observation.images.*` feature key. Rows of every other style —
+including `motion`, which describes robot-frame primitives in joint / Cartesian
+terms — MUST leave `camera` as `null`. Pipeline writers and the validator
+enforce this via `validate_camera_field(style, camera)`.
 
 `meta/tasks.parquet` remains the canonical source for the task. The special `${task}` recipe binding always reads that task string and does not depend on language annotations.
 
@@ -81,7 +82,7 @@ Exact event matching has no tolerance window, so writers must stamp event rows w
 
 ### View-dependent resolution
 
-For view-dependent styles (`vqa`, `motion`, `trace`), the resolver gains a
+For view-dependent styles (`vqa` and `trace`), the resolver gains a
 `camera=` filter parallel to `role=` and `tool_name=`. Datasets with multiple
 cameras typically emit one (`vqa`, `user`) + (`vqa`, `assistant`) pair per
 camera at the same timestamp; without `camera=`, those resolvers see two
diff --git a/src/lerobot/datasets/dataset_metadata.py b/src/lerobot/datasets/dataset_metadata.py
index 99aec7a05..81c12c090 100644
--- a/src/lerobot/datasets/dataset_metadata.py
+++ b/src/lerobot/datasets/dataset_metadata.py
@@ -316,6 +316,17 @@ class LeRobotDatasetMetadata:
         """Keys to access visual modalities (regardless of their storage method)."""
         return [key for key, ft in self.features.items() if ft["dtype"] in ["video", "image"]]
 
+    @property
+    def has_language_columns(self) -> bool:
+        """Return ``True`` if the dataset declares any language column.
+
+        Used to gate language-aware code paths (collate, render step) so
+        unannotated datasets keep PyTorch's default collate behavior.
+        """
+        from .language import LANGUAGE_COLUMNS  # noqa: PLC0415  (avoid circular import)
+
+        return any(col in self.features for col in LANGUAGE_COLUMNS)
+
     @property
     def tools(self) -> list[dict]:
         """OpenAI-style tool schemas declared by this dataset.
@@ -333,8 +344,8 @@ class LeRobotDatasetMetadata:
         """
         from .language import DEFAULT_TOOLS  # noqa: PLC0415  (avoid circular import)
 
-        declared = self.info.get("tools")
-        if isinstance(declared, list) and declared:
+        declared = self.info.tools
+        if declared:
             return [dict(t) for t in declared]
         return [dict(t) for t in DEFAULT_TOOLS]
 
diff --git a/src/lerobot/datasets/utils.py b/src/lerobot/datasets/utils.py
index efccbf23e..de91978ea 100644
--- a/src/lerobot/datasets/utils.py
+++ b/src/lerobot/datasets/utils.py
@@ -129,6 +129,9 @@ class DatasetInfo:
     # Optional metadata
     robot_type: str | None = None
     splits: dict[str, str] = field(default_factory=dict)
+    # OpenAI-style tool schemas declared by the dataset. ``None`` means the
+    # dataset doesn't declare any — readers fall back to ``DEFAULT_TOOLS``.
+    tools: list[dict] | None = None
 
     def __post_init__(self) -> None:
         # Coerce feature shapes from list to tuple — JSON deserialisation
@@ -150,11 +153,15 @@ class DatasetInfo:
         """Return a JSON-serialisable dict.
 
         Converts tuple shapes back to lists so ``json.dump`` can handle them.
+        Drops ``tools`` when unset so existing datasets keep a clean
+        ``info.json``.
         """
         d = dataclasses.asdict(self)
         for ft in d["features"].values():
             if isinstance(ft.get("shape"), tuple):
                 ft["shape"] = list(ft["shape"])
+        if d.get("tools") is None:
+            d.pop("tools", None)
         return d
 
     @classmethod
diff --git a/src/lerobot/scripts/lerobot_train.py b/src/lerobot/scripts/lerobot_train.py
index ac4893a42..463668eb2 100644
--- a/src/lerobot/scripts/lerobot_train.py
+++ b/src/lerobot/scripts/lerobot_train.py
@@ -402,6 +402,10 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None):
         shuffle = True
         sampler = None
 
+    # Only swap in the language-aware collate when the dataset actually
+    # declares language columns; otherwise stay on PyTorch's default
+    # collate so non-language training runs are unaffected.
+    collate_fn = lerobot_collate_fn if dataset.meta.has_language_columns else None
     dataloader = torch.utils.data.DataLoader(
         dataset,
         num_workers=cfg.num_workers,
@@ -410,7 +414,7 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None):
         sampler=sampler,
         pin_memory=device.type == "cuda",
         drop_last=False,
-        collate_fn=lerobot_collate_fn,
+        collate_fn=collate_fn,
         prefetch_factor=cfg.prefetch_factor if cfg.num_workers > 0 else None,
         persistent_workers=cfg.persistent_workers and cfg.num_workers > 0,
     )
diff --git a/tests/datasets/test_dataset_metadata.py b/tests/datasets/test_dataset_metadata.py
index 6c784c90b..7235db4a3 100644
--- a/tests/datasets/test_dataset_metadata.py
+++ b/tests/datasets/test_dataset_metadata.py
@@ -385,3 +385,84 @@ def test_finalize_flushes_buffered_metadata(tmp_path):
     assert episodes_dir.exists()
     parquet_files = list(episodes_dir.rglob("*.parquet"))
     assert len(parquet_files) > 0
+
+
+# ── Tools accessor ───────────────────────────────────────────────────
+
+
+def test_tools_falls_back_to_default_when_info_has_no_tools_field(tmp_path):
+    """meta.tools returns DEFAULT_TOOLS when info.json doesn't declare any."""
+    from lerobot.datasets.language import DEFAULT_TOOLS
+
+    root = tmp_path / "no_tools"
+    meta = LeRobotDatasetMetadata.create(
+        repo_id="test/no_tools",
+        fps=DEFAULT_FPS,
+        features=SIMPLE_FEATURES,
+        root=root,
+        use_videos=False,
+    )
+
+    assert meta.tools == DEFAULT_TOOLS
+    # info.json on disk should NOT include a `tools` key for clean datasets
+    with open(root / INFO_PATH) as f:
+        info_on_disk = json.load(f)
+    assert "tools" not in info_on_disk
+
+
+def test_tools_reads_declared_tools_from_info_json(tmp_path):
+    """A `tools` list written into info.json survives load → meta.tools.
+
+    Regression test for the bug where ``DatasetInfo.from_dict`` silently
+    dropped the ``tools`` key (no matching dataclass field), so
+    ``meta.tools`` always returned ``DEFAULT_TOOLS`` regardless of
+    what was on disk.
+    """
+    from lerobot.datasets.io_utils import load_info
+
+    root = tmp_path / "with_tools"
+    meta = LeRobotDatasetMetadata.create(
+        repo_id="test/with_tools",
+        fps=DEFAULT_FPS,
+        features=SIMPLE_FEATURES,
+        root=root,
+        use_videos=False,
+    )
+
+    custom_tool = {
+        "type": "function",
+        "function": {
+            "name": "record_observation",
+            "description": "Capture a still image.",
+            "parameters": {
+                "type": "object",
+                "properties": {"label": {"type": "string"}},
+                "required": ["label"],
+            },
+        },
+    }
+    info_path = root / INFO_PATH
+    with open(info_path) as f:
+        raw = json.load(f)
+    raw["tools"] = [custom_tool]
+    with open(info_path, "w") as f:
+        json.dump(raw, f)
+
+    # Reload info from disk and rebind it on the metadata object
+    meta.info = load_info(root)
+    assert meta.tools == [custom_tool]
+
+
+def test_tools_round_trip_through_dataset_info(tmp_path):
+    """A `tools` list survives DatasetInfo.from_dict / to_dict."""
+    from lerobot.datasets.utils import DatasetInfo
+
+    raw = {
+        "codebase_version": "v3.1",
+        "fps": 30,
+        "features": SIMPLE_FEATURES,
+        "tools": [{"type": "function", "function": {"name": "say"}}],
+    }
+    info = DatasetInfo.from_dict(raw)
+    assert info.tools == raw["tools"]
+    assert info.to_dict()["tools"] == raw["tools"]
diff --git a/tests/utils/test_collate.py b/tests/utils/test_collate.py
index b37c71b2d..2b23b3180 100644
--- a/tests/utils/test_collate.py
+++ b/tests/utils/test_collate.py
@@ -38,3 +38,47 @@ def test_lerobot_collate_preserves_messages_and_drops_raw_language():
     assert out["target_message_indices"] == [[0], [0]]
     assert "language_persistent" not in out
     assert "language_events" not in out
+
+
+def test_lerobot_collate_passes_through_standard_batch():
+    """On a non-language batch, the collate must match ``default_collate``.
+
+    Guards against silent regressions: ``lerobot_train.py`` only opts into
+    ``lerobot_collate_fn`` when the dataset declares language columns, but
+    if a future change ever wires it in unconditionally we want the
+    behavior to remain a transparent pass-through for ordinary tensor
+    batches.
+    """
+    from torch.utils.data._utils.collate import default_collate
+
+    batch = [
+        {
+            "observation.image": torch.zeros(3, 4, 4),
+            "action": torch.tensor([0.0, 1.0]),
+            "index": torch.tensor(0),
+        },
+        {
+            "observation.image": torch.ones(3, 4, 4),
+            "action": torch.tensor([2.0, 3.0]),
+            "index": torch.tensor(1),
+        },
+    ]
+
+    custom = lerobot_collate_fn(batch)
+    expected = default_collate(batch)
+
+    assert custom.keys() == expected.keys()
+    for key in expected:
+        assert torch.equal(custom[key], expected[key]), f"key={key} diverged"
+
+
+def test_lerobot_collate_drops_none_samples():
+    """Recipes that yielded no target message return ``None`` — those samples
+    must be filtered out, and an entirely-``None`` batch must collapse to ``None``.
+    """
+    batch = [None, {"index": torch.tensor(0)}, None]
+    out = lerobot_collate_fn(batch)
+    assert out is not None
+    assert out["index"].tolist() == [0]
+
+    assert lerobot_collate_fn([None, None]) is None