From fea41b29f5b0e552b1bbda105c722821621cbe4d Mon Sep 17 00:00:00 2001 From: Pepijn Date: Tue, 5 May 2026 11:31:19 +0200 Subject: [PATCH] fix(datasets): probe parquet for language columns before strict cast MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ``_load_hf_dataset`` was building the strict cast schema only from ``meta/info.json["features"]``. Datasets annotated by ``lerobot-annotate`` but still tagged at the older codebase version (no ``language_persistent`` / ``language_events`` entry in ``info.json``) carry both columns in the parquet itself but not in the features dict, so ``Dataset.from_parquet`` blew up with ``CastError: column names don't match`` when trying to project a 9-column parquet onto a 7-column schema. Probe one parquet shard's actual schema; if either language column is present in the parquet but missing from ``features``, graft it on using PR 1's ``language_persistent_column_feature`` / ``language_events_column_feature`` helpers. No-op when neither column is present (fully backwards-compatible with v3.0 datasets), no-op when both are already registered (fully forwards-compatible with future v3.1 ``info.json`` writes). This unblocks dry-run inference on PR 2-annotated datasets that weren't re-tagged to v3.1 — including the ones in the field today. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/lerobot/datasets/dataset_reader.py | 43 ++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/src/lerobot/datasets/dataset_reader.py b/src/lerobot/datasets/dataset_reader.py index 59aaa40e5..ae5934283 100644 --- a/src/lerobot/datasets/dataset_reader.py +++ b/src/lerobot/datasets/dataset_reader.py @@ -126,10 +126,53 @@ class DatasetReader: def _load_hf_dataset(self) -> datasets.Dataset: """hf_dataset contains all the observations, states, actions, rewards, etc.""" features = get_hf_features_from_features(self._meta.features) + # Datasets annotated with the PR1 language columns may have been + # written without registering those columns in ``meta/info.json`` + # (e.g. they predate ``CODEBASE_VERSION="v3.1"`` and were + # back-filled by ``lerobot-annotate``). Probe a single parquet + # shard and graft the column features on so the strict + # ``Dataset.from_parquet`` cast doesn't fail with + # ``column names don't match``. + features = self._extend_features_with_language_columns(features) hf_dataset = load_nested_dataset(self.root / "data", features=features, episodes=self.episodes) hf_dataset.set_transform(hf_transform_to_torch) return hf_dataset + def _extend_features_with_language_columns( + self, features: datasets.Features + ) -> datasets.Features: + """Add ``language_persistent`` / ``language_events`` to ``features`` + when the underlying parquet shards declare them but the metadata + doesn't. No-op when neither column is present or both are + already registered. + """ + # Find any one parquet to peek at; bail if there are none yet + # (the dataset will fail later for an unrelated reason and we + # want that error to surface as-is). + try: + sample = next((self.root / "data").glob("*/*.parquet")) + except StopIteration: + return features + + from pyarrow import parquet as _pq # noqa: PLC0415 + + schema_names = set(_pq.read_schema(sample).names) + from .language import ( # noqa: PLC0415 + LANGUAGE_EVENTS, + LANGUAGE_PERSISTENT, + language_events_column_feature, + language_persistent_column_feature, + ) + + extra: dict[str, object] = {} + if LANGUAGE_PERSISTENT in schema_names and LANGUAGE_PERSISTENT not in features: + extra[LANGUAGE_PERSISTENT] = language_persistent_column_feature() + if LANGUAGE_EVENTS in schema_names and LANGUAGE_EVENTS not in features: + extra[LANGUAGE_EVENTS] = language_events_column_feature() + if not extra: + return features + return datasets.Features({**features, **extra}) + def _check_cached_episodes_sufficient(self) -> bool: """Check if the cached dataset contains all requested episodes and their video files.""" if self.hf_dataset is None or len(self.hf_dataset) == 0: