tests(update): updating tests

fix(debug log): avoinding spamming warning log with debug log
feat(datasets): warn when skipping stats for zero-width features
2026-07-04 16:47:14 +00:00 · 2026-07-03 13:49:38 +02:00 · 2026-07-03 13:37:02 +02:00 · 2026-07-03 13:35:22 +02:00 · 2026-07-03 13:35:22 +02:00 · 2026-07-03 13:35:22 +02:00
4 changed files with 41 additions and 3 deletions
@@ -519,6 +519,13 @@ def compute_episode_stats(
        if features[key]["dtype"] in {"string", "language"}:
            continue

+        # Features with zero-width shapes are skipped (no data to compute stats on)
+        if any(d == 0 for d in features[key].get("shape", ())):
+            logging.debug(
+                f"Skipping statistics computation for feature '{key}' with a zero-width shape {features[key]['shape']}."
+            )
+            continue
+
        if features[key]["dtype"] in ["image", "video"]:
            ep_ft_array = sample_images(data)
            axes_to_reduce = (0, 2, 3)
@@ -67,9 +67,9 @@ def get_hf_features_from_features(features: dict) -> datasets.Features:
        elif ft["shape"] == (1,):
            hf_features[key] = datasets.Value(dtype=ft["dtype"])
        elif len(ft["shape"]) == 1:
-            hf_features[key] = datasets.Sequence(
-                length=ft["shape"][0], feature=datasets.Value(dtype=ft["dtype"])
-            )
+            # pyarrow rejects fixed-size lists of length 0, so use a variable length list instead
+            length = ft["shape"][0] if ft["shape"][0] > 0 else -1
+            hf_features[key] = datasets.Sequence(length=length, feature=datasets.Value(dtype=ft["dtype"]))
        elif len(ft["shape"]) == 2:
            hf_features[key] = datasets.Array2D(shape=ft["shape"], dtype=ft["dtype"])
        elif len(ft["shape"]) == 3:
@@ -13,6 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import logging
 from unittest.mock import patch

 import numpy as np
@@ -687,6 +688,28 @@ def test_compute_episode_stats_string_features_skipped():
    assert "q01" in stats["action"]


+def test_compute_episode_stats_zero_width_features_skipped(caplog):
+    """Test that features with a zero-width dim (e.g. shape=(0,)) are skipped with a debug log."""
+    episode_data = {
+        "empty": np.zeros((100, 0), dtype=np.float32),  # Zero-width feature
+        "action": np.random.normal(0, 1, (100, 5)),
+    }
+    features = {
+        "empty": {"dtype": "float32", "shape": (0,)},
+        "action": {"dtype": "float32", "shape": (5,)},
+    }
+
+    with caplog.at_level(logging.DEBUG):
+        stats = compute_episode_stats(episode_data, features)
+
+    # Zero-width features should be skipped with a debug log, others computed as usual
+    assert "empty" not in stats
+    assert "empty" in caplog.text
+    assert "action" in stats
+    assert "q01" in stats["action"]
+    assert stats["action"]["mean"].shape == (5,)
+
+
 def test_aggregate_feature_stats_with_quantiles():
    """Test aggregating feature stats that include quantiles."""
    stats_ft_list = [
@@ -1804,3 +1804,11 @@ def test_episode_filter_unknown_key_raises(tmp_path, lerobot_dataset_factory):
            root=dataset.root,
            episode_filter=lambda ep: ep["not_a_real_field"] > 0,
        )
+
+
+def test_get_hf_features_zero_width_feature_does_not_raise_on_from_dict():
+    import datasets
+
+    features = {"empty": {"dtype": "float32", "shape": (0,), "names": ["empty"]}}
+    hf_features = get_hf_features_from_features(features)
+    datasets.Dataset.from_dict({"empty": [[], []]}, features=hf_features)
Author	SHA1	Message	Date
CarolinePascal	e36b0368d4	tests(update): updating tests	2026-07-03 13:49:38 +02:00
CarolinePascal	67b18d87b2	fix(debug log): avoinding spamming warning log with debug log	2026-07-03 13:37:02 +02:00
Mahbod	98052e5f6e	feat(datasets): warn when skipping stats for zero-width features Per review, log a warning when compute_episode_stats skips a feature with a zero-width shape, so users know stats were intentionally not computed for it.	2026-07-03 13:35:22 +02:00
Mahbod	f59260f4aa	fix(datasets): skip zero-width features in compute_episode_stats `LeRobotDataset.save_episode()` raised `ValueError: cannot reshape array of size 0 into shape (0)` whenever a declared non-string feature had a zero-width dimension (e.g. `shape=(0,)`). The root cause was `compute_episode_stats` running stats on every non-string/language feature, then `RunningQuantileStats.update` calling `batch.reshape(-1, batch.shape[-1])` on the empty array. Skip features whose declared `shape` contains a zero dim, mirroring the existing skip for `string` / `language` dtype features. Fixes #3654	2026-07-03 13:35:22 +02:00
Mahbod	fc262fbc06	fix(datasets): allow zero-width features in get_hf_features_from_features Setting a 1-D feature with shape=(0,) builds datasets.Sequence(length=0, ...), which pyarrow rejects with ArrowInvalid: list_size needs to be a strict positive integer when datasets.Dataset.from_dict(...) is called inside save_episode. Use length=-1 (variable-length) for zero-width 1-D shapes. Fixes the second half of #3654 (the first half is #3664, in compute_episode_stats).	2026-07-03 13:35:22 +02:00