fix(audio feature shape): fixing audio feature shape ordering (frames first, channels second)

2026-07-24 18:26:11 +00:00 · 2025-04-22 17:06:10 +02:00
parent e714ff22e2
commit e4dd00c8f5
3 changed files with 5 additions and 7 deletions
@@ -1143,11 +1143,9 @@ def validate_feature_audio(name: str, expected_shape: list[str], value: np.ndarr
    if isinstance(value, np.ndarray):
        actual_shape = value.shape
        c = expected_shape
-        if len(actual_shape) != 2 or (
+        if len(actual_shape) != 2 or actual_shape[-1] != c[-1]:  # The number of frames might be different
            actual_shape[-1] != c[-1] and actual_shape[0] != c[0]
        ):  # The number of frames might be different
            error_message += (
-                f"The feature '{name}' of shape '{actual_shape}' does not have the expected shape '{(c,)}'.\n"
+                f"The feature '{name}' of shape '{actual_shape}' does not have the expected shape '{c}'.\n"
            )
    else:
        error_message += f"The feature '{name}' is expected to be of type 'np.ndarray', but type '{type(value)}' provided instead.\n"
@@ -82,7 +82,7 @@ def audio_dataset(tmp_path, empty_lerobot_dataset_factory):
    features = {
        "audio": {
            "dtype": "audio",
-            "shape": (DUMMY_AUDIO_CHANNELS,),
+            "shape": (1, DUMMY_AUDIO_CHANNELS),
            "names": [
                "channels",
            ],
@@ -41,8 +41,8 @@ DUMMY_VIDEO_INFO = {
    "has_audio": False,
 }
 DUMMY_MICROPHONE_FEATURES = {
-    "laptop": {"dtype": "audio", "shape": (1,), "names": ["channels"], "info": None},
+    "laptop": {"dtype": "audio", "shape": (1, 2), "names": ["channels"], "info": None},
-    "phone": {"dtype": "audio", "shape": (1,), "names": ["channels"], "info": None},
+    "phone": {"dtype": "audio", "shape": (1, 2), "names": ["channels"], "info": None},
 }
 DEFAULT_SAMPLE_RATE = 48000
 DUMMY_AUDIO_CHANNELS = 2