From be09a59e050216b4ce69ac390b850058119d2e67 Mon Sep 17 00:00:00 2001
From: CarolinePascal <caroline8.pascal@gmail.com>
Date: Mon, 7 Apr 2025 16:36:04 +0200
Subject: [PATCH] Adding audio tests

---
 src/lerobot/__init__.py              |  5 +++
 src/lerobot/utils/constants.py       |  1 +
 tests/conftest.py                    |  2 ++
 tests/datasets/test_compute_stats.py | 40 +++++++++++++++++++++---
 tests/datasets/test_datasets.py      | 46 +++++++++++++++++++++++++++-
 tests/fixtures/constants.py          | 13 ++++++++
 tests/fixtures/dataset_factories.py  | 14 ++++++++-
 tests/utils.py                       |  9 +++++-
 8 files changed, 122 insertions(+), 8 deletions(-)

diff --git a/src/lerobot/__init__.py b/src/lerobot/__init__.py
index eec574296..90073d213 100644
--- a/src/lerobot/__init__.py
+++ b/src/lerobot/__init__.py
@@ -174,6 +174,11 @@ available_cameras = [
     "intelrealsense",
 ]
 
+# lists all available microphones from `lerobot/microphones`
+available_microphones = [
+    "microphone",
+]
+
 # lists all available motors from `lerobot/motors`
 available_motors = [
     "dynamixel",
diff --git a/src/lerobot/utils/constants.py b/src/lerobot/utils/constants.py
index 43a61b4f7..9f33732d5 100644
--- a/src/lerobot/utils/constants.py
+++ b/src/lerobot/utils/constants.py
@@ -23,6 +23,7 @@ OBS_ENV_STATE = OBS_STR + ".environment_state"
 OBS_STATE = OBS_STR + ".state"
 OBS_IMAGE = OBS_STR + ".image"
 OBS_IMAGES = OBS_IMAGE + "s"
+OBS_AUDIO = OBS_STR + ".audio"
 OBS_LANGUAGE = OBS_STR + ".language"
 OBS_LANGUAGE_TOKENS = OBS_LANGUAGE + ".tokens"
 OBS_LANGUAGE_ATTENTION_MASK = OBS_LANGUAGE + ".attention_mask"
diff --git a/tests/conftest.py b/tests/conftest.py
index 2fcf878ab..924f06eca 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -57,6 +57,8 @@ def _check_component_availability(component_type, available_components, make_com
             print("\nNo physical device detected.")
         elif isinstance(e, ValueError) and "camera_index" in str(e):
             print("\nNo physical camera detected.")
+        elif isinstance(e, ValueError) and "microphone_index" in str(e):
+            print("\nNo physical microphone detected.")
         else:
             traceback.print_exc()
 
diff --git a/tests/datasets/test_compute_stats.py b/tests/datasets/test_compute_stats.py
index 973c80bd8..62531685b 100644
--- a/tests/datasets/test_compute_stats.py
+++ b/tests/datasets/test_compute_stats.py
@@ -26,16 +26,21 @@ from lerobot.datasets.compute_stats import (
     compute_episode_stats,
     estimate_num_samples,
     get_feature_stats,
+    sample_audio,
     sample_images,
     sample_indices,
 )
-from lerobot.utils.constants import OBS_IMAGE, OBS_STATE
+from lerobot.utils.constants import OBS_AUDIO, OBS_IMAGE, OBS_STATE
 
 
 def mock_load_image_as_numpy(path, dtype, channel_first):
     return np.ones((3, 32, 32), dtype=dtype) if channel_first else np.ones((32, 32, 3), dtype=dtype)
 
 
+def mock_load_audio(path):
+    return np.ones((16000, 2), dtype=np.float32)
+
+
 @pytest.fixture
 def sample_array():
     return np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
@@ -73,6 +78,16 @@ def test_sample_images(mock_load):
     assert len(images) == estimate_num_samples(100)
 
 
+@patch("lerobot.datasets.compute_stats.load_audio", side_effect=mock_load_audio)
+def test_sample_audio(mock_load):
+    audio_path = "audio.wav"
+    audio_samples = sample_audio(audio_path)
+    assert isinstance(audio_samples, np.ndarray)
+    assert audio_samples.shape[1] == 2
+    assert audio_samples.dtype == np.float32
+    assert len(audio_samples) == estimate_num_samples(16000)
+
+
 def test_get_feature_stats_images():
     data = np.random.rand(100, 3, 32, 32)
     stats = get_feature_stats(data, axis=(0, 2, 3), keepdims=True)
@@ -81,6 +96,14 @@ def test_get_feature_stats_images():
     assert stats["min"].shape == stats["max"].shape == stats["mean"].shape == stats["std"].shape
 
 
+def test_get_feature_stats_audio():
+    data = np.random.uniform(-1, 1, (16000, 2))
+    stats = get_feature_stats(data, axis=0, keepdims=True)
+    assert "min" in stats and "max" in stats and "mean" in stats and "std" in stats and "count" in stats
+    np.testing.assert_equal(stats["count"], np.array([16000]))
+    assert stats["min"].shape == stats["max"].shape == stats["mean"].shape == stats["std"].shape
+
+
 def test_get_feature_stats_axis_0_keepdims(sample_array):
     expected = {
         "min": np.array([[1, 2, 3]]),
@@ -145,20 +168,27 @@ def test_get_feature_stats_single_value():
 def test_compute_episode_stats():
     episode_data = {
         OBS_IMAGE: [f"image_{i}.jpg" for i in range(100)],
+        OBS_AUDIO: "audio.wav",
         OBS_STATE: np.random.rand(100, 10),
     }
     features = {
         OBS_IMAGE: {"dtype": "image"},
+        OBS_AUDIO: {"dtype": "audio"},
         OBS_STATE: {"dtype": "numeric"},
     }
 
-    with patch("lerobot.datasets.compute_stats.load_image_as_numpy", side_effect=mock_load_image_as_numpy):
+    with (
+        patch("lerobot.datasets.compute_stats.load_image_as_numpy", side_effect=mock_load_image_as_numpy),
+        patch("lerobot.datasets.compute_stats.load_audio", side_effect=mock_load_audio),
+    ):
         stats = compute_episode_stats(episode_data, features)
 
-    assert OBS_IMAGE in stats and OBS_STATE in stats
-    assert stats[OBS_IMAGE]["count"].item() == 100
-    assert stats[OBS_STATE]["count"].item() == 100
+    assert OBS_IMAGE in stats and OBS_AUDIO in stats and OBS_STATE in stats
+    assert stats[OBS_IMAGE]["count"].item() == estimate_num_samples(100)
+    assert stats[OBS_AUDIO]["count"].item() == estimate_num_samples(16000)
+    assert stats[OBS_STATE]["count"].item() == estimate_num_samples(100)
     assert stats[OBS_IMAGE]["mean"].shape == (3, 1, 1)
+    assert stats[OBS_AUDIO]["mean"].shape == (1, 2)
 
 
 def test_assert_type_and_shape_valid():
diff --git a/tests/datasets/test_datasets.py b/tests/datasets/test_datasets.py
index 27c51b3c4..e5ee05cee 100644
--- a/tests/datasets/test_datasets.py
+++ b/tests/datasets/test_datasets.py
@@ -37,6 +37,7 @@ from lerobot.datasets.lerobot_dataset import (
     _encode_video_worker,
 )
 from lerobot.datasets.utils import (
+    DEFAULT_AUDIO_CHUNK_DURATION,
     DEFAULT_CHUNK_SIZE,
     DEFAULT_DATA_FILE_SIZE_IN_MB,
     DEFAULT_VIDEO_FILE_SIZE_IN_MB,
@@ -49,7 +50,13 @@ from lerobot.envs.factory import make_env_config
 from lerobot.policies.factory import make_policy_config
 from lerobot.robots import make_robot_from_config
 from lerobot.utils.constants import ACTION, DONE, OBS_IMAGES, OBS_STATE, OBS_STR, REWARD
-from tests.fixtures.constants import DUMMY_CHW, DUMMY_HWC, DUMMY_REPO_ID
+from tests.fixtures.constants import (
+    DEFAULT_SAMPLE_RATE,
+    DUMMY_AUDIO_CHANNELS,
+    DUMMY_CHW,
+    DUMMY_HWC,
+    DUMMY_REPO_ID,
+)
 from tests.mocks.mock_robot import MockRobotConfig
 from tests.utils import require_x86_64_kernel
 
@@ -70,6 +77,20 @@ def image_dataset(tmp_path, empty_lerobot_dataset_factory):
     return empty_lerobot_dataset_factory(root=tmp_path / "test", features=features)
 
 
+@pytest.fixture
+def audio_dataset(tmp_path, empty_lerobot_dataset_factory):
+    features = {
+        "audio": {
+            "dtype": "audio",
+            "shape": (DUMMY_AUDIO_CHANNELS,),
+            "names": [
+                "channels",
+            ],
+        }
+    }
+    return empty_lerobot_dataset_factory(root=tmp_path / "test", features=features)
+
+
 def test_same_attributes_defined(tmp_path, lerobot_dataset_factory):
     """
     Instantiate a LeRobotDataset both ways with '__init__()' and 'create()' and verify that instantiated
@@ -411,6 +432,23 @@ def test_tmp_mixed_deletion(tmp_path, empty_lerobot_dataset_factory):
     )
 
 
+def test_add_frame_audio(audio_dataset):
+    dataset = audio_dataset
+    dataset.add_frame(
+        {
+            "audio": np.random.rand(
+                int(DEFAULT_AUDIO_CHUNK_DURATION * DEFAULT_SAMPLE_RATE), DUMMY_AUDIO_CHANNELS
+            )
+        },
+        task="Dummy task",
+    )
+    dataset.save_episode()
+
+    assert dataset[0]["audio"].shape == torch.Size(
+        (int(DEFAULT_AUDIO_CHUNK_DURATION * DEFAULT_SAMPLE_RATE), DUMMY_AUDIO_CHANNELS)
+    )
+
+
 # TODO(aliberts):
 # - [ ] test various attributes & state from init and create
 # - [ ] test init with episodes and check num_frames
@@ -450,6 +488,7 @@ def test_factory(env_name, repo_id, policy_name):
     dataset = make_dataset(cfg)
     delta_timestamps = dataset.delta_timestamps
     camera_keys = dataset.meta.camera_keys
+    audio_keys = dataset.meta.audio_keys
 
     item = dataset[0]
 
@@ -492,6 +531,11 @@ def test_factory(env_name, repo_id, policy_name):
                 # test c,h,w
                 assert item[key].shape[0] == 3, f"{key}"
 
+        for key in audio_keys:
+            assert item[key].dtype == torch.float32, f"{key}"
+            assert item[key].max() <= 1.0, f"{key}"
+            assert item[key].min() >= -1.0, f"{key}"
+
     if delta_timestamps is not None:
         # test missing keys in delta_timestamps
         for key in delta_timestamps:
diff --git a/tests/fixtures/constants.py b/tests/fixtures/constants.py
index 35d8776ce..017f5e54a 100644
--- a/tests/fixtures/constants.py
+++ b/tests/fixtures/constants.py
@@ -40,5 +40,18 @@ DUMMY_VIDEO_INFO = {
     "video.is_depth_map": False,
     "has_audio": False,
 }
+DUMMY_MICROPHONE_FEATURES = {
+    "laptop": {"dtype": "audio", "shape": (1,), "names": ["channels"], "info": None},
+    "phone": {"dtype": "audio", "shape": (1,), "names": ["channels"], "info": None},
+}
+DEFAULT_SAMPLE_RATE = 48000
+DUMMY_AUDIO_CHANNELS = 2
+DUMMY_AUDIO_INFO = {
+    "has_audio": True,
+    "audio.sample_rate": DEFAULT_SAMPLE_RATE,
+    "audio.codec": "aac",
+    "audio.channels": DUMMY_AUDIO_CHANNELS,
+    "audio.channel_layout": "stereo",
+}
 DUMMY_CHW = (3, 96, 128)
 DUMMY_HWC = (96, 128, 3)
diff --git a/tests/fixtures/dataset_factories.py b/tests/fixtures/dataset_factories.py
index e98e626e2..84877b8a6 100644
--- a/tests/fixtures/dataset_factories.py
+++ b/tests/fixtures/dataset_factories.py
@@ -43,6 +43,7 @@ from lerobot.datasets.video_utils import encode_video_frames
 from tests.fixtures.constants import (
     DEFAULT_FPS,
     DUMMY_CAMERA_FEATURES,
+    DUMMY_MICROPHONE_FEATURES,
     DUMMY_MOTOR_FEATURES,
     DUMMY_REPO_ID,
     DUMMY_ROBOT_TYPE,
@@ -131,6 +132,7 @@ def features_factory():
     def _create_features(
         motor_features: dict = DUMMY_MOTOR_FEATURES,
         camera_features: dict = DUMMY_CAMERA_FEATURES,
+        audio_features: dict = DUMMY_MICROPHONE_FEATURES,
         use_videos: bool = True,
     ) -> dict:
         if use_videos:
@@ -142,6 +144,7 @@ def features_factory():
         return {
             **motor_features,
             **camera_ft,
+            **audio_features,
             **DEFAULT_FEATURES,
         }
 
@@ -166,9 +169,10 @@ def info_factory(features_factory):
         audio_path: str = DEFAULT_AUDIO_PATH,
         motor_features: dict = DUMMY_MOTOR_FEATURES,
         camera_features: dict = DUMMY_CAMERA_FEATURES,
+        audio_features: dict = DUMMY_MICROPHONE_FEATURES,
         use_videos: bool = True,
     ) -> dict:
-        features = features_factory(motor_features, camera_features, use_videos)
+        features = features_factory(motor_features, camera_features, audio_features, use_videos)
         return {
             "codebase_version": codebase_version,
             "robot_type": robot_type,
@@ -207,6 +211,14 @@ def stats_factory():
                     "std": np.full((3, 1, 1), 0.25, dtype=np.float32).tolist(),
                     "count": [10],
                 }
+            elif dtype == "audio":
+                stats[key] = {
+                    "mean": np.full((shape[0],), 0.0, dtype=np.float32).tolist(),
+                    "max": np.full((shape[0],), 1, dtype=np.float32).tolist(),
+                    "min": np.full((shape[0],), -1, dtype=np.float32).tolist(),
+                    "std": np.full((shape[0],), 0.5, dtype=np.float32).tolist(),
+                    "count": [10],
+                }
             else:
                 stats[key] = {
                     "max": np.full(shape, 1, dtype=dtype).tolist(),
diff --git a/tests/utils.py b/tests/utils.py
index 38841db02..cbbf34694 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -20,7 +20,7 @@ from functools import wraps
 import pytest
 import torch
 
-from lerobot import available_cameras, available_motors, available_robots
+from lerobot import available_cameras, available_microphones, available_motors, available_robots
 from lerobot.utils.import_utils import is_package_available
 
 DEVICE = os.environ.get("LEROBOT_TEST_DEVICE", "cuda") if torch.cuda.is_available() else "cpu"
@@ -33,6 +33,10 @@ TEST_CAMERA_TYPES = []
 for camera_type in available_cameras:
     TEST_CAMERA_TYPES += [(camera_type, True), (camera_type, False)]
 
+TEST_MICROPHONE_TYPES = []
+for microphone_type in available_microphones:
+    TEST_MICROPHONE_TYPES += [(microphone_type, True), (microphone_type, False)]
+
 TEST_MOTOR_TYPES = []
 for motor_type in available_motors:
     TEST_MOTOR_TYPES += [(motor_type, True), (motor_type, False)]
@@ -41,6 +45,9 @@ for motor_type in available_motors:
 OPENCV_CAMERA_INDEX = int(os.environ.get("LEROBOT_TEST_OPENCV_CAMERA_INDEX", 0))
 INTELREALSENSE_SERIAL_NUMBER = int(os.environ.get("LEROBOT_TEST_INTELREALSENSE_SERIAL_NUMBER", 128422271614))
 
+# Microphone indices used for connecting physical microphones
+MICROPHONE_INDEX = int(os.environ.get("LEROBOT_TEST_MICROPHONE_INDEX", 0))
+
 DYNAMIXEL_PORT = os.environ.get("LEROBOT_TEST_DYNAMIXEL_PORT", "/dev/tty.usbmodem575E0032081")
 DYNAMIXEL_MOTORS = {
     "shoulder_pan": [1, "xl430-w250"],