From be09a59e050216b4ce69ac390b850058119d2e67 Mon Sep 17 00:00:00 2001 From: CarolinePascal Date: Mon, 7 Apr 2025 16:36:04 +0200 Subject: [PATCH] Adding audio tests --- src/lerobot/__init__.py | 5 +++ src/lerobot/utils/constants.py | 1 + tests/conftest.py | 2 ++ tests/datasets/test_compute_stats.py | 40 +++++++++++++++++++++--- tests/datasets/test_datasets.py | 46 +++++++++++++++++++++++++++- tests/fixtures/constants.py | 13 ++++++++ tests/fixtures/dataset_factories.py | 14 ++++++++- tests/utils.py | 9 +++++- 8 files changed, 122 insertions(+), 8 deletions(-) diff --git a/src/lerobot/__init__.py b/src/lerobot/__init__.py index eec574296..90073d213 100644 --- a/src/lerobot/__init__.py +++ b/src/lerobot/__init__.py @@ -174,6 +174,11 @@ available_cameras = [ "intelrealsense", ] +# lists all available microphones from `lerobot/microphones` +available_microphones = [ + "microphone", +] + # lists all available motors from `lerobot/motors` available_motors = [ "dynamixel", diff --git a/src/lerobot/utils/constants.py b/src/lerobot/utils/constants.py index 43a61b4f7..9f33732d5 100644 --- a/src/lerobot/utils/constants.py +++ b/src/lerobot/utils/constants.py @@ -23,6 +23,7 @@ OBS_ENV_STATE = OBS_STR + ".environment_state" OBS_STATE = OBS_STR + ".state" OBS_IMAGE = OBS_STR + ".image" OBS_IMAGES = OBS_IMAGE + "s" +OBS_AUDIO = OBS_STR + ".audio" OBS_LANGUAGE = OBS_STR + ".language" OBS_LANGUAGE_TOKENS = OBS_LANGUAGE + ".tokens" OBS_LANGUAGE_ATTENTION_MASK = OBS_LANGUAGE + ".attention_mask" diff --git a/tests/conftest.py b/tests/conftest.py index 2fcf878ab..924f06eca 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -57,6 +57,8 @@ def _check_component_availability(component_type, available_components, make_com print("\nNo physical device detected.") elif isinstance(e, ValueError) and "camera_index" in str(e): print("\nNo physical camera detected.") + elif isinstance(e, ValueError) and "microphone_index" in str(e): + print("\nNo physical microphone detected.") else: traceback.print_exc() diff --git a/tests/datasets/test_compute_stats.py b/tests/datasets/test_compute_stats.py index 973c80bd8..62531685b 100644 --- a/tests/datasets/test_compute_stats.py +++ b/tests/datasets/test_compute_stats.py @@ -26,16 +26,21 @@ from lerobot.datasets.compute_stats import ( compute_episode_stats, estimate_num_samples, get_feature_stats, + sample_audio, sample_images, sample_indices, ) -from lerobot.utils.constants import OBS_IMAGE, OBS_STATE +from lerobot.utils.constants import OBS_AUDIO, OBS_IMAGE, OBS_STATE def mock_load_image_as_numpy(path, dtype, channel_first): return np.ones((3, 32, 32), dtype=dtype) if channel_first else np.ones((32, 32, 3), dtype=dtype) +def mock_load_audio(path): + return np.ones((16000, 2), dtype=np.float32) + + @pytest.fixture def sample_array(): return np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) @@ -73,6 +78,16 @@ def test_sample_images(mock_load): assert len(images) == estimate_num_samples(100) +@patch("lerobot.datasets.compute_stats.load_audio", side_effect=mock_load_audio) +def test_sample_audio(mock_load): + audio_path = "audio.wav" + audio_samples = sample_audio(audio_path) + assert isinstance(audio_samples, np.ndarray) + assert audio_samples.shape[1] == 2 + assert audio_samples.dtype == np.float32 + assert len(audio_samples) == estimate_num_samples(16000) + + def test_get_feature_stats_images(): data = np.random.rand(100, 3, 32, 32) stats = get_feature_stats(data, axis=(0, 2, 3), keepdims=True) @@ -81,6 +96,14 @@ def test_get_feature_stats_images(): assert stats["min"].shape == stats["max"].shape == stats["mean"].shape == stats["std"].shape +def test_get_feature_stats_audio(): + data = np.random.uniform(-1, 1, (16000, 2)) + stats = get_feature_stats(data, axis=0, keepdims=True) + assert "min" in stats and "max" in stats and "mean" in stats and "std" in stats and "count" in stats + np.testing.assert_equal(stats["count"], np.array([16000])) + assert stats["min"].shape == stats["max"].shape == stats["mean"].shape == stats["std"].shape + + def test_get_feature_stats_axis_0_keepdims(sample_array): expected = { "min": np.array([[1, 2, 3]]), @@ -145,20 +168,27 @@ def test_get_feature_stats_single_value(): def test_compute_episode_stats(): episode_data = { OBS_IMAGE: [f"image_{i}.jpg" for i in range(100)], + OBS_AUDIO: "audio.wav", OBS_STATE: np.random.rand(100, 10), } features = { OBS_IMAGE: {"dtype": "image"}, + OBS_AUDIO: {"dtype": "audio"}, OBS_STATE: {"dtype": "numeric"}, } - with patch("lerobot.datasets.compute_stats.load_image_as_numpy", side_effect=mock_load_image_as_numpy): + with ( + patch("lerobot.datasets.compute_stats.load_image_as_numpy", side_effect=mock_load_image_as_numpy), + patch("lerobot.datasets.compute_stats.load_audio", side_effect=mock_load_audio), + ): stats = compute_episode_stats(episode_data, features) - assert OBS_IMAGE in stats and OBS_STATE in stats - assert stats[OBS_IMAGE]["count"].item() == 100 - assert stats[OBS_STATE]["count"].item() == 100 + assert OBS_IMAGE in stats and OBS_AUDIO in stats and OBS_STATE in stats + assert stats[OBS_IMAGE]["count"].item() == estimate_num_samples(100) + assert stats[OBS_AUDIO]["count"].item() == estimate_num_samples(16000) + assert stats[OBS_STATE]["count"].item() == estimate_num_samples(100) assert stats[OBS_IMAGE]["mean"].shape == (3, 1, 1) + assert stats[OBS_AUDIO]["mean"].shape == (1, 2) def test_assert_type_and_shape_valid(): diff --git a/tests/datasets/test_datasets.py b/tests/datasets/test_datasets.py index 27c51b3c4..e5ee05cee 100644 --- a/tests/datasets/test_datasets.py +++ b/tests/datasets/test_datasets.py @@ -37,6 +37,7 @@ from lerobot.datasets.lerobot_dataset import ( _encode_video_worker, ) from lerobot.datasets.utils import ( + DEFAULT_AUDIO_CHUNK_DURATION, DEFAULT_CHUNK_SIZE, DEFAULT_DATA_FILE_SIZE_IN_MB, DEFAULT_VIDEO_FILE_SIZE_IN_MB, @@ -49,7 +50,13 @@ from lerobot.envs.factory import make_env_config from lerobot.policies.factory import make_policy_config from lerobot.robots import make_robot_from_config from lerobot.utils.constants import ACTION, DONE, OBS_IMAGES, OBS_STATE, OBS_STR, REWARD -from tests.fixtures.constants import DUMMY_CHW, DUMMY_HWC, DUMMY_REPO_ID +from tests.fixtures.constants import ( + DEFAULT_SAMPLE_RATE, + DUMMY_AUDIO_CHANNELS, + DUMMY_CHW, + DUMMY_HWC, + DUMMY_REPO_ID, +) from tests.mocks.mock_robot import MockRobotConfig from tests.utils import require_x86_64_kernel @@ -70,6 +77,20 @@ def image_dataset(tmp_path, empty_lerobot_dataset_factory): return empty_lerobot_dataset_factory(root=tmp_path / "test", features=features) +@pytest.fixture +def audio_dataset(tmp_path, empty_lerobot_dataset_factory): + features = { + "audio": { + "dtype": "audio", + "shape": (DUMMY_AUDIO_CHANNELS,), + "names": [ + "channels", + ], + } + } + return empty_lerobot_dataset_factory(root=tmp_path / "test", features=features) + + def test_same_attributes_defined(tmp_path, lerobot_dataset_factory): """ Instantiate a LeRobotDataset both ways with '__init__()' and 'create()' and verify that instantiated @@ -411,6 +432,23 @@ def test_tmp_mixed_deletion(tmp_path, empty_lerobot_dataset_factory): ) +def test_add_frame_audio(audio_dataset): + dataset = audio_dataset + dataset.add_frame( + { + "audio": np.random.rand( + int(DEFAULT_AUDIO_CHUNK_DURATION * DEFAULT_SAMPLE_RATE), DUMMY_AUDIO_CHANNELS + ) + }, + task="Dummy task", + ) + dataset.save_episode() + + assert dataset[0]["audio"].shape == torch.Size( + (int(DEFAULT_AUDIO_CHUNK_DURATION * DEFAULT_SAMPLE_RATE), DUMMY_AUDIO_CHANNELS) + ) + + # TODO(aliberts): # - [ ] test various attributes & state from init and create # - [ ] test init with episodes and check num_frames @@ -450,6 +488,7 @@ def test_factory(env_name, repo_id, policy_name): dataset = make_dataset(cfg) delta_timestamps = dataset.delta_timestamps camera_keys = dataset.meta.camera_keys + audio_keys = dataset.meta.audio_keys item = dataset[0] @@ -492,6 +531,11 @@ def test_factory(env_name, repo_id, policy_name): # test c,h,w assert item[key].shape[0] == 3, f"{key}" + for key in audio_keys: + assert item[key].dtype == torch.float32, f"{key}" + assert item[key].max() <= 1.0, f"{key}" + assert item[key].min() >= -1.0, f"{key}" + if delta_timestamps is not None: # test missing keys in delta_timestamps for key in delta_timestamps: diff --git a/tests/fixtures/constants.py b/tests/fixtures/constants.py index 35d8776ce..017f5e54a 100644 --- a/tests/fixtures/constants.py +++ b/tests/fixtures/constants.py @@ -40,5 +40,18 @@ DUMMY_VIDEO_INFO = { "video.is_depth_map": False, "has_audio": False, } +DUMMY_MICROPHONE_FEATURES = { + "laptop": {"dtype": "audio", "shape": (1,), "names": ["channels"], "info": None}, + "phone": {"dtype": "audio", "shape": (1,), "names": ["channels"], "info": None}, +} +DEFAULT_SAMPLE_RATE = 48000 +DUMMY_AUDIO_CHANNELS = 2 +DUMMY_AUDIO_INFO = { + "has_audio": True, + "audio.sample_rate": DEFAULT_SAMPLE_RATE, + "audio.codec": "aac", + "audio.channels": DUMMY_AUDIO_CHANNELS, + "audio.channel_layout": "stereo", +} DUMMY_CHW = (3, 96, 128) DUMMY_HWC = (96, 128, 3) diff --git a/tests/fixtures/dataset_factories.py b/tests/fixtures/dataset_factories.py index e98e626e2..84877b8a6 100644 --- a/tests/fixtures/dataset_factories.py +++ b/tests/fixtures/dataset_factories.py @@ -43,6 +43,7 @@ from lerobot.datasets.video_utils import encode_video_frames from tests.fixtures.constants import ( DEFAULT_FPS, DUMMY_CAMERA_FEATURES, + DUMMY_MICROPHONE_FEATURES, DUMMY_MOTOR_FEATURES, DUMMY_REPO_ID, DUMMY_ROBOT_TYPE, @@ -131,6 +132,7 @@ def features_factory(): def _create_features( motor_features: dict = DUMMY_MOTOR_FEATURES, camera_features: dict = DUMMY_CAMERA_FEATURES, + audio_features: dict = DUMMY_MICROPHONE_FEATURES, use_videos: bool = True, ) -> dict: if use_videos: @@ -142,6 +144,7 @@ def features_factory(): return { **motor_features, **camera_ft, + **audio_features, **DEFAULT_FEATURES, } @@ -166,9 +169,10 @@ def info_factory(features_factory): audio_path: str = DEFAULT_AUDIO_PATH, motor_features: dict = DUMMY_MOTOR_FEATURES, camera_features: dict = DUMMY_CAMERA_FEATURES, + audio_features: dict = DUMMY_MICROPHONE_FEATURES, use_videos: bool = True, ) -> dict: - features = features_factory(motor_features, camera_features, use_videos) + features = features_factory(motor_features, camera_features, audio_features, use_videos) return { "codebase_version": codebase_version, "robot_type": robot_type, @@ -207,6 +211,14 @@ def stats_factory(): "std": np.full((3, 1, 1), 0.25, dtype=np.float32).tolist(), "count": [10], } + elif dtype == "audio": + stats[key] = { + "mean": np.full((shape[0],), 0.0, dtype=np.float32).tolist(), + "max": np.full((shape[0],), 1, dtype=np.float32).tolist(), + "min": np.full((shape[0],), -1, dtype=np.float32).tolist(), + "std": np.full((shape[0],), 0.5, dtype=np.float32).tolist(), + "count": [10], + } else: stats[key] = { "max": np.full(shape, 1, dtype=dtype).tolist(), diff --git a/tests/utils.py b/tests/utils.py index 38841db02..cbbf34694 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -20,7 +20,7 @@ from functools import wraps import pytest import torch -from lerobot import available_cameras, available_motors, available_robots +from lerobot import available_cameras, available_microphones, available_motors, available_robots from lerobot.utils.import_utils import is_package_available DEVICE = os.environ.get("LEROBOT_TEST_DEVICE", "cuda") if torch.cuda.is_available() else "cpu" @@ -33,6 +33,10 @@ TEST_CAMERA_TYPES = [] for camera_type in available_cameras: TEST_CAMERA_TYPES += [(camera_type, True), (camera_type, False)] +TEST_MICROPHONE_TYPES = [] +for microphone_type in available_microphones: + TEST_MICROPHONE_TYPES += [(microphone_type, True), (microphone_type, False)] + TEST_MOTOR_TYPES = [] for motor_type in available_motors: TEST_MOTOR_TYPES += [(motor_type, True), (motor_type, False)] @@ -41,6 +45,9 @@ for motor_type in available_motors: OPENCV_CAMERA_INDEX = int(os.environ.get("LEROBOT_TEST_OPENCV_CAMERA_INDEX", 0)) INTELREALSENSE_SERIAL_NUMBER = int(os.environ.get("LEROBOT_TEST_INTELREALSENSE_SERIAL_NUMBER", 128422271614)) +# Microphone indices used for connecting physical microphones +MICROPHONE_INDEX = int(os.environ.get("LEROBOT_TEST_MICROPHONE_INDEX", 0)) + DYNAMIXEL_PORT = os.environ.get("LEROBOT_TEST_DYNAMIXEL_PORT", "/dev/tty.usbmodem575E0032081") DYNAMIXEL_MOTORS = { "shoulder_pan": [1, "xl430-w250"],