mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-26 05:59:52 +00:00
Adding audio tests
This commit is contained in:
@@ -174,6 +174,11 @@ available_cameras = [
|
|||||||
"intelrealsense",
|
"intelrealsense",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# lists all available microphones from `lerobot/microphones`
|
||||||
|
available_microphones = [
|
||||||
|
"microphone",
|
||||||
|
]
|
||||||
|
|
||||||
# lists all available motors from `lerobot/motors`
|
# lists all available motors from `lerobot/motors`
|
||||||
available_motors = [
|
available_motors = [
|
||||||
"dynamixel",
|
"dynamixel",
|
||||||
|
|||||||
@@ -23,6 +23,7 @@ OBS_ENV_STATE = OBS_STR + ".environment_state"
|
|||||||
OBS_STATE = OBS_STR + ".state"
|
OBS_STATE = OBS_STR + ".state"
|
||||||
OBS_IMAGE = OBS_STR + ".image"
|
OBS_IMAGE = OBS_STR + ".image"
|
||||||
OBS_IMAGES = OBS_IMAGE + "s"
|
OBS_IMAGES = OBS_IMAGE + "s"
|
||||||
|
OBS_AUDIO = OBS_STR + ".audio"
|
||||||
OBS_LANGUAGE = OBS_STR + ".language"
|
OBS_LANGUAGE = OBS_STR + ".language"
|
||||||
OBS_LANGUAGE_TOKENS = OBS_LANGUAGE + ".tokens"
|
OBS_LANGUAGE_TOKENS = OBS_LANGUAGE + ".tokens"
|
||||||
OBS_LANGUAGE_ATTENTION_MASK = OBS_LANGUAGE + ".attention_mask"
|
OBS_LANGUAGE_ATTENTION_MASK = OBS_LANGUAGE + ".attention_mask"
|
||||||
|
|||||||
@@ -57,6 +57,8 @@ def _check_component_availability(component_type, available_components, make_com
|
|||||||
print("\nNo physical device detected.")
|
print("\nNo physical device detected.")
|
||||||
elif isinstance(e, ValueError) and "camera_index" in str(e):
|
elif isinstance(e, ValueError) and "camera_index" in str(e):
|
||||||
print("\nNo physical camera detected.")
|
print("\nNo physical camera detected.")
|
||||||
|
elif isinstance(e, ValueError) and "microphone_index" in str(e):
|
||||||
|
print("\nNo physical microphone detected.")
|
||||||
else:
|
else:
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
||||||
|
|||||||
@@ -26,16 +26,21 @@ from lerobot.datasets.compute_stats import (
|
|||||||
compute_episode_stats,
|
compute_episode_stats,
|
||||||
estimate_num_samples,
|
estimate_num_samples,
|
||||||
get_feature_stats,
|
get_feature_stats,
|
||||||
|
sample_audio,
|
||||||
sample_images,
|
sample_images,
|
||||||
sample_indices,
|
sample_indices,
|
||||||
)
|
)
|
||||||
from lerobot.utils.constants import OBS_IMAGE, OBS_STATE
|
from lerobot.utils.constants import OBS_AUDIO, OBS_IMAGE, OBS_STATE
|
||||||
|
|
||||||
|
|
||||||
def mock_load_image_as_numpy(path, dtype, channel_first):
|
def mock_load_image_as_numpy(path, dtype, channel_first):
|
||||||
return np.ones((3, 32, 32), dtype=dtype) if channel_first else np.ones((32, 32, 3), dtype=dtype)
|
return np.ones((3, 32, 32), dtype=dtype) if channel_first else np.ones((32, 32, 3), dtype=dtype)
|
||||||
|
|
||||||
|
|
||||||
|
def mock_load_audio(path):
|
||||||
|
return np.ones((16000, 2), dtype=np.float32)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def sample_array():
|
def sample_array():
|
||||||
return np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
|
return np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
|
||||||
@@ -73,6 +78,16 @@ def test_sample_images(mock_load):
|
|||||||
assert len(images) == estimate_num_samples(100)
|
assert len(images) == estimate_num_samples(100)
|
||||||
|
|
||||||
|
|
||||||
|
@patch("lerobot.datasets.compute_stats.load_audio", side_effect=mock_load_audio)
|
||||||
|
def test_sample_audio(mock_load):
|
||||||
|
audio_path = "audio.wav"
|
||||||
|
audio_samples = sample_audio(audio_path)
|
||||||
|
assert isinstance(audio_samples, np.ndarray)
|
||||||
|
assert audio_samples.shape[1] == 2
|
||||||
|
assert audio_samples.dtype == np.float32
|
||||||
|
assert len(audio_samples) == estimate_num_samples(16000)
|
||||||
|
|
||||||
|
|
||||||
def test_get_feature_stats_images():
|
def test_get_feature_stats_images():
|
||||||
data = np.random.rand(100, 3, 32, 32)
|
data = np.random.rand(100, 3, 32, 32)
|
||||||
stats = get_feature_stats(data, axis=(0, 2, 3), keepdims=True)
|
stats = get_feature_stats(data, axis=(0, 2, 3), keepdims=True)
|
||||||
@@ -81,6 +96,14 @@ def test_get_feature_stats_images():
|
|||||||
assert stats["min"].shape == stats["max"].shape == stats["mean"].shape == stats["std"].shape
|
assert stats["min"].shape == stats["max"].shape == stats["mean"].shape == stats["std"].shape
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_feature_stats_audio():
|
||||||
|
data = np.random.uniform(-1, 1, (16000, 2))
|
||||||
|
stats = get_feature_stats(data, axis=0, keepdims=True)
|
||||||
|
assert "min" in stats and "max" in stats and "mean" in stats and "std" in stats and "count" in stats
|
||||||
|
np.testing.assert_equal(stats["count"], np.array([16000]))
|
||||||
|
assert stats["min"].shape == stats["max"].shape == stats["mean"].shape == stats["std"].shape
|
||||||
|
|
||||||
|
|
||||||
def test_get_feature_stats_axis_0_keepdims(sample_array):
|
def test_get_feature_stats_axis_0_keepdims(sample_array):
|
||||||
expected = {
|
expected = {
|
||||||
"min": np.array([[1, 2, 3]]),
|
"min": np.array([[1, 2, 3]]),
|
||||||
@@ -145,20 +168,27 @@ def test_get_feature_stats_single_value():
|
|||||||
def test_compute_episode_stats():
|
def test_compute_episode_stats():
|
||||||
episode_data = {
|
episode_data = {
|
||||||
OBS_IMAGE: [f"image_{i}.jpg" for i in range(100)],
|
OBS_IMAGE: [f"image_{i}.jpg" for i in range(100)],
|
||||||
|
OBS_AUDIO: "audio.wav",
|
||||||
OBS_STATE: np.random.rand(100, 10),
|
OBS_STATE: np.random.rand(100, 10),
|
||||||
}
|
}
|
||||||
features = {
|
features = {
|
||||||
OBS_IMAGE: {"dtype": "image"},
|
OBS_IMAGE: {"dtype": "image"},
|
||||||
|
OBS_AUDIO: {"dtype": "audio"},
|
||||||
OBS_STATE: {"dtype": "numeric"},
|
OBS_STATE: {"dtype": "numeric"},
|
||||||
}
|
}
|
||||||
|
|
||||||
with patch("lerobot.datasets.compute_stats.load_image_as_numpy", side_effect=mock_load_image_as_numpy):
|
with (
|
||||||
|
patch("lerobot.datasets.compute_stats.load_image_as_numpy", side_effect=mock_load_image_as_numpy),
|
||||||
|
patch("lerobot.datasets.compute_stats.load_audio", side_effect=mock_load_audio),
|
||||||
|
):
|
||||||
stats = compute_episode_stats(episode_data, features)
|
stats = compute_episode_stats(episode_data, features)
|
||||||
|
|
||||||
assert OBS_IMAGE in stats and OBS_STATE in stats
|
assert OBS_IMAGE in stats and OBS_AUDIO in stats and OBS_STATE in stats
|
||||||
assert stats[OBS_IMAGE]["count"].item() == 100
|
assert stats[OBS_IMAGE]["count"].item() == estimate_num_samples(100)
|
||||||
assert stats[OBS_STATE]["count"].item() == 100
|
assert stats[OBS_AUDIO]["count"].item() == estimate_num_samples(16000)
|
||||||
|
assert stats[OBS_STATE]["count"].item() == estimate_num_samples(100)
|
||||||
assert stats[OBS_IMAGE]["mean"].shape == (3, 1, 1)
|
assert stats[OBS_IMAGE]["mean"].shape == (3, 1, 1)
|
||||||
|
assert stats[OBS_AUDIO]["mean"].shape == (1, 2)
|
||||||
|
|
||||||
|
|
||||||
def test_assert_type_and_shape_valid():
|
def test_assert_type_and_shape_valid():
|
||||||
|
|||||||
@@ -37,6 +37,7 @@ from lerobot.datasets.lerobot_dataset import (
|
|||||||
_encode_video_worker,
|
_encode_video_worker,
|
||||||
)
|
)
|
||||||
from lerobot.datasets.utils import (
|
from lerobot.datasets.utils import (
|
||||||
|
DEFAULT_AUDIO_CHUNK_DURATION,
|
||||||
DEFAULT_CHUNK_SIZE,
|
DEFAULT_CHUNK_SIZE,
|
||||||
DEFAULT_DATA_FILE_SIZE_IN_MB,
|
DEFAULT_DATA_FILE_SIZE_IN_MB,
|
||||||
DEFAULT_VIDEO_FILE_SIZE_IN_MB,
|
DEFAULT_VIDEO_FILE_SIZE_IN_MB,
|
||||||
@@ -49,7 +50,13 @@ from lerobot.envs.factory import make_env_config
|
|||||||
from lerobot.policies.factory import make_policy_config
|
from lerobot.policies.factory import make_policy_config
|
||||||
from lerobot.robots import make_robot_from_config
|
from lerobot.robots import make_robot_from_config
|
||||||
from lerobot.utils.constants import ACTION, DONE, OBS_IMAGES, OBS_STATE, OBS_STR, REWARD
|
from lerobot.utils.constants import ACTION, DONE, OBS_IMAGES, OBS_STATE, OBS_STR, REWARD
|
||||||
from tests.fixtures.constants import DUMMY_CHW, DUMMY_HWC, DUMMY_REPO_ID
|
from tests.fixtures.constants import (
|
||||||
|
DEFAULT_SAMPLE_RATE,
|
||||||
|
DUMMY_AUDIO_CHANNELS,
|
||||||
|
DUMMY_CHW,
|
||||||
|
DUMMY_HWC,
|
||||||
|
DUMMY_REPO_ID,
|
||||||
|
)
|
||||||
from tests.mocks.mock_robot import MockRobotConfig
|
from tests.mocks.mock_robot import MockRobotConfig
|
||||||
from tests.utils import require_x86_64_kernel
|
from tests.utils import require_x86_64_kernel
|
||||||
|
|
||||||
@@ -70,6 +77,20 @@ def image_dataset(tmp_path, empty_lerobot_dataset_factory):
|
|||||||
return empty_lerobot_dataset_factory(root=tmp_path / "test", features=features)
|
return empty_lerobot_dataset_factory(root=tmp_path / "test", features=features)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def audio_dataset(tmp_path, empty_lerobot_dataset_factory):
|
||||||
|
features = {
|
||||||
|
"audio": {
|
||||||
|
"dtype": "audio",
|
||||||
|
"shape": (DUMMY_AUDIO_CHANNELS,),
|
||||||
|
"names": [
|
||||||
|
"channels",
|
||||||
|
],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return empty_lerobot_dataset_factory(root=tmp_path / "test", features=features)
|
||||||
|
|
||||||
|
|
||||||
def test_same_attributes_defined(tmp_path, lerobot_dataset_factory):
|
def test_same_attributes_defined(tmp_path, lerobot_dataset_factory):
|
||||||
"""
|
"""
|
||||||
Instantiate a LeRobotDataset both ways with '__init__()' and 'create()' and verify that instantiated
|
Instantiate a LeRobotDataset both ways with '__init__()' and 'create()' and verify that instantiated
|
||||||
@@ -411,6 +432,23 @@ def test_tmp_mixed_deletion(tmp_path, empty_lerobot_dataset_factory):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_add_frame_audio(audio_dataset):
|
||||||
|
dataset = audio_dataset
|
||||||
|
dataset.add_frame(
|
||||||
|
{
|
||||||
|
"audio": np.random.rand(
|
||||||
|
int(DEFAULT_AUDIO_CHUNK_DURATION * DEFAULT_SAMPLE_RATE), DUMMY_AUDIO_CHANNELS
|
||||||
|
)
|
||||||
|
},
|
||||||
|
task="Dummy task",
|
||||||
|
)
|
||||||
|
dataset.save_episode()
|
||||||
|
|
||||||
|
assert dataset[0]["audio"].shape == torch.Size(
|
||||||
|
(int(DEFAULT_AUDIO_CHUNK_DURATION * DEFAULT_SAMPLE_RATE), DUMMY_AUDIO_CHANNELS)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# TODO(aliberts):
|
# TODO(aliberts):
|
||||||
# - [ ] test various attributes & state from init and create
|
# - [ ] test various attributes & state from init and create
|
||||||
# - [ ] test init with episodes and check num_frames
|
# - [ ] test init with episodes and check num_frames
|
||||||
@@ -450,6 +488,7 @@ def test_factory(env_name, repo_id, policy_name):
|
|||||||
dataset = make_dataset(cfg)
|
dataset = make_dataset(cfg)
|
||||||
delta_timestamps = dataset.delta_timestamps
|
delta_timestamps = dataset.delta_timestamps
|
||||||
camera_keys = dataset.meta.camera_keys
|
camera_keys = dataset.meta.camera_keys
|
||||||
|
audio_keys = dataset.meta.audio_keys
|
||||||
|
|
||||||
item = dataset[0]
|
item = dataset[0]
|
||||||
|
|
||||||
@@ -492,6 +531,11 @@ def test_factory(env_name, repo_id, policy_name):
|
|||||||
# test c,h,w
|
# test c,h,w
|
||||||
assert item[key].shape[0] == 3, f"{key}"
|
assert item[key].shape[0] == 3, f"{key}"
|
||||||
|
|
||||||
|
for key in audio_keys:
|
||||||
|
assert item[key].dtype == torch.float32, f"{key}"
|
||||||
|
assert item[key].max() <= 1.0, f"{key}"
|
||||||
|
assert item[key].min() >= -1.0, f"{key}"
|
||||||
|
|
||||||
if delta_timestamps is not None:
|
if delta_timestamps is not None:
|
||||||
# test missing keys in delta_timestamps
|
# test missing keys in delta_timestamps
|
||||||
for key in delta_timestamps:
|
for key in delta_timestamps:
|
||||||
|
|||||||
Vendored
+13
@@ -40,5 +40,18 @@ DUMMY_VIDEO_INFO = {
|
|||||||
"video.is_depth_map": False,
|
"video.is_depth_map": False,
|
||||||
"has_audio": False,
|
"has_audio": False,
|
||||||
}
|
}
|
||||||
|
DUMMY_MICROPHONE_FEATURES = {
|
||||||
|
"laptop": {"dtype": "audio", "shape": (1,), "names": ["channels"], "info": None},
|
||||||
|
"phone": {"dtype": "audio", "shape": (1,), "names": ["channels"], "info": None},
|
||||||
|
}
|
||||||
|
DEFAULT_SAMPLE_RATE = 48000
|
||||||
|
DUMMY_AUDIO_CHANNELS = 2
|
||||||
|
DUMMY_AUDIO_INFO = {
|
||||||
|
"has_audio": True,
|
||||||
|
"audio.sample_rate": DEFAULT_SAMPLE_RATE,
|
||||||
|
"audio.codec": "aac",
|
||||||
|
"audio.channels": DUMMY_AUDIO_CHANNELS,
|
||||||
|
"audio.channel_layout": "stereo",
|
||||||
|
}
|
||||||
DUMMY_CHW = (3, 96, 128)
|
DUMMY_CHW = (3, 96, 128)
|
||||||
DUMMY_HWC = (96, 128, 3)
|
DUMMY_HWC = (96, 128, 3)
|
||||||
|
|||||||
Vendored
+13
-1
@@ -43,6 +43,7 @@ from lerobot.datasets.video_utils import encode_video_frames
|
|||||||
from tests.fixtures.constants import (
|
from tests.fixtures.constants import (
|
||||||
DEFAULT_FPS,
|
DEFAULT_FPS,
|
||||||
DUMMY_CAMERA_FEATURES,
|
DUMMY_CAMERA_FEATURES,
|
||||||
|
DUMMY_MICROPHONE_FEATURES,
|
||||||
DUMMY_MOTOR_FEATURES,
|
DUMMY_MOTOR_FEATURES,
|
||||||
DUMMY_REPO_ID,
|
DUMMY_REPO_ID,
|
||||||
DUMMY_ROBOT_TYPE,
|
DUMMY_ROBOT_TYPE,
|
||||||
@@ -131,6 +132,7 @@ def features_factory():
|
|||||||
def _create_features(
|
def _create_features(
|
||||||
motor_features: dict = DUMMY_MOTOR_FEATURES,
|
motor_features: dict = DUMMY_MOTOR_FEATURES,
|
||||||
camera_features: dict = DUMMY_CAMERA_FEATURES,
|
camera_features: dict = DUMMY_CAMERA_FEATURES,
|
||||||
|
audio_features: dict = DUMMY_MICROPHONE_FEATURES,
|
||||||
use_videos: bool = True,
|
use_videos: bool = True,
|
||||||
) -> dict:
|
) -> dict:
|
||||||
if use_videos:
|
if use_videos:
|
||||||
@@ -142,6 +144,7 @@ def features_factory():
|
|||||||
return {
|
return {
|
||||||
**motor_features,
|
**motor_features,
|
||||||
**camera_ft,
|
**camera_ft,
|
||||||
|
**audio_features,
|
||||||
**DEFAULT_FEATURES,
|
**DEFAULT_FEATURES,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -166,9 +169,10 @@ def info_factory(features_factory):
|
|||||||
audio_path: str = DEFAULT_AUDIO_PATH,
|
audio_path: str = DEFAULT_AUDIO_PATH,
|
||||||
motor_features: dict = DUMMY_MOTOR_FEATURES,
|
motor_features: dict = DUMMY_MOTOR_FEATURES,
|
||||||
camera_features: dict = DUMMY_CAMERA_FEATURES,
|
camera_features: dict = DUMMY_CAMERA_FEATURES,
|
||||||
|
audio_features: dict = DUMMY_MICROPHONE_FEATURES,
|
||||||
use_videos: bool = True,
|
use_videos: bool = True,
|
||||||
) -> dict:
|
) -> dict:
|
||||||
features = features_factory(motor_features, camera_features, use_videos)
|
features = features_factory(motor_features, camera_features, audio_features, use_videos)
|
||||||
return {
|
return {
|
||||||
"codebase_version": codebase_version,
|
"codebase_version": codebase_version,
|
||||||
"robot_type": robot_type,
|
"robot_type": robot_type,
|
||||||
@@ -207,6 +211,14 @@ def stats_factory():
|
|||||||
"std": np.full((3, 1, 1), 0.25, dtype=np.float32).tolist(),
|
"std": np.full((3, 1, 1), 0.25, dtype=np.float32).tolist(),
|
||||||
"count": [10],
|
"count": [10],
|
||||||
}
|
}
|
||||||
|
elif dtype == "audio":
|
||||||
|
stats[key] = {
|
||||||
|
"mean": np.full((shape[0],), 0.0, dtype=np.float32).tolist(),
|
||||||
|
"max": np.full((shape[0],), 1, dtype=np.float32).tolist(),
|
||||||
|
"min": np.full((shape[0],), -1, dtype=np.float32).tolist(),
|
||||||
|
"std": np.full((shape[0],), 0.5, dtype=np.float32).tolist(),
|
||||||
|
"count": [10],
|
||||||
|
}
|
||||||
else:
|
else:
|
||||||
stats[key] = {
|
stats[key] = {
|
||||||
"max": np.full(shape, 1, dtype=dtype).tolist(),
|
"max": np.full(shape, 1, dtype=dtype).tolist(),
|
||||||
|
|||||||
+8
-1
@@ -20,7 +20,7 @@ from functools import wraps
|
|||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from lerobot import available_cameras, available_motors, available_robots
|
from lerobot import available_cameras, available_microphones, available_motors, available_robots
|
||||||
from lerobot.utils.import_utils import is_package_available
|
from lerobot.utils.import_utils import is_package_available
|
||||||
|
|
||||||
DEVICE = os.environ.get("LEROBOT_TEST_DEVICE", "cuda") if torch.cuda.is_available() else "cpu"
|
DEVICE = os.environ.get("LEROBOT_TEST_DEVICE", "cuda") if torch.cuda.is_available() else "cpu"
|
||||||
@@ -33,6 +33,10 @@ TEST_CAMERA_TYPES = []
|
|||||||
for camera_type in available_cameras:
|
for camera_type in available_cameras:
|
||||||
TEST_CAMERA_TYPES += [(camera_type, True), (camera_type, False)]
|
TEST_CAMERA_TYPES += [(camera_type, True), (camera_type, False)]
|
||||||
|
|
||||||
|
TEST_MICROPHONE_TYPES = []
|
||||||
|
for microphone_type in available_microphones:
|
||||||
|
TEST_MICROPHONE_TYPES += [(microphone_type, True), (microphone_type, False)]
|
||||||
|
|
||||||
TEST_MOTOR_TYPES = []
|
TEST_MOTOR_TYPES = []
|
||||||
for motor_type in available_motors:
|
for motor_type in available_motors:
|
||||||
TEST_MOTOR_TYPES += [(motor_type, True), (motor_type, False)]
|
TEST_MOTOR_TYPES += [(motor_type, True), (motor_type, False)]
|
||||||
@@ -41,6 +45,9 @@ for motor_type in available_motors:
|
|||||||
OPENCV_CAMERA_INDEX = int(os.environ.get("LEROBOT_TEST_OPENCV_CAMERA_INDEX", 0))
|
OPENCV_CAMERA_INDEX = int(os.environ.get("LEROBOT_TEST_OPENCV_CAMERA_INDEX", 0))
|
||||||
INTELREALSENSE_SERIAL_NUMBER = int(os.environ.get("LEROBOT_TEST_INTELREALSENSE_SERIAL_NUMBER", 128422271614))
|
INTELREALSENSE_SERIAL_NUMBER = int(os.environ.get("LEROBOT_TEST_INTELREALSENSE_SERIAL_NUMBER", 128422271614))
|
||||||
|
|
||||||
|
# Microphone indices used for connecting physical microphones
|
||||||
|
MICROPHONE_INDEX = int(os.environ.get("LEROBOT_TEST_MICROPHONE_INDEX", 0))
|
||||||
|
|
||||||
DYNAMIXEL_PORT = os.environ.get("LEROBOT_TEST_DYNAMIXEL_PORT", "/dev/tty.usbmodem575E0032081")
|
DYNAMIXEL_PORT = os.environ.get("LEROBOT_TEST_DYNAMIXEL_PORT", "/dev/tty.usbmodem575E0032081")
|
||||||
DYNAMIXEL_MOTORS = {
|
DYNAMIXEL_MOTORS = {
|
||||||
"shoulder_pan": [1, "xl430-w250"],
|
"shoulder_pan": [1, "xl430-w250"],
|
||||||
|
|||||||
Reference in New Issue
Block a user