diff --git a/src/lerobot/datasets/audio_utils.py b/src/lerobot/datasets/audio_utils.py index da4d97f82..dc9687d33 100644 --- a/src/lerobot/datasets/audio_utils.py +++ b/src/lerobot/datasets/audio_utils.py @@ -46,12 +46,12 @@ def decode_audio( Decodes audio using the specified backend. Args: audio_path (Path): Path to the audio file. - timestamps (list[float]): List of timestamps to extract frames. - tolerance_s (float): Allowed deviation in seconds for frame retrieval. + timestamps (list[float]): List of (starting) timestamps to extract audio chunks. + duration (float): Duration of the audio chunks in seconds. backend (str, optional): Backend to use for decoding. Defaults to "torchaudio". Returns: - torch.Tensor: Decoded frames. + torch.Tensor: Decoded audio chunks. Currently supports torchaudio. """ diff --git a/src/lerobot/datasets/compute_stats.py b/src/lerobot/datasets/compute_stats.py index 30c4b9262..0cb3b88b8 100644 --- a/src/lerobot/datasets/compute_stats.py +++ b/src/lerobot/datasets/compute_stats.py @@ -246,6 +246,7 @@ def sample_images(image_paths: list[str]) -> np.ndarray: def sample_audio_from_path(audio_path: str) -> np.ndarray: + """Samples audio data from an audio recording stored in a WAV file.""" data = load_audio_from_path(audio_path) sampled_indices = sample_indices(len(data)) @@ -253,6 +254,7 @@ def sample_audio_from_path(audio_path: str) -> np.ndarray: def sample_audio_from_data(data: np.ndarray) -> np.ndarray: + """Samples audio data from an audio recording stored in a numpy array.""" sampled_indices = sample_indices(len(data)) return data[sampled_indices] @@ -527,7 +529,7 @@ def compute_episode_stats( elif features[key]["dtype"] == "audio": try: ep_ft_array = sample_audio_from_path(data[0]) - except TypeError: # Should only be triggered for LeKiwi robot + except TypeError: # Should only be triggered for LeKiwi robot, for which audio is stored chunk by chunk in a visual frame-like manner ep_ft_array = sample_audio_from_data(data) axes_to_reduce = 0 keepdims = True diff --git a/src/lerobot/datasets/lerobot_dataset.py b/src/lerobot/datasets/lerobot_dataset.py index 221e160e8..26de0f8c5 100644 --- a/src/lerobot/datasets/lerobot_dataset.py +++ b/src/lerobot/datasets/lerobot_dataset.py @@ -1297,9 +1297,11 @@ class LeRobotDataset(torch.utils.data.Dataset): self._save_image(frame[key], img_path, compress_level) self.episode_buffer[key].append(str(img_path)) elif self.features[key]["dtype"] == "audio": - if self.meta.robot_type == "lekiwi": + if ( + self.meta.robot_type == "lekiwi" + ): # Raw data storage should only be triggered for LeKiwi robot, for which audio is stored chunk by chunk in a visual frame-like manner self.episode_buffer[key].append(frame[key]) - else: + else: # Otherwise, only the audio file path is stored in the episode buffer if frame_index == 0: audio_path = self._get_raw_audio_file_path( episode_index=self.episode_buffer["episode_index"], audio_key=key @@ -1312,7 +1314,7 @@ class LeRobotDataset(torch.utils.data.Dataset): def add_microphone_recording(self, microphone: Microphone, microphone_key: str) -> None: """ - This function will start recording audio from the microphone and save it to disk. + Starts recording audio data provided by the microphone and directly writes it in a .wav file. """ audio_dir = self._get_raw_audio_file_path( @@ -1371,7 +1373,9 @@ class LeRobotDataset(torch.utils.data.Dataset): if key in ["index", "episode_index", "task_index"] or ft["dtype"] in ["image", "video"]: continue elif ft["dtype"] == "audio": - if self.meta.robot_type == "lekiwi": + if ( + self.meta.robot_type == "lekiwi" + ): # Raw data storage should only be triggered for LeKiwi robot, for which audio is stored chunk by chunk in a visual frame-like manner episode_buffer[key] = np.concatenate(episode_buffer[key], axis=0) continue episode_buffer[key] = np.stack(episode_buffer[key]) diff --git a/src/lerobot/microphones/microphone.py b/src/lerobot/microphones/microphone.py index 16ab275b3..6b54610c6 100644 --- a/src/lerobot/microphones/microphone.py +++ b/src/lerobot/microphones/microphone.py @@ -64,6 +64,11 @@ def find_microphones(raise_when_empty=False) -> list[dict]: def record_audio_from_microphones( output_dir: Path, microphone_ids: list[int] | None = None, record_time_s: float = 2.0 ): + """ + Records audio from all the channels of the specified microphones for the specified duration. + If no microphone ids are provided, all available microphones will be used. + """ + if microphone_ids is None or len(microphone_ids) == 0: microphones = find_microphones() microphone_ids = [m["index"] for m in microphones] @@ -138,11 +143,11 @@ class Microphone: # Input audio stream self.stream = None - # Thread-safe concurrent queue to store the recorded/read audio + # Thread/Process-safe concurrent queue to store the recorded/read audio self.record_queue = None self.read_queue = None - # Thread to handle data reading and file writing in a separate thread (safely) + # Thread/Process to handle data reading and file writing in a separate thread/process (safely) self.record_thread = None self.record_stop_event = None @@ -152,6 +157,9 @@ class Microphone: self.is_writing = False def connect(self) -> None: + """ + Connects the microphone and checks if the requested acquisition parameters are compatible with the microphone. + """ if self.is_connected: raise DeviceAlreadyConnectedError(f"Microphone {self.microphone_index} is already connected.") @@ -205,6 +213,9 @@ class Microphone: self.is_connected = True def _audio_callback(self, indata, frames, time, status) -> None: + """ + Low-level sounddevice callback. + """ if status: logging.warning(status) # Slicing makes copy unnecessary @@ -215,6 +226,9 @@ class Microphone: @staticmethod def _record_loop(queue, event: Event, sample_rate: int, channels: list[int], output_file: Path) -> None: + """ + Thread/Process-safe loop to write audio data into a file. + """ # Can only be run on a single process/thread for file writing safety with sf.SoundFile( output_file, @@ -234,9 +248,7 @@ class Microphone: def _read(self) -> np.ndarray: """ - Gets audio data from the queue and coverts it to a numpy array. - -> PROS : Inherently thread safe, no need to lock the queue, lightweight CPU usage - -> CONS : Reading duration does not scale well with the number of channels and reading duration + Thread/Process-safe callback to read available audio data """ audio_readings = np.empty((0, len(self.channels))) @@ -251,6 +263,9 @@ class Microphone: return audio_readings def read(self) -> np.ndarray: + """ + Reads the last audio chunk recorded by the microphone, e.g. all samples recorded since the last read or since the beginning of the recording. + """ if not self.is_connected: raise DeviceNotConnectedError(f"Microphone {self.microphone_index} is not connected.") if not self.stream.active: @@ -269,6 +284,9 @@ class Microphone: return audio_readings def start_recording(self, output_file: str | None = None, multiprocessing: bool | None = False) -> None: + """ + Starts the recording of the microphone. If output_file is provided, the audio will be written to this file. + """ if not self.is_connected: raise DeviceNotConnectedError(f"Microphone {self.microphone_index} is not connected.") if self.is_recording: @@ -320,6 +338,9 @@ class Microphone: self.stream.start() def stop_recording(self) -> None: + """ + Stops the recording of the microphones. + """ if not self.is_connected: raise DeviceNotConnectedError(f"Microphone {self.microphone_index} is not connected.") if not self.is_recording: @@ -341,6 +362,9 @@ class Microphone: self.logs["stop_timestamp"] = capture_timestamp_utc() def disconnect(self) -> None: + """ + Disconnects the microphone and stops the recording. + """ if not self.is_connected: raise DeviceNotConnectedError(f"Microphone {self.microphone_index} is not connected.")