Adding audio modality in LeRobotDatasets

This commit is contained in:
CarolinePascal
2025-03-28 17:16:51 +01:00
parent d998660aa1
commit 2864caad80
7 changed files with 582 additions and 101 deletions
+3 -3
View File
@@ -41,7 +41,7 @@ from lerobot.datasets.utils import (
write_stats,
write_tasks,
)
from lerobot.datasets.video_utils import concatenate_video_files, get_video_duration_in_s
from lerobot.datasets.video_utils import concatenate_media_files, get_media_duration_in_s
def validate_all_metadata(all_metadata: list[LeRobotDatasetMetadata]):
@@ -328,7 +328,7 @@ def aggregate_videos(src_meta, dst_meta, videos_idx, video_files_size_in_mb, chu
file_index=file_idx,
)
src_duration = get_video_duration_in_s(src_path)
src_duration = get_media_duration_in_s(src_path, media_type="video")
dst_key = (chunk_idx, file_idx)
if not dst_path.exists():
@@ -367,7 +367,7 @@ def aggregate_videos(src_meta, dst_meta, videos_idx, video_files_size_in_mb, chu
current_dst_duration = dst_file_durations.get(dst_key, 0)
videos_idx[key]["src_to_offset"][(src_chunk_idx, src_file_idx)] = current_dst_duration
videos_idx[key]["src_to_dst"][(src_chunk_idx, src_file_idx)] = dst_key
concatenate_video_files(
concatenate_media_files(
[dst_path, src_path],
dst_path,
)
+193
View File
@@ -0,0 +1,193 @@
#!/usr/bin/env python
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from pathlib import Path
import av
import torch
import torchaudio
from numpy import ceil
CHANNELS_LAYOUTS_MAPPING = {
1: "mono",
2: "stereo",
3: "2.1",
4: "3.1",
5: "4.1",
6: "5.1",
7: "6.1",
8: "7.1",
16: "hexadecagonal",
24: "22.2",
}
def decode_audio(
audio_path: Path | str,
timestamps: list[float],
duration: float,
backend: str | None = "torchaudio",
) -> torch.Tensor:
"""
Decodes audio using the specified backend.
Args:
audio_path (Path): Path to the audio file.
timestamps (list[float]): List of timestamps to extract frames.
tolerance_s (float): Allowed deviation in seconds for frame retrieval.
backend (str, optional): Backend to use for decoding. Defaults to "torchaudio".
Returns:
torch.Tensor: Decoded frames.
Currently supports torchaudio.
"""
if backend == "torchcodec":
raise NotImplementedError("torchcodec is not yet supported for audio decoding")
elif backend == "torchaudio":
return decode_audio_torchaudio(audio_path, timestamps, duration)
else:
raise ValueError(f"Unsupported video backend: {backend}")
def decode_audio_torchaudio(
audio_path: Path | str,
timestamps: list[float],
duration: float,
log_loaded_timestamps: bool = False,
) -> torch.Tensor:
# TODO(CarolinePascal) : add channels selection
audio_path = str(audio_path)
reader = torchaudio.io.StreamReader(src=audio_path)
audio_sampling_rate = reader.get_src_stream_info(reader.default_audio_stream).sample_rate
# TODO(CarolinePascal) : sort timestamps ?
reader.add_basic_audio_stream(
frames_per_chunk=int(ceil(duration * audio_sampling_rate)), # Too much is better than not enough
buffer_chunk_size=-1, # No dropping frames
)
audio_chunks = []
for ts in timestamps:
reader.seek(ts) # Default to closest audio sample
status = reader.fill_buffer()
if status != 0:
logging.warning("Audio stream reached end of recording before decoding desired timestamps.")
current_audio_chunk = reader.pop_chunks()[0]
if log_loaded_timestamps:
logging.info(
f"audio chunk loaded at starting timestamp={current_audio_chunk['pts']:.4f} with duration={len(current_audio_chunk) / audio_sampling_rate:.4f}"
)
audio_chunks.append(current_audio_chunk)
audio_chunks = torch.stack(audio_chunks)
# TODO(CarolinePascal) : pytorch format conversion ?
assert len(timestamps) == len(audio_chunks)
return audio_chunks
def encode_audio(
input_path: Path | str,
output_path: Path | str,
codec: str = "aac", # TODO(CarolinePascal) : investigate Fraunhofer FDK AAC (libfdk_aac) codec and and constant (file size control) /variable (quality control) bitrate options
bit_rate: int | None = None,
sample_rate: int | None = None,
log_level: int | None = av.logging.ERROR,
overwrite: bool = False,
) -> None:
"""Encodes an audio file using ffmpeg."""
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=overwrite)
# Set logging level
if log_level is not None:
# "While less efficient, it is generally preferable to modify logging with Pythons logging"
logging.getLogger("libav").setLevel(log_level)
# Open input file
with av.open(str(input_path), "r") as input:
input_stream = input.streams.audio[0] # Assuming the first stream is the audio stream to be encoded
# Define sub-sampling options
if sample_rate is None:
sample_rate = input_stream.rate
# Create and open output file (overwrite by default)
with av.open(str(output_path), "w") as output:
output_stream = output.add_stream(
codec, rate=sample_rate, layout=CHANNELS_LAYOUTS_MAPPING[input_stream.channels]
)
if bit_rate is not None:
output_stream.bit_rate = bit_rate
# Loop through input WAV packets and encode them
for input_frame in input.decode(
input_stream
): # This step handles both demuxing and decoding under the hood
packet = output_stream.encode(input_frame)
if packet:
output.mux(packet)
# Flush the encoder
packet = output_stream.encode()
if packet:
output.mux(packet)
# Reset logging level
if log_level is not None:
av.logging.restore_default_callback()
if not output_path.exists():
raise OSError(f"Audio encoding did not work. File not found: {output_path}.")
def get_audio_info(video_path: Path | str) -> dict:
# Set logging level
logging.getLogger("libav").setLevel(av.logging.ERROR)
# Getting audio stream information
audio_info = {}
with av.open(str(video_path), "r") as audio_file:
try:
audio_stream = audio_file.streams.audio[0]
except IndexError:
# Reset logging level
av.logging.restore_default_callback()
return {"has_audio": False}
audio_info["audio.channels"] = audio_stream.channels
audio_info["audio.codec"] = audio_stream.codec.canonical_name
# In an ideal loseless case : bit depth x sample rate x channels = bit rate.
# In an actual compressed case, the bit rate is set according to the compression level : the lower the bit rate, the more compression is applied.
audio_info["audio.bit_rate"] = audio_stream.bit_rate
audio_info["audio.sample_rate"] = audio_stream.sample_rate # Number of samples per second
# In an ideal loseless case : fixed number of bits per sample.
# In an actual compressed case : variable number of bits per sample (often reduced to match a given depth rate).
audio_info["audio.bit_depth"] = audio_stream.format.bits
audio_info["audio.channel_layout"] = audio_stream.layout.name
audio_info["has_audio"] = True
# Reset logging level
av.logging.restore_default_callback()
return audio_info
+308 -23
View File
@@ -33,12 +33,15 @@ import torch.utils
from huggingface_hub import HfApi, snapshot_download
from huggingface_hub.errors import RevisionNotFoundError
from lerobot.datasets.audio_utils import decode_audio, encode_audio, get_audio_info
from lerobot.datasets.compute_stats import aggregate_stats, compute_episode_stats
from lerobot.datasets.image_writer import AsyncImageWriter, write_image
from lerobot.datasets.utils import (
DEFAULT_AUDIO_CHUNK_DURATION,
DEFAULT_EPISODES_PATH,
DEFAULT_FEATURES,
DEFAULT_IMAGE_PATH,
DEFAULT_RAW_AUDIO_PATH,
INFO_PATH,
_validate_feature_names,
check_delta_timestamps,
@@ -68,11 +71,12 @@ from lerobot.datasets.utils import (
)
from lerobot.datasets.video_utils import (
VideoFrame,
concatenate_video_files,
concatenate_media_files,
decode_video_frames,
encode_video_frames,
get_audio_duration_in_s,
get_media_duration_in_s,
get_safe_default_codec,
get_video_duration_in_s,
get_video_info,
)
from lerobot.utils.constants import HF_LEROBOT_HOME
@@ -214,6 +218,19 @@ class LeRobotDatasetMetadata:
fpath = self.video_path.format(video_key=vid_key, chunk_index=chunk_idx, file_index=file_idx)
return Path(fpath)
def get_audio_file_path(self, ep_index: int, audio_key: str) -> Path:
if self.episodes is None:
self.episodes = load_episodes(self.root)
if ep_index >= len(self.episodes):
raise IndexError(
f"Episode index {ep_index} out of range. Episodes: {len(self.episodes) if self.episodes else 0}"
)
ep = self.episodes[ep_index]
chunk_idx = ep[f"audio/{audio_key}/chunk_index"]
file_idx = ep[f"audio/{audio_key}/file_index"]
fpath = self.audio_path.format(audio_key=audio_key, chunk_index=chunk_idx, file_index=file_idx)
return Path(fpath)
@property
def data_path(self) -> str:
"""Formattable string for the parquet files."""
@@ -224,6 +241,11 @@ class LeRobotDatasetMetadata:
"""Formattable string for the video files."""
return self.info["video_path"]
@property
def audio_path(self) -> str | None:
"""Formattable string for the audio files."""
return self.info["audio_path"]
@property
def robot_type(self) -> str | None:
"""Robot type used in recording this dataset."""
@@ -254,6 +276,11 @@ class LeRobotDatasetMetadata:
"""Keys to access visual modalities (regardless of their storage method)."""
return [key for key, ft in self.features.items() if ft["dtype"] in ["video", "image"]]
@property
def audio_keys(self) -> list[str]:
"""Keys to access audio modalities."""
return [key for key, ft in self.features.items() if ft["dtype"] == "audio"]
@property
def names(self) -> dict[str, list | dict]:
"""Names of the various dimensions of vector modalities."""
@@ -294,6 +321,11 @@ class LeRobotDatasetMetadata:
"""Max size of video file in mega bytes."""
return self.info["video_files_size_in_mb"]
@property
def audio_files_size_in_mb(self) -> int:
"""Max size of audio file in mega bytes."""
return self.info["audio_files_size_in_mb"]
def get_task_index(self, task: str) -> int | None:
"""
Given a task in natural language, returns its task_index if the task already exists in the dataset,
@@ -435,11 +467,26 @@ class LeRobotDatasetMetadata:
video_path = self.root / self.video_path.format(video_key=key, chunk_index=0, file_index=0)
self.info["features"][key]["info"] = get_video_info(video_path)
def update_audio_info(self, audio_key: str | None = None) -> None:
"""
Warning: this function writes info from first episode audio, implicitly assuming that all audio have
been encoded the same way. Also, this means it assumes the first episode exists.
"""
if audio_key is not None and audio_key not in self.audio_keys:
raise ValueError(f"Audio key {audio_key} not found in dataset")
audio_keys = [audio_key] if audio_key is not None else self.audio_keys
for key in audio_keys:
if not self.features[key].get("info", None):
audio_path = self.root / self.audio_path.format(audio_key=key, chunk_index=0, file_index=0)
self.info["features"][key]["info"] = get_audio_info(audio_path)
def update_chunk_settings(
self,
chunks_size: int | None = None,
data_files_size_in_mb: int | None = None,
video_files_size_in_mb: int | None = None,
audio_files_size_in_mb: int | None = None,
) -> None:
"""Update chunk and file size settings after dataset creation.
@@ -451,6 +498,7 @@ class LeRobotDatasetMetadata:
chunks_size: Maximum number of files per chunk directory. If None, keeps current value.
data_files_size_in_mb: Maximum size for data parquet files in MB. If None, keeps current value.
video_files_size_in_mb: Maximum size for video files in MB. If None, keeps current value.
audio_files_size_in_mb: Maximum size for audio files in MB. If None, keeps current value.
"""
if chunks_size is not None:
if chunks_size <= 0:
@@ -467,6 +515,11 @@ class LeRobotDatasetMetadata:
raise ValueError(f"video_files_size_in_mb must be positive, got {video_files_size_in_mb}")
self.info["video_files_size_in_mb"] = video_files_size_in_mb
if audio_files_size_in_mb is not None:
if audio_files_size_in_mb <= 0:
raise ValueError(f"audio_files_size_in_mb must be positive, got {audio_files_size_in_mb}")
self.info["audio_files_size_in_mb"] = audio_files_size_in_mb
# Update the info file on disk
write_info(self.info, self.root)
@@ -474,12 +527,13 @@ class LeRobotDatasetMetadata:
"""Get current chunk and file size settings.
Returns:
Dict containing chunks_size, data_files_size_in_mb, and video_files_size_in_mb.
Dict containing chunks_size, data_files_size_in_mb, video_files_size_in_mb, and audio_files_size_in_mb.
"""
return {
"chunks_size": self.chunks_size,
"data_files_size_in_mb": self.data_files_size_in_mb,
"video_files_size_in_mb": self.video_files_size_in_mb,
"audio_files_size_in_mb": self.audio_files_size_in_mb,
}
def __repr__(self):
@@ -506,6 +560,7 @@ class LeRobotDatasetMetadata:
chunks_size: int | None = None,
data_files_size_in_mb: int | None = None,
video_files_size_in_mb: int | None = None,
audio_files_size_in_mb: int | None = None,
) -> "LeRobotDatasetMetadata":
"""Creates metadata for a LeRobotDataset."""
obj = cls.__new__(cls)
@@ -529,6 +584,7 @@ class LeRobotDatasetMetadata:
chunks_size,
data_files_size_in_mb,
video_files_size_in_mb,
audio_files_size_in_mb,
)
if len(obj.video_keys) > 0 and not use_videos:
raise ValueError()
@@ -565,6 +621,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
force_cache_sync: bool = False,
download_videos: bool = True,
video_backend: str | None = None,
audio_backend: str | None = None,
batch_encoding_size: int = 1,
vcodec: str = "libsvtav1",
):
@@ -598,6 +655,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
task-conditioned training.
- hf_dataset (from datasets.Dataset), which will read any values from parquet files.
- videos (optional) from which frames are loaded to be synchronous with data from parquet files.
- audio (optional) from which audio is loaded to be synchronous with data from parquet files.
A typical LeRobotDataset looks like this from its root path:
.
@@ -623,19 +681,37 @@ class LeRobotDataset(torch.utils.data.Dataset):
│ ├── info.json
│ ├── stats.json
│ └── tasks.parquet
── videos
├── observation.images.laptop
── videos
├── observation.images.laptop
│ │ ├── chunk-000
│ │ │ ├── file-000.mp4
│ │ │ ├── file-001.mp4
│ │ │ └── ...
│ │ ├── chunk-001
│ │ │ └── ...
│ │ └── ...
│ ├── observation.images.phone
│ │ ├── chunk-000
│ │ │ ├── file-000.mp4
│ │ │ ├── file-001.mp4
│ │ │ └── ...
│ │ ├── chunk-001
│ │ │ └── ...
│ │ └── ...
│ └── ...
└── audio
├── observation.audio.laptop
│ ├── chunk-000
│ │ ├── file-000.mp4
│ │ ├── file-001.mp4
│ │ ├── file-000.m4a
│ │ ├── file-001.m4a
│ │ └── ...
│ ├── chunk-001
│ │ └── ...
│ └── ...
├── observation.images.phone
├── observation.audio.phone
│ ├── chunk-000
│ │ ├── file-000.mp4
│ │ ├── file-001.mp4
│ │ ├── file-000.m4a
│ │ ├── file-001.m4a
│ │ └── ...
│ ├── chunk-001
│ │ └── ...
@@ -677,6 +753,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
True.
video_backend (str | None, optional): Video backend to use for decoding videos. Defaults to torchcodec when available int the platform; otherwise, defaults to 'pyav'.
You can also use the 'pyav' decoder used by Torchvision, which used to be the default option, or 'video_reader' which is another decoder of Torchvision.
audio_backend (str | None, optional): Audio backend to use for decoding audio. Defaults to 'ffmpeg'.
batch_encoding_size (int, optional): Number of episodes to accumulate before batch encoding videos.
Set to 1 for immediate encoding (default), or higher for batched encoding. Defaults to 1.
vcodec (str, optional): Video codec for encoding videos during recording. Options: 'h264', 'hevc',
@@ -694,6 +771,9 @@ class LeRobotDataset(torch.utils.data.Dataset):
self.tolerance_s = tolerance_s
self.revision = revision if revision else CODEBASE_VERSION
self.video_backend = video_backend if video_backend else get_safe_default_codec()
self.audio_backend = (
audio_backend if audio_backend else "ffmpeg"
) # Waiting for torchcodec release #TODO(CarolinePascal)
self.delta_indices = None
self.batch_encoding_size = batch_encoding_size
self.episodes_since_last_encoding = 0
@@ -864,7 +944,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
return hf_dataset
def _check_cached_episodes_sufficient(self) -> bool:
"""Check if the cached dataset contains all requested episodes and their video files."""
"""Check if the cached dataset contains all requested episodes and their video and audio files."""
if self.hf_dataset is None or len(self.hf_dataset) == 0:
return False
@@ -892,6 +972,14 @@ class LeRobotDataset(torch.utils.data.Dataset):
if not video_path.exists():
return False
# Check if all required audio files exist
if len(self.meta.audio_keys) > 0:
for ep_idx in requested_episodes:
for audio_key in self.meta.audio_keys:
audio_path = self.root / self.meta.get_audio_file_path(ep_idx, audio_key)
if not audio_path.exists():
return False
return True
def create_hf_dataset(self) -> datasets.Dataset:
@@ -970,7 +1058,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
query_indices: dict[str, list[int]] | None = None,
) -> dict[str, list[float]]:
query_timestamps = {}
for key in self.meta.video_keys:
for key in self.meta.video_keys + self.meta.audio_keys:
if query_indices is not None and key in query_indices:
if self._absolute_to_relative_idx is not None:
relative_indices = [self._absolute_to_relative_idx[idx] for idx in query_indices[key]]
@@ -985,7 +1073,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
def _query_hf_dataset(self, query_indices: dict[str, list[int]]) -> dict:
"""
Query dataset for indices across keys, skipping video keys.
Query dataset for indices across keys, skipping video keys and audio keys.
Tries column-first [key][indices] for speed, falls back to row-first.
@@ -997,7 +1085,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
"""
result: dict = {}
for key, q_idx in query_indices.items():
if key in self.meta.video_keys:
if key in self.meta.video_keys or key in self.meta.audio_keys:
continue
# Map absolute indices to relative indices if needed
relative_indices = (
@@ -1032,6 +1120,25 @@ class LeRobotDataset(torch.utils.data.Dataset):
return item
# TODO(CarolinePascal): add variable query durations
def _query_audio(
self, query_timestamps: dict[str, list[float]], query_duration: float, ep_idx: int
) -> dict[str, torch.Tensor]:
ep = self.meta.episodes[ep_idx]
item = {}
for audio_key, query_ts in query_timestamps.items():
# Episodes are stored sequentially on a single mp4 to reduce the number of files.
# Thus we load the start timestamp of the episode on this mp4 and,
# shift the query timestamp accordingly.
from_timestamp = ep[f"audio/{audio_key}/from_timestamp"]
shifted_query_ts = [from_timestamp + ts for ts in query_ts]
audio_path = self.root / self.meta.get_audio_file_path(ep_idx, audio_key)
audio_chunk = decode_audio(audio_path, shifted_query_ts, query_duration, self.audio_backend)
item[audio_key] = audio_chunk.squeeze(0)
return item
def _ensure_hf_dataset_loaded(self):
"""Lazy load the HF dataset only when needed for reading."""
if self._lazy_loading or self.hf_dataset is None:
@@ -1061,11 +1168,12 @@ class LeRobotDataset(torch.utils.data.Dataset):
for key, val in query_result.items():
item[key] = val
if len(self.meta.video_keys) > 0:
if len(self.meta.video_keys) > 0 or len(self.meta.audio_keys) > 0:
current_ts = item["timestamp"].item()
query_timestamps = self._get_query_timestamps(current_ts, query_indices)
video_frames = self._query_videos(query_timestamps, ep_idx)
item = {**video_frames, **item}
audio_chunks = self._query_audio(query_timestamps, DEFAULT_AUDIO_CHUNK_DURATION, ep_idx)
item = {**item, **video_frames, **audio_chunks}
if self.image_transforms is not None:
image_keys = self.meta.camera_keys
@@ -1113,6 +1221,10 @@ class LeRobotDataset(torch.utils.data.Dataset):
)
return self.root / fpath
def _get_raw_audio_file_path(self, episode_index: int, audio_key: str) -> Path:
fpath = DEFAULT_RAW_AUDIO_PATH.format(audio_key=audio_key, episode_index=episode_index)
return self.root / fpath
def _get_image_file_dir(self, episode_index: int, image_key: str) -> Path:
return self._get_image_file_path(episode_index, image_key, frame_index=0).parent
@@ -1211,7 +1323,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
for key, ft in self.features.items():
# index, episode_index, task_index are already processed above, and image and video
# are processed separately by storing image path and frame info as meta data
if key in ["index", "episode_index", "task_index"] or ft["dtype"] in ["image", "video"]:
if key in ["index", "episode_index", "task_index"] or ft["dtype"] in ["image", "video", "audio"]:
continue
episode_buffer[key] = np.stack(episode_buffer[key])
@@ -1221,9 +1333,10 @@ class LeRobotDataset(torch.utils.data.Dataset):
ep_metadata = self._save_episode_data(episode_buffer)
has_video_keys = len(self.meta.video_keys) > 0
has_audio_keys = len(self.meta.audio_keys) > 0
use_batched_encoding = self.batch_encoding_size > 1
if has_video_keys and not use_batched_encoding:
if (has_video_keys or has_audio_keys) and not use_batched_encoding:
num_cameras = len(self.meta.video_keys)
if parallel_encoding and num_cameras > 1:
# TODO(Steven): Ideally we would like to control the number of threads per encoding such that:
@@ -1260,21 +1373,30 @@ class LeRobotDataset(torch.utils.data.Dataset):
for video_key in self.meta.video_keys:
ep_metadata.update(self._save_episode_video(video_key, episode_index))
#TODO(Caroline): add parallel encoding for audio as well
for audio_key in self.meta.audio_keys:
ep_metadata.update(self._save_episode_audio(audio_key, episode_index))
# `meta.save_episode` need to be executed after encoding the videos
self.meta.save_episode(episode_index, episode_length, episode_tasks, ep_stats, ep_metadata)
if has_video_keys and use_batched_encoding:
if (has_video_keys or has_audio_keys) and use_batched_encoding:
# Check if we should trigger batch encoding
self.episodes_since_last_encoding += 1
if self.episodes_since_last_encoding == self.batch_encoding_size:
start_ep = self.num_episodes - self.batch_encoding_size
end_ep = self.num_episodes
if has_video_keys:
self._batch_save_episode_video(start_ep, end_ep)
if has_audio_keys:
self._batch_save_episode_audio(start_ep, end_ep)
self.episodes_since_last_encoding = 0
if not episode_data:
# Reset episode buffer and clean up temporary images (if not already deleted during video encoding)
self.clear_episode_buffer(delete_images=len(self.meta.image_keys) > 0)
self.clear_episode_buffer(
delete_images=len(self.meta.image_keys) > 0, delete_audio=len(self.meta.audio_keys) > 0
)
def _batch_save_episode_video(self, start_episode: int, end_episode: int | None = None) -> None:
"""
@@ -1325,7 +1447,70 @@ class LeRobotDataset(torch.utils.data.Dataset):
dtype_backend="pyarrow"
) # allows NaN values along with integers
# Save the current episode's audio metadata to the dataframe
audio_ep_metadata = {}
for audio_key in self.meta.audio_keys:
audio_ep_metadata.update(self._save_episode_audio(audio_key, ep_idx))
audio_ep_metadata.pop("episode_index")
audio_ep_df = pd.DataFrame(audio_ep_metadata, index=[ep_idx]).convert_dtypes(
dtype_backend="pyarrow"
) # allows NaN values along with integers
episode_df = episode_df.combine_first(video_ep_df)
episode_df = episode_df.combine_first(audio_ep_df)
episode_df.to_parquet(episode_df_path)
self.meta.episodes = load_episodes(self.root)
def _batch_save_episode_audio(self, start_episode: int, end_episode: int | None = None) -> None:
"""
Batch save audio for multiple episodes.
Args:
start_episode: Starting episode index (inclusive)
end_episode: Ending episode index (exclusive). If None, encodes all episodes from start_episode to the current episode.
"""
if end_episode is None:
end_episode = self.num_episodes
logging.info(
f"Batch encoding {self.batch_encoding_size} audio for episodes {start_episode} to {end_episode - 1}"
)
chunk_idx = self.meta.episodes[start_episode]["data/chunk_index"]
file_idx = self.meta.episodes[start_episode]["data/file_index"]
episode_df_path = self.root / DEFAULT_EPISODES_PATH.format(chunk_index=chunk_idx, file_index=file_idx)
episode_df = pd.read_parquet(episode_df_path)
for ep_idx in range(start_episode, end_episode):
logging.info(f"Encoding audio for episode {ep_idx}")
if (
self.meta.episodes[ep_idx]["data/chunk_index"] != chunk_idx
or self.meta.episodes[ep_idx]["data/file_index"] != file_idx
):
# The current episode is in a new chunk or file.
# Save previous episode dataframe and update the Hugging Face dataset by reloading it.
episode_df.to_parquet(episode_df_path)
self.meta.episodes = load_episodes(self.root)
# Load new episode dataframe
chunk_idx = self.meta.episodes[ep_idx]["data/chunk_index"]
file_idx = self.meta.episodes[ep_idx]["data/file_index"]
episode_df_path = self.root / DEFAULT_EPISODES_PATH.format(
chunk_index=chunk_idx, file_index=file_idx
)
episode_df = pd.read_parquet(episode_df_path)
# Save the current episode's video metadata to the dataframe
audio_ep_metadata = {}
for audio_key in self.meta.audio_keys:
audio_ep_metadata.update(self._save_episode_audio(audio_key, ep_idx))
audio_ep_metadata.pop("episode_index")
audio_ep_df = pd.DataFrame(audio_ep_metadata, index=[ep_idx]).convert_dtypes(
dtype_backend="pyarrow"
) # allows NaN values along with integers
episode_df = episode_df.combine_first(audio_ep_df)
episode_df.to_parquet(episode_df_path)
self.meta.episodes = load_episodes(self.root)
@@ -1436,7 +1621,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
ep_path = temp_path
ep_size_in_mb = get_file_size_in_mb(ep_path)
ep_duration_in_s = get_video_duration_in_s(ep_path)
ep_duration_in_s = get_media_duration_in_s(ep_path, media_type="video")
if (
episode_index == 0
@@ -1482,7 +1667,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
latest_duration_in_s = 0.0
else:
# Update latest video file
concatenate_video_files(
concatenate_media_files(
[latest_path, ep_path],
latest_path,
)
@@ -1504,7 +1689,79 @@ class LeRobotDataset(torch.utils.data.Dataset):
}
return metadata
def clear_episode_buffer(self, delete_images: bool = True) -> None:
def _save_episode_audio(self, audio_key: str, episode_index: int) -> dict:
# Encode episode audio into a temporary audio file
ep_path = self._encode_temporary_episode_audio(audio_key, episode_index)
ep_size_in_mb = get_file_size_in_mb(ep_path)
ep_duration_in_s = get_audio_duration_in_s(ep_path)
if (
episode_index == 0
or self.meta.latest_episode is None
or f"audio/{audio_key}/chunk_index" not in self.meta.latest_episode
):
# Initialize indices for a new dataset made of the first episode data
chunk_idx, file_idx = 0, 0
if self.meta.episodes is not None and len(self.meta.episodes) > 0:
# It means we are resuming recording, so we need to load the latest episode
# Update the indices to avoid overwriting the latest episode
old_chunk_idx = self.meta.episodes[-1][f"audio/{audio_key}/chunk_index"]
old_file_idx = self.meta.episodes[-1][f"audio/{audio_key}/file_index"]
chunk_idx, file_idx = update_chunk_file_indices(
old_chunk_idx, old_file_idx, self.meta.chunks_size
)
latest_duration_in_s = 0.0
new_path = self.root / self.meta.audio_path.format(
audio_key=audio_key, chunk_index=chunk_idx, file_index=file_idx
)
new_path.parent.mkdir(parents=True, exist_ok=True)
shutil.move(str(ep_path), str(new_path))
else:
# Retrieve information from the latest updated audio file using latest_episode
latest_ep = self.meta.latest_episode
chunk_idx = latest_ep[f"audio/{audio_key}/chunk_index"][0]
file_idx = latest_ep[f"audio/{audio_key}/file_index"][0]
latest_path = self.root / self.meta.audio_path.format(
audio_key=audio_key, chunk_index=chunk_idx, file_index=file_idx
)
latest_size_in_mb = get_file_size_in_mb(latest_path)
latest_duration_in_s = latest_ep[f"audio/{audio_key}/to_timestamp"][0]
if latest_size_in_mb + ep_size_in_mb >= self.meta.audio_files_size_in_mb:
# Move temporary episode audio to a new audio file in the dataset
chunk_idx, file_idx = update_chunk_file_indices(chunk_idx, file_idx, self.meta.chunks_size)
new_path = self.root / self.meta.audio_path.format(
audio_key=audio_key, chunk_index=chunk_idx, file_index=file_idx
)
new_path.parent.mkdir(parents=True, exist_ok=True)
shutil.move(str(ep_path), str(new_path))
latest_duration_in_s = 0.0
else:
# Update latest audio file
concatenate_media_files(
[latest_path, ep_path],
latest_path,
)
# Remove temporary directory
shutil.rmtree(str(ep_path.parent))
# Update audio info (only needed when first episode is encoded since it reads from episode 0)
if episode_index == 0:
self.meta.update_audio_info(audio_key)
write_info(self.meta.info, self.meta.root) # ensure audio info always written properly
metadata = {
"episode_index": episode_index,
f"audio/{audio_key}/chunk_index": chunk_idx,
f"audio/{audio_key}/file_index": file_idx,
f"audio/{audio_key}/from_timestamp": latest_duration_in_s,
f"audio/{audio_key}/to_timestamp": latest_duration_in_s + ep_duration_in_s,
}
return metadata
def clear_episode_buffer(self, delete_images: bool = True, delete_audio: bool = True) -> None:
# Clean up image files for the current episode buffer
if delete_images:
# Wait for the async image writer to finish
@@ -1518,6 +1775,16 @@ class LeRobotDataset(torch.utils.data.Dataset):
if img_dir.is_dir():
shutil.rmtree(img_dir)
# Clean up audio files for the current episode buffer
if delete_audio:
episode_index = self.episode_buffer["episode_index"]
if isinstance(episode_index, np.ndarray):
episode_index = episode_index.item() if episode_index.size == 1 else episode_index[0]
for microphone_key in self.meta.microphone_keys:
audio_file = self.root / self.meta.get_audio_file_path(episode_index, microphone_key)
if audio_file.is_file():
audio_file.unlink()
# Reset the buffer
self.episode_buffer = self.create_episode_buffer()
@@ -1554,6 +1821,18 @@ class LeRobotDataset(torch.utils.data.Dataset):
"""
return _encode_video_worker(video_key, episode_index, self.root, self.fps, self.vcodec)
def _encode_temporary_episode_audio(self, audio_key: str, episode_index: int) -> Path:
"""
Use ffmpeg to convert raw audio files into m4a audio files.
Note: `encode_episode_audio` is a blocking call. Making it asynchronous shouldn't speedup encoding,
since audio encoding with ffmpeg is already using multithreading.
"""
temp_path = Path(tempfile.mkdtemp(dir=self.root)) / f"{audio_key}_{episode_index:03d}.m4a"
raw_audio_file = self._get_raw_audio_file_path(episode_index, audio_key)
encode_audio(raw_audio_file, temp_path, overwrite=True)
raw_audio_file.unlink()
return temp_path
@classmethod
def create(
cls,
@@ -1567,6 +1846,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
image_writer_processes: int = 0,
image_writer_threads: int = 0,
video_backend: str | None = None,
audio_backend: str | None = None,
batch_encoding_size: int = 1,
vcodec: str = "libsvtav1",
) -> "LeRobotDataset":
@@ -1611,6 +1891,9 @@ class LeRobotDataset(torch.utils.data.Dataset):
obj._lazy_loading = False
obj._recorded_frames = 0
obj._writer_closed_for_reading = False
obj.audio_backend = (
audio_backend if audio_backend is not None else "ffmpeg"
) # Waiting for torchcodec release #TODO(CarolinePascal)
return obj
@@ -1631,6 +1914,7 @@ class MultiLeRobotDataset(torch.utils.data.Dataset):
tolerances_s: dict | None = None,
download_videos: bool = True,
video_backend: str | None = None,
audio_backend: str | None = None,
):
super().__init__()
self.repo_ids = repo_ids
@@ -1648,6 +1932,7 @@ class MultiLeRobotDataset(torch.utils.data.Dataset):
tolerance_s=self.tolerances_s[repo_id],
download_videos=download_videos,
video_backend=video_backend,
audio_backend=audio_backend,
)
for repo_id in repo_ids
]
+14 -1
View File
@@ -50,6 +50,7 @@ from lerobot.utils.utils import SuppressProgressBars, is_valid_numpy_dtype_strin
DEFAULT_CHUNK_SIZE = 1000 # Max number of files per chunk
DEFAULT_DATA_FILE_SIZE_IN_MB = 100 # Max size per file
DEFAULT_VIDEO_FILE_SIZE_IN_MB = 200 # Max size per file
DEFAULT_AUDIO_FILE_SIZE_IN_MB = 100 # Max size per file
INFO_PATH = "meta/info.json"
STATS_PATH = "meta/stats.json"
@@ -57,13 +58,18 @@ STATS_PATH = "meta/stats.json"
EPISODES_DIR = "meta/episodes"
DATA_DIR = "data"
VIDEO_DIR = "videos"
AUDIO_DIR = "audio"
CHUNK_FILE_PATTERN = "chunk-{chunk_index:03d}/file-{file_index:03d}"
DEFAULT_TASKS_PATH = "meta/tasks.parquet"
DEFAULT_EPISODES_PATH = EPISODES_DIR + "/" + CHUNK_FILE_PATTERN + ".parquet"
DEFAULT_DATA_PATH = DATA_DIR + "/" + CHUNK_FILE_PATTERN + ".parquet"
DEFAULT_VIDEO_PATH = VIDEO_DIR + "/{video_key}/" + CHUNK_FILE_PATTERN + ".mp4"
DEFAULT_AUDIO_PATH = AUDIO_DIR + "/{audio_key}/" + CHUNK_FILE_PATTERN + ".m4a"
DEFAULT_IMAGE_PATH = "images/{image_key}/episode-{episode_index:06d}/frame-{frame_index:06d}.png"
DEFAULT_RAW_AUDIO_PATH = "raw_audio/{audio_key}/episode_{episode_index:06d}.wav"
DEFAULT_AUDIO_CHUNK_DURATION = 0.5 # seconds
LEGACY_EPISODES_PATH = "meta/episodes.jsonl"
LEGACY_EPISODES_STATS_PATH = "meta/episodes_stats.jsonl"
@@ -576,7 +582,7 @@ def get_hf_features_from_features(features: dict) -> datasets.Features:
"""
hf_features = {}
for key, ft in features.items():
if ft["dtype"] == "video":
if ft["dtype"] == "video" or ft["dtype"] == "audio":
continue
elif ft["dtype"] == "image":
hf_features[key] = datasets.Image()
@@ -802,6 +808,7 @@ def create_empty_dataset_info(
chunks_size: int | None = None,
data_files_size_in_mb: int | None = None,
video_files_size_in_mb: int | None = None,
audio_files_size_in_mb: int | None = None,
) -> dict:
"""Create a template dictionary for a new dataset's `info.json`.
@@ -811,6 +818,10 @@ def create_empty_dataset_info(
features (dict): The LeRobot features dictionary for the dataset.
use_videos (bool): Whether the dataset will store videos.
robot_type (str | None): The type of robot used, if any.
chunks_size (int | None): The maximum number of files per chunk directory.
data_files_size_in_mb (int | None): The maximum size for data files in MB.
video_files_size_in_mb (int | None): The maximum size for video files in MB.
audio_files_size_in_mb (int | None): The maximum size for audio files in MB.
Returns:
dict: A dictionary with the initial dataset metadata.
@@ -824,10 +835,12 @@ def create_empty_dataset_info(
"chunks_size": chunks_size or DEFAULT_CHUNK_SIZE,
"data_files_size_in_mb": data_files_size_in_mb or DEFAULT_DATA_FILE_SIZE_IN_MB,
"video_files_size_in_mb": video_files_size_in_mb or DEFAULT_VIDEO_FILE_SIZE_IN_MB,
"audio_files_size_in_mb": audio_files_size_in_mb or DEFAULT_AUDIO_FILE_SIZE_IN_MB,
"fps": fps,
"splits": {},
"data_path": DEFAULT_DATA_PATH,
"video_path": DEFAULT_VIDEO_PATH if use_videos else None,
"audio_path": DEFAULT_AUDIO_PATH,
"features": features,
}
@@ -79,7 +79,7 @@ from lerobot.datasets.utils import (
write_stats,
write_tasks,
)
from lerobot.datasets.video_utils import concatenate_video_files, get_video_duration_in_s
from lerobot.datasets.video_utils import concatenate_media_files, get_media_duration_in_s
from lerobot.utils.constants import HF_LEROBOT_HOME
from lerobot.utils.utils import init_logging
@@ -311,12 +311,12 @@ def convert_videos_of_camera(root: Path, new_root: Path, video_key: str, video_f
for ep_path in tqdm.tqdm(ep_paths, desc=f"convert videos of {video_key}"):
ep_size_in_mb = get_file_size_in_mb(ep_path)
ep_duration_in_s = get_video_duration_in_s(ep_path)
ep_duration_in_s = get_media_duration_in_s(ep_path, media_type="video")
# Check if adding this episode would exceed the limit
if size_in_mb + ep_size_in_mb >= video_file_size_in_mb and len(paths_to_cat) > 0:
# Size limit would be exceeded, save current accumulation WITHOUT this episode
concatenate_video_files(
concatenate_media_files(
paths_to_cat,
new_root
/ DEFAULT_VIDEO_PATH.format(video_key=video_key, chunk_index=chunk_idx, file_index=file_idx),
@@ -352,7 +352,7 @@ def convert_videos_of_camera(root: Path, new_root: Path, video_key: str, video_f
# Write remaining videos if any
if paths_to_cat:
concatenate_video_files(
concatenate_media_files(
paths_to_cat,
new_root
/ DEFAULT_VIDEO_PATH.format(video_key=video_key, chunk_index=chunk_idx, file_index=file_idx),
+56 -69
View File
@@ -397,42 +397,42 @@ def encode_video_frames(
raise OSError(f"Video encoding did not work. File not found: {video_path}.")
def concatenate_video_files(
input_video_paths: list[Path | str], output_video_path: Path, overwrite: bool = True
def concatenate_media_files(
input_media_paths: list[Path | str], output_media_path: Path, overwrite: bool = True
):
"""
Concatenate multiple video files into a single video file using pyav.
Concatenate multiple media files (video & audio) into a single media file using pyav.
This function takes a list of video input file paths and concatenates them into a single
output video file. It uses ffmpeg's concat demuxer with stream copy mode for fast
This function takes a list of input media file paths and concatenates them into a single
output media file. It uses ffmpeg's concat demuxer with stream copy mode for fast
concatenation without re-encoding.
Args:
input_video_paths: Ordered list of input video file paths to concatenate.
output_video_path: Path to the output video file.
overwrite: Whether to overwrite the output video file if it already exists. Default is True.
input_media_paths: Ordered list of input media file paths to concatenate.
output_media_path: Path to the output media file.
overwrite: Whether to overwrite the output media file if it already exists. Default is True.
Note:
- Creates a temporary directory for intermediate files that is cleaned up after use.
- Uses ffmpeg's concat demuxer which requires all input videos to have the same
- Creates a temporary .ffconcat file and container audio/video file that are cleaned up after use.
- Uses ffmpeg's concat demuxer which requires all input media files to have the same
codec, resolution, and frame rate for proper concatenation.
"""
output_video_path = Path(output_video_path)
output_media_path = Path(output_media_path)
if output_video_path.exists() and not overwrite:
logging.warning(f"Video file already exists: {output_video_path}. Skipping concatenation.")
if output_media_path.exists() and not overwrite:
logging.warning(f"Media file already exists: {output_media_path}. Skipping concatenation.")
return
output_video_path.parent.mkdir(parents=True, exist_ok=True)
output_media_path.parent.mkdir(parents=True, exist_ok=True)
if len(input_video_paths) == 0:
raise FileNotFoundError("No input video paths provided.")
if len(input_media_paths) == 0:
raise FileNotFoundError("No input media paths provided.")
# Create a temporary .ffconcat file to list the input video paths
# Create a temporary .ffconcat file to list the input media paths
with tempfile.NamedTemporaryFile(mode="w", suffix=".ffconcat", delete=False) as tmp_concatenate_file:
tmp_concatenate_file.write("ffconcat version 1.0\n")
for input_path in input_video_paths:
for input_path in input_media_paths:
tmp_concatenate_file.write(f"file '{str(input_path.resolve())}'\n")
tmp_concatenate_file.flush()
tmp_concatenate_path = tmp_concatenate_file.name
@@ -442,11 +442,11 @@ def concatenate_video_files(
tmp_concatenate_path, mode="r", format="concat", options={"safe": "0"}
) # safe = 0 allows absolute paths as well as relative paths
with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp_named_file:
tmp_output_video_path = tmp_named_file.name
with tempfile.NamedTemporaryFile(suffix=output_media_path.suffix, delete=False) as tmp_named_file:
tmp_output_media_path = tmp_named_file.name
output_container = av.open(
tmp_output_video_path, mode="w", options={"movflags": "faststart"}
tmp_output_media_path, mode="w", options={"movflags": "faststart"}
) # faststart is to move the metadata to the beginning of the file to speed up loading
# Replicate input streams in output container
@@ -476,7 +476,7 @@ def concatenate_video_files(
input_container.close()
output_container.close()
shutil.move(tmp_output_video_path, output_video_path)
shutil.move(tmp_output_media_path, output_media_path)
Path(tmp_concatenate_path).unlink()
@@ -512,38 +512,6 @@ with warnings.catch_warnings():
register_feature(VideoFrame, "VideoFrame")
def get_audio_info(video_path: Path | str) -> dict:
# Set logging level
logging.getLogger("libav").setLevel(av.logging.ERROR)
# Getting audio stream information
audio_info = {}
with av.open(str(video_path), "r") as audio_file:
try:
audio_stream = audio_file.streams.audio[0]
except IndexError:
# Reset logging level
av.logging.restore_default_callback()
return {"has_audio": False}
audio_info["audio.channels"] = audio_stream.channels
audio_info["audio.codec"] = audio_stream.codec.canonical_name
# In an ideal loseless case : bit depth x sample rate x channels = bit rate.
# In an actual compressed case, the bit rate is set according to the compression level : the lower the bit rate, the more compression is applied.
audio_info["audio.bit_rate"] = audio_stream.bit_rate
audio_info["audio.sample_rate"] = audio_stream.sample_rate # Number of samples per second
# In an ideal loseless case : fixed number of bits per sample.
# In an actual compressed case : variable number of bits per sample (often reduced to match a given depth rate).
audio_info["audio.bit_depth"] = audio_stream.format.bits
audio_info["audio.channel_layout"] = audio_stream.layout.name
audio_info["has_audio"] = True
# Reset logging level
av.logging.restore_default_callback()
return audio_info
def get_video_info(video_path: Path | str) -> dict:
# Set logging level
logging.getLogger("libav").setLevel(av.logging.ERROR)
@@ -573,9 +541,6 @@ def get_video_info(video_path: Path | str) -> dict:
# Reset logging level
av.logging.restore_default_callback()
# Adding audio stream information
video_info.update(**get_audio_info(video_path))
return video_info
@@ -590,22 +555,22 @@ def get_video_pixel_channels(pix_fmt: str) -> int:
raise ValueError("Unknown format")
def get_video_duration_in_s(video_path: Path | str) -> float:
def get_media_duration_in_s(media_path: Path | str, media_type: str = "video") -> float:
"""
Get the duration of a video file in seconds using PyAV.
Get the duration of a media file (video & audio) in seconds using PyAV.
Args:
video_path: Path to the video file.
media_path: Path to the media file.
Returns:
Duration of the video in seconds.
Duration of the media file in seconds.
"""
with av.open(str(video_path)) as container:
# Get the first video stream
video_stream = container.streams.video[0]
with av.open(str(media_path)) as container:
# Get the first stream
stream = container.streams.video[0] if media_type == "video" else container.streams.audio[0]
# Calculate duration: stream.duration * stream.time_base gives duration in seconds
if video_stream.duration is not None:
duration = float(video_stream.duration * video_stream.time_base)
if stream.duration is not None:
duration = float(stream.duration * stream.time_base)
else:
# Fallback to container duration if stream duration is not available
duration = float(container.duration / av.time_base)
@@ -614,12 +579,12 @@ def get_video_duration_in_s(video_path: Path | str) -> float:
class VideoEncodingManager:
"""
Context manager that ensures proper video encoding and data cleanup even if exceptions occur.
Context manager that ensures proper video and audio encoding and data cleanup even if exceptions occur.
This manager handles:
- Batch encoding for any remaining episodes when recording interrupted
- Cleaning up temporary image files from interrupted episodes
- Removing empty image directories
- Cleaning up temporary image and audio files from interrupted episodes
- Removing empty image and audio directories
Args:
dataset: The LeRobotDataset instance
@@ -646,6 +611,7 @@ class VideoEncodingManager:
f"from episode {start_ep} to {end_ep - 1}"
)
self.dataset._batch_save_episode_video(start_ep, end_ep)
self.dataset._batch_save_episode_audio(start_ep, end_ep)
# Finalize the dataset to properly close all writers
self.dataset.finalize()
@@ -662,6 +628,15 @@ class VideoEncodingManager:
f"Cleaning up interrupted episode images for episode {interrupted_episode_index}, camera {key}"
)
shutil.rmtree(img_dir)
for key in self.dataset.meta.audio_keys:
audio_file = self.dataset._get_raw_audio_file_path(
episode_index=interrupted_episode_index, audio_key=key
)
if audio_file.exists():
logging.debug(
f"Cleaning up interrupted episode audio for episode {interrupted_episode_index}, microphone {key}"
)
audio_file.unlink()
# Clean up any remaining images directory if it's empty
img_dir = self.dataset.root / "images"
@@ -675,4 +650,16 @@ class VideoEncodingManager:
else:
logging.debug(f"Images directory is not empty, containing {len(png_files)} PNG files")
# Clean up any remaining audio directory if it's empty
audio_dir = self.dataset.root / "raw_audio"
# Check for any remaining WAV files
wav_files = list(audio_dir.rglob("*.wav"))
if len(wav_files) == 0:
# Only remove the raw_audio directory if no WAV files remain
if audio_dir.exists():
shutil.rmtree(audio_dir)
logging.debug("Cleaned up empty audio directory")
else:
logging.debug(f"Audio directory is not empty, containing {len(wav_files)} WAV files")
return False # Don't suppress the original exception
+3
View File
@@ -28,6 +28,7 @@ from datasets import Dataset
from lerobot.datasets.lerobot_dataset import CODEBASE_VERSION, LeRobotDataset, LeRobotDatasetMetadata
from lerobot.datasets.utils import (
DEFAULT_AUDIO_PATH,
DEFAULT_CHUNK_SIZE,
DEFAULT_DATA_FILE_SIZE_IN_MB,
DEFAULT_DATA_PATH,
@@ -162,6 +163,7 @@ def info_factory(features_factory):
video_files_size_in_mb: float = DEFAULT_VIDEO_FILE_SIZE_IN_MB,
data_path: str = DEFAULT_DATA_PATH,
video_path: str = DEFAULT_VIDEO_PATH,
audio_path: str = DEFAULT_AUDIO_PATH,
motor_features: dict = DUMMY_MOTOR_FEATURES,
camera_features: dict = DUMMY_CAMERA_FEATURES,
use_videos: bool = True,
@@ -181,6 +183,7 @@ def info_factory(features_factory):
"splits": {},
"data_path": data_path,
"video_path": video_path if use_videos else None,
"audio_path": audio_path,
"features": features,
}