mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-15 00:29:52 +00:00
[skip ci] feat(torchcodec): adding support for torchcodec audio decoding
This commit is contained in:
@@ -20,6 +20,7 @@ from pathlib import Path
|
||||
import av
|
||||
import torch
|
||||
import torchaudio
|
||||
import torchcodec
|
||||
from numpy import ceil
|
||||
|
||||
CHANNELS_LAYOUTS_MAPPING = {
|
||||
@@ -56,13 +57,42 @@ def decode_audio(
|
||||
Currently supports torchaudio.
|
||||
"""
|
||||
if backend == "torchcodec":
|
||||
raise NotImplementedError("torchcodec is not yet supported for audio decoding")
|
||||
# return decode_audio_torchcodec(audio_path, timestamps, duration) #TODO(CarolinePascal): uncomment this line at next torchcodec release
|
||||
raise ValueError("torchcodec backend is not available yet.")
|
||||
elif backend == "torchaudio":
|
||||
return decode_audio_torchaudio(audio_path, timestamps, duration)
|
||||
else:
|
||||
raise ValueError(f"Unsupported video backend: {backend}")
|
||||
|
||||
|
||||
def decode_audio_torchcodec(
|
||||
audio_path: Path | str,
|
||||
timestamps: list[float],
|
||||
duration: float,
|
||||
log_loaded_timestamps: bool = False,
|
||||
) -> torch.Tensor:
|
||||
# TODO(CarolinePascal) : add channels selection
|
||||
audio_decoder = torchcodec.decoders.AudioDecoder(audio_path)
|
||||
|
||||
audio_chunks = []
|
||||
for ts in timestamps:
|
||||
current_audio_chunk = audio_decoder.get_samples_played_in_range(
|
||||
start_seconds=ts, stop_seconds=ts + duration
|
||||
)
|
||||
|
||||
if log_loaded_timestamps:
|
||||
logging.info(
|
||||
f"audio chunk loaded at starting timestamp={current_audio_chunk.pts_seconds:.4f} with duration={current_audio_chunk.duration_seconds:.4f}"
|
||||
)
|
||||
|
||||
audio_chunks.append(current_audio_chunk.data)
|
||||
|
||||
audio_chunks = torch.stack(audio_chunks)
|
||||
|
||||
assert len(timestamps) == len(audio_chunks)
|
||||
return audio_chunks
|
||||
|
||||
|
||||
def decode_audio_torchaudio(
|
||||
audio_path: Path | str,
|
||||
timestamps: list[float],
|
||||
|
||||
@@ -756,7 +756,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
|
||||
download_audio (bool, optional): Flag to download the audio. Defaults to True.
|
||||
video_backend (str | None, optional): Video backend to use for decoding videos. Defaults to torchcodec when available int the platform; otherwise, defaults to 'pyav'.
|
||||
You can also use the 'pyav' decoder used by Torchvision, which used to be the default option, or 'video_reader' which is another decoder of Torchvision.
|
||||
audio_backend (str | None, optional): Audio backend to use for decoding audio. Defaults to 'ffmpeg' decoder used by 'torchaudio'.
|
||||
audio_backend (str | None, optional): Audio backend to use for decoding audio. Defaults to 'torchaudio'.
|
||||
batch_encoding_size (int, optional): Number of episodes to accumulate before batch encoding videos.
|
||||
Set to 1 for immediate encoding (default), or higher for batched encoding. Defaults to 1.
|
||||
vcodec (str, optional): Video codec for encoding videos during recording. Options: 'h264', 'hevc',
|
||||
@@ -775,7 +775,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
|
||||
self.revision = revision if revision else CODEBASE_VERSION
|
||||
self.video_backend = video_backend if video_backend else get_safe_default_codec()
|
||||
self.audio_backend = (
|
||||
audio_backend if audio_backend else "ffmpeg"
|
||||
audio_backend if audio_backend else "trochaudio"
|
||||
) # Waiting for torchcodec release #TODO(CarolinePascal)
|
||||
self.delta_indices = None
|
||||
self.batch_encoding_size = batch_encoding_size
|
||||
@@ -1945,7 +1945,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
|
||||
obj._recorded_frames = 0
|
||||
obj._writer_closed_for_reading = False
|
||||
obj.audio_backend = (
|
||||
audio_backend if audio_backend is not None else "ffmpeg"
|
||||
audio_backend if audio_backend is not None else "trochaudio"
|
||||
) # Waiting for torchcodec release #TODO(CarolinePascal)
|
||||
return obj
|
||||
|
||||
|
||||
Reference in New Issue
Block a user