revert back video_utils.py to using pyav while keeping concat_video_files function

This commit is contained in:
Michel Aractingi
2025-08-29 01:06:46 +02:00
parent bbd64b9ce5
commit 47aee1fdbe
2 changed files with 116 additions and 102 deletions
+116 -92
View File
@@ -13,18 +13,18 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import glob
import importlib import importlib
import json
import logging import logging
import shutil import shutil
import subprocess import subprocess
import tempfile import tempfile
import warnings import warnings
from collections import OrderedDict
from dataclasses import dataclass, field from dataclasses import dataclass, field
from pathlib import Path from pathlib import Path
from typing import Any, ClassVar from typing import Any, ClassVar
import av
import pyarrow as pa import pyarrow as pa
import torch import torch
import torchvision import torchvision
@@ -106,7 +106,7 @@ def decode_video_frames_torchvision(
keyframes_only = False keyframes_only = False
torchvision.set_video_backend(backend) torchvision.set_video_backend(backend)
if backend == "pyav": if backend == "pyav":
keyframes_only = True # pyav doesnt support accuracte seek keyframes_only = True # pyav doesn't support accurate seek
# set a video stream reader # set a video stream reader
# TODO(rcadene): also load audio stream at the same time # TODO(rcadene): also load audio stream at the same time
@@ -159,7 +159,6 @@ def decode_video_frames_torchvision(
) )
# get closest frames to the query timestamps # get closest frames to the query timestamps
# TODO(rcadene): remove torch.stack
closest_frames = torch.stack([loaded_frames[idx] for idx in argmin_]) closest_frames = torch.stack([loaded_frames[idx] for idx in argmin_])
closest_ts = loaded_ts[argmin_] closest_ts = loaded_ts[argmin_]
@@ -257,51 +256,83 @@ def encode_video_frames(
g: int | None = 2, g: int | None = 2,
crf: int | None = 30, crf: int | None = 30,
fast_decode: int = 0, fast_decode: int = 0,
log_level: str | None = "quiet", log_level: int | None = av.logging.ERROR,
overwrite: bool = False, overwrite: bool = False,
) -> None: ) -> None:
"""More info on ffmpeg arguments tuning on `benchmark/video/README.md`""" """More info on ffmpeg arguments tuning on `benchmark/video/README.md`"""
# Check encoder availability
if vcodec not in ["h264", "hevc", "libsvtav1"]:
raise ValueError(f"Unsupported video codec: {vcodec}. Supported codecs are: h264, hevc, libsvtav1.")
video_path = Path(video_path) video_path = Path(video_path)
imgs_dir = Path(imgs_dir) imgs_dir = Path(imgs_dir)
video_path.parent.mkdir(parents=True, exist_ok=True)
ffmpeg_args = OrderedDict( video_path.parent.mkdir(parents=True, exist_ok=overwrite)
[
("-f", "image2"), # Encoders/pixel formats incompatibility check
("-r", str(fps)), if (vcodec == "libsvtav1" or vcodec == "hevc") and pix_fmt == "yuv444p":
("-i", str(imgs_dir / "frame-%06d.png")), logging.warning(
("-vcodec", vcodec), f"Incompatible pixel format 'yuv444p' for codec {vcodec}, auto-selecting format 'yuv420p'"
("-pix_fmt", pix_fmt), )
] pix_fmt = "yuv420p"
# Get input frames
template = "frame_" + ("[0-9]" * 6) + ".png"
input_list = sorted(
glob.glob(str(imgs_dir / template)), key=lambda x: int(x.split("_")[-1].split(".")[0])
) )
# Define video output frame size (assuming all input frames are the same size)
if len(input_list) == 0:
raise FileNotFoundError(f"No images found in {imgs_dir}.")
dummy_image = Image.open(input_list[0])
width, height = dummy_image.size
# Define video codec options
video_options = {}
if g is not None: if g is not None:
ffmpeg_args["-g"] = str(g) video_options["g"] = str(g)
if crf is not None: if crf is not None:
ffmpeg_args["-crf"] = str(crf) video_options["crf"] = str(crf)
if fast_decode: if fast_decode:
key = "-svtav1-params" if vcodec == "libsvtav1" else "-tune" key = "svtav1-params" if vcodec == "libsvtav1" else "tune"
value = f"fast-decode={fast_decode}" if vcodec == "libsvtav1" else "fastdecode" value = f"fast-decode={fast_decode}" if vcodec == "libsvtav1" else "fastdecode"
ffmpeg_args[key] = value video_options[key] = value
# Set logging level
if log_level is not None: if log_level is not None:
ffmpeg_args["-loglevel"] = str(log_level) # "While less efficient, it is generally preferable to modify logging with Python's logging"
logging.getLogger("libav").setLevel(log_level)
ffmpeg_args = [item for pair in ffmpeg_args.items() for item in pair] # Create and open output file (overwrite by default)
if overwrite: with av.open(str(video_path), "w") as output:
ffmpeg_args.append("-y") output_stream = output.add_stream(vcodec, fps, options=video_options)
output_stream.pix_fmt = pix_fmt
output_stream.width = width
output_stream.height = height
ffmpeg_cmd = ["ffmpeg"] + ffmpeg_args + [str(video_path)] # Loop through input frames and encode them
# redirect stdin to subprocess.DEVNULL to prevent reading random keyboard inputs from terminal for input_data in input_list:
subprocess.run(ffmpeg_cmd, check=True, stdin=subprocess.DEVNULL) input_image = Image.open(input_data).convert("RGB")
input_frame = av.VideoFrame.from_image(input_image)
packet = output_stream.encode(input_frame)
if packet:
output.mux(packet)
# Flush the encoder
packet = output_stream.encode()
if packet:
output.mux(packet)
# Reset logging level
if log_level is not None:
av.logging.restore_default_callback()
if not video_path.exists(): if not video_path.exists():
raise OSError( raise OSError(f"Video encoding did not work. File not found: {video_path}.")
f"Video encoding did not work. File not found: {video_path}. "
f"Try running the command manually to debug: `{''.join(ffmpeg_cmd)}`"
)
def concat_video_files(paths_to_cat: list[Path], root: Path, video_key: str, chunk_idx: int, file_idx: int): def concat_video_files(paths_to_cat: list[Path], root: Path, video_key: str, chunk_idx: int, file_idx: int):
@@ -325,6 +356,9 @@ def concat_video_files(paths_to_cat: list[Path], root: Path, video_key: str, chu
codec, resolution, and frame rate for proper concatenation. codec, resolution, and frame rate for proper concatenation.
- Output path follows the DEFAULT_VIDEO_PATH pattern with video_key, chunk_idx, - Output path follows the DEFAULT_VIDEO_PATH pattern with video_key, chunk_idx,
and file_idx parameters. and file_idx parameters.
- This function uses subprocess to call ffmpeg directly because PyAV doesn't have
built-in support for video concatenation. The concat demuxer in ffmpeg handles
all the complex timestamp adjustments automatically.
""" """
tmp_dir = Path(tempfile.mkdtemp(dir=root)) tmp_dir = Path(tempfile.mkdtemp(dir=root))
@@ -390,78 +424,68 @@ with warnings.catch_warnings():
def get_audio_info(video_path: Path | str) -> dict: def get_audio_info(video_path: Path | str) -> dict:
ffprobe_audio_cmd = [ # Set logging level
"ffprobe", logging.getLogger("libav").setLevel(av.logging.ERROR)
"-v",
"error",
"-select_streams",
"a:0",
"-show_entries",
"stream=channels,codec_name,bit_rate,sample_rate,bit_depth,channel_layout,duration",
"-of",
"json",
str(video_path),
]
result = subprocess.run(ffprobe_audio_cmd, capture_output=True, text=True)
if result.returncode != 0:
raise RuntimeError(f"Error running ffprobe: {result.stderr}")
info = json.loads(result.stdout) # Getting audio stream information
audio_stream_info = info["streams"][0] if info.get("streams") else None audio_info = {}
if audio_stream_info is None: with av.open(str(video_path), "r") as audio_file:
return {"has_audio": False} try:
audio_stream = audio_file.streams.audio[0]
except IndexError:
# Reset logging level
av.logging.restore_default_callback()
return {"has_audio": False}
# Return the information, defaulting to None if no audio stream is present audio_info["audio.channels"] = audio_stream.channels
return { audio_info["audio.codec"] = audio_stream.codec.canonical_name
"has_audio": True, # In an ideal loseless case : bit depth x sample rate x channels = bit rate.
"audio.channels": audio_stream_info.get("channels", None), # In an actual compressed case, the bit rate is set according to the compression level : the lower the bit rate, the more compression is applied.
"audio.codec": audio_stream_info.get("codec_name", None), audio_info["audio.bit_rate"] = audio_stream.bit_rate
"audio.bit_rate": int(audio_stream_info["bit_rate"]) if audio_stream_info.get("bit_rate") else None, audio_info["audio.sample_rate"] = audio_stream.sample_rate # Number of samples per second
"audio.sample_rate": int(audio_stream_info["sample_rate"]) # In an ideal loseless case : fixed number of bits per sample.
if audio_stream_info.get("sample_rate") # In an actual compressed case : variable number of bits per sample (often reduced to match a given depth rate).
else None, audio_info["audio.bit_depth"] = audio_stream.format.bits
"audio.bit_depth": audio_stream_info.get("bit_depth", None), audio_info["audio.channel_layout"] = audio_stream.layout.name
"audio.channel_layout": audio_stream_info.get("channel_layout", None), audio_info["has_audio"] = True
}
# Reset logging level
av.logging.restore_default_callback()
return audio_info
def get_video_info(video_path: Path | str) -> dict: def get_video_info(video_path: Path | str) -> dict:
ffprobe_video_cmd = [ # Set logging level
"ffprobe", logging.getLogger("libav").setLevel(av.logging.ERROR)
"-v",
"error",
"-select_streams",
"v:0",
"-show_entries",
"stream=r_frame_rate,width,height,codec_name,nb_frames,duration,pix_fmt",
"-of",
"json",
str(video_path),
]
result = subprocess.run(ffprobe_video_cmd, capture_output=True, text=True)
if result.returncode != 0:
raise RuntimeError(f"Error running ffprobe: {result.stderr}")
info = json.loads(result.stdout) # Getting video stream information
video_stream_info = info["streams"][0] video_info = {}
with av.open(str(video_path), "r") as video_file:
try:
video_stream = video_file.streams.video[0]
except IndexError:
# Reset logging level
av.logging.restore_default_callback()
return {}
# Calculate fps from r_frame_rate video_info["video.height"] = video_stream.height
r_frame_rate = video_stream_info["r_frame_rate"] video_info["video.width"] = video_stream.width
num, denom = map(int, r_frame_rate.split("/")) video_info["video.codec"] = video_stream.codec.canonical_name
fps = num / denom video_info["video.pix_fmt"] = video_stream.pix_fmt
video_info["video.is_depth_map"] = False
pixel_channels = get_video_pixel_channels(video_stream_info["pix_fmt"]) # Calculate fps from r_frame_rate
video_info["video.fps"] = int(video_stream.base_rate)
video_info = { pixel_channels = get_video_pixel_channels(video_stream.pix_fmt)
"video.fps": fps, video_info["video.channels"] = pixel_channels
"video.height": video_stream_info["height"],
"video.width": video_stream_info["width"], # Reset logging level
"video.channels": pixel_channels, av.logging.restore_default_callback()
"video.codec": video_stream_info["codec_name"],
"video.pix_fmt": video_stream_info["pix_fmt"], # Adding audio stream information
"video.is_depth_map": False, video_info.update(**get_audio_info(video_path))
**get_audio_info(video_path),
}
return video_info return video_info
-10
View File
@@ -140,16 +140,6 @@ def create_stats(stats_factory):
return _create_stats return _create_stats
# @pytest.fixture(scope="session")
# def create_episodes_stats(episodes_stats_factory):
# def _create_episodes_stats(dir: Path, episodes_stats: Dataset | None = None):
# if episodes_stats is None:
# episodes_stats = episodes_stats_factory()
# write_episodes_stats(episodes_stats, dir)
# return _create_episodes_stats
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def create_tasks(tasks_factory): def create_tasks(tasks_factory):
def _create_tasks(dir: Path, tasks: pd.DataFrame | None = None): def _create_tasks(dir: Path, tasks: pd.DataFrame | None = None):