[video processors] Support float fps for precise frame sampling (#39134)
* [video processors] Support float fps for precise frame sampling Enable fractional fps values (e.g., 1.5, 29.97) in video processors for more precise frame sampling control. - Change fps type from int to float across all video processors - Maintain backward compatibility with integer values Extends: #38105 * [video processors] Refine fps typing to Union[int, float] Change fps type from Optional[float] to Optional[Union[int, float]] for more explicit type information about supporting both integer and floating-point frame rates. - Update type hints and docstrings across 8 files - Maintain backward compatibility - Clarify support for both int and float values Extends: #38105 * Revert "[video processors] Support float fps for precise frame sampling" This reverts commit 7360d6e661b413ca0239e5ef61f9b1abbeab8e65.
This commit is contained in:
@@ -91,7 +91,7 @@ class InstructBlipVideoVideoProcessor(BaseVideoProcessor):
|
||||
do_sample_frames: bool,
|
||||
image_mean: Optional[Union[float, list[float]]],
|
||||
image_std: Optional[Union[float, list[float]]],
|
||||
fps: Optional[int] = None,
|
||||
fps: Optional[Union[int, float]] = None,
|
||||
num_frames: Optional[int] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
device: Optional["torch.Tensor"] = None,
|
||||
|
||||
@@ -76,7 +76,7 @@ class InternVLVideoProcessor(BaseVideoProcessor):
|
||||
video: "torch.Tensor",
|
||||
metadata: Optional[Union[VideoMetadata, dict]] = None,
|
||||
num_frames: Optional[int] = None,
|
||||
fps: Optional[int] = None,
|
||||
fps: Optional[Union[int, float]] = None,
|
||||
initial_shift: Optional[Union[bool, float, int]] = None,
|
||||
):
|
||||
"""
|
||||
@@ -91,7 +91,7 @@ class InternVLVideoProcessor(BaseVideoProcessor):
|
||||
Metadata of the video containing information about total duration, fps and total number of frames.
|
||||
num_frames (`int`, *optional*):
|
||||
Maximum number of frames to sample. Defaults to `self.num_frames`.
|
||||
fps (`int`, *optional*):
|
||||
fps (`int` or `float`, *optional*):
|
||||
Target frames to sample per second. Defaults to `self.fps`.
|
||||
initial_shift (`bool`, `float` or `int`, defaults to `self.initial_shift`):
|
||||
The initial shift to apply when sampling frames. If `True`, the shift is set so that frames are sampled from the middle of the video.
|
||||
@@ -143,7 +143,7 @@ class InternVLVideoProcessor(BaseVideoProcessor):
|
||||
image_mean: Optional[Union[float, list[float]]],
|
||||
image_std: Optional[Union[float, list[float]]],
|
||||
do_sample_frames: Optional[bool] = None,
|
||||
fps: Optional[int] = None,
|
||||
fps: Optional[Union[int, float]] = None,
|
||||
num_frames: Optional[int] = None,
|
||||
initial_shift: Optional[Union[bool, float, int]] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
|
||||
@@ -31,7 +31,7 @@ from ...video_utils import VideoInput, make_batched_videos
|
||||
|
||||
|
||||
class Qwen2_5_OmniVideosKwargs(VideosKwargs):
|
||||
fps: Optional[list[int]] = None
|
||||
fps: Optional[list[Union[int, float]]] = None
|
||||
use_audio_in_video: Optional[bool] = None
|
||||
seconds_per_chunk: Optional[float] = None
|
||||
position_id_per_seconds: Optional[int] = None
|
||||
|
||||
@@ -127,7 +127,7 @@ class Qwen2VLVideoProcessor(BaseVideoProcessor):
|
||||
max_frames: int,
|
||||
metadata: Optional[Union[VideoMetadata, dict]] = None,
|
||||
num_frames: Optional[int] = None,
|
||||
fps: Optional[int] = None,
|
||||
fps: Optional[Union[int, float]] = None,
|
||||
):
|
||||
"""
|
||||
Default sampling function which uniformly samples the desired number of frames between 0 and total number of frames.
|
||||
@@ -147,7 +147,7 @@ class Qwen2VLVideoProcessor(BaseVideoProcessor):
|
||||
Metadata of the video containing information about total duration, fps and total number of frames.
|
||||
num_frames (`int`, *optional*):
|
||||
Maximum number of frames to sample. Defaults to `self.num_frames`.
|
||||
fps (`int`, *optional*):
|
||||
fps (`int` or `float`, *optional*):
|
||||
Target frames to sample per second. Defaults to `self.fps`.
|
||||
|
||||
Returns:
|
||||
@@ -208,7 +208,7 @@ class Qwen2VLVideoProcessor(BaseVideoProcessor):
|
||||
patch_size: Optional[int] = None,
|
||||
temporal_patch_size: Optional[int] = None,
|
||||
merge_size: Optional[int] = None,
|
||||
fps: Optional[int] = None,
|
||||
fps: Optional[Union[int, float]] = None,
|
||||
num_frames: Optional[int] = None,
|
||||
min_frames: Optional[int] = None,
|
||||
max_frames: Optional[int] = None,
|
||||
|
||||
@@ -249,7 +249,7 @@ class SmolVLMVideoProcessor(BaseVideoProcessor):
|
||||
video: "torch.Tensor",
|
||||
metadata: Union[VideoMetadata, dict],
|
||||
num_frames: Optional[int] = None,
|
||||
fps: Optional[int] = None,
|
||||
fps: Optional[Union[int, float]] = None,
|
||||
skip_secs: Optional[int] = 1,
|
||||
):
|
||||
"""
|
||||
@@ -266,7 +266,7 @@ class SmolVLMVideoProcessor(BaseVideoProcessor):
|
||||
Metadata of the video containing information about total duration, fps and total number of frames.
|
||||
num_frames (`int`, *optional*):
|
||||
Maximum number of frames to sample. Defaults to `self.num_frames`.
|
||||
fps (`int`, *optional*):
|
||||
fps (`int` or `float`, *optional*):
|
||||
Target frames to sample per second. Defaults to `self.fps`.
|
||||
skip_secs (`float`, *optional*, defaults to `1`):
|
||||
Number of seconds to skip from the start and end if the video is long enough.
|
||||
@@ -328,7 +328,7 @@ class SmolVLMVideoProcessor(BaseVideoProcessor):
|
||||
do_sample_frames: bool,
|
||||
image_mean: Optional[Union[float, list[float]]],
|
||||
image_std: Optional[Union[float, list[float]]],
|
||||
fps: Optional[int] = None,
|
||||
fps: Optional[Union[int, float]] = None,
|
||||
num_frames: Optional[int] = None,
|
||||
skip_secs: Optional[int] = 0,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
|
||||
@@ -251,7 +251,7 @@ class VideosKwargs(TypedDict, total=False):
|
||||
Metadata of the video containing information about total duration, fps and total number of frames.
|
||||
num_frames (`int`, *optional*):
|
||||
Maximum number of frames to sample when `do_sample_frames=True`.
|
||||
fps (`int`, *optional*):
|
||||
fps (`int` or `float`, *optional*):
|
||||
Target frames to sample per second when `do_sample_frames=True`.
|
||||
crop_size (`dict[str, int]`, *optional*):
|
||||
Desired output size when applying center-cropping.
|
||||
@@ -280,7 +280,7 @@ class VideosKwargs(TypedDict, total=False):
|
||||
device: Optional[str]
|
||||
do_sample_frames: Optional[bool]
|
||||
video_metadata: Optional[Union[VideoMetadata, dict]]
|
||||
fps: Optional[int]
|
||||
fps: Optional[Union[int, float]]
|
||||
num_frames: Optional[int]
|
||||
|
||||
|
||||
|
||||
@@ -125,7 +125,7 @@ BASE_VIDEO_PROCESSOR_DOCSTRING = r"""
|
||||
Whether to sample frames from the video before processing or to process the whole video.
|
||||
num_frames (`int`, *optional*, defaults to `self.num_frames`):
|
||||
Maximum number of frames to sample when `do_sample_frames=True`.
|
||||
fps (`int`, *optional*, defaults to `self.fps`):
|
||||
fps (`int` or `float`, *optional*, defaults to `self.fps`):
|
||||
Target frames to sample per second when `do_sample_frames=True`.
|
||||
return_tensors (`str` or `TensorType`, *optional*):
|
||||
Returns stacked tensors if set to `pt, otherwise returns a list of tensors.
|
||||
@@ -237,7 +237,7 @@ class BaseVideoProcessor(BaseImageProcessorFast):
|
||||
video: "torch.Tensor",
|
||||
metadata: Optional[Union[VideoMetadata, dict]] = None,
|
||||
num_frames: Optional[int] = None,
|
||||
fps: Optional[int] = None,
|
||||
fps: Optional[Union[int, float]] = None,
|
||||
):
|
||||
"""
|
||||
Default sampling function which uniformly samples the desired number of frames between 0 and total number of frames.
|
||||
@@ -251,7 +251,7 @@ class BaseVideoProcessor(BaseImageProcessorFast):
|
||||
Metadata of the video containing information about total duration, fps and total number of frames.
|
||||
num_frames (`int`, *optional*):
|
||||
Maximum number of frames to sample. Defaults to `self.num_frames`.
|
||||
fps (`int`, *optional*):
|
||||
fps (`int` or `float`, *optional*):
|
||||
Target frames to sample per second. Defaults to `self.fps`.
|
||||
|
||||
Returns:
|
||||
@@ -369,7 +369,7 @@ class BaseVideoProcessor(BaseImageProcessorFast):
|
||||
image_mean: Optional[Union[float, list[float]]],
|
||||
image_std: Optional[Union[float, list[float]]],
|
||||
do_sample_frames: Optional[bool] = None,
|
||||
fps: Optional[int] = None,
|
||||
fps: Optional[Union[int, float]] = None,
|
||||
num_frames: Optional[int] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
device: Optional["torch.Tensor"] = None,
|
||||
|
||||
@@ -227,7 +227,7 @@ def default_sample_indices_fn(metadata: VideoMetadata, num_frames=None, fps=None
|
||||
`VideoMetadata` object containing metadata about the video, such as "total_num_frames" or "fps".
|
||||
num_frames (`int`, *optional*):
|
||||
Number of frames to sample uniformly.
|
||||
fps (`int`, *optional*):
|
||||
fps (`int` or `float`, *optional*):
|
||||
Desired frames per second. Takes priority over num_frames if both are provided.
|
||||
|
||||
Returns:
|
||||
@@ -514,7 +514,7 @@ VIDEO_DECODERS = {
|
||||
def load_video(
|
||||
video: Union[str, "VideoInput"],
|
||||
num_frames: Optional[int] = None,
|
||||
fps: Optional[int] = None,
|
||||
fps: Optional[Union[int, float]] = None,
|
||||
backend: str = "pyav",
|
||||
sample_indices_fn: Optional[Callable] = None,
|
||||
**kwargs,
|
||||
@@ -527,7 +527,7 @@ def load_video(
|
||||
The video to convert to the numpy array format. Can be a link to video or local path.
|
||||
num_frames (`int`, *optional*):
|
||||
Number of frames to sample uniformly. If not passed, the whole video is loaded.
|
||||
fps (`int`, *optional*):
|
||||
fps (`int` or `float`, *optional*):
|
||||
Number of frames to sample per second. Should be passed only when `num_frames=None`.
|
||||
If not specified and `num_frames==None`, all frames are sampled.
|
||||
backend (`str`, *optional*, defaults to `"pyav"`):
|
||||
|
||||
Reference in New Issue
Block a user