[video processors] Support float fps for precise frame sampling (#39134)

* [video processors] Support float fps for precise frame sampling

Enable fractional fps values (e.g., 1.5, 29.97) in video processors
for more precise frame sampling control.

- Change fps type from int to float across all video processors
- Maintain backward compatibility with integer values

Extends: #38105

* [video processors] Refine fps typing to Union[int, float]

Change fps type from Optional[float] to Optional[Union[int, float]]
for more explicit type information about supporting both integer
and floating-point frame rates.

- Update type hints and docstrings across 8 files
- Maintain backward compatibility
- Clarify support for both int and float values

Extends: #38105

* Revert "[video processors] Support float fps for precise frame sampling"

This reverts commit 7360d6e661b413ca0239e5ef61f9b1abbeab8e65.
This commit is contained in:
zrohyun
2025-07-07 12:43:43 +09:00
committed by GitHub
parent ca7e1a3756
commit b0a8e0b8d7
8 changed files with 20 additions and 20 deletions

View File

@@ -91,7 +91,7 @@ class InstructBlipVideoVideoProcessor(BaseVideoProcessor):
do_sample_frames: bool,
image_mean: Optional[Union[float, list[float]]],
image_std: Optional[Union[float, list[float]]],
fps: Optional[int] = None,
fps: Optional[Union[int, float]] = None,
num_frames: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
device: Optional["torch.Tensor"] = None,

View File

@@ -76,7 +76,7 @@ class InternVLVideoProcessor(BaseVideoProcessor):
video: "torch.Tensor",
metadata: Optional[Union[VideoMetadata, dict]] = None,
num_frames: Optional[int] = None,
fps: Optional[int] = None,
fps: Optional[Union[int, float]] = None,
initial_shift: Optional[Union[bool, float, int]] = None,
):
"""
@@ -91,7 +91,7 @@ class InternVLVideoProcessor(BaseVideoProcessor):
Metadata of the video containing information about total duration, fps and total number of frames.
num_frames (`int`, *optional*):
Maximum number of frames to sample. Defaults to `self.num_frames`.
fps (`int`, *optional*):
fps (`int` or `float`, *optional*):
Target frames to sample per second. Defaults to `self.fps`.
initial_shift (`bool`, `float` or `int`, defaults to `self.initial_shift`):
The initial shift to apply when sampling frames. If `True`, the shift is set so that frames are sampled from the middle of the video.
@@ -143,7 +143,7 @@ class InternVLVideoProcessor(BaseVideoProcessor):
image_mean: Optional[Union[float, list[float]]],
image_std: Optional[Union[float, list[float]]],
do_sample_frames: Optional[bool] = None,
fps: Optional[int] = None,
fps: Optional[Union[int, float]] = None,
num_frames: Optional[int] = None,
initial_shift: Optional[Union[bool, float, int]] = None,
return_tensors: Optional[Union[str, TensorType]] = None,

View File

@@ -31,7 +31,7 @@ from ...video_utils import VideoInput, make_batched_videos
class Qwen2_5_OmniVideosKwargs(VideosKwargs):
fps: Optional[list[int]] = None
fps: Optional[list[Union[int, float]]] = None
use_audio_in_video: Optional[bool] = None
seconds_per_chunk: Optional[float] = None
position_id_per_seconds: Optional[int] = None

View File

@@ -127,7 +127,7 @@ class Qwen2VLVideoProcessor(BaseVideoProcessor):
max_frames: int,
metadata: Optional[Union[VideoMetadata, dict]] = None,
num_frames: Optional[int] = None,
fps: Optional[int] = None,
fps: Optional[Union[int, float]] = None,
):
"""
Default sampling function which uniformly samples the desired number of frames between 0 and total number of frames.
@@ -147,7 +147,7 @@ class Qwen2VLVideoProcessor(BaseVideoProcessor):
Metadata of the video containing information about total duration, fps and total number of frames.
num_frames (`int`, *optional*):
Maximum number of frames to sample. Defaults to `self.num_frames`.
fps (`int`, *optional*):
fps (`int` or `float`, *optional*):
Target frames to sample per second. Defaults to `self.fps`.
Returns:
@@ -208,7 +208,7 @@ class Qwen2VLVideoProcessor(BaseVideoProcessor):
patch_size: Optional[int] = None,
temporal_patch_size: Optional[int] = None,
merge_size: Optional[int] = None,
fps: Optional[int] = None,
fps: Optional[Union[int, float]] = None,
num_frames: Optional[int] = None,
min_frames: Optional[int] = None,
max_frames: Optional[int] = None,

View File

@@ -249,7 +249,7 @@ class SmolVLMVideoProcessor(BaseVideoProcessor):
video: "torch.Tensor",
metadata: Union[VideoMetadata, dict],
num_frames: Optional[int] = None,
fps: Optional[int] = None,
fps: Optional[Union[int, float]] = None,
skip_secs: Optional[int] = 1,
):
"""
@@ -266,7 +266,7 @@ class SmolVLMVideoProcessor(BaseVideoProcessor):
Metadata of the video containing information about total duration, fps and total number of frames.
num_frames (`int`, *optional*):
Maximum number of frames to sample. Defaults to `self.num_frames`.
fps (`int`, *optional*):
fps (`int` or `float`, *optional*):
Target frames to sample per second. Defaults to `self.fps`.
skip_secs (`float`, *optional*, defaults to `1`):
Number of seconds to skip from the start and end if the video is long enough.
@@ -328,7 +328,7 @@ class SmolVLMVideoProcessor(BaseVideoProcessor):
do_sample_frames: bool,
image_mean: Optional[Union[float, list[float]]],
image_std: Optional[Union[float, list[float]]],
fps: Optional[int] = None,
fps: Optional[Union[int, float]] = None,
num_frames: Optional[int] = None,
skip_secs: Optional[int] = 0,
return_tensors: Optional[Union[str, TensorType]] = None,

View File

@@ -251,7 +251,7 @@ class VideosKwargs(TypedDict, total=False):
Metadata of the video containing information about total duration, fps and total number of frames.
num_frames (`int`, *optional*):
Maximum number of frames to sample when `do_sample_frames=True`.
fps (`int`, *optional*):
fps (`int` or `float`, *optional*):
Target frames to sample per second when `do_sample_frames=True`.
crop_size (`dict[str, int]`, *optional*):
Desired output size when applying center-cropping.
@@ -280,7 +280,7 @@ class VideosKwargs(TypedDict, total=False):
device: Optional[str]
do_sample_frames: Optional[bool]
video_metadata: Optional[Union[VideoMetadata, dict]]
fps: Optional[int]
fps: Optional[Union[int, float]]
num_frames: Optional[int]

View File

@@ -125,7 +125,7 @@ BASE_VIDEO_PROCESSOR_DOCSTRING = r"""
Whether to sample frames from the video before processing or to process the whole video.
num_frames (`int`, *optional*, defaults to `self.num_frames`):
Maximum number of frames to sample when `do_sample_frames=True`.
fps (`int`, *optional*, defaults to `self.fps`):
fps (`int` or `float`, *optional*, defaults to `self.fps`):
Target frames to sample per second when `do_sample_frames=True`.
return_tensors (`str` or `TensorType`, *optional*):
Returns stacked tensors if set to `pt, otherwise returns a list of tensors.
@@ -237,7 +237,7 @@ class BaseVideoProcessor(BaseImageProcessorFast):
video: "torch.Tensor",
metadata: Optional[Union[VideoMetadata, dict]] = None,
num_frames: Optional[int] = None,
fps: Optional[int] = None,
fps: Optional[Union[int, float]] = None,
):
"""
Default sampling function which uniformly samples the desired number of frames between 0 and total number of frames.
@@ -251,7 +251,7 @@ class BaseVideoProcessor(BaseImageProcessorFast):
Metadata of the video containing information about total duration, fps and total number of frames.
num_frames (`int`, *optional*):
Maximum number of frames to sample. Defaults to `self.num_frames`.
fps (`int`, *optional*):
fps (`int` or `float`, *optional*):
Target frames to sample per second. Defaults to `self.fps`.
Returns:
@@ -369,7 +369,7 @@ class BaseVideoProcessor(BaseImageProcessorFast):
image_mean: Optional[Union[float, list[float]]],
image_std: Optional[Union[float, list[float]]],
do_sample_frames: Optional[bool] = None,
fps: Optional[int] = None,
fps: Optional[Union[int, float]] = None,
num_frames: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
device: Optional["torch.Tensor"] = None,

View File

@@ -227,7 +227,7 @@ def default_sample_indices_fn(metadata: VideoMetadata, num_frames=None, fps=None
`VideoMetadata` object containing metadata about the video, such as "total_num_frames" or "fps".
num_frames (`int`, *optional*):
Number of frames to sample uniformly.
fps (`int`, *optional*):
fps (`int` or `float`, *optional*):
Desired frames per second. Takes priority over num_frames if both are provided.
Returns:
@@ -514,7 +514,7 @@ VIDEO_DECODERS = {
def load_video(
video: Union[str, "VideoInput"],
num_frames: Optional[int] = None,
fps: Optional[int] = None,
fps: Optional[Union[int, float]] = None,
backend: str = "pyav",
sample_indices_fn: Optional[Callable] = None,
**kwargs,
@@ -527,7 +527,7 @@ def load_video(
The video to convert to the numpy array format. Can be a link to video or local path.
num_frames (`int`, *optional*):
Number of frames to sample uniformly. If not passed, the whole video is loaded.
fps (`int`, *optional*):
fps (`int` or `float`, *optional*):
Number of frames to sample per second. Should be passed only when `num_frames=None`.
If not specified and `num_frames==None`, all frames are sampled.
backend (`str`, *optional*, defaults to `"pyav"`):